[sheepdog] [PATCH v10 03/19] sheep: decrement generational reference count on vdi deletion

Hitoshi Mitake mitake.hitoshi at gmail.com
Mon Jun 2 17:08:57 CEST 2014


This removes old vdi deletion code, which reclaims objects only when
all relevant snapshots are deleted, and uses a generational reference
counting algorithm.

Cc: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
Tested-by: Valerio Pachera <sirio81 at gmail.com>
Cc: Alessandro Bolgia <alessandro at extensys.it>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 sheep/vdi.c | 272 ++++++------------------------------------------------------
 1 file changed, 27 insertions(+), 245 deletions(-)

diff --git a/sheep/vdi.c b/sheep/vdi.c
index 6b51672..8204bae 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -810,41 +810,11 @@ int read_vdis(char *data, int len, unsigned int *rsp_len)
 
 struct deletion_work {
 	struct work work;
-
 	uint32_t target_vid;
-	int delete_vid_count;
-	uint32_t *delete_vid_array;
-
+	bool succeed;
 	int finish_fd;		/* eventfd for notifying finish */
 };
 
-static int delete_inode(uint32_t vid)
-{
-	struct sd_inode *inode = NULL;
-	int ret = SD_RES_SUCCESS;
-
-	inode = xzalloc(sizeof(*inode));
-	ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode,
-			     SD_INODE_HEADER_SIZE, 0);
-	if (ret != SD_RES_SUCCESS) {
-		ret = SD_RES_EIO;
-		goto out;
-	}
-
-	memset(inode->name, 0, sizeof(inode->name));
-
-	ret = sd_write_object(vid_to_vdi_oid(vid), (char *)inode,
-			      SD_INODE_HEADER_SIZE, 0, false);
-	if (ret != 0) {
-		ret = SD_RES_EIO;
-		goto out;
-	}
-
-out:
-	free(inode);
-	return ret;
-}
-
 static int notify_vdi_deletion(uint32_t vdi_id)
 {
 	struct sd_req hdr;
@@ -888,16 +858,20 @@ static void delete_cb(struct sd_index *idx, void *arg, int ignore)
 	}
 }
 
-static int delete_one_vdi(uint32_t vdi_id)
+static void delete_vdi_work(struct work *work)
 {
+	struct deletion_work *dw =
+		container_of(work, struct deletion_work, work);
 	int ret = 0;
 	uint32_t i, nr_deleted, nr_objs;
 	struct sd_inode *inode = NULL;
+	uint32_t vdi_id = dw->target_vid;
 
 	inode = malloc(sizeof(*inode));
 	if (!inode) {
 		sd_err("failed to allocate memory");
-		return -1;
+		dw->succeed = false;
+		return;
 	}
 
 	ret = read_backend_object(vid_to_vdi_oid(vdi_id),
@@ -905,7 +879,7 @@ static int delete_one_vdi(uint32_t vdi_id)
 
 	if (ret != SD_RES_SUCCESS) {
 		sd_err("cannot find VDI object");
-		ret = -1;
+		dw->succeed = false;
 		goto out;
 	}
 
@@ -922,21 +896,20 @@ static int delete_one_vdi(uint32_t vdi_id)
 				continue;
 
 			oid = vid_to_data_oid(vid, i);
-
-			if (vid != inode->vdi_id) {
-				sd_debug("object %" PRIx64 " is base's data, "
-					 "would not be deleted.", oid);
-				continue;
-			}
-
-			ret = sd_remove_object(oid);
+			ret = sd_dec_object_refcnt(oid,
+						inode->gref[i].generation,
+						inode->gref[i].count);
 			if (ret != SD_RES_SUCCESS)
-				sd_err("remove object %" PRIx64 " fail, %d",
+				sd_err("discard ref %" PRIx64 " fail, %d",
 				       oid, ret);
 
 			nr_deleted++;
 		}
 	} else {
+		/*
+		 * todo: generational reference counting is not supported by
+		 * hypervolume yet
+		 */
 		struct delete_arg arg = {inode, &nr_deleted};
 		sd_inode_index_walk(inode, delete_cb, &arg);
 	}
@@ -946,6 +919,8 @@ static int delete_one_vdi(uint32_t vdi_id)
 
 	inode->vdi_size = 0;
 	memset(inode->name, 0, sizeof(inode->name));
+	memset((char *)inode + SD_INODE_HEADER_SIZE, 0,
+	       SD_INODE_SIZE - SD_INODE_HEADER_SIZE);
 
 	sd_write_object(vid_to_vdi_oid(vdi_id), (void *)inode,
 			sizeof(*inode), 0, false);
@@ -954,173 +929,27 @@ static int delete_one_vdi(uint32_t vdi_id)
 		notify_vdi_deletion(vdi_id);
 out:
 	free(inode);
-	return ret;
+	dw->succeed = true;
 }
 
-static void delete_vdis_work(struct work *work)
-{
-	struct deletion_work *dw =
-		container_of(work, struct deletion_work, work);
-
-	for (int i = 0; i < dw->delete_vid_count; i++) {
-		int ret;
-
-		ret = delete_one_vdi(dw->delete_vid_array[i]);
-		if (ret < 0)
-			sd_err("deleting VDI %x failed",
-			       dw->delete_vid_array[i]);
-	}
-}
-
-static void delete_vdis_done(struct work *work)
+static void delete_vdi_done(struct work *work)
 {
 	struct deletion_work *dw =
 		container_of(work, struct deletion_work, work);
 
 	eventfd_xwrite(dw->finish_fd, 1);
-
-	/* the deletion info is completed */
-	free(dw->delete_vid_array);
+	if (!dw->succeed)
+		sd_err("deleting vdi: %x failed", dw->target_vid);
+	/* the deletion work is completed */
 	free(dw);
 }
 
-static int fill_delete_vid_array(struct deletion_work *dw, uint32_t root_vid)
-{
-	int ret = 0;
-	struct sd_inode *inode = NULL;
-	int done = 0;
-	uint32_t vid;
-
-	inode = malloc(SD_INODE_HEADER_SIZE);
-	if (!inode) {
-		sd_err("failed to allocate memory");
-		return -1;
-	}
-
-	dw->delete_vid_array[dw->delete_vid_count++] = root_vid;
-
-	do {
-		vid = dw->delete_vid_array[done++];
-		ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,
-					  SD_INODE_HEADER_SIZE, 0);
-		if (ret != SD_RES_SUCCESS) {
-			sd_err("cannot find VDI object");
-			ret = -1;
-			break;
-		}
-
-		if (!vdi_is_deleted(inode) && vid != dw->target_vid) {
-			ret = 1;
-			break;
-		}
-
-		for (int i = 0; i < ARRAY_SIZE(inode->child_vdi_id); i++) {
-			if (!inode->child_vdi_id[i])
-				continue;
-
-			dw->delete_vid_array[dw->delete_vid_count++] =
-				inode->child_vdi_id[i];
-		}
-	} while (dw->delete_vid_array[done]);
-
-	free(inode);
-	return ret;
-}
-
-static uint64_t get_vdi_root(uint32_t vid, bool *cloned)
-{
-	int ret;
-	struct sd_inode *inode = NULL;
-
-	*cloned = false;
-
-	inode = malloc(SD_INODE_HEADER_SIZE);
-	if (!inode) {
-		sd_err("failed to allocate memory");
-		return 0;
-	}
-
-	do {
-		ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,
-					  SD_INODE_HEADER_SIZE, 0);
-		if (ret != SD_RES_SUCCESS) {
-			sd_err("cannot find VDI object");
-			vid = 0;
-			break;
-		}
-
-		if (vid == inode->vdi_id && inode->snap_id == 1
-		    && inode->parent_vdi_id != 0 && !inode->snap_ctime) {
-			sd_debug("vdi %" PRIx32 " is a cloned vdi.", vid);
-			/* current vdi is a cloned vdi */
-			*cloned = true;
-		}
-
-		if (!inode->parent_vdi_id)
-			break;
-
-		vid = inode->parent_vdi_id;
-	} while (true);
-
-	free(inode);
-	return vid;
-}
-
-static void clear_parent_child_vdi(uint32_t vid)
-{
-	struct sd_inode * inode = xmalloc(SD_INODE_HEADER_SIZE);
-	uint32_t pvid, i;
-	int ret;
-
-	ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,
-				  SD_INODE_HEADER_SIZE, 0);
-	if (ret != SD_RES_SUCCESS) {
-		sd_err("failed to read inode %"PRIx32, vid);
-		goto out;
-	}
-
-	pvid = inode->parent_vdi_id;
-	if (!pvid)
-		goto out;
-	ret = read_backend_object(vid_to_vdi_oid(pvid), (char *)inode,
-				  SD_INODE_HEADER_SIZE, 0);
-	if (ret != SD_RES_SUCCESS) {
-		sd_err("failed to read parent inode %"PRIx32, pvid);
-		goto out;
-	}
-
-	for (i = 0; i < MAX_CHILDREN; i++)
-		if (inode->child_vdi_id[i] == vid) {
-			inode->child_vdi_id[i] = 0;
-			break;
-		}
-
-	if (i == MAX_CHILDREN) {
-		sd_info("failed to find child %"PRIx32, vid);
-		goto out;
-	}
-
-	ret = sd_write_object(vid_to_vdi_oid(pvid), (char *)inode,
-			      SD_INODE_HEADER_SIZE, 0, false);
-	if (ret != SD_RES_SUCCESS) {
-		sd_err("failed to update parent %"PRIx32, pvid);
-		goto out;
-	}
-	sd_debug("parent %"PRIx32, pvid);
-out:
-	free(inode);
-}
-
 static int start_deletion(struct request *req, uint32_t vid)
 {
 	struct deletion_work *dw = NULL;
 	int ret = SD_RES_SUCCESS, finish_fd;
-	bool cloned;
-	uint32_t root_vid;
 
 	dw = xzalloc(sizeof(*dw));
-	dw->delete_vid_array = xzalloc(SD_INODE_SIZE - SD_INODE_HEADER_SIZE);
-	dw->delete_vid_count = 0;
 	dw->target_vid = vid;
 	finish_fd = dw->finish_fd = eventfd(0, 0);
 	if (dw->finish_fd < 0) {
@@ -1129,67 +958,20 @@ static int start_deletion(struct request *req, uint32_t vid)
 		goto out;
 	}
 
-	root_vid = get_vdi_root(dw->target_vid, &cloned);
-	if (!root_vid) {
-		ret = SD_RES_EIO;
-		goto out;
-	}
-
-	ret = fill_delete_vid_array(dw, root_vid);
-	if (ret < 0) {
-		ret = SD_RES_EIO;
-		goto out;
-	} else if (ret == 1) {
-		/*
-		 * if the VDI is a cloned VDI, delete its objects
-		 * no matter whether the VDI tree is clear.
-		 */
-		ret = SD_RES_SUCCESS;
-
-		if (cloned) {
-			dw->delete_vid_array[0] = vid;
-			dw->delete_vid_count = 1;
-			/*
-			 * FIXME:
-			 *
-			 * We can't clear snapshot's parent because it is not
-			 * removed. We only remove the whole snapshot chain. So
-			 * we can only create MAX_CHILDREN snapshots for one
-			 * base even if we later remove some of them.
-			 *
-			 * But for clone (writable snapshot, we can clear
-			 * parent for deletion, thus make room for new clones.
-			 */
-			clear_parent_child_vdi(vid);
-		} else {
-			sd_debug("snapshot chain has valid vdi, just mark vdi %"
-				 PRIx32 " as deleted.", dw->target_vid);
-			delete_inode(dw->target_vid);
-			goto out;
-		}
-	}
-
-	sd_debug("number of VDI deletion: %d", dw->delete_vid_count);
-
-	if (dw->delete_vid_count == 0)
-		goto out;
-
-	dw->work.fn = delete_vdis_work;
-	dw->work.done = delete_vdis_done;
+	dw->work.fn = delete_vdi_work;
+	dw->work.done = delete_vdi_done;
 
 	queue_work(sys->deletion_wqueue, &dw->work);
 
 	/*
-	 * the event fd is written by delete_one_vdi_done(), when all vdis of
-	 * deletion_work are deleted
+	 * the event fd is written by delete_vdi_done(), when all reference
+	 * counters are decremented
 	 */
 	eventfd_xread(finish_fd);
 	close(finish_fd);
 
 	return ret;
 out:
-	if (dw)
-		free(dw->delete_vid_array);
 	free(dw);
 
 	return ret;
-- 
1.9.1




More information about the sheepdog mailing list