[sheepdog] [PATCH] sheep: cleaning vdi deletion process of sheep daemon

Hitoshi Mitake mitake.hitoshi at lab.ntt.co.jp
Thu Dec 26 07:53:51 CET 2013


Current code of vdi deletion process in sheep deamon is a real
disaster. Especially, its irregular usage of work queue is terrible
thing for maintenance. In addition, the vdi deletion process can
potentially notify a completion of deletion to a dog process before
actual completion.

This patch cleans the VDI deletion code and removes the above bug.

Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 sheep/vdi.c |  301 ++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 176 insertions(+), 125 deletions(-)

diff --git a/sheep/vdi.c b/sheep/vdi.c
index eca4f43..49038c5 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -754,22 +754,6 @@ int vdi_snapshot(const struct vdi_iocb *iocb, uint32_t *new_vid)
 				  info.vid);
 }
 
-static int start_deletion(struct request *req, uint32_t vid);
-
-int vdi_delete(const struct vdi_iocb *iocb, struct request *req)
-{
-	struct vdi_info info;
-	int ret;
-
-	ret = vdi_lookup(iocb, &info);
-	if (ret != SD_RES_SUCCESS)
-		goto out;
-
-	ret = start_deletion(req, info.vid);
-out:
-	return ret;
-}
-
 int read_vdis(char *data, int len, unsigned int *rsp_len)
 {
 	if (len != sizeof(sys->vdi_inuse))
@@ -781,28 +765,35 @@ int read_vdis(char *data, int len, unsigned int *rsp_len)
 	return SD_RES_SUCCESS;
 }
 
-struct deletion_work {
-	uint32_t done;
+struct deletion_info {
+	uint32_t target_vid;
+
+	int delete_vid_count;
+	uint32_t *delete_vid_array;
+
+	int done_deletion;
 
+	int finish_fd;
+	struct list_node list;	/* connected deletion_info_queue */
+};
+
+struct deletion_work {
 	struct work work;
-	struct list_node list;
-	struct request *req;
 
 	uint32_t vid;
-
-	int count;
-	uint32_t *buf;
+	struct deletion_info *di;
 };
 
-static LIST_HEAD(deletion_work_list);
+static pthread_mutex_t deletion_info_queue_lock = PTHREAD_MUTEX_INITIALIZER;
+static LIST_HEAD(deletion_info_queue);
 
-static int delete_inode(struct deletion_work *dw)
+static int delete_inode(uint32_t vid)
 {
 	struct sd_inode *inode = NULL;
 	int ret = SD_RES_SUCCESS;
 
 	inode = xzalloc(sizeof(*inode));
-	ret = sd_read_object(vid_to_vdi_oid(dw->vid), (char *)inode,
+	ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode,
 			     SD_INODE_HEADER_SIZE, 0);
 	if (ret != SD_RES_SUCCESS) {
 		ret = SD_RES_EIO;
@@ -811,7 +802,7 @@ static int delete_inode(struct deletion_work *dw)
 
 	memset(inode->name, 0, sizeof(inode->name));
 
-	ret = sd_write_object(vid_to_vdi_oid(dw->vid), (char *)inode,
+	ret = sd_write_object(vid_to_vdi_oid(vid), (char *)inode,
 			      SD_INODE_HEADER_SIZE, 0, false);
 	if (ret != 0) {
 		ret = SD_RES_EIO;
@@ -871,16 +862,15 @@ static void delete_cb(void *data, enum btree_node_type type, void *arg)
 	}
 }
 
-static void delete_one(struct work *work)
+static void delete_one_vdi(struct work *work)
 {
-	struct deletion_work *dw = container_of(work, struct deletion_work, work);
-	uint32_t vdi_id = *(dw->buf + dw->count - dw->done - 1);
+	struct deletion_work *dw =
+		container_of(work, struct deletion_work, work);
+	uint32_t vdi_id = dw->vid;
 	int ret;
 	uint32_t i, nr_deleted, nr_objs;
 	struct sd_inode *inode = NULL;
 
-	sd_debug("%d %d, %16x", dw->done, dw->count, vdi_id);
-
 	inode = malloc(sizeof(*inode));
 	if (!inode) {
 		sd_err("failed to allocate memory");
@@ -942,74 +932,111 @@ out:
 	free(inode);
 }
 
-static void delete_one_done(struct work *work)
+static void delete_one_vdi_done(struct work *work);
+
+static struct deletion_work *create_next_deletion_work(struct deletion_info *di)
 {
-	struct deletion_work *dw = container_of(work, struct deletion_work, work);
-	struct request *req = dw->req;
+	struct deletion_work *dw = NULL;
 
-	dw->done++;
-	if (dw->done < dw->count) {
-		queue_work(sys->deletion_wqueue, &dw->work);
-		return;
+	if (!(di->done_deletion < di->delete_vid_count))
+		/* no more deletion work, this deletion info is completed */
+		return NULL;
+
+	sd_debug("done_deletion: %d, delete_vid_count: %d, next vid: %"PRIu32,
+		 di->done_deletion, di->delete_vid_count,
+		 di->delete_vid_array[di->done_deletion + 1]);
+
+	dw = xzalloc(sizeof(*dw));
+	dw->work.fn = delete_one_vdi;
+	dw->work.done = delete_one_vdi_done;
+	dw->vid = di->delete_vid_array[di->done_deletion++];
+
+	dw->di = di;
+
+	return dw;
+}
+
+static void delete_one_vdi_done(struct work *work)
+{
+	struct deletion_work *dw =
+		container_of(work, struct deletion_work, work);
+	struct deletion_work *next_dw;
+	struct deletion_info *di = dw->di;
+
+	next_dw = create_next_deletion_work(di);
+	if (next_dw) {
+		sd_debug("next vid of deletion: %"PRIx32, next_dw->vid);
+		queue_work(sys->deletion_wqueue, &next_dw->work);
+
+		goto out;
 	}
 
-	list_del(&dw->list);
+	eventfd_xwrite(di->finish_fd, 1);
 
-	put_request(req);
+	/* the deletion info is completed */
+	free(di->delete_vid_array);
+	free(di);
 
-	free(dw->buf);
-	free(dw);
+	pthread_mutex_lock(&deletion_info_queue_lock);
 
-	if (!list_empty(&deletion_work_list)) {
-		dw = list_first_entry(&deletion_work_list,
-				      struct deletion_work, list);
+	if (!list_empty(&deletion_info_queue)) {
+		di = list_first_entry(&deletion_info_queue,
+				      struct deletion_info, list);
+		list_del(&di->list);
 
-		queue_work(sys->deletion_wqueue, &dw->work);
+		next_dw = create_next_deletion_work(di);
+		assert(next_dw);
+		sd_debug("next vid of deletion: %"PRIx32, next_dw->vid);
+		queue_work(sys->deletion_wqueue, &next_dw->work);
 	}
+
+	pthread_mutex_unlock(&deletion_info_queue_lock);
+
+out:
+	free(dw);
 }
 
-static int fill_vdi_list(struct deletion_work *dw, uint32_t root_vid)
+static int fill_delete_vid_array(struct deletion_info *di, uint32_t root_vid)
 {
-	int ret, i;
+	int ret = 0;
 	struct sd_inode *inode = NULL;
-	int done = dw->count;
+	int done = di->delete_vid_count;
 	uint32_t vid;
 
 	inode = malloc(SD_INODE_HEADER_SIZE);
 	if (!inode) {
 		sd_err("failed to allocate memory");
-		goto err;
+		return -1;
 	}
 
-	dw->buf[dw->count++] = root_vid;
-again:
-	vid = dw->buf[done++];
-	ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,
-				  SD_INODE_HEADER_SIZE, 0);
+	di->delete_vid_array[di->delete_vid_count++] = root_vid;
 
-	if (ret != SD_RES_SUCCESS) {
-		sd_err("cannot find VDI object");
-		goto err;
-	}
+	do {
+		vid = di->delete_vid_array[done++];
+		ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,
+					  SD_INODE_HEADER_SIZE, 0);
+		if (ret != SD_RES_SUCCESS) {
+			sd_err("cannot find VDI object");
+			ret = -1;
+			break;
+		}
 
-	if (!vdi_is_deleted(inode) && vid != dw->vid)
-		goto out;
+		if (!vdi_is_deleted(inode) && vid != di->target_vid) {
+			ret = 1;
+			break;
+		}
 
-	for (i = 0; i < ARRAY_SIZE(inode->child_vdi_id); i++) {
-		if (!inode->child_vdi_id[i])
-			continue;
+		for (int i = 0; i < ARRAY_SIZE(inode->child_vdi_id); i++) {
+			if (!inode->child_vdi_id[i])
+				continue;
 
-		dw->buf[dw->count++] = inode->child_vdi_id[i];
-	}
+			di->delete_vid_array[di->delete_vid_count++] =
+				inode->child_vdi_id[i];
+		}
+	} while (di->delete_vid_array[done]);
 
-	if (dw->buf[done])
-		goto again;
-err:
-	free(inode);
-	return 0;
-out:
 	free(inode);
-	return 1;
+	return ret;
 }
 
 static uint64_t get_vdi_root(uint32_t vid, bool *cloned)
@@ -1022,100 +1049,124 @@ static uint64_t get_vdi_root(uint32_t vid, bool *cloned)
 	inode = malloc(SD_INODE_HEADER_SIZE);
 	if (!inode) {
 		sd_err("failed to allocate memory");
-		vid = 0;
-		goto out;
-	}
-next:
-	ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,
-				  SD_INODE_HEADER_SIZE, 0);
-
-	if (vid == inode->vdi_id && inode->snap_id == 1
-	    && inode->parent_vdi_id != 0
-	    && !inode->snap_ctime) {
-		sd_debug("vdi %" PRIx32 " is a cloned vdi.", vid);
-		/* current vdi is a cloned vdi */
-		*cloned = true;
+		return 0;
 	}
 
-	if (ret != SD_RES_SUCCESS) {
-		sd_err("cannot find VDI object");
-		vid = 0;
-		goto out;
-	}
+	do {
+		ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,
+					  SD_INODE_HEADER_SIZE, 0);
+		if (ret != SD_RES_SUCCESS) {
+			sd_err("cannot find VDI object");
+			vid = 0;
+			break;
+		}
 
-	if (!inode->parent_vdi_id)
-		goto out;
+		if (vid == inode->vdi_id && inode->snap_id == 1
+		    && inode->parent_vdi_id != 0 && !inode->snap_ctime) {
+			sd_debug("vdi %" PRIx32 " is a cloned vdi.", vid);
+			/* current vdi is a cloned vdi */
+			*cloned = true;
+		}
 
-	vid = inode->parent_vdi_id;
+		if (!inode->parent_vdi_id)
+			break;
 
-	goto next;
-out:
-	free(inode);
+		vid = inode->parent_vdi_id;
+	} while (true);
 
+	free(inode);
 	return vid;
 }
 
 static int start_deletion(struct request *req, uint32_t vid)
 {
+	struct deletion_info *di = NULL;
 	struct deletion_work *dw = NULL;
-	int ret = SD_RES_NO_MEM;
+	int ret = SD_RES_SUCCESS;
 	bool cloned;
 	uint32_t root_vid;
 
-	dw = xzalloc(sizeof(struct deletion_work));
-	/* buf is to store vdi id of every object */
-	dw->buf = xzalloc(SD_INODE_SIZE - SD_INODE_HEADER_SIZE);
-	dw->count = 0;
-	dw->vid = vid;
-	dw->req = req;
-
-	dw->work.fn = delete_one;
-	dw->work.done = delete_one_done;
+	di = xzalloc(sizeof(*di));
+	di->delete_vid_array = xzalloc(SD_INODE_SIZE - SD_INODE_HEADER_SIZE);
+	di->done_deletion = di->delete_vid_count = 0;
+	di->target_vid = vid;
+	di->finish_fd = eventfd(0, 0);
+	if (di->finish_fd < 0) {
+		sd_err("cannot create an eventfd for notifying finish of"
+		       " deletion info: %m");
+		goto out;
+	}
+	INIT_LIST_NODE(&di->list);
 
-	root_vid = get_vdi_root(dw->vid, &cloned);
+	root_vid = get_vdi_root(di->target_vid, &cloned);
 	if (!root_vid) {
 		ret = SD_RES_EIO;
 		goto out;
 	}
 
-	ret = fill_vdi_list(dw, root_vid);
-	if (ret) {
+	ret = fill_delete_vid_array(di, root_vid);
+	if (ret < 0) {
+		ret = SD_RES_EIO;
+		goto out;
+	} else if (ret == 1) {
 		/*
 		 * if the VDI is a cloned VDI, delete its objects
 		 * no matter whether the VDI tree is clear.
 		 */
 		if (cloned) {
-			dw->buf[0] = vid;
-			dw->count = 1;
+			di->delete_vid_array[0] = vid;
+			di->delete_vid_count = 1;
 		} else {
 			sd_debug("snapshot chain has valid vdi, just mark vdi %"
-				 PRIx32 " as deleted.", dw->vid);
-			delete_inode(dw);
+				 PRIx32 " as deleted.", di->target_vid);
+			delete_inode(di->target_vid);
 			ret = SD_RES_SUCCESS;
 			goto out;
 		}
 	}
 
-	sd_debug("%d", dw->count);
+	sd_debug("number of VDI deletion: %d", di->delete_vid_count);
 
-	ret = SD_RES_SUCCESS;
-	if (dw->count == 0)
+	if (di->delete_vid_count == 0)
 		goto out;
 
-	refcount_inc(&req->refcnt);
+	pthread_mutex_lock(&deletion_info_queue_lock);
 
-	if (list_empty(&deletion_work_list)) {
-		list_add_tail(&dw->list, &deletion_work_list);
+	if (list_empty(&deletion_info_queue)) {
+		dw = create_next_deletion_work(di);
+		assert(dw);
 		queue_work(sys->deletion_wqueue, &dw->work);
 	} else
-		list_add_tail(&dw->list, &deletion_work_list);
+		list_add_tail(&di->list, &deletion_info_queue);
+
+	pthread_mutex_unlock(&deletion_info_queue_lock);
+
+	/*
+	 * the event fd is written by delete_one_vdi_done(), when all vdis of
+	 * deletion_info are deleted
+	 */
+	eventfd_xread(di->finish_fd);
 
 	return ret;
 out:
-	if (dw)
-		free(dw->buf);
-	free(dw);
+	if (di)
+		free(di->delete_vid_array);
+	free(di);
+
+	return ret;
+}
 
+int vdi_delete(const struct vdi_iocb *iocb, struct request *req)
+{
+	struct vdi_info info;
+	int ret;
+
+	ret = vdi_lookup(iocb, &info);
+	if (ret != SD_RES_SUCCESS)
+		goto out;
+
+	ret = start_deletion(req, info.vid);
+out:
 	return ret;
 }
 
-- 
1.7.10.4




More information about the sheepdog mailing list