<p dir="ltr">this is core chang, it would be nice if you can describe current delete algorithm and your new approach, what it solves, pros and cons</p>
<p dir="ltr">thanks<br>
Yuan</p>
<div class="gmail_quote">2013-12-26 PM2:54于 "Hitoshi Mitake" <<a href="mailto:mitake.hitoshi@lab.ntt.co.jp">mitake.hitoshi@lab.ntt.co.jp</a>>写道:<br type="attribution"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
Current code of vdi deletion process in sheep deamon is a real<br>
disaster. Especially, its irregular usage of work queue is terrible<br>
thing for maintenance. In addition, the vdi deletion process can<br>
potentially notify a completion of deletion to a dog process before<br>
actual completion.<br>
<br>
This patch cleans the VDI deletion code and removes the above bug.<br>
<br>
Signed-off-by: Hitoshi Mitake <<a href="mailto:mitake.hitoshi@lab.ntt.co.jp">mitake.hitoshi@lab.ntt.co.jp</a>><br>
---<br>
 sheep/vdi.c |  301 ++++++++++++++++++++++++++++++++++-------------------------<br>
 1 file changed, 176 insertions(+), 125 deletions(-)<br>
<br>
diff --git a/sheep/vdi.c b/sheep/vdi.c<br>
index eca4f43..49038c5 100644<br>
--- a/sheep/vdi.c<br>
+++ b/sheep/vdi.c<br>
@@ -754,22 +754,6 @@ int vdi_snapshot(const struct vdi_iocb *iocb, uint32_t *new_vid)<br>
                                  info.vid);<br>
 }<br>
<br>
-static int start_deletion(struct request *req, uint32_t vid);<br>
-<br>
-int vdi_delete(const struct vdi_iocb *iocb, struct request *req)<br>
-{<br>
-       struct vdi_info info;<br>
-       int ret;<br>
-<br>
-       ret = vdi_lookup(iocb, &info);<br>
-       if (ret != SD_RES_SUCCESS)<br>
-               goto out;<br>
-<br>
-       ret = start_deletion(req, info.vid);<br>
-out:<br>
-       return ret;<br>
-}<br>
-<br>
 int read_vdis(char *data, int len, unsigned int *rsp_len)<br>
 {<br>
        if (len != sizeof(sys->vdi_inuse))<br>
@@ -781,28 +765,35 @@ int read_vdis(char *data, int len, unsigned int *rsp_len)<br>
        return SD_RES_SUCCESS;<br>
 }<br>
<br>
-struct deletion_work {<br>
-       uint32_t done;<br>
+struct deletion_info {<br>
+       uint32_t target_vid;<br>
+<br>
+       int delete_vid_count;<br>
+       uint32_t *delete_vid_array;<br>
+<br>
+       int done_deletion;<br>
<br>
+       int finish_fd;<br>
+       struct list_node list;  /* connected deletion_info_queue */<br>
+};<br>
+<br>
+struct deletion_work {<br>
        struct work work;<br>
-       struct list_node list;<br>
-       struct request *req;<br>
<br>
        uint32_t vid;<br>
-<br>
-       int count;<br>
-       uint32_t *buf;<br>
+       struct deletion_info *di;<br>
 };<br>
<br>
-static LIST_HEAD(deletion_work_list);<br>
+static pthread_mutex_t deletion_info_queue_lock = PTHREAD_MUTEX_INITIALIZER;<br>
+static LIST_HEAD(deletion_info_queue);<br>
<br>
-static int delete_inode(struct deletion_work *dw)<br>
+static int delete_inode(uint32_t vid)<br>
 {<br>
        struct sd_inode *inode = NULL;<br>
        int ret = SD_RES_SUCCESS;<br>
<br>
        inode = xzalloc(sizeof(*inode));<br>
-       ret = sd_read_object(vid_to_vdi_oid(dw->vid), (char *)inode,<br>
+       ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode,<br>
                             SD_INODE_HEADER_SIZE, 0);<br>
        if (ret != SD_RES_SUCCESS) {<br>
                ret = SD_RES_EIO;<br>
@@ -811,7 +802,7 @@ static int delete_inode(struct deletion_work *dw)<br>
<br>
        memset(inode->name, 0, sizeof(inode->name));<br>
<br>
-       ret = sd_write_object(vid_to_vdi_oid(dw->vid), (char *)inode,<br>
+       ret = sd_write_object(vid_to_vdi_oid(vid), (char *)inode,<br>
                              SD_INODE_HEADER_SIZE, 0, false);<br>
        if (ret != 0) {<br>
                ret = SD_RES_EIO;<br>
@@ -871,16 +862,15 @@ static void delete_cb(void *data, enum btree_node_type type, void *arg)<br>
        }<br>
 }<br>
<br>
-static void delete_one(struct work *work)<br>
+static void delete_one_vdi(struct work *work)<br>
 {<br>
-       struct deletion_work *dw = container_of(work, struct deletion_work, work);<br>
-       uint32_t vdi_id = *(dw->buf + dw->count - dw->done - 1);<br>
+       struct deletion_work *dw =<br>
+               container_of(work, struct deletion_work, work);<br>
+       uint32_t vdi_id = dw->vid;<br>
        int ret;<br>
        uint32_t i, nr_deleted, nr_objs;<br>
        struct sd_inode *inode = NULL;<br>
<br>
-       sd_debug("%d %d, %16x", dw->done, dw->count, vdi_id);<br>
-<br>
        inode = malloc(sizeof(*inode));<br>
        if (!inode) {<br>
                sd_err("failed to allocate memory");<br>
@@ -942,74 +932,111 @@ out:<br>
        free(inode);<br>
 }<br>
<br>
-static void delete_one_done(struct work *work)<br>
+static void delete_one_vdi_done(struct work *work);<br>
+<br>
+static struct deletion_work *create_next_deletion_work(struct deletion_info *di)<br>
 {<br>
-       struct deletion_work *dw = container_of(work, struct deletion_work, work);<br>
-       struct request *req = dw->req;<br>
+       struct deletion_work *dw = NULL;<br>
<br>
-       dw->done++;<br>
-       if (dw->done < dw->count) {<br>
-               queue_work(sys->deletion_wqueue, &dw->work);<br>
-               return;<br>
+       if (!(di->done_deletion < di->delete_vid_count))<br>
+               /* no more deletion work, this deletion info is completed */<br>
+               return NULL;<br>
+<br>
+       sd_debug("done_deletion: %d, delete_vid_count: %d, next vid: %"PRIu32,<br>
+                di->done_deletion, di->delete_vid_count,<br>
+                di->delete_vid_array[di->done_deletion + 1]);<br>
+<br>
+       dw = xzalloc(sizeof(*dw));<br>
+       dw->work.fn = delete_one_vdi;<br>
+       dw->work.done = delete_one_vdi_done;<br>
+       dw->vid = di->delete_vid_array[di->done_deletion++];<br>
+<br>
+       dw->di = di;<br>
+<br>
+       return dw;<br>
+}<br>
+<br>
+static void delete_one_vdi_done(struct work *work)<br>
+{<br>
+       struct deletion_work *dw =<br>
+               container_of(work, struct deletion_work, work);<br>
+       struct deletion_work *next_dw;<br>
+       struct deletion_info *di = dw->di;<br>
+<br>
+       next_dw = create_next_deletion_work(di);<br>
+       if (next_dw) {<br>
+               sd_debug("next vid of deletion: %"PRIx32, next_dw->vid);<br>
+               queue_work(sys->deletion_wqueue, &next_dw->work);<br>
+<br>
+               goto out;<br>
        }<br>
<br>
-       list_del(&dw->list);<br>
+       eventfd_xwrite(di->finish_fd, 1);<br>
<br>
-       put_request(req);<br>
+       /* the deletion info is completed */<br>
+       free(di->delete_vid_array);<br>
+       free(di);<br>
<br>
-       free(dw->buf);<br>
-       free(dw);<br>
+       pthread_mutex_lock(&deletion_info_queue_lock);<br>
<br>
-       if (!list_empty(&deletion_work_list)) {<br>
-               dw = list_first_entry(&deletion_work_list,<br>
-                                     struct deletion_work, list);<br>
+       if (!list_empty(&deletion_info_queue)) {<br>
+               di = list_first_entry(&deletion_info_queue,<br>
+                                     struct deletion_info, list);<br>
+               list_del(&di->list);<br>
<br>
-               queue_work(sys->deletion_wqueue, &dw->work);<br>
+               next_dw = create_next_deletion_work(di);<br>
+               assert(next_dw);<br>
+               sd_debug("next vid of deletion: %"PRIx32, next_dw->vid);<br>
+               queue_work(sys->deletion_wqueue, &next_dw->work);<br>
        }<br>
+<br>
+       pthread_mutex_unlock(&deletion_info_queue_lock);<br>
+<br>
+out:<br>
+       free(dw);<br>
 }<br>
<br>
-static int fill_vdi_list(struct deletion_work *dw, uint32_t root_vid)<br>
+static int fill_delete_vid_array(struct deletion_info *di, uint32_t root_vid)<br>
 {<br>
-       int ret, i;<br>
+       int ret = 0;<br>
        struct sd_inode *inode = NULL;<br>
-       int done = dw->count;<br>
+       int done = di->delete_vid_count;<br>
        uint32_t vid;<br>
<br>
        inode = malloc(SD_INODE_HEADER_SIZE);<br>
        if (!inode) {<br>
                sd_err("failed to allocate memory");<br>
-               goto err;<br>
+               return -1;<br>
        }<br>
<br>
-       dw->buf[dw->count++] = root_vid;<br>
-again:<br>
-       vid = dw->buf[done++];<br>
-       ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,<br>
-                                 SD_INODE_HEADER_SIZE, 0);<br>
+       di->delete_vid_array[di->delete_vid_count++] = root_vid;<br>
<br>
-       if (ret != SD_RES_SUCCESS) {<br>
-               sd_err("cannot find VDI object");<br>
-               goto err;<br>
-       }<br>
+       do {<br>
+               vid = di->delete_vid_array[done++];<br>
+               ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,<br>
+                                         SD_INODE_HEADER_SIZE, 0);<br>
+               if (ret != SD_RES_SUCCESS) {<br>
+                       sd_err("cannot find VDI object");<br>
+                       ret = -1;<br>
+                       break;<br>
+               }<br>
<br>
-       if (!vdi_is_deleted(inode) && vid != dw->vid)<br>
-               goto out;<br>
+               if (!vdi_is_deleted(inode) && vid != di->target_vid) {<br>
+                       ret = 1;<br>
+                       break;<br>
+               }<br>
<br>
-       for (i = 0; i < ARRAY_SIZE(inode->child_vdi_id); i++) {<br>
-               if (!inode->child_vdi_id[i])<br>
-                       continue;<br>
+               for (int i = 0; i < ARRAY_SIZE(inode->child_vdi_id); i++) {<br>
+                       if (!inode->child_vdi_id[i])<br>
+                               continue;<br>
<br>
-               dw->buf[dw->count++] = inode->child_vdi_id[i];<br>
-       }<br>
+                       di->delete_vid_array[di->delete_vid_count++] =<br>
+                               inode->child_vdi_id[i];<br>
+               }<br>
+       } while (di->delete_vid_array[done]);<br>
<br>
-       if (dw->buf[done])<br>
-               goto again;<br>
-err:<br>
-       free(inode);<br>
-       return 0;<br>
-out:<br>
        free(inode);<br>
-       return 1;<br>
+       return ret;<br>
 }<br>
<br>
 static uint64_t get_vdi_root(uint32_t vid, bool *cloned)<br>
@@ -1022,100 +1049,124 @@ static uint64_t get_vdi_root(uint32_t vid, bool *cloned)<br>
        inode = malloc(SD_INODE_HEADER_SIZE);<br>
        if (!inode) {<br>
                sd_err("failed to allocate memory");<br>
-               vid = 0;<br>
-               goto out;<br>
-       }<br>
-next:<br>
-       ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,<br>
-                                 SD_INODE_HEADER_SIZE, 0);<br>
-<br>
-       if (vid == inode->vdi_id && inode->snap_id == 1<br>
-           && inode->parent_vdi_id != 0<br>
-           && !inode->snap_ctime) {<br>
-               sd_debug("vdi %" PRIx32 " is a cloned vdi.", vid);<br>
-               /* current vdi is a cloned vdi */<br>
-               *cloned = true;<br>
+               return 0;<br>
        }<br>
<br>
-       if (ret != SD_RES_SUCCESS) {<br>
-               sd_err("cannot find VDI object");<br>
-               vid = 0;<br>
-               goto out;<br>
-       }<br>
+       do {<br>
+               ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,<br>
+                                         SD_INODE_HEADER_SIZE, 0);<br>
+               if (ret != SD_RES_SUCCESS) {<br>
+                       sd_err("cannot find VDI object");<br>
+                       vid = 0;<br>
+                       break;<br>
+               }<br>
<br>
-       if (!inode->parent_vdi_id)<br>
-               goto out;<br>
+               if (vid == inode->vdi_id && inode->snap_id == 1<br>
+                   && inode->parent_vdi_id != 0 && !inode->snap_ctime) {<br>
+                       sd_debug("vdi %" PRIx32 " is a cloned vdi.", vid);<br>
+                       /* current vdi is a cloned vdi */<br>
+                       *cloned = true;<br>
+               }<br>
<br>
-       vid = inode->parent_vdi_id;<br>
+               if (!inode->parent_vdi_id)<br>
+                       break;<br>
<br>
-       goto next;<br>
-out:<br>
-       free(inode);<br>
+               vid = inode->parent_vdi_id;<br>
+       } while (true);<br>
<br>
+       free(inode);<br>
        return vid;<br>
 }<br>
<br>
 static int start_deletion(struct request *req, uint32_t vid)<br>
 {<br>
+       struct deletion_info *di = NULL;<br>
        struct deletion_work *dw = NULL;<br>
-       int ret = SD_RES_NO_MEM;<br>
+       int ret = SD_RES_SUCCESS;<br>
        bool cloned;<br>
        uint32_t root_vid;<br>
<br>
-       dw = xzalloc(sizeof(struct deletion_work));<br>
-       /* buf is to store vdi id of every object */<br>
-       dw->buf = xzalloc(SD_INODE_SIZE - SD_INODE_HEADER_SIZE);<br>
-       dw->count = 0;<br>
-       dw->vid = vid;<br>
-       dw->req = req;<br>
-<br>
-       dw->work.fn = delete_one;<br>
-       dw->work.done = delete_one_done;<br>
+       di = xzalloc(sizeof(*di));<br>
+       di->delete_vid_array = xzalloc(SD_INODE_SIZE - SD_INODE_HEADER_SIZE);<br>
+       di->done_deletion = di->delete_vid_count = 0;<br>
+       di->target_vid = vid;<br>
+       di->finish_fd = eventfd(0, 0);<br>
+       if (di->finish_fd < 0) {<br>
+               sd_err("cannot create an eventfd for notifying finish of"<br>
+                      " deletion info: %m");<br>
+               goto out;<br>
+       }<br>
+       INIT_LIST_NODE(&di->list);<br>
<br>
-       root_vid = get_vdi_root(dw->vid, &cloned);<br>
+       root_vid = get_vdi_root(di->target_vid, &cloned);<br>
        if (!root_vid) {<br>
                ret = SD_RES_EIO;<br>
                goto out;<br>
        }<br>
<br>
-       ret = fill_vdi_list(dw, root_vid);<br>
-       if (ret) {<br>
+       ret = fill_delete_vid_array(di, root_vid);<br>
+       if (ret < 0) {<br>
+               ret = SD_RES_EIO;<br>
+               goto out;<br>
+       } else if (ret == 1) {<br>
                /*<br>
                 * if the VDI is a cloned VDI, delete its objects<br>
                 * no matter whether the VDI tree is clear.<br>
                 */<br>
                if (cloned) {<br>
-                       dw->buf[0] = vid;<br>
-                       dw->count = 1;<br>
+                       di->delete_vid_array[0] = vid;<br>
+                       di->delete_vid_count = 1;<br>
                } else {<br>
                        sd_debug("snapshot chain has valid vdi, just mark vdi %"<br>
-                                PRIx32 " as deleted.", dw->vid);<br>
-                       delete_inode(dw);<br>
+                                PRIx32 " as deleted.", di->target_vid);<br>
+                       delete_inode(di->target_vid);<br>
                        ret = SD_RES_SUCCESS;<br>
                        goto out;<br>
                }<br>
        }<br>
<br>
-       sd_debug("%d", dw->count);<br>
+       sd_debug("number of VDI deletion: %d", di->delete_vid_count);<br>
<br>
-       ret = SD_RES_SUCCESS;<br>
-       if (dw->count == 0)<br>
+       if (di->delete_vid_count == 0)<br>
                goto out;<br>
<br>
-       refcount_inc(&req->refcnt);<br>
+       pthread_mutex_lock(&deletion_info_queue_lock);<br>
<br>
-       if (list_empty(&deletion_work_list)) {<br>
-               list_add_tail(&dw->list, &deletion_work_list);<br>
+       if (list_empty(&deletion_info_queue)) {<br>
+               dw = create_next_deletion_work(di);<br>
+               assert(dw);<br>
                queue_work(sys->deletion_wqueue, &dw->work);<br>
        } else<br>
-               list_add_tail(&dw->list, &deletion_work_list);<br>
+               list_add_tail(&di->list, &deletion_info_queue);<br>
+<br>
+       pthread_mutex_unlock(&deletion_info_queue_lock);<br>
+<br>
+       /*<br>
+        * the event fd is written by delete_one_vdi_done(), when all vdis of<br>
+        * deletion_info are deleted<br>
+        */<br>
+       eventfd_xread(di->finish_fd);<br>
<br>
        return ret;<br>
 out:<br>
-       if (dw)<br>
-               free(dw->buf);<br>
-       free(dw);<br>
+       if (di)<br>
+               free(di->delete_vid_array);<br>
+       free(di);<br>
+<br>
+       return ret;<br>
+}<br>
<br>
+int vdi_delete(const struct vdi_iocb *iocb, struct request *req)<br>
+{<br>
+       struct vdi_info info;<br>
+       int ret;<br>
+<br>
+       ret = vdi_lookup(iocb, &info);<br>
+       if (ret != SD_RES_SUCCESS)<br>
+               goto out;<br>
+<br>
+       ret = start_deletion(req, info.vid);<br>
+out:<br>
        return ret;<br>
 }<br>
<br>
--<br>
1.7.10.4<br>
<br>
--<br>
sheepdog mailing list<br>
<a href="mailto:sheepdog@lists.wpkg.org">sheepdog@lists.wpkg.org</a><br>
<a href="http://lists.wpkg.org/mailman/listinfo/sheepdog" target="_blank">http://lists.wpkg.org/mailman/listinfo/sheepdog</a><br>
</blockquote></div>