[sheepdog] [PATCH v2] sheep: cleaning vdi deletion process of sheep daemon
Hitoshi Mitake
mitake.hitoshi at lab.ntt.co.jp
Fri Dec 27 08:53:57 CET 2013
Current code of vdi deletion process in sheep deamon is a real
disaster. Especially, its irregular usage of work queue is terrible
thing for maintenance. In addition, the vdi deletion process can
potentially notify a completion of deletion to a dog process before
actual completion.
This patch cleans the VDI deletion code and removes the above bug.
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
v2: change an comparison operator
sheep/vdi.c | 301 ++++++++++++++++++++++++++++++++++-------------------------
1 file changed, 176 insertions(+), 125 deletions(-)
diff --git a/sheep/vdi.c b/sheep/vdi.c
index f8b6bac..ff35e92 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -754,22 +754,6 @@ int vdi_snapshot(const struct vdi_iocb *iocb, uint32_t *new_vid)
info.vid);
}
-static int start_deletion(struct request *req, uint32_t vid);
-
-int vdi_delete(const struct vdi_iocb *iocb, struct request *req)
-{
- struct vdi_info info;
- int ret;
-
- ret = vdi_lookup(iocb, &info);
- if (ret != SD_RES_SUCCESS)
- goto out;
-
- ret = start_deletion(req, info.vid);
-out:
- return ret;
-}
-
int read_vdis(char *data, int len, unsigned int *rsp_len)
{
if (len != sizeof(sys->vdi_inuse))
@@ -781,28 +765,35 @@ int read_vdis(char *data, int len, unsigned int *rsp_len)
return SD_RES_SUCCESS;
}
-struct deletion_work {
- uint32_t done;
+struct deletion_info {
+ uint32_t target_vid;
+
+ int delete_vid_count;
+ uint32_t *delete_vid_array;
+
+ int done_deletion;
+ int finish_fd;
+ struct list_node list; /* connected deletion_info_queue */
+};
+
+struct deletion_work {
struct work work;
- struct list_node list;
- struct request *req;
uint32_t vid;
-
- int count;
- uint32_t *buf;
+ struct deletion_info *di;
};
-static LIST_HEAD(deletion_work_list);
+static pthread_mutex_t deletion_info_queue_lock = PTHREAD_MUTEX_INITIALIZER;
+static LIST_HEAD(deletion_info_queue);
-static int delete_inode(struct deletion_work *dw)
+static int delete_inode(uint32_t vid)
{
struct sd_inode *inode = NULL;
int ret = SD_RES_SUCCESS;
inode = xzalloc(sizeof(*inode));
- ret = sd_read_object(vid_to_vdi_oid(dw->vid), (char *)inode,
+ ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode,
SD_INODE_HEADER_SIZE, 0);
if (ret != SD_RES_SUCCESS) {
ret = SD_RES_EIO;
@@ -811,7 +802,7 @@ static int delete_inode(struct deletion_work *dw)
memset(inode->name, 0, sizeof(inode->name));
- ret = sd_write_object(vid_to_vdi_oid(dw->vid), (char *)inode,
+ ret = sd_write_object(vid_to_vdi_oid(vid), (char *)inode,
SD_INODE_HEADER_SIZE, 0, false);
if (ret != 0) {
ret = SD_RES_EIO;
@@ -871,16 +862,15 @@ static void delete_cb(void *data, enum btree_node_type type, void *arg)
}
}
-static void delete_one(struct work *work)
+static void delete_one_vdi(struct work *work)
{
- struct deletion_work *dw = container_of(work, struct deletion_work, work);
- uint32_t vdi_id = *(dw->buf + dw->count - dw->done - 1);
+ struct deletion_work *dw =
+ container_of(work, struct deletion_work, work);
+ uint32_t vdi_id = dw->vid;
int ret;
uint32_t i, nr_deleted, nr_objs;
struct sd_inode *inode = NULL;
- sd_debug("%d %d, %16x", dw->done, dw->count, vdi_id);
-
inode = malloc(sizeof(*inode));
if (!inode) {
sd_err("failed to allocate memory");
@@ -942,74 +932,111 @@ out:
free(inode);
}
-static void delete_one_done(struct work *work)
+static void delete_one_vdi_done(struct work *work);
+
+static struct deletion_work *create_next_deletion_work(struct deletion_info *di)
{
- struct deletion_work *dw = container_of(work, struct deletion_work, work);
- struct request *req = dw->req;
+ struct deletion_work *dw = NULL;
- dw->done++;
- if (dw->done < dw->count) {
- queue_work(sys->deletion_wqueue, &dw->work);
- return;
+ if (di->done_deletion >= di->delete_vid_count)
+ /* no more deletion work, this deletion info is completed */
+ return NULL;
+
+ sd_debug("done_deletion: %d, delete_vid_count: %d, next vid: %"PRIu32,
+ di->done_deletion, di->delete_vid_count,
+ di->delete_vid_array[di->done_deletion + 1]);
+
+ dw = xzalloc(sizeof(*dw));
+ dw->work.fn = delete_one_vdi;
+ dw->work.done = delete_one_vdi_done;
+ dw->vid = di->delete_vid_array[di->done_deletion++];
+
+ dw->di = di;
+
+ return dw;
+}
+
+static void delete_one_vdi_done(struct work *work)
+{
+ struct deletion_work *dw =
+ container_of(work, struct deletion_work, work);
+ struct deletion_work *next_dw;
+ struct deletion_info *di = dw->di;
+
+ next_dw = create_next_deletion_work(di);
+ if (next_dw) {
+ sd_debug("next vid of deletion: %"PRIx32, next_dw->vid);
+ queue_work(sys->deletion_wqueue, &next_dw->work);
+
+ goto out;
}
- list_del(&dw->list);
+ eventfd_xwrite(di->finish_fd, 1);
- put_request(req);
+ /* the deletion info is completed */
+ free(di->delete_vid_array);
+ free(di);
- free(dw->buf);
- free(dw);
+ pthread_mutex_lock(&deletion_info_queue_lock);
- if (!list_empty(&deletion_work_list)) {
- dw = list_first_entry(&deletion_work_list,
- struct deletion_work, list);
+ if (!list_empty(&deletion_info_queue)) {
+ di = list_first_entry(&deletion_info_queue,
+ struct deletion_info, list);
+ list_del(&di->list);
- queue_work(sys->deletion_wqueue, &dw->work);
+ next_dw = create_next_deletion_work(di);
+ assert(next_dw);
+ sd_debug("next vid of deletion: %"PRIx32, next_dw->vid);
+ queue_work(sys->deletion_wqueue, &next_dw->work);
}
+
+ pthread_mutex_unlock(&deletion_info_queue_lock);
+
+out:
+ free(dw);
}
-static int fill_vdi_list(struct deletion_work *dw, uint32_t root_vid)
+static int fill_delete_vid_array(struct deletion_info *di, uint32_t root_vid)
{
- int ret, i;
+ int ret = 0;
struct sd_inode *inode = NULL;
- int done = dw->count;
+ int done = di->delete_vid_count;
uint32_t vid;
inode = malloc(SD_INODE_HEADER_SIZE);
if (!inode) {
sd_err("failed to allocate memory");
- goto err;
+ return -1;
}
- dw->buf[dw->count++] = root_vid;
-again:
- vid = dw->buf[done++];
- ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,
- SD_INODE_HEADER_SIZE, 0);
+ di->delete_vid_array[di->delete_vid_count++] = root_vid;
- if (ret != SD_RES_SUCCESS) {
- sd_err("cannot find VDI object");
- goto err;
- }
+ do {
+ vid = di->delete_vid_array[done++];
+ ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,
+ SD_INODE_HEADER_SIZE, 0);
+ if (ret != SD_RES_SUCCESS) {
+ sd_err("cannot find VDI object");
+ ret = -1;
+ break;
+ }
- if (!vdi_is_deleted(inode) && vid != dw->vid)
- goto out;
+ if (!vdi_is_deleted(inode) && vid != di->target_vid) {
+ ret = 1;
+ break;
+ }
- for (i = 0; i < ARRAY_SIZE(inode->child_vdi_id); i++) {
- if (!inode->child_vdi_id[i])
- continue;
+ for (int i = 0; i < ARRAY_SIZE(inode->child_vdi_id); i++) {
+ if (!inode->child_vdi_id[i])
+ continue;
- dw->buf[dw->count++] = inode->child_vdi_id[i];
- }
+ di->delete_vid_array[di->delete_vid_count++] =
+ inode->child_vdi_id[i];
+ }
+ } while (di->delete_vid_array[done]);
- if (dw->buf[done])
- goto again;
-err:
- free(inode);
- return 0;
-out:
free(inode);
- return 1;
+ return ret;
}
static uint64_t get_vdi_root(uint32_t vid, bool *cloned)
@@ -1022,100 +1049,124 @@ static uint64_t get_vdi_root(uint32_t vid, bool *cloned)
inode = malloc(SD_INODE_HEADER_SIZE);
if (!inode) {
sd_err("failed to allocate memory");
- vid = 0;
- goto out;
- }
-next:
- ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,
- SD_INODE_HEADER_SIZE, 0);
-
- if (vid == inode->vdi_id && inode->snap_id == 1
- && inode->parent_vdi_id != 0
- && !inode->snap_ctime) {
- sd_debug("vdi %" PRIx32 " is a cloned vdi.", vid);
- /* current vdi is a cloned vdi */
- *cloned = true;
+ return 0;
}
- if (ret != SD_RES_SUCCESS) {
- sd_err("cannot find VDI object");
- vid = 0;
- goto out;
- }
+ do {
+ ret = read_backend_object(vid_to_vdi_oid(vid), (char *)inode,
+ SD_INODE_HEADER_SIZE, 0);
+ if (ret != SD_RES_SUCCESS) {
+ sd_err("cannot find VDI object");
+ vid = 0;
+ break;
+ }
- if (!inode->parent_vdi_id)
- goto out;
+ if (vid == inode->vdi_id && inode->snap_id == 1
+ && inode->parent_vdi_id != 0 && !inode->snap_ctime) {
+ sd_debug("vdi %" PRIx32 " is a cloned vdi.", vid);
+ /* current vdi is a cloned vdi */
+ *cloned = true;
+ }
- vid = inode->parent_vdi_id;
+ if (!inode->parent_vdi_id)
+ break;
- goto next;
-out:
- free(inode);
+ vid = inode->parent_vdi_id;
+ } while (true);
+ free(inode);
return vid;
}
static int start_deletion(struct request *req, uint32_t vid)
{
+ struct deletion_info *di = NULL;
struct deletion_work *dw = NULL;
- int ret = SD_RES_NO_MEM;
+ int ret = SD_RES_SUCCESS;
bool cloned;
uint32_t root_vid;
- dw = xzalloc(sizeof(struct deletion_work));
- /* buf is to store vdi id of every object */
- dw->buf = xzalloc(SD_INODE_SIZE - SD_INODE_HEADER_SIZE);
- dw->count = 0;
- dw->vid = vid;
- dw->req = req;
-
- dw->work.fn = delete_one;
- dw->work.done = delete_one_done;
+ di = xzalloc(sizeof(*di));
+ di->delete_vid_array = xzalloc(SD_INODE_SIZE - SD_INODE_HEADER_SIZE);
+ di->done_deletion = di->delete_vid_count = 0;
+ di->target_vid = vid;
+ di->finish_fd = eventfd(0, 0);
+ if (di->finish_fd < 0) {
+ sd_err("cannot create an eventfd for notifying finish of"
+ " deletion info: %m");
+ goto out;
+ }
+ INIT_LIST_NODE(&di->list);
- root_vid = get_vdi_root(dw->vid, &cloned);
+ root_vid = get_vdi_root(di->target_vid, &cloned);
if (!root_vid) {
ret = SD_RES_EIO;
goto out;
}
- ret = fill_vdi_list(dw, root_vid);
- if (ret) {
+ ret = fill_delete_vid_array(di, root_vid);
+ if (ret < 0) {
+ ret = SD_RES_EIO;
+ goto out;
+ } else if (ret == 1) {
/*
* if the VDI is a cloned VDI, delete its objects
* no matter whether the VDI tree is clear.
*/
if (cloned) {
- dw->buf[0] = vid;
- dw->count = 1;
+ di->delete_vid_array[0] = vid;
+ di->delete_vid_count = 1;
} else {
sd_debug("snapshot chain has valid vdi, just mark vdi %"
- PRIx32 " as deleted.", dw->vid);
- delete_inode(dw);
+ PRIx32 " as deleted.", di->target_vid);
+ delete_inode(di->target_vid);
ret = SD_RES_SUCCESS;
goto out;
}
}
- sd_debug("%d", dw->count);
+ sd_debug("number of VDI deletion: %d", di->delete_vid_count);
- ret = SD_RES_SUCCESS;
- if (dw->count == 0)
+ if (di->delete_vid_count == 0)
goto out;
- refcount_inc(&req->refcnt);
+ pthread_mutex_lock(&deletion_info_queue_lock);
- if (list_empty(&deletion_work_list)) {
- list_add_tail(&dw->list, &deletion_work_list);
+ if (list_empty(&deletion_info_queue)) {
+ dw = create_next_deletion_work(di);
+ assert(dw);
queue_work(sys->deletion_wqueue, &dw->work);
} else
- list_add_tail(&dw->list, &deletion_work_list);
+ list_add_tail(&di->list, &deletion_info_queue);
+
+ pthread_mutex_unlock(&deletion_info_queue_lock);
+
+ /*
+ * the event fd is written by delete_one_vdi_done(), when all vdis of
+ * deletion_info are deleted
+ */
+ eventfd_xread(di->finish_fd);
return ret;
out:
- if (dw)
- free(dw->buf);
- free(dw);
+ if (di)
+ free(di->delete_vid_array);
+ free(di);
+
+ return ret;
+}
+int vdi_delete(const struct vdi_iocb *iocb, struct request *req)
+{
+ struct vdi_info info;
+ int ret;
+
+ ret = vdi_lookup(iocb, &info);
+ if (ret != SD_RES_SUCCESS)
+ goto out;
+
+ ret = start_deletion(req, info.vid);
+out:
return ret;
}
--
1.7.10.4
More information about the sheepdog
mailing list