On 05/04/2012 02:49 PM, levin li wrote: > Cluster recovery may cause objects migrated from one node > to another, but the object list cache doesn't change, when > deleting an object we can not find the right node in whose > cache the id stays, so we need to notify the deletion object > list to all the node to make them delete the specified > object id from object list cache. > > I reused the deletion_work_list and deletion_wqueue for the > object list cleanup work, so that we save some memory by not > starting a new work queue. > > The del_vdi is called in worker thread, but it's locked by the > notification mechanism, other operations about deletion_work_list > and deletion_wqueue are done in main thread, so there's no need > to concern about the race condition. > > Signed-off-by: levin li <xingke.lwp at taobao.com> > --- > include/sheep.h | 1 + > sheep/group.c | 1 + > sheep/object_list_cache.c | 17 +++++++++ > sheep/ops.c | 57 ++++++++++++++++++++++++++++++ > sheep/sheep_priv.h | 21 ++++++++++++ > sheep/vdi.c | 84 ++++++++++++++++++++++++++++++--------------- > 6 files changed, 154 insertions(+), 27 deletions(-) > > diff --git a/include/sheep.h b/include/sheep.h > index 7e287c4..e941dc1 100644 > --- a/include/sheep.h > +++ b/include/sheep.h > @@ -46,6 +46,7 @@ > #define SD_OP_TRACE 0x95 > #define SD_OP_TRACE_CAT 0x96 > #define SD_OP_STAT_RECOVERY 0x97 > +#define SD_OP_NOTIFY_VDI_DEL 0x98 > > #define SD_FLAG_CMD_IO_LOCAL 0x0010 > #define SD_FLAG_CMD_RECOVERY 0x0020 > diff --git a/sheep/group.c b/sheep/group.c > index c7fd387..1f1aa8d 100644 > --- a/sheep/group.c > +++ b/sheep/group.c > @@ -1314,6 +1314,7 @@ int create_cluster(int port, int64_t zone, int nr_vnodes) > INIT_LIST_HEAD(&sys->req_wait_for_obj_list); > INIT_LIST_HEAD(&sys->consistent_obj_list); > INIT_LIST_HEAD(&sys->blocking_conn_list); > + INIT_LIST_HEAD(&sys->deletion_work_list); > > INIT_LIST_HEAD(&sys->request_queue); > INIT_LIST_HEAD(&sys->event_queue); > diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c > index 28cdbbc..0aa93e5 100644 > --- a/sheep/object_list_cache.c > +++ b/sheep/object_list_cache.c > @@ -149,3 +149,20 @@ int get_obj_list(const struct sd_list_req *hdr, struct sd_list_rsp *rsp, void *d > > return res; > } > + > +int del_vdi_from_objlist_cache(uint64_t *objs, int count) > +{ > + int i; > + > + dprintf("%d\n", count); > + for (i = 0; i < count; i++) { > + pthread_rwlock_wrlock(&obj_list_cache.lock); > + if (!objlist_cache_rb_remove(&obj_list_cache.root, objs[i])) { > + dprintf("remove oid %" PRIx64 " from objlist cache\n", objs[i]); > + obj_list_cache.cache_size--; > + } > + pthread_rwlock_unlock(&obj_list_cache.lock); > + } > + > + return 0; > +} > diff --git a/sheep/ops.c b/sheep/ops.c > index b6f8eb2..5763b6f 100644 > --- a/sheep/ops.c > +++ b/sheep/ops.c > @@ -550,6 +550,57 @@ static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp, > return ret; > } > > +static void objlist_cache_del_vdi_work(struct work *work) > +{ > + struct deletion_work *dw = container_of(work, struct deletion_work, work); > + > + del_vdi_from_objlist_cache(dw->deleted_objs, dw->deleted_count); > +} > + > +static void objlist_cache_del_vdi_done(struct work *work) > +{ > + struct deletion_work *dw = container_of(work, struct deletion_work, work); > + > + list_del(&dw->dw_siblings); > + free(dw); > + > + if (!list_empty(&sys->deletion_work_list)) { > + dw = list_first_entry(&sys->deletion_work_list, > + struct deletion_work, dw_siblings); > + > + queue_work(sys->deletion_wqueue, &dw->work); > + } > +} > + > +static int cluster_notify_vdi_deletion(const struct sd_req *req, struct sd_rsp *rsp, > + void *data) > +{ > + int count = req->data_length / sizeof(uint64_t); > + struct deletion_work *dw = NULL; > + > + dw = zalloc(sizeof(*dw)); > + if (!dw) { > + eprintf("no memory to allocate\n"); > + return SD_RES_NO_MEM; > + } > + > + dw->deleted_objs = data; > + dw->deleted_count = count; > + > + dw->work.fn = objlist_cache_del_vdi_work; > + dw->work.done = objlist_cache_del_vdi_done; > + > + if (!list_empty(&sys->deletion_work_list)) { > + list_add_tail(&dw->dw_siblings, &sys->deletion_work_list); > + return SD_RES_SUCCESS; > + } > + > + list_add_tail(&dw->dw_siblings, &sys->deletion_work_list); > + queue_work(sys->deletion_wqueue, &dw->work); > + > + return SD_RES_SUCCESS; > +} > + > static int cluster_restore(const struct sd_req *req, struct sd_rsp *rsp, > void *data) > { > @@ -945,6 +996,12 @@ static struct sd_op_template sd_ops[] = { > .process_main = cluster_cleanup, > }, > > + [SD_OP_NOTIFY_VDI_DEL] = { > + .type = SD_OP_TYPE_CLUSTER, > + .force = 1, > + .process_main = cluster_notify_vdi_deletion, > + }, > + > /* local operations */ > [SD_OP_GET_STORE_LIST] = { > .type = SD_OP_TYPE_LOCAL, > diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h > index 2275a93..427db03 100644 > --- a/sheep/sheep_priv.h > +++ b/sheep/sheep_priv.h > @@ -135,6 +135,7 @@ struct cluster_info { > struct list_head req_wait_for_obj_list; > struct list_head consistent_obj_list; > struct list_head blocking_conn_list; > + struct list_head deletion_work_list; > > int nr_copies; > > @@ -212,6 +213,25 @@ struct objlist_cache { > pthread_rwlock_t lock; > }; > > +struct deletion_work { > + uint32_t done; > + uint32_t epoch; > + > + struct work work; > + struct list_head dw_siblings; > + > + uint32_t vid; > + > + int count; > + uint32_t *buf; > + > + uint64_t *deleted_objs; > + uint32_t deleted_count; > + > + struct vnode_info *vnodes; > + int delete_error; > +}; > + > extern struct cluster_info *sys; > extern struct store_driver *sd_store; > extern char *obj_path; > @@ -313,6 +333,7 @@ int prealloc(int fd, uint32_t size); > int init_objlist_cache(void); > int objlist_cache_rb_remove(struct rb_root *root, uint64_t oid); > int check_and_insert_objlist_cache(uint64_t oid); > +int del_vdi_from_objlist_cache(uint64_t *objs, int count); > > /* Operations */ > > diff --git a/sheep/vdi.c b/sheep/vdi.c > index 5b96dbb..609066e 100644 > --- a/sheep/vdi.c > +++ b/sheep/vdi.c > @@ -10,6 +10,7 @@ > */ > #include <stdio.h> > #include <stdlib.h> > +#include <unistd.h> > #include <sys/time.h> > > #include "sheepdog_proto.h" > @@ -373,24 +374,6 @@ int read_vdis(char *data, int len, unsigned int *rsp_len) > return SD_RES_SUCCESS; > } > > -struct deletion_work { > - uint32_t done; > - uint32_t epoch; > - > - struct work work; > - struct list_head dw_siblings; > - > - uint32_t vid; > - > - int count; > - uint32_t *buf; > - > - struct vnode_info *vnodes; > - int delete_error; > -}; > - > -static LIST_HEAD(deletion_work_list); > - > static int delete_inode(struct deletion_work *dw) > { > struct sheepdog_inode *inode = NULL; > @@ -430,6 +413,39 @@ out: > return ret; > } > > +static int notify_deletion(uint64_t *oids, uint32_t count) > +{ > + int fd, ret; > + unsigned int wlen, rlen = 0; > + struct sd_vdi_req hdr; > + char host[128]; > + > + addr_to_str(host, sizeof(host), sys->this_node.addr, 0); > + > + fd = connect_to(host, sys->this_node.port); > + if (fd < 0) { > + eprintf("connect to local node fail\n"); > + return -1; > + } > + > + memset(&hdr, 0, sizeof(hdr)); > + > + hdr.proto_ver = SD_PROTO_VER; > + hdr.opcode = SD_OP_NOTIFY_VDI_DEL; > + hdr.flags = SD_FLAG_CMD_WRITE; > + hdr.data_length = sizeof(uint64_t) * count; > + wlen = hdr.data_length; > + > + ret = exec_req(fd, (struct sd_req *)&hdr, oids, &wlen, &rlen); > + close(fd); > + > + if (ret < 0) { > + eprintf("send request fail\n"); > + return -1; > + } > + > + return 0; > +} > > static void delete_one(struct work *work) > { > @@ -447,6 +463,12 @@ static void delete_one(struct work *work) > goto out; > } > > + dw->deleted_objs = malloc(sizeof(uint64_t) * MAX_DATA_OBJS); > + if (!dw->deleted_objs) { > + eprintf("failed to allocate memory\n"); > + goto out; > + } > + > nr_copies = get_nr_copies(dw->vnodes); > > ret = read_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(vdi_id), > @@ -459,25 +481,33 @@ static void delete_one(struct work *work) > } > > for (i = 0; i < MAX_DATA_OBJS; i++) { > + uint64_t oid; > + > if (!inode->data_vdi_id[i]) > continue; > > + oid = vid_to_data_oid(inode->data_vdi_id[i], i); > + > if (inode->data_vdi_id[i] != inode->vdi_id) { > dprintf("object %" PRIx64 " is base's data, would not be deleted.\n", > - vid_to_data_oid(inode->data_vdi_id[i], i)); > + oid); > continue; > } > > ret = remove_object(dw->vnodes, dw->epoch, > - vid_to_data_oid(inode->data_vdi_id[i], i), > - nr_copies); > + oid, nr_copies); > > if (ret != SD_RES_SUCCESS) > dw->delete_error = 1; > - else > + else { > + dw->deleted_objs[dw->deleted_count++] = oid; > inode->data_vdi_id[i] = 0; > + } > } > > + if (dw->deleted_count > 0) > + notify_deletion(dw->deleted_objs, dw->deleted_count); This name looks confusing, I guess you mean undeleted objects as in dw->deleted_objs, dw->deleted_count. > + > if (dw->delete_error) { > write_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(vdi_id), > (void *)inode, sizeof(*inode), 0, 0, nr_copies, 0); > @@ -505,8 +535,8 @@ static void delete_one_done(struct work *work) > free(dw->buf); > free(dw); > > - if (!list_empty(&deletion_work_list)) { > - dw = list_first_entry(&deletion_work_list, > + if (!list_empty(&sys->deletion_work_list)) { > + dw = list_first_entry(&sys->deletion_work_list, > struct deletion_work, dw_siblings); > > queue_work(sys->deletion_wqueue, &dw->work); > @@ -647,12 +677,12 @@ int start_deletion(uint32_t vid, uint32_t epoch) > if (dw->count == 0) > goto out; > > - if (!list_empty(&deletion_work_list)) { > - list_add_tail(&dw->dw_siblings, &deletion_work_list); > + if (!list_empty(&sys->deletion_work_list)) { > + list_add_tail(&dw->dw_siblings, &sys->deletion_work_list); > goto out; > } > > - list_add_tail(&dw->dw_siblings, &deletion_work_list); > + list_add_tail(&dw->dw_siblings, &sys->deletion_work_list); > queue_work(sys->deletion_wqueue, &dw->work); it looks redundant to have deletion_work_list here to get FIFO semantics. We only have one worker for deletion_wqueue, isn't it FIFO already? > out: > return SD_RES_SUCCESS; This code looks really confusing, I think you need rework the logic first, take object recovery as an example, recovery_object just recovers the targeted object once at a time. Your delete_one I guess serves the same purpose as its name suggests, but you actually try to delete all the objects in one go. By the way, please run script/checkpatch.pl, there are some style problems. Thanks, Yuan |