Cluster recovery may cause objects migrated from one node to another, but the object list cache doesn't change, when deleting an object we can not find the right node in whose cache the id stays, so we need to notify the deletion object list to all the node to make them delete the specified object id from object list cache. I reused the deletion_work_list and deletion_wqueue for the object list cleanup work, so that we save some memory by not starting a new work queue. The del_vdi is called in worker thread, but it's locked by the notification mechanism, other operations about deletion_work_list and deletion_wqueue are done in main thread, so there's no need to concern about the race condition. Signed-off-by: levin li <xingke.lwp at taobao.com> --- include/sheep.h | 1 + sheep/group.c | 1 + sheep/object_list_cache.c | 17 +++++++++ sheep/ops.c | 57 ++++++++++++++++++++++++++++++ sheep/sheep_priv.h | 21 ++++++++++++ sheep/vdi.c | 84 ++++++++++++++++++++++++++++++--------------- 6 files changed, 154 insertions(+), 27 deletions(-) diff --git a/include/sheep.h b/include/sheep.h index 7e287c4..e941dc1 100644 --- a/include/sheep.h +++ b/include/sheep.h @@ -46,6 +46,7 @@ #define SD_OP_TRACE 0x95 #define SD_OP_TRACE_CAT 0x96 #define SD_OP_STAT_RECOVERY 0x97 +#define SD_OP_NOTIFY_VDI_DEL 0x98 #define SD_FLAG_CMD_IO_LOCAL 0x0010 #define SD_FLAG_CMD_RECOVERY 0x0020 diff --git a/sheep/group.c b/sheep/group.c index c7fd387..1f1aa8d 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -1314,6 +1314,7 @@ int create_cluster(int port, int64_t zone, int nr_vnodes) INIT_LIST_HEAD(&sys->req_wait_for_obj_list); INIT_LIST_HEAD(&sys->consistent_obj_list); INIT_LIST_HEAD(&sys->blocking_conn_list); + INIT_LIST_HEAD(&sys->deletion_work_list); INIT_LIST_HEAD(&sys->request_queue); INIT_LIST_HEAD(&sys->event_queue); diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c index 28cdbbc..0aa93e5 100644 --- a/sheep/object_list_cache.c +++ b/sheep/object_list_cache.c @@ -149,3 +149,20 @@ int get_obj_list(const struct sd_list_req *hdr, struct sd_list_rsp *rsp, void *d return res; } + +int del_vdi_from_objlist_cache(uint64_t *objs, int count) +{ + int i; + + dprintf("%d\n", count); + for (i = 0; i < count; i++) { + pthread_rwlock_wrlock(&obj_list_cache.lock); + if (!objlist_cache_rb_remove(&obj_list_cache.root, objs[i])) { + dprintf("remove oid %" PRIx64 " from objlist cache\n", objs[i]); + obj_list_cache.cache_size--; + } + pthread_rwlock_unlock(&obj_list_cache.lock); + } + + return 0; +} diff --git a/sheep/ops.c b/sheep/ops.c index b6f8eb2..5763b6f 100644 --- a/sheep/ops.c +++ b/sheep/ops.c @@ -550,6 +550,57 @@ static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp, return ret; } +static void objlist_cache_del_vdi_work(struct work *work) +{ + struct deletion_work *dw = container_of(work, struct deletion_work, work); + + del_vdi_from_objlist_cache(dw->deleted_objs, dw->deleted_count); +} + +static void objlist_cache_del_vdi_done(struct work *work) +{ + struct deletion_work *dw = container_of(work, struct deletion_work, work); + + list_del(&dw->dw_siblings); + free(dw); + + if (!list_empty(&sys->deletion_work_list)) { + dw = list_first_entry(&sys->deletion_work_list, + struct deletion_work, dw_siblings); + + queue_work(sys->deletion_wqueue, &dw->work); + } +} + +static int cluster_notify_vdi_deletion(const struct sd_req *req, struct sd_rsp *rsp, + void *data) +{ + int count = req->data_length / sizeof(uint64_t); + struct deletion_work *dw = NULL; + + dw = zalloc(sizeof(*dw)); + if (!dw) { + eprintf("no memory to allocate\n"); + return SD_RES_NO_MEM; + } + + dw->deleted_objs = data; + dw->deleted_count = count; + + dw->work.fn = objlist_cache_del_vdi_work; + dw->work.done = objlist_cache_del_vdi_done; + + if (!list_empty(&sys->deletion_work_list)) { + list_add_tail(&dw->dw_siblings, &sys->deletion_work_list); + return SD_RES_SUCCESS; + } + + list_add_tail(&dw->dw_siblings, &sys->deletion_work_list); + queue_work(sys->deletion_wqueue, &dw->work); + + return SD_RES_SUCCESS; +} + static int cluster_restore(const struct sd_req *req, struct sd_rsp *rsp, void *data) { @@ -945,6 +996,12 @@ static struct sd_op_template sd_ops[] = { .process_main = cluster_cleanup, }, + [SD_OP_NOTIFY_VDI_DEL] = { + .type = SD_OP_TYPE_CLUSTER, + .force = 1, + .process_main = cluster_notify_vdi_deletion, + }, + /* local operations */ [SD_OP_GET_STORE_LIST] = { .type = SD_OP_TYPE_LOCAL, diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index 2275a93..427db03 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -135,6 +135,7 @@ struct cluster_info { struct list_head req_wait_for_obj_list; struct list_head consistent_obj_list; struct list_head blocking_conn_list; + struct list_head deletion_work_list; int nr_copies; @@ -212,6 +213,25 @@ struct objlist_cache { pthread_rwlock_t lock; }; +struct deletion_work { + uint32_t done; + uint32_t epoch; + + struct work work; + struct list_head dw_siblings; + + uint32_t vid; + + int count; + uint32_t *buf; + + uint64_t *deleted_objs; + uint32_t deleted_count; + + struct vnode_info *vnodes; + int delete_error; +}; + extern struct cluster_info *sys; extern struct store_driver *sd_store; extern char *obj_path; @@ -313,6 +333,7 @@ int prealloc(int fd, uint32_t size); int init_objlist_cache(void); int objlist_cache_rb_remove(struct rb_root *root, uint64_t oid); int check_and_insert_objlist_cache(uint64_t oid); +int del_vdi_from_objlist_cache(uint64_t *objs, int count); /* Operations */ diff --git a/sheep/vdi.c b/sheep/vdi.c index 5b96dbb..609066e 100644 --- a/sheep/vdi.c +++ b/sheep/vdi.c @@ -10,6 +10,7 @@ */ #include <stdio.h> #include <stdlib.h> +#include <unistd.h> #include <sys/time.h> #include "sheepdog_proto.h" @@ -373,24 +374,6 @@ int read_vdis(char *data, int len, unsigned int *rsp_len) return SD_RES_SUCCESS; } -struct deletion_work { - uint32_t done; - uint32_t epoch; - - struct work work; - struct list_head dw_siblings; - - uint32_t vid; - - int count; - uint32_t *buf; - - struct vnode_info *vnodes; - int delete_error; -}; - -static LIST_HEAD(deletion_work_list); - static int delete_inode(struct deletion_work *dw) { struct sheepdog_inode *inode = NULL; @@ -430,6 +413,39 @@ out: return ret; } +static int notify_deletion(uint64_t *oids, uint32_t count) +{ + int fd, ret; + unsigned int wlen, rlen = 0; + struct sd_vdi_req hdr; + char host[128]; + + addr_to_str(host, sizeof(host), sys->this_node.addr, 0); + + fd = connect_to(host, sys->this_node.port); + if (fd < 0) { + eprintf("connect to local node fail\n"); + return -1; + } + + memset(&hdr, 0, sizeof(hdr)); + + hdr.proto_ver = SD_PROTO_VER; + hdr.opcode = SD_OP_NOTIFY_VDI_DEL; + hdr.flags = SD_FLAG_CMD_WRITE; + hdr.data_length = sizeof(uint64_t) * count; + wlen = hdr.data_length; + + ret = exec_req(fd, (struct sd_req *)&hdr, oids, &wlen, &rlen); + close(fd); + + if (ret < 0) { + eprintf("send request fail\n"); + return -1; + } + + return 0; +} static void delete_one(struct work *work) { @@ -447,6 +463,12 @@ static void delete_one(struct work *work) goto out; } + dw->deleted_objs = malloc(sizeof(uint64_t) * MAX_DATA_OBJS); + if (!dw->deleted_objs) { + eprintf("failed to allocate memory\n"); + goto out; + } + nr_copies = get_nr_copies(dw->vnodes); ret = read_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(vdi_id), @@ -459,25 +481,33 @@ static void delete_one(struct work *work) } for (i = 0; i < MAX_DATA_OBJS; i++) { + uint64_t oid; + if (!inode->data_vdi_id[i]) continue; + oid = vid_to_data_oid(inode->data_vdi_id[i], i); + if (inode->data_vdi_id[i] != inode->vdi_id) { dprintf("object %" PRIx64 " is base's data, would not be deleted.\n", - vid_to_data_oid(inode->data_vdi_id[i], i)); + oid); continue; } ret = remove_object(dw->vnodes, dw->epoch, - vid_to_data_oid(inode->data_vdi_id[i], i), - nr_copies); + oid, nr_copies); if (ret != SD_RES_SUCCESS) dw->delete_error = 1; - else + else { + dw->deleted_objs[dw->deleted_count++] = oid; inode->data_vdi_id[i] = 0; + } } + if (dw->deleted_count > 0) + notify_deletion(dw->deleted_objs, dw->deleted_count); + if (dw->delete_error) { write_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(vdi_id), (void *)inode, sizeof(*inode), 0, 0, nr_copies, 0); @@ -505,8 +535,8 @@ static void delete_one_done(struct work *work) free(dw->buf); free(dw); - if (!list_empty(&deletion_work_list)) { - dw = list_first_entry(&deletion_work_list, + if (!list_empty(&sys->deletion_work_list)) { + dw = list_first_entry(&sys->deletion_work_list, struct deletion_work, dw_siblings); queue_work(sys->deletion_wqueue, &dw->work); @@ -647,12 +677,12 @@ int start_deletion(uint32_t vid, uint32_t epoch) if (dw->count == 0) goto out; - if (!list_empty(&deletion_work_list)) { - list_add_tail(&dw->dw_siblings, &deletion_work_list); + if (!list_empty(&sys->deletion_work_list)) { + list_add_tail(&dw->dw_siblings, &sys->deletion_work_list); goto out; } - list_add_tail(&dw->dw_siblings, &deletion_work_list); + list_add_tail(&dw->dw_siblings, &sys->deletion_work_list); queue_work(sys->deletion_wqueue, &dw->work); out: return SD_RES_SUCCESS; -- 1.7.10 |