Cluster recovery may cause objects migrated from one node to another, but the object list cache doesn't change, when deleting an object we can not find the right node in whose cache the id stays, so we need to notify the deletion list to all the node to make them delete the specified object id from object list cache. Signed-off-by: levin li <xingke.lwp at taobao.com> --- include/sheep.h | 1 + sheep/object_list_cache.c | 17 ++++++++++++++ sheep/ops.c | 17 ++++++++++++++ sheep/sheep_priv.h | 1 + sheep/vdi.c | 57 ++++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 90 insertions(+), 3 deletions(-) diff --git a/include/sheep.h b/include/sheep.h index f72460d..b4692d0 100644 --- a/include/sheep.h +++ b/include/sheep.h @@ -46,6 +46,7 @@ #define SD_OP_TRACE 0x95 #define SD_OP_TRACE_CAT 0x96 #define SD_OP_STAT_RECOVERY 0x97 +#define SD_OP_NOTIFY_VDI_DEL 0x98 #define SD_FLAG_CMD_IO_LOCAL 0x0010 #define SD_FLAG_CMD_RECOVERY 0x0020 diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c index 28cdbbc..339c865 100644 --- a/sheep/object_list_cache.c +++ b/sheep/object_list_cache.c @@ -130,6 +130,23 @@ int check_and_insert_objlist_cache(uint64_t oid) return 0; } +int del_vdi_from_objlist_cache(uint64_t *oids, int count) +{ + int i; + + dprintf("%d\n", count); + for (i = 0; i < count; i++) { + pthread_rwlock_wrlock(&obj_list_cache.lock); + if (!objlist_cache_rb_remove(&obj_list_cache.root, oids[i])) { + dprintf("remove oid %" PRIx64 " from objlist cache\n", oids[i]); + obj_list_cache.cache_size--; + } + pthread_rwlock_unlock(&obj_list_cache.lock); + } + + return 0; +} + int get_obj_list(const struct sd_list_req *hdr, struct sd_list_rsp *rsp, void *data) { uint64_t *list = (uint64_t *)data; diff --git a/sheep/ops.c b/sheep/ops.c index 439b714..0fabdab 100644 --- a/sheep/ops.c +++ b/sheep/ops.c @@ -550,6 +550,17 @@ static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp, return ret; } +static int cluster_notify_vdi_deletion(const struct sd_req *req, struct sd_rsp *rsp, + void *data) +{ + int count = req->data_length / sizeof(uint64_t); + uint64_t *oids = data; + + del_vdi_from_objlist_cache(oids, count); + + return SD_RES_SUCCESS; +} + static int cluster_restore(const struct sd_req *req, struct sd_rsp *rsp, void *data) { @@ -945,6 +956,12 @@ static struct sd_op_template sd_ops[] = { .process_bottom = cluster_cleanup, }, + [SD_OP_NOTIFY_VDI_DEL] = { + .type = SD_OP_TYPE_CLUSTER, + .force = 1, + .process_bottom = cluster_notify_vdi_deletion, + }, + /* local operations */ [SD_OP_GET_STORE_LIST] = { .type = SD_OP_TYPE_LOCAL, diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index 8258738..66722c5 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -293,6 +293,7 @@ void resume_recovery_work(void); int is_recoverying_oid(uint64_t oid); int node_in_recovery(void); +int del_vdi_from_objlist_cache(uint64_t *oids, int count); int write_object(struct vnode_info *vnodes, uint32_t node_version, uint64_t oid, char *data, unsigned int datalen, uint64_t offset, uint16_t flags, int nr, int create); diff --git a/sheep/vdi.c b/sheep/vdi.c index f240303..3fd5397 100644 --- a/sheep/vdi.c +++ b/sheep/vdi.c @@ -10,6 +10,7 @@ */ #include <stdio.h> #include <stdlib.h> +#include <unistd.h> #include <sys/time.h> #include "sheepdog_proto.h" @@ -430,6 +431,39 @@ out: return ret; } +static int notify_deletion(uint64_t *oids, uint32_t count) +{ + int fd, ret; + unsigned int wlen, rlen = 0; + struct sd_vdi_req hdr; + char host[128]; + + addr_to_str(host, sizeof(host), sys->this_node.addr, 0); + + fd = connect_to(host, sys->this_node.port); + if (fd < 0) { + eprintf("connect to local node fail\n"); + return -1; + } + + memset(&hdr, 0, sizeof(hdr)); + + hdr.proto_ver = SD_PROTO_VER; + hdr.opcode = SD_OP_NOTIFY_VDI_DEL; + hdr.flags = SD_FLAG_CMD_WRITE | SD_FLAG_CMD_WORKER; + hdr.data_length = sizeof(uint64_t) * count; + wlen = hdr.data_length; + + ret = exec_req(fd, (struct sd_req *)&hdr, oids, &wlen, &rlen); + close(fd); + + if (ret < 0) { + eprintf("send request fail\n"); + return -1; + } + + return 0; +} static void delete_one(struct work *work) { @@ -437,6 +471,8 @@ static void delete_one(struct work *work) uint32_t vdi_id = *(dw->buf + dw->count - dw->done - 1); int ret, i; struct sheepdog_inode *inode = NULL; + uint64_t *deleted_oids = NULL; + uint32_t deleted_count = 0; int nr_copies; eprintf("%d %d, %16x\n", dw->done, dw->count, vdi_id); @@ -447,6 +483,12 @@ static void delete_one(struct work *work) goto out; } + deleted_oids = malloc(sizeof(uint64_t) * MAX_DATA_OBJS); + if (!deleted_oids) { + eprintf("failed to allocate memory\n"); + goto out; + } + nr_copies = get_nr_copies(dw->vnodes); ret = read_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(vdi_id), @@ -461,6 +503,8 @@ static void delete_one(struct work *work) dw->delete_error = 0; for (i = 0; i < MAX_DATA_OBJS; i++) { + uint64_t oid; + if (!inode->data_vdi_id[i]) continue; @@ -470,16 +514,22 @@ static void delete_one(struct work *work) continue; } + oid = vid_to_data_oid(inode->data_vdi_id[i], i); + ret = remove_object(dw->vnodes, dw->epoch, - vid_to_data_oid(inode->data_vdi_id[i], i), - nr_copies); + oid, inode->nr_copies); if (ret != SD_RES_SUCCESS) dw->delete_error = 1; - else + else { + deleted_oids[deleted_count++] = oid; inode->data_vdi_id[i] = 0; + } } + if (deleted_count > 0) + notify_deletion(deleted_oids, deleted_count); + if (dw->delete_error) { write_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(vdi_id), (void *)inode, sizeof(*inode), 0, 0, nr_copies, 0); @@ -487,6 +537,7 @@ static void delete_one(struct work *work) out: free(inode); + free(deleted_oids); } static void delete_one_done(struct work *work) -- 1.7.10 |