On 2012年07月12日 06:01, MORITA Kazutaka wrote: > At Wed, 11 Jul 2012 14:42:47 +0800, > levin li wrote: >> >> From: levin li <xingke.lwp at taobao.com> >> >> Before reclaiming the cache belonging to the VDI just deleted, we should test >> whether the VDI is exist, because after some node delete it and before the >> notification is sent to all the node, another node may issus a VDI creation >> event and reused the VDI id again, in which case we should reclaim the cached >> entry. > > Should this be explained in the source code where vdi_exist() is > called? > >> >> Signed-off-by: levin li <xingke.lwp at taobao.com> >> --- >> sheep/object_list_cache.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ >> sheep/ops.c | 14 +++++++++++ >> sheep/sheep_priv.h | 2 + >> sheep/vdi.c | 27 +++++++++++++++++++++ >> 4 files changed, 100 insertions(+), 0 deletions(-) >> >> diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c >> index 39e8d49..479064c 100644 >> --- a/sheep/object_list_cache.c >> +++ b/sheep/object_list_cache.c >> @@ -37,6 +37,11 @@ struct objlist_cache { >> pthread_rwlock_t lock; >> }; >> >> +struct objlist_deletion_work { >> + uint32_t vid; >> + struct work work; >> +}; >> + >> struct objlist_cache obj_list_cache = { >> .tree_version = 1, >> .root = RB_ROOT, >> @@ -167,3 +172,55 @@ out: >> pthread_rwlock_unlock(&obj_list_cache.lock); >> return SD_RES_SUCCESS; >> } >> + >> +static void objlist_deletion_work(struct work *work) >> +{ >> + struct objlist_deletion_work *ow = >> + container_of(work, struct objlist_deletion_work, work); >> + struct objlist_cache_entry *entry, *t; >> + uint32_t vid = ow->vid, entry_vid; >> + >> + if (vdi_exist(vid)) { >> + eprintf("VDI (%" PRIx32 ") is still in use, can not be deleted\n", >> + vid); >> + return; >> + } >> + >> + pthread_rwlock_wrlock(&obj_list_cache.lock); >> + list_for_each_entry_safe(entry, t, &obj_list_cache.entry_list, list) { >> + entry_vid = oid_to_vid(entry->oid); >> + if (entry_vid != vid) >> + continue; >> + dprintf("delete object entry %" PRIx64 "\n", entry->oid); >> + list_del(&entry->list); >> + rb_erase(&entry->node, &obj_list_cache.root); >> + free(entry); >> + } >> + pthread_rwlock_unlock(&obj_list_cache.lock); >> +} >> + >> +static void objlist_deletion_done(struct work *work) >> +{ >> + struct objlist_deletion_work *ow = >> + container_of(work, struct objlist_deletion_work, work); >> + free(ow); >> +} >> + >> +/* During recovery, some objects may be migrated from one node to a >> + * new one, but we can't remove the object list cache entry in this >> + * case, it may causes recovery failure, so after recovery, we can >> + * not locate the cache entry correctly, causing objlist_cache_remove() >> + * fail to delete it, then we need this function to do the cleanup work >> + * in all nodes. */ > > I'm still a bit confused why we need both objlist_cache_delete and > objlist_cache_remove. If no recovery happens during object deletion, > we don't need objlist_cache_delete because objlist_cache_remove > removes the entry. objlist_cache_delete is necessary for the case > object deletion and recovery happen at the same time to clean up > unhandled entries by objlist_cache_remove. Is it correct? > Well, if no recovery happens before, objlist_cache_remove() can always delete the entry in object list cache while VDI deletion, but once recovery happens, it's something different. For example, obj A and B stay in node n1 at first, but after a recovery, A may be migrated from n1 to n2, and B stays in n1, but the objlist entry of A and B are still in n1, we can't migrate the objlist entry, or else it causes problem, now when VDI deletion is issued, sheep tries to remove object A from n2 and B from n1, objlist_cache_remove() is OK for deleting objlist entry for B, but not for A, because the entry of A is still in n1, but objlist_cache_remove(A) is called in n2, so this is the problem, not only the case recovery and deletion happen at the same time, but all the time after recovery. thanks, levin > Thanks, > > Kazutaka > >> +int objlist_cache_delete(uint32_t vid) >> +{ >> + struct objlist_deletion_work *ow; >> + >> + ow = zalloc(sizeof(*ow)); >> + ow->vid = vid; >> + ow->work.fn = objlist_deletion_work; >> + ow->work.done = objlist_deletion_done; >> + queue_work(sys->deletion_wqueue, &ow->work); >> + >> + return SD_RES_SUCCESS; >> +} >> diff --git a/sheep/ops.c b/sheep/ops.c >> index ecf4f2e..bd198a1 100644 >> --- a/sheep/ops.c >> +++ b/sheep/ops.c >> @@ -485,6 +485,14 @@ static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp, >> return ret; >> } >> >> +static int cluster_notify_vdi_del(const struct sd_req *req, struct sd_rsp *rsp, >> + void *data) >> +{ >> + uint32_t vid = *(uint32_t *)data; >> + >> + return objlist_cache_delete(vid); >> +} >> + >> static int cluster_restore(const struct sd_req *req, struct sd_rsp *rsp, >> void *data) >> { >> @@ -822,6 +830,12 @@ static struct sd_op_template sd_ops[] = { >> .process_main = cluster_cleanup, >> }, >> >> + [SD_OP_NOTIFY_VDI_DEL] = { >> + .type = SD_OP_TYPE_CLUSTER, >> + .force = 1, >> + .process_main = cluster_notify_vdi_del, >> + }, >> + >> /* local operations */ >> [SD_OP_GET_STORE_LIST] = { >> .type = SD_OP_TYPE_LOCAL, >> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h >> index afbc361..a4273e4 100644 >> --- a/sheep/sheep_priv.h >> +++ b/sheep/sheep_priv.h >> @@ -195,6 +195,7 @@ int create_listen_port(int port, void *data); >> int init_store(const char *dir, int enable_write_cache); >> int init_base_path(const char *dir); >> >> +int vdi_exist(uint32_t vid); >> int add_vdi(char *data, int data_len, uint64_t size, uint32_t *new_vid, >> uint32_t base_vid, int is_snapshot, unsigned int *nr_copies); >> >> @@ -257,6 +258,7 @@ uint32_t get_latest_epoch(void); >> int set_cluster_ctime(uint64_t ctime); >> uint64_t get_cluster_ctime(void); >> int get_obj_list(const struct sd_list_req *, struct sd_list_rsp *, void *); >> +int objlist_cache_delete(uint32_t vid); >> >> int start_recovery(struct vnode_info *cur_vnodes, >> struct vnode_info *old_vnodes); >> diff --git a/sheep/vdi.c b/sheep/vdi.c >> index bcb3df1..c9e070e 100644 >> --- a/sheep/vdi.c >> +++ b/sheep/vdi.c >> @@ -15,6 +15,33 @@ >> #include "sheepdog_proto.h" >> #include "sheep_priv.h" >> >> +int vdi_exist(uint32_t vid) >> +{ >> + struct sheepdog_inode *inode; >> + int ret = 1; >> + >> + inode = zalloc(sizeof(*inode)); >> + if (!inode) { >> + ret = 0; >> + goto out; >> + } >> + >> + ret = read_object(vid_to_vdi_oid(vid), (char *)inode, >> + sizeof(*inode), 0); >> + if (ret != SD_RES_SUCCESS) { >> + eprintf("fail to read vdi inode (%" PRIx32 ")\n", vid); >> + ret = 0; >> + goto out; >> + } >> + >> + if (*inode->name == '\0') >> + ret = 0; >> + ret = 1; >> + >> +out: >> + free(inode); >> + return ret; >> +} >> >> /* TODO: should be performed atomically */ >> static int create_vdi_obj(char *name, uint32_t new_vid, uint64_t size, >> -- >> 1.7.1 >> >> -- >> sheepdog mailing list >> sheepdog at lists.wpkg.org >> http://lists.wpkg.org/mailman/listinfo/sheepdog |