At Wed, 11 Jul 2012 14:42:47 +0800, levin li wrote: > > From: levin li <xingke.lwp at taobao.com> > > Before reclaiming the cache belonging to the VDI just deleted, we should test > whether the VDI is exist, because after some node delete it and before the > notification is sent to all the node, another node may issus a VDI creation > event and reused the VDI id again, in which case we should reclaim the cached > entry. Should this be explained in the source code where vdi_exist() is called? > > Signed-off-by: levin li <xingke.lwp at taobao.com> > --- > sheep/object_list_cache.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ > sheep/ops.c | 14 +++++++++++ > sheep/sheep_priv.h | 2 + > sheep/vdi.c | 27 +++++++++++++++++++++ > 4 files changed, 100 insertions(+), 0 deletions(-) > > diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c > index 39e8d49..479064c 100644 > --- a/sheep/object_list_cache.c > +++ b/sheep/object_list_cache.c > @@ -37,6 +37,11 @@ struct objlist_cache { > pthread_rwlock_t lock; > }; > > +struct objlist_deletion_work { > + uint32_t vid; > + struct work work; > +}; > + > struct objlist_cache obj_list_cache = { > .tree_version = 1, > .root = RB_ROOT, > @@ -167,3 +172,55 @@ out: > pthread_rwlock_unlock(&obj_list_cache.lock); > return SD_RES_SUCCESS; > } > + > +static void objlist_deletion_work(struct work *work) > +{ > + struct objlist_deletion_work *ow = > + container_of(work, struct objlist_deletion_work, work); > + struct objlist_cache_entry *entry, *t; > + uint32_t vid = ow->vid, entry_vid; > + > + if (vdi_exist(vid)) { > + eprintf("VDI (%" PRIx32 ") is still in use, can not be deleted\n", > + vid); > + return; > + } > + > + pthread_rwlock_wrlock(&obj_list_cache.lock); > + list_for_each_entry_safe(entry, t, &obj_list_cache.entry_list, list) { > + entry_vid = oid_to_vid(entry->oid); > + if (entry_vid != vid) > + continue; > + dprintf("delete object entry %" PRIx64 "\n", entry->oid); > + list_del(&entry->list); > + rb_erase(&entry->node, &obj_list_cache.root); > + free(entry); > + } > + pthread_rwlock_unlock(&obj_list_cache.lock); > +} > + > +static void objlist_deletion_done(struct work *work) > +{ > + struct objlist_deletion_work *ow = > + container_of(work, struct objlist_deletion_work, work); > + free(ow); > +} > + > +/* During recovery, some objects may be migrated from one node to a > + * new one, but we can't remove the object list cache entry in this > + * case, it may causes recovery failure, so after recovery, we can > + * not locate the cache entry correctly, causing objlist_cache_remove() > + * fail to delete it, then we need this function to do the cleanup work > + * in all nodes. */ I'm still a bit confused why we need both objlist_cache_delete and objlist_cache_remove. If no recovery happens during object deletion, we don't need objlist_cache_delete because objlist_cache_remove removes the entry. objlist_cache_delete is necessary for the case object deletion and recovery happen at the same time to clean up unhandled entries by objlist_cache_remove. Is it correct? Thanks, Kazutaka > +int objlist_cache_delete(uint32_t vid) > +{ > + struct objlist_deletion_work *ow; > + > + ow = zalloc(sizeof(*ow)); > + ow->vid = vid; > + ow->work.fn = objlist_deletion_work; > + ow->work.done = objlist_deletion_done; > + queue_work(sys->deletion_wqueue, &ow->work); > + > + return SD_RES_SUCCESS; > +} > diff --git a/sheep/ops.c b/sheep/ops.c > index ecf4f2e..bd198a1 100644 > --- a/sheep/ops.c > +++ b/sheep/ops.c > @@ -485,6 +485,14 @@ static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp, > return ret; > } > > +static int cluster_notify_vdi_del(const struct sd_req *req, struct sd_rsp *rsp, > + void *data) > +{ > + uint32_t vid = *(uint32_t *)data; > + > + return objlist_cache_delete(vid); > +} > + > static int cluster_restore(const struct sd_req *req, struct sd_rsp *rsp, > void *data) > { > @@ -822,6 +830,12 @@ static struct sd_op_template sd_ops[] = { > .process_main = cluster_cleanup, > }, > > + [SD_OP_NOTIFY_VDI_DEL] = { > + .type = SD_OP_TYPE_CLUSTER, > + .force = 1, > + .process_main = cluster_notify_vdi_del, > + }, > + > /* local operations */ > [SD_OP_GET_STORE_LIST] = { > .type = SD_OP_TYPE_LOCAL, > diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h > index afbc361..a4273e4 100644 > --- a/sheep/sheep_priv.h > +++ b/sheep/sheep_priv.h > @@ -195,6 +195,7 @@ int create_listen_port(int port, void *data); > int init_store(const char *dir, int enable_write_cache); > int init_base_path(const char *dir); > > +int vdi_exist(uint32_t vid); > int add_vdi(char *data, int data_len, uint64_t size, uint32_t *new_vid, > uint32_t base_vid, int is_snapshot, unsigned int *nr_copies); > > @@ -257,6 +258,7 @@ uint32_t get_latest_epoch(void); > int set_cluster_ctime(uint64_t ctime); > uint64_t get_cluster_ctime(void); > int get_obj_list(const struct sd_list_req *, struct sd_list_rsp *, void *); > +int objlist_cache_delete(uint32_t vid); > > int start_recovery(struct vnode_info *cur_vnodes, > struct vnode_info *old_vnodes); > diff --git a/sheep/vdi.c b/sheep/vdi.c > index bcb3df1..c9e070e 100644 > --- a/sheep/vdi.c > +++ b/sheep/vdi.c > @@ -15,6 +15,33 @@ > #include "sheepdog_proto.h" > #include "sheep_priv.h" > > +int vdi_exist(uint32_t vid) > +{ > + struct sheepdog_inode *inode; > + int ret = 1; > + > + inode = zalloc(sizeof(*inode)); > + if (!inode) { > + ret = 0; > + goto out; > + } > + > + ret = read_object(vid_to_vdi_oid(vid), (char *)inode, > + sizeof(*inode), 0); > + if (ret != SD_RES_SUCCESS) { > + eprintf("fail to read vdi inode (%" PRIx32 ")\n", vid); > + ret = 0; > + goto out; > + } > + > + if (*inode->name == '\0') > + ret = 0; > + ret = 1; > + > +out: > + free(inode); > + return ret; > +} > > /* TODO: should be performed atomically */ > static int create_vdi_obj(char *name, uint32_t new_vid, uint64_t size, > -- > 1.7.1 > > -- > sheepdog mailing list > sheepdog at lists.wpkg.org > http://lists.wpkg.org/mailman/listinfo/sheepdog |