[sheepdog] [PATCH v2 3/3] object list cache: reclaim object list cache when receiving a deletion event.
levin li
levin108 at gmail.com
Thu Jul 12 03:55:27 CEST 2012
On 2012年07月12日 06:01, MORITA Kazutaka wrote:
> At Wed, 11 Jul 2012 14:42:47 +0800,
> levin li wrote:
>>
>> From: levin li <xingke.lwp at taobao.com>
>>
>> Before reclaiming the cache belonging to the VDI just deleted, we should test
>> whether the VDI is exist, because after some node delete it and before the
>> notification is sent to all the node, another node may issus a VDI creation
>> event and reused the VDI id again, in which case we should reclaim the cached
>> entry.
>
> Should this be explained in the source code where vdi_exist() is
> called?
>
>>
>> Signed-off-by: levin li <xingke.lwp at taobao.com>
>> ---
>> sheep/object_list_cache.c | 57 +++++++++++++++++++++++++++++++++++++++++++++
>> sheep/ops.c | 14 +++++++++++
>> sheep/sheep_priv.h | 2 +
>> sheep/vdi.c | 27 +++++++++++++++++++++
>> 4 files changed, 100 insertions(+), 0 deletions(-)
>>
>> diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
>> index 39e8d49..479064c 100644
>> --- a/sheep/object_list_cache.c
>> +++ b/sheep/object_list_cache.c
>> @@ -37,6 +37,11 @@ struct objlist_cache {
>> pthread_rwlock_t lock;
>> };
>>
>> +struct objlist_deletion_work {
>> + uint32_t vid;
>> + struct work work;
>> +};
>> +
>> struct objlist_cache obj_list_cache = {
>> .tree_version = 1,
>> .root = RB_ROOT,
>> @@ -167,3 +172,55 @@ out:
>> pthread_rwlock_unlock(&obj_list_cache.lock);
>> return SD_RES_SUCCESS;
>> }
>> +
>> +static void objlist_deletion_work(struct work *work)
>> +{
>> + struct objlist_deletion_work *ow =
>> + container_of(work, struct objlist_deletion_work, work);
>> + struct objlist_cache_entry *entry, *t;
>> + uint32_t vid = ow->vid, entry_vid;
>> +
>> + if (vdi_exist(vid)) {
>> + eprintf("VDI (%" PRIx32 ") is still in use, can not be deleted\n",
>> + vid);
>> + return;
>> + }
>> +
>> + pthread_rwlock_wrlock(&obj_list_cache.lock);
>> + list_for_each_entry_safe(entry, t, &obj_list_cache.entry_list, list) {
>> + entry_vid = oid_to_vid(entry->oid);
>> + if (entry_vid != vid)
>> + continue;
>> + dprintf("delete object entry %" PRIx64 "\n", entry->oid);
>> + list_del(&entry->list);
>> + rb_erase(&entry->node, &obj_list_cache.root);
>> + free(entry);
>> + }
>> + pthread_rwlock_unlock(&obj_list_cache.lock);
>> +}
>> +
>> +static void objlist_deletion_done(struct work *work)
>> +{
>> + struct objlist_deletion_work *ow =
>> + container_of(work, struct objlist_deletion_work, work);
>> + free(ow);
>> +}
>> +
>> +/* During recovery, some objects may be migrated from one node to a
>> + * new one, but we can't remove the object list cache entry in this
>> + * case, it may causes recovery failure, so after recovery, we can
>> + * not locate the cache entry correctly, causing objlist_cache_remove()
>> + * fail to delete it, then we need this function to do the cleanup work
>> + * in all nodes. */
>
> I'm still a bit confused why we need both objlist_cache_delete and
> objlist_cache_remove. If no recovery happens during object deletion,
> we don't need objlist_cache_delete because objlist_cache_remove
> removes the entry. objlist_cache_delete is necessary for the case
> object deletion and recovery happen at the same time to clean up
> unhandled entries by objlist_cache_remove. Is it correct?
>
Well, if no recovery happens before, objlist_cache_remove() can always
delete the entry in object list cache while VDI deletion, but once recovery
happens, it's something different.
For example, obj A and B stay in node n1 at first, but after a recovery,
A may be migrated from n1 to n2, and B stays in n1, but the objlist entry
of A and B are still in n1, we can't migrate the objlist entry, or else it
causes problem, now when VDI deletion is issued, sheep tries to remove object
A from n2 and B from n1, objlist_cache_remove() is OK for deleting objlist
entry for B, but not for A, because the entry of A is still in n1, but
objlist_cache_remove(A) is called in n2, so this is the problem, not only
the case recovery and deletion happen at the same time, but all the time
after recovery.
thanks,
levin
> Thanks,
>
> Kazutaka
>
>> +int objlist_cache_delete(uint32_t vid)
>> +{
>> + struct objlist_deletion_work *ow;
>> +
>> + ow = zalloc(sizeof(*ow));
>> + ow->vid = vid;
>> + ow->work.fn = objlist_deletion_work;
>> + ow->work.done = objlist_deletion_done;
>> + queue_work(sys->deletion_wqueue, &ow->work);
>> +
>> + return SD_RES_SUCCESS;
>> +}
>> diff --git a/sheep/ops.c b/sheep/ops.c
>> index ecf4f2e..bd198a1 100644
>> --- a/sheep/ops.c
>> +++ b/sheep/ops.c
>> @@ -485,6 +485,14 @@ static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp,
>> return ret;
>> }
>>
>> +static int cluster_notify_vdi_del(const struct sd_req *req, struct sd_rsp *rsp,
>> + void *data)
>> +{
>> + uint32_t vid = *(uint32_t *)data;
>> +
>> + return objlist_cache_delete(vid);
>> +}
>> +
>> static int cluster_restore(const struct sd_req *req, struct sd_rsp *rsp,
>> void *data)
>> {
>> @@ -822,6 +830,12 @@ static struct sd_op_template sd_ops[] = {
>> .process_main = cluster_cleanup,
>> },
>>
>> + [SD_OP_NOTIFY_VDI_DEL] = {
>> + .type = SD_OP_TYPE_CLUSTER,
>> + .force = 1,
>> + .process_main = cluster_notify_vdi_del,
>> + },
>> +
>> /* local operations */
>> [SD_OP_GET_STORE_LIST] = {
>> .type = SD_OP_TYPE_LOCAL,
>> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
>> index afbc361..a4273e4 100644
>> --- a/sheep/sheep_priv.h
>> +++ b/sheep/sheep_priv.h
>> @@ -195,6 +195,7 @@ int create_listen_port(int port, void *data);
>> int init_store(const char *dir, int enable_write_cache);
>> int init_base_path(const char *dir);
>>
>> +int vdi_exist(uint32_t vid);
>> int add_vdi(char *data, int data_len, uint64_t size, uint32_t *new_vid,
>> uint32_t base_vid, int is_snapshot, unsigned int *nr_copies);
>>
>> @@ -257,6 +258,7 @@ uint32_t get_latest_epoch(void);
>> int set_cluster_ctime(uint64_t ctime);
>> uint64_t get_cluster_ctime(void);
>> int get_obj_list(const struct sd_list_req *, struct sd_list_rsp *, void *);
>> +int objlist_cache_delete(uint32_t vid);
>>
>> int start_recovery(struct vnode_info *cur_vnodes,
>> struct vnode_info *old_vnodes);
>> diff --git a/sheep/vdi.c b/sheep/vdi.c
>> index bcb3df1..c9e070e 100644
>> --- a/sheep/vdi.c
>> +++ b/sheep/vdi.c
>> @@ -15,6 +15,33 @@
>> #include "sheepdog_proto.h"
>> #include "sheep_priv.h"
>>
>> +int vdi_exist(uint32_t vid)
>> +{
>> + struct sheepdog_inode *inode;
>> + int ret = 1;
>> +
>> + inode = zalloc(sizeof(*inode));
>> + if (!inode) {
>> + ret = 0;
>> + goto out;
>> + }
>> +
>> + ret = read_object(vid_to_vdi_oid(vid), (char *)inode,
>> + sizeof(*inode), 0);
>> + if (ret != SD_RES_SUCCESS) {
>> + eprintf("fail to read vdi inode (%" PRIx32 ")\n", vid);
>> + ret = 0;
>> + goto out;
>> + }
>> +
>> + if (*inode->name == '\0')
>> + ret = 0;
>> + ret = 1;
>> +
>> +out:
>> + free(inode);
>> + return ret;
>> +}
>>
>> /* TODO: should be performed atomically */
>> static int create_vdi_obj(char *name, uint32_t new_vid, uint64_t size,
>> --
>> 1.7.1
>>
>> --
>> sheepdog mailing list
>> sheepdog at lists.wpkg.org
>> http://lists.wpkg.org/mailman/listinfo/sheepdog
More information about the sheepdog
mailing list