[sheepdog] [PATCH v2 3/3] object list cache: reclaim object list cache when receiving a deletion event.

Thu Jul 12 03:55:27 CEST 2012

On 2012年07月12日 06:01, MORITA Kazutaka wrote:
> At Wed, 11 Jul 2012 14:42:47 +0800,
> levin li wrote:
>>
>> From: levin li <xingke.lwp at taobao.com>
>>
>> Before reclaiming the cache belonging to the VDI just deleted, we should test
>> whether the VDI is exist, because after some node delete it and before the
>> notification is sent to all the node, another node may issus a VDI creation
>> event and reused the VDI id again, in which case we should reclaim the cached
>> entry.
> 
> Should this be explained in the source code where vdi_exist() is
> called?
> 
>>
>> Signed-off-by: levin li <xingke.lwp at taobao.com>
>> ---
>>  sheep/object_list_cache.c |   57 +++++++++++++++++++++++++++++++++++++++++++++
>>  sheep/ops.c               |   14 +++++++++++
>>  sheep/sheep_priv.h        |    2 +
>>  sheep/vdi.c               |   27 +++++++++++++++++++++
>>  4 files changed, 100 insertions(+), 0 deletions(-)
>>
>> diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
>> index 39e8d49..479064c 100644
>> --- a/sheep/object_list_cache.c
>> +++ b/sheep/object_list_cache.c
>> @@ -37,6 +37,11 @@ struct objlist_cache {
>>  	pthread_rwlock_t lock;
>>  };
>>  
>> +struct objlist_deletion_work {
>> +	uint32_t vid;
>> +	struct work work;
>> +};
>> +
>>  struct objlist_cache obj_list_cache = {
>>  	.tree_version	= 1,
>>  	.root		= RB_ROOT,
>> @@ -167,3 +172,55 @@ out:
>>  	pthread_rwlock_unlock(&obj_list_cache.lock);
>>  	return SD_RES_SUCCESS;
>>  }
>> +
>> +static void objlist_deletion_work(struct work *work)
>> +{
>> +	struct objlist_deletion_work *ow =
>> +		container_of(work, struct objlist_deletion_work, work);
>> +	struct objlist_cache_entry *entry, *t;
>> +	uint32_t vid = ow->vid, entry_vid;
>> +
>> +	if (vdi_exist(vid)) {
>> +		eprintf("VDI (%" PRIx32 ") is still in use, can not be deleted\n",
>> +			vid);
>> +		return;
>> +	}
>> +
>> +	pthread_rwlock_wrlock(&obj_list_cache.lock);
>> +	list_for_each_entry_safe(entry, t, &obj_list_cache.entry_list, list) {
>> +		entry_vid = oid_to_vid(entry->oid);
>> +		if (entry_vid != vid)
>> +			continue;
>> +		dprintf("delete object entry %" PRIx64 "\n", entry->oid);
>> +		list_del(&entry->list);
>> +		rb_erase(&entry->node, &obj_list_cache.root);
>> +		free(entry);
>> +	}
>> +	pthread_rwlock_unlock(&obj_list_cache.lock);
>> +}
>> +
>> +static void objlist_deletion_done(struct work *work)
>> +{
>> +	struct objlist_deletion_work *ow =
>> +		container_of(work, struct objlist_deletion_work, work);
>> +	free(ow);
>> +}
>> +
>> +/* During recovery, some objects may be migrated from one node to a
>> + * new one, but we can't remove the object list cache entry in this
>> + * case, it may causes recovery failure, so after recovery, we can
>> + * not locate the cache entry correctly, causing objlist_cache_remove()
>> + * fail to delete it, then we need this function to do the cleanup work
>> + * in all nodes. */
> 
> I'm still a bit confused why we need both objlist_cache_delete and
> objlist_cache_remove.  If no recovery happens during object deletion,
> we don't need objlist_cache_delete because objlist_cache_remove
> removes the entry.  objlist_cache_delete is necessary for the case
> object deletion and recovery happen at the same time to clean up
> unhandled entries by objlist_cache_remove.  Is it correct?
> 

Well, if no recovery happens before, objlist_cache_remove() can always
delete the entry in object list cache while VDI deletion, but once recovery
happens, it's something different.

For example, obj A and B stay in node n1 at first, but after a recovery,
A may be migrated from n1 to n2, and B stays in n1, but the objlist entry
of A and B are still in n1, we can't migrate the objlist entry, or else it
causes problem, now when VDI deletion is issued, sheep tries to remove object
A from n2 and B from n1, objlist_cache_remove() is OK for deleting objlist
entry for B, but not for A, because the entry of A is still in n1, but
objlist_cache_remove(A) is called in n2, so this is the problem, not only
the case recovery and deletion happen at the same time, but all the time
after recovery.

thanks,

levin

> Thanks,
> 
> Kazutaka
> 
>> +int objlist_cache_delete(uint32_t vid)
>> +{
>> +	struct objlist_deletion_work *ow;
>> +
>> +	ow = zalloc(sizeof(*ow));
>> +	ow->vid = vid;
>> +	ow->work.fn = objlist_deletion_work;
>> +	ow->work.done = objlist_deletion_done;
>> +	queue_work(sys->deletion_wqueue, &ow->work);
>> +
>> +	return SD_RES_SUCCESS;
>> +}
>> diff --git a/sheep/ops.c b/sheep/ops.c
>> index ecf4f2e..bd198a1 100644
>> --- a/sheep/ops.c
>> +++ b/sheep/ops.c
>> @@ -485,6 +485,14 @@ static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp,
>>  	return ret;
>>  }
>>  
>> +static int cluster_notify_vdi_del(const struct sd_req *req, struct sd_rsp *rsp,
>> +				  void *data)
>> +{
>> +	uint32_t vid = *(uint32_t *)data;
>> +
>> +	return objlist_cache_delete(vid);
>> +}
>> +
>>  static int cluster_restore(const struct sd_req *req, struct sd_rsp *rsp,
>>  			   void *data)
>>  {
>> @@ -822,6 +830,12 @@ static struct sd_op_template sd_ops[] = {
>>  		.process_main = cluster_cleanup,
>>  	},
>>  
>> +	[SD_OP_NOTIFY_VDI_DEL] = {
>> +		.type = SD_OP_TYPE_CLUSTER,
>> +		.force = 1,
>> +		.process_main = cluster_notify_vdi_del,
>> +	},
>> +
>>  	/* local operations */
>>  	[SD_OP_GET_STORE_LIST] = {
>>  		.type = SD_OP_TYPE_LOCAL,
>> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
>> index afbc361..a4273e4 100644
>> --- a/sheep/sheep_priv.h
>> +++ b/sheep/sheep_priv.h
>> @@ -195,6 +195,7 @@ int create_listen_port(int port, void *data);
>>  int init_store(const char *dir, int enable_write_cache);
>>  int init_base_path(const char *dir);
>>  
>> +int vdi_exist(uint32_t vid);
>>  int add_vdi(char *data, int data_len, uint64_t size, uint32_t *new_vid,
>>  	    uint32_t base_vid, int is_snapshot, unsigned int *nr_copies);
>>  
>> @@ -257,6 +258,7 @@ uint32_t get_latest_epoch(void);
>>  int set_cluster_ctime(uint64_t ctime);
>>  uint64_t get_cluster_ctime(void);
>>  int get_obj_list(const struct sd_list_req *, struct sd_list_rsp *, void *);
>> +int objlist_cache_delete(uint32_t vid);
>>  
>>  int start_recovery(struct vnode_info *cur_vnodes,
>>  	struct vnode_info *old_vnodes);
>> diff --git a/sheep/vdi.c b/sheep/vdi.c
>> index bcb3df1..c9e070e 100644
>> --- a/sheep/vdi.c
>> +++ b/sheep/vdi.c
>> @@ -15,6 +15,33 @@
>>  #include "sheepdog_proto.h"
>>  #include "sheep_priv.h"
>>  
>> +int vdi_exist(uint32_t vid)
>> +{
>> +	struct sheepdog_inode *inode;
>> +	int ret = 1;
>> +
>> +	inode = zalloc(sizeof(*inode));
>> +	if (!inode) {
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>> +	ret = read_object(vid_to_vdi_oid(vid), (char *)inode,
>> +			  sizeof(*inode), 0);
>> +	if (ret != SD_RES_SUCCESS) {
>> +		eprintf("fail to read vdi inode (%" PRIx32 ")\n", vid);
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>> +	if (*inode->name == '\0')
>> +		ret = 0;
>> +	ret = 1;
>> +
>> +out:
>> +	free(inode);
>> +	return ret;
>> +}
>>  
>>  /* TODO: should be performed atomically */
>>  static int create_vdi_obj(char *name, uint32_t new_vid, uint64_t size,
>> -- 
>> 1.7.1
>>
>> -- 
>> sheepdog mailing list
>> sheepdog at lists.wpkg.org
>> http://lists.wpkg.org/mailman/listinfo/sheepdog