[sheepdog] [PATCH v2 3/3] object list cache: reclaim object list cache when receiving a deletion event.

Thu Jul 12 00:01:53 CEST 2012

At Wed, 11 Jul 2012 14:42:47 +0800,
levin li wrote:
> 
> From: levin li <xingke.lwp at taobao.com>
> 
> Before reclaiming the cache belonging to the VDI just deleted, we should test
> whether the VDI is exist, because after some node delete it and before the
> notification is sent to all the node, another node may issus a VDI creation
> event and reused the VDI id again, in which case we should reclaim the cached
> entry.

Should this be explained in the source code where vdi_exist() is
called?

> 
> Signed-off-by: levin li <xingke.lwp at taobao.com>
> ---
>  sheep/object_list_cache.c |   57 +++++++++++++++++++++++++++++++++++++++++++++
>  sheep/ops.c               |   14 +++++++++++
>  sheep/sheep_priv.h        |    2 +
>  sheep/vdi.c               |   27 +++++++++++++++++++++
>  4 files changed, 100 insertions(+), 0 deletions(-)
> 
> diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
> index 39e8d49..479064c 100644
> --- a/sheep/object_list_cache.c
> +++ b/sheep/object_list_cache.c
> @@ -37,6 +37,11 @@ struct objlist_cache {
>  	pthread_rwlock_t lock;
>  };
>  
> +struct objlist_deletion_work {
> +	uint32_t vid;
> +	struct work work;
> +};
> +
>  struct objlist_cache obj_list_cache = {
>  	.tree_version	= 1,
>  	.root		= RB_ROOT,
> @@ -167,3 +172,55 @@ out:
>  	pthread_rwlock_unlock(&obj_list_cache.lock);
>  	return SD_RES_SUCCESS;
>  }
> +
> +static void objlist_deletion_work(struct work *work)
> +{
> +	struct objlist_deletion_work *ow =
> +		container_of(work, struct objlist_deletion_work, work);
> +	struct objlist_cache_entry *entry, *t;
> +	uint32_t vid = ow->vid, entry_vid;
> +
> +	if (vdi_exist(vid)) {
> +		eprintf("VDI (%" PRIx32 ") is still in use, can not be deleted\n",
> +			vid);
> +		return;
> +	}
> +
> +	pthread_rwlock_wrlock(&obj_list_cache.lock);
> +	list_for_each_entry_safe(entry, t, &obj_list_cache.entry_list, list) {
> +		entry_vid = oid_to_vid(entry->oid);
> +		if (entry_vid != vid)
> +			continue;
> +		dprintf("delete object entry %" PRIx64 "\n", entry->oid);
> +		list_del(&entry->list);
> +		rb_erase(&entry->node, &obj_list_cache.root);
> +		free(entry);
> +	}
> +	pthread_rwlock_unlock(&obj_list_cache.lock);
> +}
> +
> +static void objlist_deletion_done(struct work *work)
> +{
> +	struct objlist_deletion_work *ow =
> +		container_of(work, struct objlist_deletion_work, work);
> +	free(ow);
> +}
> +
> +/* During recovery, some objects may be migrated from one node to a
> + * new one, but we can't remove the object list cache entry in this
> + * case, it may causes recovery failure, so after recovery, we can
> + * not locate the cache entry correctly, causing objlist_cache_remove()
> + * fail to delete it, then we need this function to do the cleanup work
> + * in all nodes. */

I'm still a bit confused why we need both objlist_cache_delete and
objlist_cache_remove.  If no recovery happens during object deletion,
we don't need objlist_cache_delete because objlist_cache_remove
removes the entry.  objlist_cache_delete is necessary for the case
object deletion and recovery happen at the same time to clean up
unhandled entries by objlist_cache_remove.  Is it correct?

Thanks,

Kazutaka

> +int objlist_cache_delete(uint32_t vid)
> +{
> +	struct objlist_deletion_work *ow;
> +
> +	ow = zalloc(sizeof(*ow));
> +	ow->vid = vid;
> +	ow->work.fn = objlist_deletion_work;
> +	ow->work.done = objlist_deletion_done;
> +	queue_work(sys->deletion_wqueue, &ow->work);
> +
> +	return SD_RES_SUCCESS;
> +}
> diff --git a/sheep/ops.c b/sheep/ops.c
> index ecf4f2e..bd198a1 100644
> --- a/sheep/ops.c
> +++ b/sheep/ops.c
> @@ -485,6 +485,14 @@ static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp,
>  	return ret;
>  }
>  
> +static int cluster_notify_vdi_del(const struct sd_req *req, struct sd_rsp *rsp,
> +				  void *data)
> +{
> +	uint32_t vid = *(uint32_t *)data;
> +
> +	return objlist_cache_delete(vid);
> +}
> +
>  static int cluster_restore(const struct sd_req *req, struct sd_rsp *rsp,
>  			   void *data)
>  {
> @@ -822,6 +830,12 @@ static struct sd_op_template sd_ops[] = {
>  		.process_main = cluster_cleanup,
>  	},
>  
> +	[SD_OP_NOTIFY_VDI_DEL] = {
> +		.type = SD_OP_TYPE_CLUSTER,
> +		.force = 1,
> +		.process_main = cluster_notify_vdi_del,
> +	},
> +
>  	/* local operations */
>  	[SD_OP_GET_STORE_LIST] = {
>  		.type = SD_OP_TYPE_LOCAL,
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index afbc361..a4273e4 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -195,6 +195,7 @@ int create_listen_port(int port, void *data);
>  int init_store(const char *dir, int enable_write_cache);
>  int init_base_path(const char *dir);
>  
> +int vdi_exist(uint32_t vid);
>  int add_vdi(char *data, int data_len, uint64_t size, uint32_t *new_vid,
>  	    uint32_t base_vid, int is_snapshot, unsigned int *nr_copies);
>  
> @@ -257,6 +258,7 @@ uint32_t get_latest_epoch(void);
>  int set_cluster_ctime(uint64_t ctime);
>  uint64_t get_cluster_ctime(void);
>  int get_obj_list(const struct sd_list_req *, struct sd_list_rsp *, void *);
> +int objlist_cache_delete(uint32_t vid);
>  
>  int start_recovery(struct vnode_info *cur_vnodes,
>  	struct vnode_info *old_vnodes);
> diff --git a/sheep/vdi.c b/sheep/vdi.c
> index bcb3df1..c9e070e 100644
> --- a/sheep/vdi.c
> +++ b/sheep/vdi.c
> @@ -15,6 +15,33 @@
>  #include "sheepdog_proto.h"
>  #include "sheep_priv.h"
>  
> +int vdi_exist(uint32_t vid)
> +{
> +	struct sheepdog_inode *inode;
> +	int ret = 1;
> +
> +	inode = zalloc(sizeof(*inode));
> +	if (!inode) {
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	ret = read_object(vid_to_vdi_oid(vid), (char *)inode,
> +			  sizeof(*inode), 0);
> +	if (ret != SD_RES_SUCCESS) {
> +		eprintf("fail to read vdi inode (%" PRIx32 ")\n", vid);
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	if (*inode->name == '\0')
> +		ret = 0;
> +	ret = 1;
> +
> +out:
> +	free(inode);
> +	return ret;
> +}
>  
>  /* TODO: should be performed atomically */
>  static int create_vdi_obj(char *name, uint32_t new_vid, uint64_t size,
> -- 
> 1.7.1
> 
> -- 
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog