[Sheepdog] [PATCH v5] remove oids from object list cache when deleting a vdi

Liu Yuan namei.unix at gmail.com
Sat May 5 06:59:54 CEST 2012


On 05/04/2012 02:49 PM, levin li wrote:

> Cluster recovery may cause objects migrated from one node
> to another, but the object list cache doesn't change, when
> deleting an object we can not find the right node in whose
> cache the id stays, so we need to notify the deletion object
> list to all the node to make them delete the specified
> object id from object list cache.
> 
> I reused the deletion_work_list and deletion_wqueue for the
> object list cleanup work, so that we save some memory by not
> starting a new work queue.
> 
> The del_vdi is called in worker thread, but it's locked by the
> notification mechanism, other operations about deletion_work_list
> and deletion_wqueue are done in main thread, so there's no need
> to concern about the race condition.
> 
> Signed-off-by: levin li <xingke.lwp at taobao.com>
> ---
>  include/sheep.h           |    1 +
>  sheep/group.c             |    1 +
>  sheep/object_list_cache.c |   17 +++++++++
>  sheep/ops.c               |   57 ++++++++++++++++++++++++++++++
>  sheep/sheep_priv.h        |   21 ++++++++++++
>  sheep/vdi.c               |   84 ++++++++++++++++++++++++++++++---------------
>  6 files changed, 154 insertions(+), 27 deletions(-)
> 
> diff --git a/include/sheep.h b/include/sheep.h
> index 7e287c4..e941dc1 100644
> --- a/include/sheep.h
> +++ b/include/sheep.h
> @@ -46,6 +46,7 @@
>  #define SD_OP_TRACE          0x95
>  #define SD_OP_TRACE_CAT      0x96
>  #define SD_OP_STAT_RECOVERY  0x97
> +#define SD_OP_NOTIFY_VDI_DEL 0x98
>  
>  #define SD_FLAG_CMD_IO_LOCAL   0x0010
>  #define SD_FLAG_CMD_RECOVERY 0x0020
> diff --git a/sheep/group.c b/sheep/group.c
> index c7fd387..1f1aa8d 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -1314,6 +1314,7 @@ int create_cluster(int port, int64_t zone, int nr_vnodes)
>  	INIT_LIST_HEAD(&sys->req_wait_for_obj_list);
>  	INIT_LIST_HEAD(&sys->consistent_obj_list);
>  	INIT_LIST_HEAD(&sys->blocking_conn_list);
> +	INIT_LIST_HEAD(&sys->deletion_work_list);
>  
>  	INIT_LIST_HEAD(&sys->request_queue);
>  	INIT_LIST_HEAD(&sys->event_queue);
> diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
> index 28cdbbc..0aa93e5 100644
> --- a/sheep/object_list_cache.c
> +++ b/sheep/object_list_cache.c
> @@ -149,3 +149,20 @@ int get_obj_list(const struct sd_list_req *hdr, struct sd_list_rsp *rsp, void *d
>  
>  	return res;
>  }
> +
> +int del_vdi_from_objlist_cache(uint64_t *objs, int count)
> +{
> +	int i;
> +
> +	dprintf("%d\n", count);
> +	for (i = 0; i < count; i++) {
> +		pthread_rwlock_wrlock(&obj_list_cache.lock);
> +		if (!objlist_cache_rb_remove(&obj_list_cache.root, objs[i])) {
> +			dprintf("remove oid %" PRIx64 " from objlist cache\n", objs[i]);
> +			obj_list_cache.cache_size--;
> +		}
> +		pthread_rwlock_unlock(&obj_list_cache.lock);
> +	}
> +
> +	return 0;
> +}
> diff --git a/sheep/ops.c b/sheep/ops.c
> index b6f8eb2..5763b6f 100644
> --- a/sheep/ops.c
> +++ b/sheep/ops.c
> @@ -550,6 +550,57 @@ static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp,
>  	return ret;
>  }
>  
> +static void objlist_cache_del_vdi_work(struct work *work)
> +{
> +	struct deletion_work *dw = container_of(work, struct deletion_work, work);
> +
> +	del_vdi_from_objlist_cache(dw->deleted_objs, dw->deleted_count);
> +}
> +
> +static void objlist_cache_del_vdi_done(struct work *work)
> +{
> +	struct deletion_work *dw = container_of(work, struct deletion_work, work);
> +
> +	list_del(&dw->dw_siblings);
> +	free(dw);
> +
> +	if (!list_empty(&sys->deletion_work_list)) {
> +		dw = list_first_entry(&sys->deletion_work_list,
> +				      struct deletion_work, dw_siblings);
> +
> +		queue_work(sys->deletion_wqueue, &dw->work);
> +	}
> +}
> +
> +static int cluster_notify_vdi_deletion(const struct sd_req *req, struct sd_rsp *rsp,
> +				void *data)
> +{
> +	int count = req->data_length / sizeof(uint64_t);
> +	struct deletion_work *dw = NULL;
> +
> +	dw = zalloc(sizeof(*dw));
> +	if (!dw) {
> +		eprintf("no memory to allocate\n");
> +		return SD_RES_NO_MEM;
> +	}
> +
> +	dw->deleted_objs = data;
> +	dw->deleted_count = count;
> +
> +	dw->work.fn = objlist_cache_del_vdi_work;
> +	dw->work.done = objlist_cache_del_vdi_done;
> +
> +	if (!list_empty(&sys->deletion_work_list)) {
> +		list_add_tail(&dw->dw_siblings, &sys->deletion_work_list);
> +		return SD_RES_SUCCESS;
> +	}
> +
> +	list_add_tail(&dw->dw_siblings, &sys->deletion_work_list);
> +	queue_work(sys->deletion_wqueue, &dw->work);
> +
> +	return SD_RES_SUCCESS;
> +}
> +
>  static int cluster_restore(const struct sd_req *req, struct sd_rsp *rsp,
>  			   void *data)
>  {
> @@ -945,6 +996,12 @@ static struct sd_op_template sd_ops[] = {
>  		.process_main = cluster_cleanup,
>  	},
>  
> +	[SD_OP_NOTIFY_VDI_DEL] = {
> +		.type = SD_OP_TYPE_CLUSTER,
> +		.force = 1,
> +		.process_main = cluster_notify_vdi_deletion,
> +	},
> +
>  	/* local operations */
>  	[SD_OP_GET_STORE_LIST] = {
>  		.type = SD_OP_TYPE_LOCAL,
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index 2275a93..427db03 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -135,6 +135,7 @@ struct cluster_info {
>  	struct list_head req_wait_for_obj_list;
>  	struct list_head consistent_obj_list;
>  	struct list_head blocking_conn_list;
> +	struct list_head deletion_work_list;
>  
>  	int nr_copies;
>  
> @@ -212,6 +213,25 @@ struct objlist_cache {
>  	pthread_rwlock_t lock;
>  };
>  
> +struct deletion_work {
> +	uint32_t done;
> +	uint32_t epoch;
> +
> +	struct work work;
> +	struct list_head dw_siblings;
> +
> +	uint32_t vid;
> +
> +	int count;
> +	uint32_t *buf;
> +
> +	uint64_t *deleted_objs;
> +	uint32_t deleted_count;
> +
> +	struct vnode_info *vnodes;
> +	int delete_error;
> +};
> +
>  extern struct cluster_info *sys;
>  extern struct store_driver *sd_store;
>  extern char *obj_path;
> @@ -313,6 +333,7 @@ int prealloc(int fd, uint32_t size);
>  int init_objlist_cache(void);
>  int objlist_cache_rb_remove(struct rb_root *root, uint64_t oid);
>  int check_and_insert_objlist_cache(uint64_t oid);
> +int del_vdi_from_objlist_cache(uint64_t *objs, int count);
>  
>  /* Operations */
>  
> diff --git a/sheep/vdi.c b/sheep/vdi.c
> index 5b96dbb..609066e 100644
> --- a/sheep/vdi.c
> +++ b/sheep/vdi.c
> @@ -10,6 +10,7 @@
>   */
>  #include <stdio.h>
>  #include <stdlib.h>
> +#include <unistd.h>
>  #include <sys/time.h>
>  
>  #include "sheepdog_proto.h"
> @@ -373,24 +374,6 @@ int read_vdis(char *data, int len, unsigned int *rsp_len)
>  	return SD_RES_SUCCESS;
>  }
>  
> -struct deletion_work {
> -	uint32_t done;
> -	uint32_t epoch;
> -
> -	struct work work;
> -	struct list_head dw_siblings;
> -
> -	uint32_t vid;
> -
> -	int count;
> -	uint32_t *buf;
> -
> -	struct vnode_info *vnodes;
> -	int delete_error;
> -};
> -
> -static LIST_HEAD(deletion_work_list);
> -
>  static int delete_inode(struct deletion_work *dw)
>  {
>  	struct sheepdog_inode *inode = NULL;
> @@ -430,6 +413,39 @@ out:
>  	return ret;
>  }
>  
> +static int notify_deletion(uint64_t *oids, uint32_t count)
> +{
> +	int fd, ret;
> +	unsigned int wlen, rlen = 0;
> +	struct sd_vdi_req hdr;
> +	char host[128];
> +
> +	addr_to_str(host, sizeof(host), sys->this_node.addr, 0);
> +
> +	fd = connect_to(host, sys->this_node.port);
> +	if (fd < 0) {
> +		eprintf("connect to local node fail\n");
> +		return -1;
> +	}
> +
> +	memset(&hdr, 0, sizeof(hdr));
> +
> +	hdr.proto_ver = SD_PROTO_VER;
> +	hdr.opcode = SD_OP_NOTIFY_VDI_DEL;
> +	hdr.flags = SD_FLAG_CMD_WRITE;
> +	hdr.data_length = sizeof(uint64_t) * count;
> +	wlen = hdr.data_length;
> +
> +	ret = exec_req(fd, (struct sd_req *)&hdr, oids, &wlen, &rlen);
> +	close(fd);
> +
> +	if (ret < 0) {
> +		eprintf("send request fail\n");
> +		return -1;
> +	}
> +
> +	return 0;
> +}
>  
>  static void delete_one(struct work *work)
>  {
> @@ -447,6 +463,12 @@ static void delete_one(struct work *work)
>  		goto out;
>  	}
>  
> +	dw->deleted_objs = malloc(sizeof(uint64_t) * MAX_DATA_OBJS);
> +	if (!dw->deleted_objs) {
> +		eprintf("failed to allocate memory\n");
> +		goto out;
> +	}
> +
>  	nr_copies = get_nr_copies(dw->vnodes);
>  
>  	ret = read_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(vdi_id),
> @@ -459,25 +481,33 @@ static void delete_one(struct work *work)
>  	}
>  
>  	for (i = 0; i < MAX_DATA_OBJS; i++) {
> +		uint64_t oid;
> +
>  		if (!inode->data_vdi_id[i])
>  			continue;
>  
> +		oid = vid_to_data_oid(inode->data_vdi_id[i], i);
> +
>  		if (inode->data_vdi_id[i] != inode->vdi_id) {
>  			dprintf("object %" PRIx64 " is base's data, would not be deleted.\n",
> -					vid_to_data_oid(inode->data_vdi_id[i], i));
> +					oid);
>  			continue;
>  		}
>  
>  		ret = remove_object(dw->vnodes, dw->epoch,
> -			      vid_to_data_oid(inode->data_vdi_id[i], i),
> -			      nr_copies);
> +					oid, nr_copies);
>  
>  		if (ret != SD_RES_SUCCESS)
>  			dw->delete_error = 1;
> -		else
> +		else {
> +			dw->deleted_objs[dw->deleted_count++] = oid;
>  			inode->data_vdi_id[i] = 0;
> +		}
>  	}
>  
> +	if (dw->deleted_count > 0)
> +		notify_deletion(dw->deleted_objs, dw->deleted_count);


This name looks confusing, I guess you mean undeleted objects as in
dw->deleted_objs, dw->deleted_count.

> +
>  	if (dw->delete_error) {
>  		write_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(vdi_id),
>  			     (void *)inode, sizeof(*inode), 0, 0, nr_copies, 0);
> @@ -505,8 +535,8 @@ static void delete_one_done(struct work *work)
>  	free(dw->buf);
>  	free(dw);
>  
> -	if (!list_empty(&deletion_work_list)) {
> -		dw = list_first_entry(&deletion_work_list,
> +	if (!list_empty(&sys->deletion_work_list)) {
> +		dw = list_first_entry(&sys->deletion_work_list,
>  				      struct deletion_work, dw_siblings);
>  
>  		queue_work(sys->deletion_wqueue, &dw->work);
> @@ -647,12 +677,12 @@ int start_deletion(uint32_t vid, uint32_t epoch)
>  	if (dw->count == 0)
>  		goto out;
>  
> -	if (!list_empty(&deletion_work_list)) {
> -		list_add_tail(&dw->dw_siblings, &deletion_work_list);
> +	if (!list_empty(&sys->deletion_work_list)) {
> +		list_add_tail(&dw->dw_siblings, &sys->deletion_work_list);
>  		goto out;
>  	}
>  
> -	list_add_tail(&dw->dw_siblings, &deletion_work_list);
> +	list_add_tail(&dw->dw_siblings, &sys->deletion_work_list);
>  	queue_work(sys->deletion_wqueue, &dw->work);


it looks redundant to have deletion_work_list here to get FIFO semantics.

We only have one worker for deletion_wqueue, isn't it FIFO already?

>  out:
>  	return SD_RES_SUCCESS;


This code looks really confusing, I think you need rework the logic
first, take object recovery as an example, recovery_object just recovers
the targeted object once at a time. Your delete_one I guess serves the
same purpose as its name suggests, but you actually try to delete all
the objects in one go.

By the way, please run script/checkpatch.pl, there are some style problems.

Thanks,
Yuan



More information about the sheepdog mailing list