[sheepdog] [PATCH v2] sheep/recovery: reduce network overhead of preparing object list

Ruoyu liangry at ucweb.com
Wed Jul 9 11:00:11 CEST 2014


Hi Yuan,

I am sorry for a critical bug in the patch is CONFIRMED.

Although the probability is rather low, the patch will cause the vm file 
system crashed.

Suppose that there are 3 nodes in a cluster: A, B and C. A is fast and B 
is slow. Now C is left accidently. Let us focus on node B, if 
GET_OBJ_LIST request of A is received prior to the zookeeper EVENT_LEAVE 
message, B will return object list on the old epoch to A. However, what 
node A want to get is the object list on the new epoch. Therefore, some 
objects cannot be recovered. That is the problem.

To simplify the scenario, I think it is better to use the old algorithm, 
getting all object list first, and then, checking which objects are 
belong to the node itself.

Could you please help to revert the patch?


On 2014年06月20日 13:37, Liu Yuan wrote:
> On Tue, Jun 17, 2014 at 06:08:40PM +0800, Ruoyu wrote:
>> In the stage of preparing object list, every node tries to fetch
>> entire object lists from all nodes of the cluster, and then,
>> screens out objects that don't belong to it's own. This behavior
>> transfers quite a lot of unnecessary object id over the network.
>>
>> This patch resolve the issue by:
>> 1. receiver node prepare objects that belong to the sender node.
>> 2. sender node don't screen out objects any longer because it is
>>     already done by the receiver, but merging object lists fetched
>>     from different nodes is essential.
>>
>> Signed-off-by: Ruoyu <liangry at ucweb.com>
>> ---
>>   include/sheepdog_proto.h  |  4 ++++
>>   sheep/object_list_cache.c | 38 +++++++++++++++++++++++++++++++++-----
>>   sheep/recovery.c          | 37 ++++++++++++++-----------------------
>>   sheep/sheep_priv.h        |  6 ++++++
>>   4 files changed, 57 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
>> index 8b5834b..76fad51 100644
>> --- a/include/sheepdog_proto.h
>> +++ b/include/sheepdog_proto.h
>> @@ -176,6 +176,10 @@ struct sd_req {
>>   			uint32_t	generation;
>>   			uint32_t	count;
>>   		} ref;
>> +		struct {
>> +			uint8_t		addr[16];
>> +			uint16_t	port;
>> +		} node_addr;
>>   
>>   		uint32_t		__pad[8];
>>   	};
>> diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
>> index eefa40a..9a7073d 100644
>> --- a/sheep/object_list_cache.c
>> +++ b/sheep/object_list_cache.c
>> @@ -97,8 +97,17 @@ int objlist_cache_insert(uint64_t oid)
>>   
>>   int get_obj_list(const struct sd_req *hdr, struct sd_rsp *rsp, void *data)
>>   {
>> -	int nr = 0;
>> +	int i = 0, j, copies;
>>   	struct objlist_cache_entry *entry;
>> +	struct node_id peer_nid;
>> +	struct request *req = container_of(hdr, struct request, rq);
>> +	struct vnode_info *peer_vinfo = req->vinfo;
>> +	const struct sd_vnode *vnodes[SD_MAX_COPIES];
>> +	int last = 0, end = 4096;
>> +	uint64_t *oids = xmalloc(end * sizeof(uint64_t));
>> +
>> +	memcpy(peer_nid.addr, hdr->node_addr.addr, sizeof(peer_nid.addr));
>> +	peer_nid.port = hdr->node_addr.port;
>>   
>>   	/* first try getting the cached buffer with only a read lock held */
>>   	sd_read_lock(&obj_list_cache.lock);
>> @@ -116,19 +125,38 @@ int get_obj_list(const struct sd_req *hdr, struct sd_rsp *rsp, void *data)
>>   				obj_list_cache.cache_size * sizeof(uint64_t));
>>   
>>   	rb_for_each_entry(entry, &obj_list_cache.root, node) {
>> -		obj_list_cache.buf[nr++] = entry->oid;
>> +		obj_list_cache.buf[i++] = entry->oid;
>>   	}
>>   
>>   out:
>> -	if (hdr->data_length < obj_list_cache.cache_size * sizeof(uint64_t)) {
>> +	/* Screen out objects that don't belong to that node */
>> +	for (i = 0; i < obj_list_cache.cache_size; i++) {
>> +		copies = get_obj_copy_number(obj_list_cache.buf[i],
>> +				peer_vinfo->nr_zones);
>> +		oid_to_vnodes(obj_list_cache.buf[i],
>> +				&peer_vinfo->vroot, copies, vnodes);
>> +		for (j = 0; j < copies; j++) {
>> +			if (!vnode_is_peer(vnodes[j], &peer_nid))
>> +				continue;
>> +			oids[last++] = obj_list_cache.buf[i];
>> +			if (last >= end) {
>> +				end *= 2;
>> +				oids = xrealloc(oids, end * sizeof(uint64_t));
>> +			}
>> +		}
>> +	}
>> +
>> +	if (hdr->data_length < last * sizeof(uint64_t)) {
>>   		sd_rw_unlock(&obj_list_cache.lock);
>>   		sd_err("GET_OBJ_LIST buffer too small");
>> +		free(oids);
>>   		return SD_RES_BUFFER_SMALL;
>>   	}
>>   
>> -	rsp->data_length = obj_list_cache.cache_size * sizeof(uint64_t);
>> -	memcpy(data, obj_list_cache.buf, rsp->data_length);
>> +	rsp->data_length = last * sizeof(uint64_t);
>> +	memcpy(data, oids, rsp->data_length);
>>   	sd_rw_unlock(&obj_list_cache.lock);
>> +	free(oids);
>>   	return SD_RES_SUCCESS;
>>   }
>>   
>> diff --git a/sheep/recovery.c b/sheep/recovery.c
>> index 4648966..ea67b5f 100644
>> --- a/sheep/recovery.c
>> +++ b/sheep/recovery.c
>> @@ -978,6 +978,9 @@ retry:
>>   	sd_init_req(&hdr, SD_OP_GET_OBJ_LIST);
>>   	hdr.data_length = buf_size;
>>   	hdr.epoch = epoch;
>> +	memcpy(hdr.node_addr.addr, sys->this_node.nid.addr,
>> +			sizeof(hdr.node_addr.addr));
>> +	hdr.node_addr.port = sys->this_node.nid.port;
>>   	ret = sheep_exec_req(&e->nid, &hdr, buf);
>>   
>>   	switch (ret) {
>> @@ -997,40 +1000,28 @@ retry:
>>   	}
>>   
>>   	*nr_oids = rsp->data_length / sizeof(uint64_t);
>> -	sd_debug("%zu", *nr_oids);
>> +	sd_debug("%s: %zu objects to be fetched",
>> +			addr_to_str(e->nid.addr, e->nid.port), *nr_oids);
>>   	return buf;
>>   }
>>   
>> -/* Screen out objects that don't belong to this node */
>> -static void screen_object_list(struct recovery_list_work *rlw,
>> +/* Merge object lists fetched from other nodes */
>> +static void merge_object_list(struct recovery_list_work *rlw,
>>   			       uint64_t *oids, size_t nr_oids)
>>   {
>> -	struct recovery_work *rw = &rlw->base;
>> -	const struct sd_vnode *vnodes[SD_MAX_COPIES];
>>   	uint64_t old_count = rlw->count;
>> -	uint64_t nr_objs;
>> -	uint64_t i, j;
>> +	uint64_t i;
>>   
>>   	for (i = 0; i < nr_oids; i++) {
>>   		if (xbsearch(&oids[i], rlw->oids, old_count, obj_cmp))
>>   			/* the object is already scheduled to be recovered */
>>   			continue;
>>   
>> -		nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones);
>> -
>> -		oid_to_vnodes(oids[i], &rw->cur_vinfo->vroot, nr_objs, vnodes);
>> -		for (j = 0; j < nr_objs; j++) {
>> -			if (!vnode_is_local(vnodes[j]))
>> -				continue;
>> -
>> -			rlw->oids[rlw->count++] = oids[i];
>> -			/* enlarge the list buffer if full */
>> -			if (rlw->count == list_buffer_size / sizeof(uint64_t)) {
>> -				list_buffer_size *= 2;
>> -				rlw->oids = xrealloc(rlw->oids,
>> -						     list_buffer_size);
>> -			}
>> -			break;
>> +		rlw->oids[rlw->count++] = oids[i];
>> +		/* enlarge the list buffer if full */
>> +		if (rlw->count == list_buffer_size / sizeof(uint64_t)) {
>> +			list_buffer_size *= 2;
>> +			rlw->oids = xrealloc(rlw->oids, list_buffer_size);
>>   		}
>>   	}
>>   
>> @@ -1072,7 +1063,7 @@ again:
>>   		oids = fetch_object_list(node, rw->epoch, &nr_oids);
>>   		if (!oids)
>>   			continue;
>> -		screen_object_list(rlw, oids, nr_oids);
>> +		merge_object_list(rlw, oids, nr_oids);
>>   		free(oids);
>>   	}
>>   
>> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
>> index 7b33f11..02f75ad 100644
>> --- a/sheep/sheep_priv.h
>> +++ b/sheep/sheep_priv.h
>> @@ -461,6 +461,12 @@ int gateway_to_peer_opcode(int opcode);
>>   
>>   extern uint32_t last_gathered_epoch;
>>   
>> +static inline bool vnode_is_peer(const struct sd_vnode *v,
>> +			const struct node_id *peer_nid)
>> +{
>> +	return node_id_cmp(&v->node->nid, peer_nid) == 0;
>> +}
>> +
>>   static inline bool vnode_is_local(const struct sd_vnode *v)
>>   {
>>   	return node_id_cmp(&v->node->nid, &sys->this_node.nid) == 0;
>> -- 
>> 1.8.3.2
>>
>>
>> -- 
>> sheepdog mailing list
>> sheepdog at lists.wpkg.org
>> http://lists.wpkg.org/mailman/listinfo/sheepdog
> Applied thanks
>
> Yuan





More information about the sheepdog mailing list