[sheepdog] [PATCH] sheep/recovery: reduce network overhead of preparing object list

Mon Jun 16 12:45:01 CEST 2014

On 2014年06月16日 18:26, Liu Yuan wrote:
> On Mon, Jun 16, 2014 at 04:58:19PM +0800, Ruoyu wrote:
>> In the stage of preparing object list, every node tries to fetch
>> entire object lists from all nodes of the cluster, and then,
>> screens out objects that don't belong to it's own. This behavior
>> transfers quite a lot of unnecessary object id over the network.
>>
>> This patch resolve the issue by:
>> 1. fetch objects that belong to the sender node only.
>> 2. don't send request of getting objects to itself.
>> 3. no need to screen out objects if step 1 is achieved, but merging
>>     object lists fetched from different nodes is essential.
>>
> Suppose three nodes a, b, c with 3 copies and one object N. Then
>
> a tries to get object list from b, c and get N from b and N from c because N
> belongs to this node. Then we'll have duplicate N if you don't screen out the
> objects.
I mean, no need to screen out objects those are not belong to this node.
screen out duplicated objects, that is merging, is needed.
>
>> Signed-off-by: Ruoyu <liangry at ucweb.com>
>> ---
>>   include/sheepdog_proto.h  |  4 ++++
>>   sheep/object_list_cache.c | 38 +++++++++++++++++++++++++++++++++-----
>>   sheep/recovery.c          | 41 ++++++++++++++++++-----------------------
>>   sheep/sheep_priv.h        |  6 ++++++
>>   4 files changed, 61 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
>> index 8b5834b..76fad51 100644
>> --- a/include/sheepdog_proto.h
>> +++ b/include/sheepdog_proto.h
>> @@ -176,6 +176,10 @@ struct sd_req {
>>   			uint32_t	generation;
>>   			uint32_t	count;
>>   		} ref;
>> +		struct {
>> +			uint8_t		addr[16];
>> +			uint16_t	port;
>> +		} node_addr;
>>   
>>   		uint32_t		__pad[8];
>>   	};
>> diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
>> index eefa40a..9a7073d 100644
>> --- a/sheep/object_list_cache.c
>> +++ b/sheep/object_list_cache.c
>> @@ -97,8 +97,17 @@ int objlist_cache_insert(uint64_t oid)
>>   
>>   int get_obj_list(const struct sd_req *hdr, struct sd_rsp *rsp, void *data)
>>   {
>> -	int nr = 0;
>> +	int i = 0, j, copies;
>>   	struct objlist_cache_entry *entry;
>> +	struct node_id peer_nid;
>> +	struct request *req = container_of(hdr, struct request, rq);
>> +	struct vnode_info *peer_vinfo = req->vinfo;
>> +	const struct sd_vnode *vnodes[SD_MAX_COPIES];
>> +	int last = 0, end = 4096;
>> +	uint64_t *oids = xmalloc(end * sizeof(uint64_t));
>> +
>> +	memcpy(peer_nid.addr, hdr->node_addr.addr, sizeof(peer_nid.addr));
>> +	peer_nid.port = hdr->node_addr.port;
>>   
>>   	/* first try getting the cached buffer with only a read lock held */
>>   	sd_read_lock(&obj_list_cache.lock);
>> @@ -116,19 +125,38 @@ int get_obj_list(const struct sd_req *hdr, struct sd_rsp *rsp, void *data)
>>   				obj_list_cache.cache_size * sizeof(uint64_t));
>>   
>>   	rb_for_each_entry(entry, &obj_list_cache.root, node) {
>> -		obj_list_cache.buf[nr++] = entry->oid;
>> +		obj_list_cache.buf[i++] = entry->oid;
>>   	}
>>   
>>   out:
>> -	if (hdr->data_length < obj_list_cache.cache_size * sizeof(uint64_t)) {
>> +	/* Screen out objects that don't belong to that node */
>> +	for (i = 0; i < obj_list_cache.cache_size; i++) {
>> +		copies = get_obj_copy_number(obj_list_cache.buf[i],
>> +				peer_vinfo->nr_zones);
>> +		oid_to_vnodes(obj_list_cache.buf[i],
>> +				&peer_vinfo->vroot, copies, vnodes);
>> +		for (j = 0; j < copies; j++) {
>> +			if (!vnode_is_peer(vnodes[j], &peer_nid))
>> +				continue;
>> +			oids[last++] = obj_list_cache.buf[i];
>> +			if (last >= end) {
>> +				end *= 2;
>> +				oids = xrealloc(oids, end * sizeof(uint64_t));
>> +			}
>> +		}
>> +	}
>> +
>> +	if (hdr->data_length < last * sizeof(uint64_t)) {
>>   		sd_rw_unlock(&obj_list_cache.lock);
>>   		sd_err("GET_OBJ_LIST buffer too small");
>> +		free(oids);
>>   		return SD_RES_BUFFER_SMALL;
>>   	}
>>   
>> -	rsp->data_length = obj_list_cache.cache_size * sizeof(uint64_t);
>> -	memcpy(data, obj_list_cache.buf, rsp->data_length);
>> +	rsp->data_length = last * sizeof(uint64_t);
>> +	memcpy(data, oids, rsp->data_length);
>>   	sd_rw_unlock(&obj_list_cache.lock);
>> +	free(oids);
>>   	return SD_RES_SUCCESS;
>>   }
>>   
>> diff --git a/sheep/recovery.c b/sheep/recovery.c
>> index 4648966..dfd27bb 100644
>> --- a/sheep/recovery.c
>> +++ b/sheep/recovery.c
>> @@ -978,6 +978,9 @@ retry:
>>   	sd_init_req(&hdr, SD_OP_GET_OBJ_LIST);
>>   	hdr.data_length = buf_size;
>>   	hdr.epoch = epoch;
>> +	memcpy(hdr.node_addr.addr, sys->this_node.nid.addr,
>> +			sizeof(hdr.node_addr.addr));
>> +	hdr.node_addr.port = sys->this_node.nid.port;
>>   	ret = sheep_exec_req(&e->nid, &hdr, buf);
>>   
>>   	switch (ret) {
>> @@ -997,40 +1000,28 @@ retry:
>>   	}
>>   
>>   	*nr_oids = rsp->data_length / sizeof(uint64_t);
>> -	sd_debug("%zu", *nr_oids);
>> +	sd_debug("%s: %zu objects to be fetched",
>> +			addr_to_str(e->nid.addr, e->nid.port), *nr_oids);
>>   	return buf;
>>   }
>>   
>> -/* Screen out objects that don't belong to this node */
>> -static void screen_object_list(struct recovery_list_work *rlw,
>> +/* Merge object lists fetched from other nodes */
>> +static void merge_object_list(struct recovery_list_work *rlw,
>>   			       uint64_t *oids, size_t nr_oids)
>>   {
>> -	struct recovery_work *rw = &rlw->base;
>> -	const struct sd_vnode *vnodes[SD_MAX_COPIES];
>>   	uint64_t old_count = rlw->count;
>> -	uint64_t nr_objs;
>> -	uint64_t i, j;
>> +	uint64_t i;
>>   
>>   	for (i = 0; i < nr_oids; i++) {
>>   		if (xbsearch(&oids[i], rlw->oids, old_count, obj_cmp))
>>   			/* the object is already scheduled to be recovered */
>>   			continue;
>>   
>> -		nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones);
>> -
>> -		oid_to_vnodes(oids[i], &rw->cur_vinfo->vroot, nr_objs, vnodes);
>> -		for (j = 0; j < nr_objs; j++) {
>> -			if (!vnode_is_local(vnodes[j]))
>> -				continue;
>> -
>> -			rlw->oids[rlw->count++] = oids[i];
>> -			/* enlarge the list buffer if full */
>> -			if (rlw->count == list_buffer_size / sizeof(uint64_t)) {
>> -				list_buffer_size *= 2;
>> -				rlw->oids = xrealloc(rlw->oids,
>> -						     list_buffer_size);
>> -			}
>> -			break;
>> +		rlw->oids[rlw->count++] = oids[i];
>> +		/* enlarge the list buffer if full */
>> +		if (rlw->count == list_buffer_size / sizeof(uint64_t)) {
>> +			list_buffer_size *= 2;
>> +			rlw->oids = xrealloc(rlw->oids, list_buffer_size);
>>   		}
>>   	}
>>   
>> @@ -1064,6 +1055,10 @@ again:
>>   		size_t nr_oids;
>>   		struct sd_node *node = nodes + i;
>>   
>> +		/* No need to fetch from local node */
>> +		if (node_is_local(node))
>> +			continue;
> We have to fetch from local node too because we need to get the complete object
> list.
>
> Suppose we have 2 nodes and 2 objects with 1 copy.
>
> a[O1], b[O2]. If a wouldn't try to get object list from itself, O1 will be lost.
> and b will lose O2 for the same reason.
I am wonder why a[01] will be lost since it is still in node a.
>
> Thanks
> Yuan