[sheepdog] [PATCH] sheep/recovery: reduce network overhead of preparing object list
Liu Yuan
namei.unix at gmail.com
Mon Jun 16 12:26:04 CEST 2014
On Mon, Jun 16, 2014 at 04:58:19PM +0800, Ruoyu wrote:
> In the stage of preparing object list, every node tries to fetch
> entire object lists from all nodes of the cluster, and then,
> screens out objects that don't belong to it's own. This behavior
> transfers quite a lot of unnecessary object id over the network.
>
> This patch resolve the issue by:
> 1. fetch objects that belong to the sender node only.
> 2. don't send request of getting objects to itself.
> 3. no need to screen out objects if step 1 is achieved, but merging
> object lists fetched from different nodes is essential.
>
Suppose three nodes a, b, c with 3 copies and one object N. Then
a tries to get object list from b, c and get N from b and N from c because N
belongs to this node. Then we'll have duplicate N if you don't screen out the
objects.
> Signed-off-by: Ruoyu <liangry at ucweb.com>
> ---
> include/sheepdog_proto.h | 4 ++++
> sheep/object_list_cache.c | 38 +++++++++++++++++++++++++++++++++-----
> sheep/recovery.c | 41 ++++++++++++++++++-----------------------
> sheep/sheep_priv.h | 6 ++++++
> 4 files changed, 61 insertions(+), 28 deletions(-)
>
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 8b5834b..76fad51 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -176,6 +176,10 @@ struct sd_req {
> uint32_t generation;
> uint32_t count;
> } ref;
> + struct {
> + uint8_t addr[16];
> + uint16_t port;
> + } node_addr;
>
> uint32_t __pad[8];
> };
> diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
> index eefa40a..9a7073d 100644
> --- a/sheep/object_list_cache.c
> +++ b/sheep/object_list_cache.c
> @@ -97,8 +97,17 @@ int objlist_cache_insert(uint64_t oid)
>
> int get_obj_list(const struct sd_req *hdr, struct sd_rsp *rsp, void *data)
> {
> - int nr = 0;
> + int i = 0, j, copies;
> struct objlist_cache_entry *entry;
> + struct node_id peer_nid;
> + struct request *req = container_of(hdr, struct request, rq);
> + struct vnode_info *peer_vinfo = req->vinfo;
> + const struct sd_vnode *vnodes[SD_MAX_COPIES];
> + int last = 0, end = 4096;
> + uint64_t *oids = xmalloc(end * sizeof(uint64_t));
> +
> + memcpy(peer_nid.addr, hdr->node_addr.addr, sizeof(peer_nid.addr));
> + peer_nid.port = hdr->node_addr.port;
>
> /* first try getting the cached buffer with only a read lock held */
> sd_read_lock(&obj_list_cache.lock);
> @@ -116,19 +125,38 @@ int get_obj_list(const struct sd_req *hdr, struct sd_rsp *rsp, void *data)
> obj_list_cache.cache_size * sizeof(uint64_t));
>
> rb_for_each_entry(entry, &obj_list_cache.root, node) {
> - obj_list_cache.buf[nr++] = entry->oid;
> + obj_list_cache.buf[i++] = entry->oid;
> }
>
> out:
> - if (hdr->data_length < obj_list_cache.cache_size * sizeof(uint64_t)) {
> + /* Screen out objects that don't belong to that node */
> + for (i = 0; i < obj_list_cache.cache_size; i++) {
> + copies = get_obj_copy_number(obj_list_cache.buf[i],
> + peer_vinfo->nr_zones);
> + oid_to_vnodes(obj_list_cache.buf[i],
> + &peer_vinfo->vroot, copies, vnodes);
> + for (j = 0; j < copies; j++) {
> + if (!vnode_is_peer(vnodes[j], &peer_nid))
> + continue;
> + oids[last++] = obj_list_cache.buf[i];
> + if (last >= end) {
> + end *= 2;
> + oids = xrealloc(oids, end * sizeof(uint64_t));
> + }
> + }
> + }
> +
> + if (hdr->data_length < last * sizeof(uint64_t)) {
> sd_rw_unlock(&obj_list_cache.lock);
> sd_err("GET_OBJ_LIST buffer too small");
> + free(oids);
> return SD_RES_BUFFER_SMALL;
> }
>
> - rsp->data_length = obj_list_cache.cache_size * sizeof(uint64_t);
> - memcpy(data, obj_list_cache.buf, rsp->data_length);
> + rsp->data_length = last * sizeof(uint64_t);
> + memcpy(data, oids, rsp->data_length);
> sd_rw_unlock(&obj_list_cache.lock);
> + free(oids);
> return SD_RES_SUCCESS;
> }
>
> diff --git a/sheep/recovery.c b/sheep/recovery.c
> index 4648966..dfd27bb 100644
> --- a/sheep/recovery.c
> +++ b/sheep/recovery.c
> @@ -978,6 +978,9 @@ retry:
> sd_init_req(&hdr, SD_OP_GET_OBJ_LIST);
> hdr.data_length = buf_size;
> hdr.epoch = epoch;
> + memcpy(hdr.node_addr.addr, sys->this_node.nid.addr,
> + sizeof(hdr.node_addr.addr));
> + hdr.node_addr.port = sys->this_node.nid.port;
> ret = sheep_exec_req(&e->nid, &hdr, buf);
>
> switch (ret) {
> @@ -997,40 +1000,28 @@ retry:
> }
>
> *nr_oids = rsp->data_length / sizeof(uint64_t);
> - sd_debug("%zu", *nr_oids);
> + sd_debug("%s: %zu objects to be fetched",
> + addr_to_str(e->nid.addr, e->nid.port), *nr_oids);
> return buf;
> }
>
> -/* Screen out objects that don't belong to this node */
> -static void screen_object_list(struct recovery_list_work *rlw,
> +/* Merge object lists fetched from other nodes */
> +static void merge_object_list(struct recovery_list_work *rlw,
> uint64_t *oids, size_t nr_oids)
> {
> - struct recovery_work *rw = &rlw->base;
> - const struct sd_vnode *vnodes[SD_MAX_COPIES];
> uint64_t old_count = rlw->count;
> - uint64_t nr_objs;
> - uint64_t i, j;
> + uint64_t i;
>
> for (i = 0; i < nr_oids; i++) {
> if (xbsearch(&oids[i], rlw->oids, old_count, obj_cmp))
> /* the object is already scheduled to be recovered */
> continue;
>
> - nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones);
> -
> - oid_to_vnodes(oids[i], &rw->cur_vinfo->vroot, nr_objs, vnodes);
> - for (j = 0; j < nr_objs; j++) {
> - if (!vnode_is_local(vnodes[j]))
> - continue;
> -
> - rlw->oids[rlw->count++] = oids[i];
> - /* enlarge the list buffer if full */
> - if (rlw->count == list_buffer_size / sizeof(uint64_t)) {
> - list_buffer_size *= 2;
> - rlw->oids = xrealloc(rlw->oids,
> - list_buffer_size);
> - }
> - break;
> + rlw->oids[rlw->count++] = oids[i];
> + /* enlarge the list buffer if full */
> + if (rlw->count == list_buffer_size / sizeof(uint64_t)) {
> + list_buffer_size *= 2;
> + rlw->oids = xrealloc(rlw->oids, list_buffer_size);
> }
> }
>
> @@ -1064,6 +1055,10 @@ again:
> size_t nr_oids;
> struct sd_node *node = nodes + i;
>
> + /* No need to fetch from local node */
> + if (node_is_local(node))
> + continue;
We have to fetch from local node too because we need to get the complete object
list.
Suppose we have 2 nodes and 2 objects with 1 copy.
a[O1], b[O2]. If a wouldn't try to get object list from itself, O1 will be lost.
and b will lose O2 for the same reason.
Thanks
Yuan
More information about the sheepdog
mailing list