[sheepdog] [PATCH v2 3/5] sheep: change method of generating vnodes

Wed May 7 12:25:40 CEST 2014

From: Robin Dong <sanbai at taobao.com>

The sheepdog use node (include network ip and port) to generate
vnoes in hashing-ring at present. Every node generate about 128 vnods.
When comes a oid, sheep will find the corresponding nodes by searching
the vnodes and send oid to these nodes.

Imaging a disk has corrupted in a cluster using erasure-code, the node
will try to recovery all data in corrupted disk by fetching from other
nodes. This progress will take a very long time because it need to fetch
m copies from remote to re-generate (erasure code algorithm) 1 copy in
m:n erasure-code cluster, which will costs large mount of network traffic.

To solve this problem, we use disk to generate vnodes; therefore when a
disk is corrupted, all the nodes in cluster will join the recovery progress
and the progress will be much shorter.

In our test environment, the recovery time is 10 minutes when using old
vnodes algorithm, but it reduces to only 2 minutes by using disk to
generate vnodes. The test cluster has 10TB data.

Signed-off-by: Robin Dong <sanbai at taobao.com>
---
 dog/dog.c                | 20 ++++++++++++++++++--
 dog/vdi.c                |  5 ++++-
 include/internal_proto.h |  3 ++-
 include/sheep.h          | 35 +++++++++++++++++++++++++++++++++++
 sheep/group.c            |  5 ++++-
 sheep/ops.c              |  1 +
 6 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/dog/dog.c b/dog/dog.c
index 7942b34..663b6ce 100644
--- a/dog/dog.c
+++ b/dog/dog.c
@@ -65,6 +65,8 @@ int update_node_list(int max_nodes)
 	struct sd_node *ent;
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+	struct epoch_log *logs = NULL;
+	int log_length;
 
 	size = sizeof(*ent) * max_nodes;
 	buf = xzalloc(size);
@@ -111,13 +113,27 @@ int update_node_list(int max_nodes)
 		if (j == sd_zones_nr)
 			sd_zones[sd_zones_nr++] = n->zone;
 	}
+	/* check whether cluster use diskmode */
+	log_length = sizeof(struct epoch_log);
+	logs = xmalloc(log_length);
+
+	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
+	hdr.data_length = log_length;
+
+	ret = dog_exec_req(&sd_nid, &hdr, logs);
+	if (ret < 0)
+		goto out;
+
+	if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
+		disks_to_vnodes(&sd_nroot, &sd_vroot);
+	else
+		nodes_to_vnodes(&sd_nroot, &sd_vroot);
 
-	nodes_to_vnodes(&sd_nroot, &sd_vroot);
 	sd_epoch = hdr.epoch;
 out:
 	if (buf)
 		free(buf);
-
+	free(logs);
 	return ret;
 }
 
diff --git a/dog/vdi.c b/dog/vdi.c
index 1851a2a..fc7b402 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -908,7 +908,10 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 		}
 		for (int k = 0; k < logs[i].nr_nodes; k++)
 			rb_insert(&nroot, &logs[i].nodes[k], rb, node_cmp);
-		nodes_to_vnodes(&nroot, &vroot);
+		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
+			disks_to_vnodes(&nroot, &vroot);
+		else
+			nodes_to_vnodes(&nroot, &vroot);
 		oid_to_vnodes(oid, &vroot, nr_copies, vnode_buf);
 		for (j = 0; j < nr_copies; j++) {
 			const struct node_id *n = &vnode_buf[j]->node->nid;
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 488d6d9..73ed581 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -202,7 +202,8 @@ struct epoch_log {
 	uint8_t  disable_recovery;
 	uint8_t  nr_copies;
 	uint8_t  copy_policy;
-	uint8_t  __pad[1];
+	uint8_t  __pad[3];
+	uint16_t flags;
 	char drv_name[STORE_LEN];
 	struct sd_node nodes[SD_MAX_NODES];
 };
diff --git a/include/sheep.h b/include/sheep.h
index ea376cc..785883e 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -204,6 +204,41 @@ static inline bool node_eq(const struct sd_node *a, const struct sd_node *b)
 	return node_cmp(a, b) == 0;
 }
 
+static inline uint64_t
+node_disk_to_vnodes(const struct sd_node *n, struct rb_root *vroot)
+{
+
+	uint64_t node_hval = sd_hash(&n->nid, offsetof(typeof(n->nid),
+						       io_addr));
+	uint64_t hval, disk_vnodes, total = 0;
+
+	for (int j = 0; j < DISK_MAX; j++) {
+		if (!n->disks[j].disk_id)
+			continue;
+		hval = fnv_64a_64(node_hval, n->disks[j].disk_id);
+		disk_vnodes = n->disks[j].disk_space / WEIGHT_MIN;
+		total += disk_vnodes;
+		for (int k = 0; k < disk_vnodes; k++) {
+			hval = sd_hash_next(hval);
+			struct sd_vnode *v = xmalloc(sizeof(*v));
+			v->hash = hval;
+			v->node = n;
+			if (unlikely(rb_insert(vroot, v, rb, vnode_cmp)))
+				panic("vdisk hash collison");
+		}
+	}
+	return total;
+}
+
+static inline void
+disks_to_vnodes(struct rb_root *nroot, struct rb_root *vroot)
+{
+	struct sd_node *n;
+
+	rb_for_each_entry(n, nroot, rb)
+		n->nr_vnodes = node_disk_to_vnodes(n, vroot);
+}
+
 static inline void
 node_to_vnodes(const struct sd_node *n, struct rb_root *vroot)
 {
diff --git a/sheep/group.c b/sheep/group.c
index 1e861bd..efb47fb 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -146,7 +146,10 @@ struct vnode_info *alloc_vnode_info(const struct rb_root *nroot)
 
 	recalculate_vnodes(&vnode_info->nroot);
 
-	nodes_to_vnodes(&vnode_info->nroot, &vnode_info->vroot);
+	if (is_cluster_diskmode(&sys->cinfo))
+		disks_to_vnodes(&vnode_info->nroot, &vnode_info->vroot);
+	else
+		nodes_to_vnodes(&vnode_info->nroot, &vnode_info->vroot);
 	vnode_info->nr_zones = get_zones_nr_from(&vnode_info->nroot);
 	refcount_set(&vnode_info->refcnt, 1);
 	return vnode_info;
diff --git a/sheep/ops.c b/sheep/ops.c
index b9550f0..22bb8dc 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -463,6 +463,7 @@ static int local_stat_cluster(struct request *req)
 			elog->disable_recovery = sys->cinfo.disable_recovery;
 			elog->nr_copies = sys->cinfo.nr_copies;
 			elog->copy_policy = sys->cinfo.copy_policy;
+			elog->flags = sys->cinfo.flags;
 			strncpy(elog->drv_name, (char *)sys->cinfo.store,
 				STORE_LEN);
 		}
-- 
1.7.12.4