[sheepdog] [PATCH v2 3/5] sheep: use rbtree to manage vnodes hash ring

Liu Yuan namei.unix at gmail.com
Tue Sep 10 04:53:50 CEST 2013


Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 dog/dog.c                |    6 +-
 dog/dog.h                |    4 +-
 dog/vdi.c                |   20 ++---
 include/internal_proto.h |    1 -
 include/sheep.h          |  199 +++++++++++++++-------------------------------
 sheep/gateway.c          |    8 +-
 sheep/group.c            |   12 ++-
 sheep/plain_store.c      |    3 +-
 sheep/recovery.c         |    7 +-
 sheep/request.c          |    6 +-
 10 files changed, 98 insertions(+), 168 deletions(-)

diff --git a/dog/dog.c b/dog/dog.c
index 9ec41c1..0009c92 100644
--- a/dog/dog.c
+++ b/dog/dog.c
@@ -49,8 +49,8 @@ static void usage(const struct command *commands, int status);
 uint32_t sd_epoch;
 
 struct sd_node sd_nodes[SD_MAX_NODES];
-struct sd_vnode sd_vnodes[SD_MAX_VNODES];
-int sd_nodes_nr, sd_vnodes_nr;
+int sd_nodes_nr;
+struct rb_root sd_vroot = RB_ROOT;
 
 int update_node_list(int max_nodes)
 {
@@ -92,7 +92,7 @@ int update_node_list(int max_nodes)
 	}
 
 	memcpy(sd_nodes, buf, size);
-	sd_vnodes_nr = nodes_to_vnodes(sd_nodes, sd_nodes_nr, sd_vnodes);
+	nodes_to_vnodes(sd_nodes, sd_nodes_nr, &sd_vroot);
 	sd_epoch = hdr.epoch;
 out:
 	if (buf)
diff --git a/dog/dog.h b/dog/dog.h
index e1c499e..9b17eb7 100644
--- a/dog/dog.h
+++ b/dog/dog.h
@@ -56,8 +56,8 @@ extern bool verbose;
 
 extern uint32_t sd_epoch;
 extern struct sd_node sd_nodes[SD_MAX_NODES];
-extern struct sd_vnode sd_vnodes[SD_MAX_VNODES];
-extern int sd_nodes_nr, sd_vnodes_nr;
+extern struct rb_root sd_vroot;
+extern int sd_nodes_nr;
 
 bool is_current(const struct sd_inode *i);
 char *strnumber(uint64_t _size);
diff --git a/dog/vdi.c b/dog/vdi.c
index 82821ac..c99d0ae 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -904,14 +904,12 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 	int i, j, ret;
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
-	struct sd_vnode *vnodes;
 	const struct sd_vnode *vnode_buf[SD_MAX_COPIES];
 	struct epoch_log *logs;
-	int vnodes_nr, nr_logs, log_length;
+	int nr_logs, log_length;
 
 	log_length = sd_epoch * sizeof(struct epoch_log);
 	logs = xmalloc(log_length);
-	vnodes = xmalloc(sizeof(*vnodes) * SD_MAX_VNODES);
 
 	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
 	hdr.data_length = log_length;
@@ -927,6 +925,9 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 
 	nr_logs = rsp->data_length / sizeof(struct epoch_log);
 	for (i = nr_logs - 1; i >= 0; i--) {
+		struct rb_root vroot;
+		struct sd_vnode *v;
+
 		printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n",
 		       oid, logs[i].epoch, nr_copies);
 		printf("---------------------------------------------------\n");
@@ -943,22 +944,23 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 			}
 			continue;
 		}
-		vnodes_nr = nodes_to_vnodes(logs[i].nodes,
-					    logs[i].nr_nodes, vnodes);
-		oid_to_vnodes(vnodes, vnodes_nr, oid, nr_copies, vnode_buf);
+		nodes_to_vnodes(logs[i].nodes, logs[i].nr_nodes, &vroot);
+		oid_to_vnodes(oid, &vroot, nr_copies, vnode_buf);
 		for (j = 0; j < nr_copies; j++) {
 			const struct node_id *n = &vnode_buf[j]->node->nid;
 
 			printf("%s\n", addr_to_str(n->addr, n->port));
 		}
+		rb_for_each_entry(v, &vroot, rb) {
+			rb_erase(&v->rb, &vroot);
+			free(v);
+		}
 	}
 
 	free(logs);
-	free(vnodes);
 	return EXIT_SUCCESS;
 error:
 	free(logs);
-	free(vnodes);
 	return EXIT_SYSFAIL;
 }
 
@@ -1541,7 +1543,7 @@ static void queue_vdi_check_work(const struct sd_inode *inode, uint64_t oid,
 	info->done = done;
 	info->wq = wq;
 
-	oid_to_vnodes(sd_vnodes, sd_vnodes_nr, oid, nr_copies, tgt_vnodes);
+	oid_to_vnodes(oid, &sd_vroot, nr_copies, tgt_vnodes);
 	for (int i = 0; i < nr_copies; i++) {
 		info->vcw[i].info = info;
 		info->vcw[i].vnode = tgt_vnodes[i];
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 8544c75..7d1df2c 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -29,7 +29,6 @@
 
 #define SD_MAX_NODES 1024
 #define SD_DEFAULT_VNODES 128
-#define SD_MAX_VNODES (SD_MAX_NODES * SD_DEFAULT_VNODES)
 
 /*
  * Operations with opcodes above 0x80 are considered part of the inter-sheep
diff --git a/include/sheep.h b/include/sheep.h
index bbb7721..6c4b601 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation.
+ * Copyright (C) 2012-2013 Taobao Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License version
@@ -17,19 +18,18 @@
 #include "bitops.h"
 #include "list.h"
 #include "net.h"
+#include "rbtree.h"
 
 struct sd_vnode {
+	struct rb_node rb;
 	const struct sd_node *node;
-	uint64_t        id;
+	uint64_t hash;
 };
 
 struct vnode_info {
-	struct sd_vnode vnodes[SD_MAX_VNODES];
-	int nr_vnodes;
-
+	struct rb_root vroot;
 	struct sd_node nodes[SD_MAX_NODES];
 	int nr_nodes;
-
 	int nr_zones;
 	refcnt_t refcnt;
 };
@@ -56,133 +56,77 @@ static inline void sd_init_req(struct sd_req *req, uint8_t opcode)
 	req->proto_ver = opcode < 0x80 ? SD_PROTO_VER : SD_SHEEP_PROTO_VER;
 }
 
-static inline int same_zone(const struct sd_vnode *e, int n1, int n2)
+static inline int same_zone(struct sd_vnode *v1, struct sd_vnode *v2)
 {
-	return e[n1].node->zone == e[n2].node->zone;
+	return v1->node->zone == v2->node->zone;
 }
 
-/* Get the first vnode's index which is matching the OID */
-static inline int get_vnode_first_idx(const struct sd_vnode *entries,
-				      int nr_entries, uint64_t oid)
+static inline int vnode_cmp(const struct sd_vnode *node1,
+			    const struct sd_vnode *node2)
 {
-	uint64_t id = sd_hash_oid(oid);
-	int start, end, pos;
-
-	assert(nr_entries > 0);
-
-	start = 0;
-	end = nr_entries - 1;
-
-	if (id > entries[end].id || id < entries[start].id)
-		return (end + 1) % nr_entries;
-
-	for (;;) {
-		pos = (end - start) / 2 + start;
-		if (entries[pos].id < id) {
-			if (entries[pos + 1].id >= id)
-				return (pos + 1) % nr_entries;
-			start = pos;
-		} else
-			end = pos;
-	}
+	return intcmp(node1->hash, node2->hash);
 }
 
-/* Get next vnode's index according to the PREV_IDXS */
-static inline int get_vnode_next_idx(const struct sd_vnode *entries,
-				     int nr_entries, int *prev_idxs,
-				     int nr_prev_idxs)
+/* If v1_hash < oid_hash <= v2_hash, then oid is resident on v2 */
+static inline struct sd_vnode *
+oid_to_first_vnode(uint64_t oid, struct rb_root *root)
 {
-	int i, idx, first_idx;
-	bool found;
-
-	first_idx = prev_idxs[0];
-	idx = prev_idxs[nr_prev_idxs - 1];
-	for (;;) {
-		idx = (idx + 1) % nr_entries;
-		if (unlikely(idx == first_idx))
-			panic("can't find next new idx");
-
-		for (found = false, i = 0; i < nr_prev_idxs; i++) {
-			if (same_zone(entries, idx, prev_idxs[i])) {
-				found = true;
-				break;
-			}
-		}
-
-		if (!found)
-			return idx;
-	}
+	struct sd_vnode dummy = {
+		.hash = sd_hash_oid(oid),
+	};
+	return rb_nsearch(root, &dummy, rb, vnode_cmp);
 }
 
-/* Get the n'th vnode's index which is matching the OID */
-static inline int get_vnode_nth_idx(const struct sd_vnode *entries,
-			int nr_entries, uint64_t oid, int nth)
+/* Replica are placed along the ring one by one with different zones */
+static inline void oid_to_vnodes(uint64_t oid, struct rb_root *root,
+				 int nr_copies,
+				 const struct sd_vnode **vnodes)
 {
-	int nr_idxs = 0, idxs[SD_MAX_COPIES];
-
-	idxs[nr_idxs++] = get_vnode_first_idx(entries, nr_entries, oid);
-
-	if (!nth)
-		return idxs[nth];
-
-	while (nr_idxs <= nth) {
-		idxs[nr_idxs] = get_vnode_next_idx(entries, nr_entries,
-						     idxs, nr_idxs);
-		nr_idxs++;
+	struct sd_vnode *next = oid_to_first_vnode(oid, root);
+
+	vnodes[0] = next;
+	for (int i = 1; i < nr_copies; i++) {
+next:
+		next = rb_entry(rb_next(&next->rb), struct sd_vnode, rb);
+		if (!next) /* Wrap around */
+			next = rb_entry(rb_first(root), struct sd_vnode, rb);
+		if (unlikely(next == vnodes[0]))
+			panic("can't find a valid vnode");
+		for (int j = 0; j < i; j++)
+			if (same_zone((struct sd_vnode *)vnodes[j], next))
+				goto next;
+		vnodes[i] = next;
 	}
-
-	return idxs[nth];
 }
 
-static inline const struct sd_vnode *oid_to_vnode(const struct sd_vnode *entries,
-						  int nr_entries, uint64_t oid,
-						  int copy_idx)
+static inline const struct sd_vnode *
+oid_to_vnode(uint64_t oid, struct rb_root *root, int copy_idx)
 {
-	int idx = get_vnode_nth_idx(entries, nr_entries, oid, copy_idx);
+	const struct sd_vnode *vnodes[SD_MAX_COPIES];
 
-	return &entries[idx];
+	oid_to_vnodes(oid, root, copy_idx + 1, vnodes);
+
+	return vnodes[copy_idx];
 }
 
-static inline const struct sd_node *oid_to_node(const struct sd_vnode *entries,
-						int nr_entries, uint64_t oid,
-						int copy_idx)
+static inline const struct sd_node *
+oid_to_node(uint64_t oid, struct rb_root *root, int copy_idx)
 {
 	const struct sd_vnode *vnode;
 
-	vnode = oid_to_vnode(entries, nr_entries, oid, copy_idx);
+	vnode = oid_to_vnode(oid, root, copy_idx);
 
 	return vnode->node;
 }
 
-static inline void oid_to_vnodes(const struct sd_vnode *entries, int nr_entries,
-				 uint64_t oid, int nr_copies,
-				 const struct sd_vnode **vnodes)
-{
-	int idx, idxs[SD_MAX_COPIES], i;
-
-	if (nr_entries == 0)
-		return;
-
-	idx = get_vnode_first_idx(entries, nr_entries, oid);
-	idxs[0] = idx;
-	vnodes[0] = &entries[idx];
-
-	for (i = 1; i < nr_copies; i++) {
-		idx = get_vnode_next_idx(entries, nr_entries, idxs, i);
-		idxs[i] = idx;
-		vnodes[i] = &entries[idx];
-	}
-}
-
-static inline void oid_to_nodes(const struct sd_vnode *entries, int nr_entries,
-				uint64_t oid, int nr_copies,
+static inline void oid_to_nodes(uint64_t oid, struct rb_root *root,
+				int nr_copies,
 				const struct sd_node **nodes)
 {
-	int i;
 	const struct sd_vnode *vnodes[SD_MAX_COPIES];
 
-	oid_to_vnodes(entries, nr_entries, oid, nr_copies, vnodes);
-	for (i = 0; i < nr_copies; i++)
+	oid_to_vnodes(oid, root, nr_copies, vnodes);
+	for (int i = 0; i < nr_copies; i++)
 		nodes[i] = vnodes[i]->node;
 }
 
@@ -273,39 +217,24 @@ static inline bool node_eq(const struct sd_node *a, const struct sd_node *b)
 	return node_cmp(a, b) == 0;
 }
 
-static inline int vnode_cmp(const struct sd_vnode *node1,
-			    const struct sd_vnode *node2)
-{
-	return intcmp(node1->id, node2->id);
-}
-
-static inline int node_to_vnodes(const struct sd_node *n,
-				 struct sd_vnode *vnodes)
+static inline void
+nodes_to_vnodes(const struct sd_node *nodes, int nr_nodes, struct rb_root *root)
 {
-	int nr = 0;
-	uint64_t hval = sd_hash(&n->nid, offsetof(typeof(n->nid), io_addr));
-
-	for (int i = 0; i < n->nr_vnodes; i++) {
-		hval = sd_hash_next(hval);
-		vnodes[nr].id = hval;
-		vnodes[nr].node = n;
-		nr++;
+	for (int j = 0; j < nr_nodes; j++) {
+		const struct sd_node *n = nodes + j;
+		uint64_t hval = sd_hash(&n->nid, offsetof(typeof(n->nid),
+							  io_addr));
+
+		for (int i = 0; i < n->nr_vnodes; i++) {
+			struct sd_vnode *v = xmalloc(sizeof(*v));
+
+			hval = sd_hash_next(hval);
+			v->hash = hval;
+			v->node = n;
+			if (unlikely(rb_insert(root, v, rb, vnode_cmp)))
+				panic("vdisk hash collison");
+		}
 	}
-
-	return nr;
-}
-
-static inline int nodes_to_vnodes(const struct sd_node *nodes, int nr_nodes,
-				  struct sd_vnode *vnodes)
-{
-	int nr_vnodes = 0;
-
-	for (int i = 0; i < nr_nodes; i++)
-		nr_vnodes += node_to_vnodes(nodes + i, vnodes + nr_vnodes);
-
-	xqsort(vnodes, nr_vnodes, vnode_cmp);
-
-	return nr_vnodes;
 }
 
 #define MAX_NODE_STR_LEN 256
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 33f04e0..1b0810b 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -43,8 +43,7 @@ int gateway_read_obj(struct request *req)
 
 	nr_copies = get_req_copy_number(req);
 
-	oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid,
-		      nr_copies, obj_vnodes);
+	oid_to_vnodes(oid, &req->vinfo->vroot, nr_copies, obj_vnodes);
 	for (i = 0; i < nr_copies; i++) {
 		v = obj_vnodes[i];
 		if (!vnode_is_local(v))
@@ -239,11 +238,10 @@ static int init_target_nodes(struct request *req, uint64_t oid,
 			     const struct sd_node **target_nodes)
 {
 	int nr_to_send;
-	const struct vnode_info *vinfo = req->vinfo;
+	struct vnode_info *vinfo = req->vinfo;
 
 	nr_to_send = get_req_copy_number(req);
-	oid_to_nodes(vinfo->vnodes, vinfo->nr_vnodes, oid, nr_to_send,
-		     target_nodes);
+	oid_to_nodes(oid, &vinfo->vroot, nr_to_send, target_nodes);
 
 	return nr_to_send;
 }
diff --git a/sheep/group.c b/sheep/group.c
index e2ee266..378d084 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -101,8 +101,14 @@ main_fn struct vnode_info *get_vnode_info(void)
 void put_vnode_info(struct vnode_info *vnode_info)
 {
 	if (vnode_info) {
-		if (refcount_dec(&vnode_info->refcnt) == 0)
+		if (refcount_dec(&vnode_info->refcnt) == 0) {
+			struct sd_vnode *v;
+			rb_for_each_entry(v, &vnode_info->vroot, rb) {
+				rb_erase(&v->rb, &vnode_info->vroot);
+				free(v);
+			}
 			free(vnode_info);
+		}
 	}
 }
 
@@ -139,14 +145,14 @@ struct vnode_info *alloc_vnode_info(const struct sd_node *nodes,
 
 	vnode_info = xzalloc(sizeof(*vnode_info));
 
+	INIT_RB_ROOT(&vnode_info->vroot);
 	vnode_info->nr_nodes = nr_nodes;
 	memcpy(vnode_info->nodes, nodes, sizeof(*nodes) * nr_nodes);
 	xqsort(vnode_info->nodes, nr_nodes, node_cmp);
 
 	recalculate_vnodes(vnode_info->nodes, nr_nodes);
 
-	vnode_info->nr_vnodes = nodes_to_vnodes(vnode_info->nodes, nr_nodes,
-						vnode_info->vnodes);
+	nodes_to_vnodes(vnode_info->nodes, nr_nodes, &vnode_info->vroot);
 	vnode_info->nr_zones = get_zones_nr_from(nodes, nr_nodes);
 	refcount_set(&vnode_info->refcnt, 1);
 	return vnode_info;
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 7cba585..dc5e3d9 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -397,8 +397,7 @@ static bool oid_stale(uint64_t oid)
 	vinfo = get_vnode_info();
 	nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
 
-	oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid,
-		      nr_copies, obj_vnodes);
+	oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes);
 	for (i = 0; i < nr_copies; i++) {
 		v = obj_vnodes[i];
 		if (vnode_is_local(v)) {
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 6a02adf..da5e711 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -184,7 +184,7 @@ static int recover_object_from_replica(struct recovery_obj_work *row,
 	for (int i = 0; i < nr_copies; i++) {
 		const struct sd_vnode *vnode;
 
-		vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i);
+		vnode = oid_to_vnode(oid, &old->vroot, i);
 
 		if (vnode_is_local(vnode)) {
 			start = i;
@@ -197,7 +197,7 @@ static int recover_object_from_replica(struct recovery_obj_work *row,
 		const struct sd_node *node;
 		int idx = (i + start) % nr_copies;
 
-		node = oid_to_node(old->vnodes, old->nr_vnodes, oid, idx);
+		node = oid_to_node(oid, &old->vroot, idx);
 
 		if (invalid_node(node, row->base.cur_vinfo))
 			continue;
@@ -724,8 +724,7 @@ static void screen_object_list(struct recovery_list_work *rlw,
 
 		nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones);
 
-		oid_to_vnodes(rw->cur_vinfo->vnodes, rw->cur_vinfo->nr_vnodes,
-			      oids[i], nr_objs, vnodes);
+		oid_to_vnodes(oids[i], &rw->cur_vinfo->vroot, nr_objs, vnodes);
 		for (j = 0; j < nr_objs; j++) {
 			if (!vnode_is_local(vnodes[j]))
 				continue;
diff --git a/sheep/request.c b/sheep/request.c
index 5bd917b..5113fca 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -28,9 +28,7 @@ static bool is_access_local(struct request *req, uint64_t oid)
 	int i;
 
 	nr_copies = get_req_copy_number(req);
-	oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid,
-		      nr_copies, obj_vnodes);
-
+	oid_to_vnodes(oid, &req->vinfo->vroot, nr_copies, obj_vnodes);
 	for (i = 0; i < nr_copies; i++) {
 		if (vnode_is_local(obj_vnodes[i]))
 			return true;
@@ -310,7 +308,7 @@ static void queue_gateway_request(struct request *req)
 			return;
 
 queue_work:
-	if (req->vinfo->nr_vnodes == 0) {
+	if (RB_EMPTY_ROOT(&req->vinfo->vroot)) {
 		sd_err("there is no living nodes");
 		req->rp.result = SD_RES_HALT;
 		put_request(req);
-- 
1.7.9.5




More information about the sheepdog mailing list