[sheepdog] [PATCH 7/8] sheep: use rbtree to manage struct sd_node

Liu Yuan namei.unix at gmail.com
Fri Sep 13 12:06:39 CEST 2013


No algorithm nor functionality changes, but involes a lot of api changes that
assume a static array of sd_nodes.

With this patch, cluster driver will pass a rbtree of nodes directly to sheep
handlers and all the way down we'll play with rbtree instead of the old static
array.

I do not yet move static sd_nodes array in struct epoch_log and struct
cluster_info to dynamic structures. So basically we are still limited by max
node roof, but we are way close to get rid of it.

Shepherd driver is consdiered to be changed later because it might be redesigned
in the future and not stale for now.

Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 dog/dog.c                 |   14 +++-
 dog/dog.h                 |    2 +-
 dog/node.c                |   59 +++++++------
 dog/vdi.c                 |   15 ++--
 include/internal_proto.h  |    2 +
 include/sheep.h           |   20 +++--
 include/sockfd_cache.h    |    2 +-
 lib/sockfd_cache.c        |   10 +--
 sheep/cluster.h           |    6 +-
 sheep/cluster/corosync.c  |   25 +++---
 sheep/cluster/local.c     |   41 ++++++----
 sheep/cluster/zookeeper.c |   50 ++++++------
 sheep/group.c             |  200 +++++++++++++++++++++++----------------------
 sheep/ops.c               |   16 ++--
 sheep/recovery.c          |   17 ++--
 sheep/sheep_priv.h        |    3 +-
 16 files changed, 267 insertions(+), 215 deletions(-)

diff --git a/dog/dog.c b/dog/dog.c
index 0009c92..16298ad 100644
--- a/dog/dog.c
+++ b/dog/dog.c
@@ -48,15 +48,15 @@ static void usage(const struct command *commands, int status);
 
 uint32_t sd_epoch;
 
-struct sd_node sd_nodes[SD_MAX_NODES];
 int sd_nodes_nr;
 struct rb_root sd_vroot = RB_ROOT;
+struct rb_root sd_nroot = RB_ROOT;
 
 int update_node_list(int max_nodes)
 {
 	int ret;
 	unsigned int size;
-	char *buf = NULL;
+	struct sd_node *buf = NULL;
 	struct sd_node *ent;
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
@@ -91,8 +91,14 @@ int update_node_list(int max_nodes)
 		goto out;
 	}
 
-	memcpy(sd_nodes, buf, size);
-	nodes_to_vnodes(sd_nodes, sd_nodes_nr, &sd_vroot);
+	for (int i = 0; i < sd_nodes_nr; i++) {
+		struct sd_node *n = xmalloc(sizeof(*n));
+
+		*n = buf[i];
+		rb_insert(&sd_nroot, n, rb, node_cmp);
+	}
+
+	nodes_to_vnodes(&sd_nroot, &sd_vroot);
 	sd_epoch = hdr.epoch;
 out:
 	if (buf)
diff --git a/dog/dog.h b/dog/dog.h
index 9b17eb7..8c54c10 100644
--- a/dog/dog.h
+++ b/dog/dog.h
@@ -55,8 +55,8 @@ extern bool raw_output;
 extern bool verbose;
 
 extern uint32_t sd_epoch;
-extern struct sd_node sd_nodes[SD_MAX_NODES];
 extern struct rb_root sd_vroot;
+extern struct rb_root sd_nroot;
 extern int sd_nodes_nr;
 
 bool is_current(const struct sd_inode *i);
diff --git a/dog/node.c b/dog/node.c
index 2c1450f..052739c 100644
--- a/dog/node.c
+++ b/dog/node.c
@@ -29,16 +29,16 @@ static void cal_total_vdi_size(uint32_t vid, const char *name, const char *tag,
 
 static int node_list(int argc, char **argv)
 {
-	int i;
+	struct sd_node *n;
+	int i = 0;
 
 	if (!raw_output)
 		printf("  Id   Host:Port         V-Nodes       Zone\n");
-	for (i = 0; i < sd_nodes_nr; i++) {
-		const char *host = addr_to_str(sd_nodes[i].nid.addr,
-					       sd_nodes[i].nid.port);
+	rb_for_each_entry(n, &sd_nroot, rb) {
+		const char *host = addr_to_str(n->nid.addr, n->nid.port);
 
 		printf(raw_output ? "%d %s %d %u\n" : "%4d   %-20s\t%2d%11u\n",
-		       i, host, sd_nodes[i].nr_vnodes, sd_nodes[i].zone);
+		       i++, host, n->nr_vnodes, n->zone);
 	}
 
 	return EXIT_SUCCESS;
@@ -46,26 +46,27 @@ static int node_list(int argc, char **argv)
 
 static int node_info(int argc, char **argv)
 {
-	int i, ret, success = 0;
+	int ret, success = 0, i = 0;
 	uint64_t total_size = 0, total_avail = 0, total_vdi_size = 0;
+	struct sd_node *n;
 
 	if (!raw_output)
 		printf("Id\tSize\tUsed\tAvail\tUse%%\n");
 
-	for (i = 0; i < sd_nodes_nr; i++) {
+	rb_for_each_entry(n, &sd_nroot, rb) {
 		struct sd_req req;
 		struct sd_rsp *rsp = (struct sd_rsp *)&req;
 
 		sd_init_req(&req, SD_OP_STAT_SHEEP);
 
-		ret = send_light_req(&sd_nodes[i].nid, &req);
+		ret = send_light_req(&n->nid, &req);
 		if (!ret) {
 			int ratio = (int)(((double)(rsp->node.store_size -
 						    rsp->node.store_free) /
 					   rsp->node.store_size) * 100);
 			printf(raw_output ? "%d %s %s %s %d%%\n" :
 					"%2d\t%s\t%s\t%s\t%3d%%\n",
-			       i,
+			       i++,
 			       strnumber(rsp->node.store_size),
 			       strnumber(rsp->node.store_size -
 					   rsp->node.store_free),
@@ -182,7 +183,8 @@ static int node_recovery_progress(void)
 
 static int node_recovery(int argc, char **argv)
 {
-	int i, ret;
+	struct sd_node *n;
+	int ret, i = 0;
 
 	if (node_cmd_data.recovery_progress)
 		return node_recovery_progress();
@@ -193,7 +195,7 @@ static int node_recovery(int argc, char **argv)
 		       "       Progress\n");
 	}
 
-	for (i = 0; i < sd_nodes_nr; i++) {
+	rb_for_each_entry(n, &sd_nroot, rb) {
 		struct sd_req req;
 		struct sd_rsp *rsp = (struct sd_rsp *)&req;
 		struct recovery_state state;
@@ -203,7 +205,7 @@ static int node_recovery(int argc, char **argv)
 		sd_init_req(&req, SD_OP_STAT_RECOVERY);
 		req.data_length = sizeof(state);
 
-		ret = dog_exec_req(&sd_nodes[i].nid, &req, &state);
+		ret = dog_exec_req(&n->nid, &req, &state);
 		if (ret < 0)
 			return EXIT_SYSFAIL;
 		if (rsp->result != SD_RES_SUCCESS) {
@@ -212,24 +214,35 @@ static int node_recovery(int argc, char **argv)
 		}
 
 		if (state.in_recovery) {
-			const char *host = addr_to_str(sd_nodes[i].nid.addr,
-						       sd_nodes[i].nid.port);
+			const char *host = addr_to_str(n->nid.addr,
+						       n->nid.port);
 			if (raw_output)
 				printf("%d %s %d %d %"PRIu64" %"PRIu64"\n", i,
-				       host, sd_nodes[i].nr_vnodes,
-				       sd_nodes[i].zone, state.nr_finished,
+				       host, n->nr_vnodes,
+				       n->zone, state.nr_finished,
 				       state.nr_total);
 			else
 				printf("%4d   %-20s%5d%11d%11.1f%%\n", i, host,
-				       sd_nodes[i].nr_vnodes, sd_nodes[i].zone,
+				       n->nr_vnodes, n->zone,
 				       100 * (float)state.nr_finished
 				       / state.nr_total);
 		}
+		i++;
 	}
 
 	return EXIT_SUCCESS;
 }
 
+static struct sd_node *idx_to_node(struct rb_root *nroot, int idx)
+{
+	struct sd_node *n = rb_entry(rb_first(nroot), struct sd_node, rb);
+
+	while (idx--)
+		n = rb_entry(rb_next(&n->rb), struct sd_node, rb);
+
+	return n;
+}
+
 static int node_kill(int argc, char **argv)
 {
 	int node_id, ret;
@@ -249,8 +262,7 @@ static int node_kill(int argc, char **argv)
 	}
 
 	sd_init_req(&req, SD_OP_KILL_NODE);
-
-	ret = send_light_req(&sd_nodes[node_id].nid, &req);
+	ret = send_light_req(&idx_to_node(&sd_nroot, node_id)->nid, &req);
 	if (ret) {
 		sd_err("Failed to execute request");
 		exit(EXIT_FAILURE);
@@ -335,16 +347,17 @@ static int node_md_info(struct node_id *nid)
 
 static int md_info(int argc, char **argv)
 {
-	int i, ret;
+	struct sd_node *n;
+	int ret, i = 0;
 
 	fprintf(stdout, "Id\tSize\tUsed\tAvail\tUse%%\tPath\n");
 
 	if (!node_cmd_data.all_nodes)
 		return node_md_info(&sd_nid);
 
-	for (i = 0; i < sd_nodes_nr; i++) {
-		fprintf(stdout, "Node %d:\n", i);
-		ret = node_md_info(&sd_nodes[i].nid);
+	rb_for_each_entry(n, &sd_nroot, rb) {
+		fprintf(stdout, "Node %d:\n", i++);
+		ret = node_md_info(&n->nid);
 		if (ret != EXIT_SUCCESS)
 			return EXIT_FAILURE;
 	}
diff --git a/dog/vdi.c b/dog/vdi.c
index c5813f6..63fb0b1 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -315,11 +315,12 @@ static int get_data_oid(const char *sheep, uint64_t oid, struct sd_rsp *rsp,
 
 static void parse_objs(uint64_t oid, obj_parser_func_t func, void *data, unsigned size)
 {
-	int i, ret, cb_ret;
+	int ret, cb_ret;
+	struct sd_node *n;
 	char *buf;
 
 	buf = xzalloc(size);
-	for (i = 0; i < sd_nodes_nr; i++) {
+	rb_for_each_entry(n, &sd_nroot, rb) {
 		struct sd_req hdr;
 		struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
 
@@ -330,7 +331,7 @@ static void parse_objs(uint64_t oid, obj_parser_func_t func, void *data, unsigne
 
 		hdr.obj.oid = oid;
 
-		ret = dog_exec_req(&sd_nodes[i].nid, &hdr, buf);
+		ret = dog_exec_req(&n->nid, &hdr, buf);
 		if (ret < 0)
 			continue;
 		switch (rsp->result) {
@@ -341,8 +342,7 @@ static void parse_objs(uint64_t oid, obj_parser_func_t func, void *data, unsigne
 		untrim_zero_blocks(buf, rsp->obj.offset, rsp->data_length,
 				   size);
 
-		cb_ret = func(addr_to_str(sd_nodes[i].nid.addr,
-					  sd_nodes[i].nid.port),
+		cb_ret = func(addr_to_str(n->nid.addr, n->nid.port),
 			      oid, rsp, buf, data);
 		if (cb_ret)
 			break;
@@ -926,6 +926,7 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 	nr_logs = rsp->data_length / sizeof(struct epoch_log);
 	for (i = nr_logs - 1; i >= 0; i--) {
 		struct rb_root vroot = RB_ROOT;
+		struct rb_root nroot = RB_ROOT;
 
 		printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n",
 		       oid, logs[i].epoch, nr_copies);
@@ -943,7 +944,9 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 			}
 			continue;
 		}
-		nodes_to_vnodes(logs[i].nodes, logs[i].nr_nodes, &vroot);
+		for (int k = 0; k < logs[i].nr_nodes; k++)
+			rb_insert(&nroot, &logs[i].nodes[k], rb, node_cmp);
+		nodes_to_vnodes(&nroot, &vroot);
 		oid_to_vnodes(oid, &vroot, nr_copies, vnode_buf);
 		for (j = 0; j < nr_copies; j++) {
 			const struct node_id *n = &vnode_buf[j]->node->nid;
diff --git a/include/internal_proto.h b/include/internal_proto.h
index a33b376..2402a1a 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -21,6 +21,7 @@
 #include <netinet/in.h>
 
 #include "sheepdog_proto.h"
+#include "rbtree.h"
 
 #define SD_SHEEP_PROTO_VER 0x08
 
@@ -124,6 +125,7 @@ struct node_id {
 };
 
 struct sd_node {
+	struct rb_node  rb;
 	struct node_id  nid;
 	uint16_t	nr_vnodes;
 	uint32_t	zone;
diff --git a/include/sheep.h b/include/sheep.h
index 7945ac1..1795855 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -28,7 +28,7 @@ struct sd_vnode {
 
 struct vnode_info {
 	struct rb_root vroot;
-	struct sd_node nodes[SD_MAX_NODES];
+	struct rb_root nroot;
 	int nr_nodes;
 	int nr_zones;
 	refcnt_t refcnt;
@@ -219,10 +219,11 @@ static inline bool node_eq(const struct sd_node *a, const struct sd_node *b)
 }
 
 static inline void
-nodes_to_vnodes(const struct sd_node *nodes, int nr_nodes, struct rb_root *root)
+nodes_to_vnodes(struct rb_root *nroot, struct rb_root *vroot)
 {
-	for (int j = 0; j < nr_nodes; j++) {
-		const struct sd_node *n = nodes + j;
+	struct sd_node *n;
+
+	rb_for_each_entry(n, nroot, rb) {
 		uint64_t hval = sd_hash(&n->nid, offsetof(typeof(n->nid),
 							  io_addr));
 
@@ -232,12 +233,21 @@ nodes_to_vnodes(const struct sd_node *nodes, int nr_nodes, struct rb_root *root)
 			hval = sd_hash_next(hval);
 			v->hash = hval;
 			v->node = n;
-			if (unlikely(rb_insert(root, v, rb, vnode_cmp)))
+			if (unlikely(rb_insert(vroot, v, rb, vnode_cmp)))
 				panic("vdisk hash collison");
 		}
 	}
 }
 
+static inline void nodes_to_buffer(struct rb_root *nroot, void *buffer)
+{
+	struct sd_node *n, *buf = buffer;
+
+	rb_for_each_entry(n, nroot, rb) {
+		memcpy(buf++, n, sizeof(*n));
+	}
+}
+
 #define MAX_NODE_STR_LEN 256
 
 static inline const char *node_to_str(const struct sd_node *id)
diff --git a/include/sockfd_cache.h b/include/sockfd_cache.h
index d91c56a..21cc2bf 100644
--- a/include/sockfd_cache.h
+++ b/include/sockfd_cache.h
@@ -9,7 +9,7 @@ void sockfd_cache_put(const struct node_id *nid, struct sockfd *sfd);
 void sockfd_cache_del_node(const struct node_id *nid);
 void sockfd_cache_del(const struct node_id *nid, struct sockfd *sfd);
 void sockfd_cache_add(const struct node_id *nid);
-void sockfd_cache_add_group(const struct sd_node *nodes, int nr);
+void sockfd_cache_add_group(const struct rb_root *nroot);
 
 int sockfd_init(void);
 
diff --git a/lib/sockfd_cache.c b/lib/sockfd_cache.c
index 5d0f2f2..0bfb274 100644
--- a/lib/sockfd_cache.c
+++ b/lib/sockfd_cache.c
@@ -205,15 +205,13 @@ static void sockfd_cache_add_nolock(const struct node_id *nid)
 }
 
 /* Add group of nodes to the cache */
-void sockfd_cache_add_group(const struct sd_node *nodes, int nr)
+void sockfd_cache_add_group(const struct rb_root *nroot)
 {
-	const struct sd_node *p;
+	struct sd_node *n;
 
-	sd_debug("%d", nr);
 	sd_write_lock(&sockfd_cache.lock);
-	while (nr--) {
-		p = nodes + nr;
-		sockfd_cache_add_nolock(&p->nid);
+	rb_for_each_entry(n, nroot, rb) {
+		sockfd_cache_add_nolock(&n->nid);
 	}
 	sd_unlock(&sockfd_cache.lock);
 }
diff --git a/sheep/cluster.h b/sheep/cluster.h
index 37a2bb7..a267443 100644
--- a/sheep/cluster.h
+++ b/sheep/cluster.h
@@ -163,16 +163,16 @@ static inline const char *get_cdrv_option(const struct cluster_driver *cdrv,
 
 /* callbacks back into sheepdog from the cluster drivers */
 void sd_accept_handler(const struct sd_node *joined,
-		       const struct sd_node *members, size_t nr_members,
+		       const struct rb_root *nroot, size_t nr_members,
 		       const void *opaque);
-void sd_leave_handler(const struct sd_node *left, const struct sd_node *members,
+void sd_leave_handler(const struct sd_node *left, const struct rb_root *nroot,
 		      size_t nr_members);
 void sd_notify_handler(const struct sd_node *sender, void *msg, size_t msg_len);
 bool sd_block_handler(const struct sd_node *sender);
 int sd_reconnect_handler(void);
 void sd_update_node_handler(struct sd_node *);
 bool sd_join_handler(const struct sd_node *joining,
-		     const struct sd_node *nodes, size_t nr_nodes,
+		     const struct rb_root *nroot, size_t nr_nodes,
 		     void *opaque);
 
 #endif
diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
index 8c38cd7..954add9 100644
--- a/sheep/cluster/corosync.c
+++ b/sheep/cluster/corosync.c
@@ -237,13 +237,11 @@ find_event(enum corosync_event_type type, struct cpg_node *sender)
 		return find_nonblock_event(type, sender);
 }
 
-static void build_node_list(const struct cpg_node *nodes, size_t nr_nodes,
-			    struct sd_node *entries)
+static void build_node_list(struct cpg_node *nodes, size_t nr_nodes,
+			    struct rb_root *nroot)
 {
-	int i;
-
-	for (i = 0; i < nr_nodes; i++)
-		entries[i] = nodes[i].node;
+	for (int i = 0; i < nr_nodes; i++)
+		rb_insert(nroot, &nodes[i].node, rb, node_cmp);
 }
 
 /*
@@ -253,8 +251,9 @@ static void build_node_list(const struct cpg_node *nodes, size_t nr_nodes,
  */
 static bool __corosync_dispatch_one(struct corosync_event *cevent)
 {
-	struct sd_node entries[SD_MAX_NODES], *node;
+	struct sd_node *node;
 	struct cpg_node *n;
+	struct rb_root nroot = RB_ROOT;
 	int idx;
 
 	switch (cevent->type) {
@@ -267,8 +266,8 @@ static bool __corosync_dispatch_one(struct corosync_event *cevent)
 			/* sd_join_handler() must be called only once */
 			return false;
 
-		build_node_list(cpg_nodes, nr_cpg_nodes, entries);
-		if (sd_join_handler(&cevent->sender.node, entries,
+		build_node_list(cpg_nodes, nr_cpg_nodes, &nroot);
+		if (sd_join_handler(&cevent->sender.node, &nroot,
 				    nr_cpg_nodes, cevent->msg)) {
 			send_message(COROSYNC_MSG_TYPE_ACCEPT, &cevent->sender,
 				     cpg_nodes, nr_cpg_nodes, cevent->msg,
@@ -281,8 +280,8 @@ static bool __corosync_dispatch_one(struct corosync_event *cevent)
 		add_cpg_node(cpg_nodes, nr_cpg_nodes, &cevent->sender);
 		nr_cpg_nodes++;
 
-		build_node_list(cpg_nodes, nr_cpg_nodes, entries);
-		sd_accept_handler(&cevent->sender.node, entries, nr_cpg_nodes,
+		build_node_list(cpg_nodes, nr_cpg_nodes, &nroot);
+		sd_accept_handler(&cevent->sender.node, &nroot, nr_cpg_nodes,
 				  cevent->msg);
 		break;
 	case COROSYNC_EVENT_TYPE_LEAVE:
@@ -294,8 +293,8 @@ static bool __corosync_dispatch_one(struct corosync_event *cevent)
 
 		del_cpg_node(cpg_nodes, nr_cpg_nodes, &cevent->sender);
 		nr_cpg_nodes--;
-		build_node_list(cpg_nodes, nr_cpg_nodes, entries);
-		sd_leave_handler(&cevent->sender.node, entries, nr_cpg_nodes);
+		build_node_list(cpg_nodes, nr_cpg_nodes, &nroot);
+		sd_leave_handler(&cevent->sender.node, &nroot, nr_cpg_nodes);
 		break;
 	case COROSYNC_EVENT_TYPE_BLOCK:
 		if (cevent->callbacked)
diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c
index de3dfb1..b0e7fc2 100644
--- a/sheep/cluster/local.c
+++ b/sheep/cluster/local.c
@@ -83,7 +83,6 @@ struct local_event {
 	struct local_node lnodes[SD_MAX_NODES];
 };
 
-
 /* shared memory queue */
 
 static struct shm_queue {
@@ -93,6 +92,12 @@ static struct shm_queue {
 	struct local_event nonblock_events[MAX_EVENTS];
 } *shm_queue;
 
+static inline void node_insert(struct sd_node *new, struct rb_root *root)
+{
+	if (rb_insert(root, new, rb, node_cmp))
+		panic("insert duplicate %s", node_to_str(new));
+}
+
 static void shm_queue_lock(void)
 {
 	flock(shmfd, LOCK_EX);
@@ -382,8 +387,8 @@ static bool local_process_event(void)
 {
 	struct local_event *ev;
 	int i;
-	struct sd_node nodes[SD_MAX_NODES];
-	size_t nr_nodes;
+	struct rb_root root = RB_ROOT;
+	size_t nr_nodes = 0;
 
 	ev = shm_queue_peek();
 	if (!ev)
@@ -392,13 +397,6 @@ static bool local_process_event(void)
 	sd_debug("type = %d, sender = %s", ev->type, lnode_to_str(&ev->sender));
 	sd_debug("callbacked = %d, removed = %d", ev->callbacked, ev->removed);
 
-	nr_nodes = 0;
-	for (i = 0; i < ev->nr_lnodes; i++) {
-		sd_debug("%d: %s", i, lnode_to_str(ev->lnodes + i));
-		if (!ev->lnodes[i].gateway)
-			nodes[nr_nodes++] = ev->lnodes[i].node;
-	}
-
 	if (ev->removed)
 		goto out;
 
@@ -421,12 +419,23 @@ static bool local_process_event(void)
 		}
 	}
 
+	for (i = 0; i < ev->nr_lnodes; i++) {
+		sd_debug("%d: %s", i, lnode_to_str(ev->lnodes + i));
+		if (!ev->lnodes[i].gateway) {
+			node_insert(&ev->lnodes[i].node, &root);
+			nr_nodes++;
+		}
+	}
+
 	switch (ev->type) {
 	case EVENT_JOIN:
-		/* nodes[nr_nodes - 1] is a sender, so don't include it */
-		assert(node_eq(&ev->sender.node, &nodes[nr_nodes - 1]));
-		if (sd_join_handler(&ev->sender.node, nodes, nr_nodes - 1,
-				      ev->buf)) {
+		for (i = 0; i < ev->nr_lnodes; i++)
+			if (node_eq(&ev->sender.node, &ev->lnodes[i].node)) {
+				rb_erase(&ev->lnodes[i].node.rb, &root);
+				nr_nodes--;
+			}
+		if (sd_join_handler(&ev->sender.node, &root, nr_nodes,
+				    ev->buf)) {
 			ev->type = EVENT_ACCEPT;
 			msync(ev, sizeof(*ev), MS_SYNC);
 
@@ -435,7 +444,7 @@ static bool local_process_event(void)
 
 		return false;
 	case EVENT_ACCEPT:
-		sd_accept_handler(&ev->sender.node, nodes, nr_nodes, ev->buf);
+		sd_accept_handler(&ev->sender.node, &root, nr_nodes, ev->buf);
 		break;
 	case EVENT_LEAVE:
 		if (ev->sender.gateway) {
@@ -445,7 +454,7 @@ static bool local_process_event(void)
 		}
 		/* fall through */
 	case EVENT_GATEWAY:
-		sd_leave_handler(&ev->sender.node, nodes, nr_nodes);
+		sd_leave_handler(&ev->sender.node, &root, nr_nodes);
 		break;
 	case EVENT_BLOCK:
 		ev->callbacked = sd_block_handler(&ev->sender.node);
diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 238c114..7bda011 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -57,6 +57,8 @@ struct zk_node {
 	bool gone;
 };
 
+#define ZK_MAX_BUF_SIZE (2*1024*1024) /* 2M */
+
 struct zk_event {
 	uint64_t id;
 	enum zk_event_type type;
@@ -64,10 +66,10 @@ struct zk_event {
 	size_t msg_len;
 	size_t nr_nodes;
 	size_t buf_len;
-	uint8_t buf[SD_MAX_EVENT_BUF_SIZE];
+	uint8_t buf[ZK_MAX_BUF_SIZE];
 };
 
-static struct sd_node sd_nodes[SD_MAX_NODES];
+static struct rb_root sd_node_root = RB_ROOT;
 static size_t nr_sd_nodes;
 static struct rb_root zk_node_root = RB_ROOT;
 static struct sd_lock zk_tree_lock = SD_LOCK_INITIALIZER;
@@ -372,12 +374,14 @@ static inline void *zk_event_sd_nodes(struct zk_event *ev)
 static int push_join_response(struct zk_event *ev)
 {
 	char path[MAX_NODE_STR_LEN];
+	struct sd_node *n, *np = zk_event_sd_nodes(ev);
 	int len;
 
 	ev->type = EVENT_ACCEPT;
 	ev->nr_nodes = nr_sd_nodes;
-	memcpy(zk_event_sd_nodes(ev), sd_nodes,
-	       nr_sd_nodes * sizeof(struct sd_node));
+	rb_for_each_entry(n, &sd_node_root, rb) {
+		memcpy(np++, n, sizeof(struct sd_node));
+	}
 	queue_pos--;
 
 	len = offsetof(typeof(*ev), buf) + ev->buf_len;
@@ -417,35 +421,24 @@ static inline void zk_tree_add(struct zk_node *node)
 	 * Even node list will be built later, we need this because in master
 	 * transfer case, we need this information to destroy the tree.
 	 */
-	sd_nodes[nr_sd_nodes++] = zk->node;
+	rb_insert(&sd_node_root, &zk->node, rb, node_cmp);
+	nr_sd_nodes++;
 out:
 	sd_unlock(&zk_tree_lock);
 }
 
-static inline void zk_tree_del_nolock(struct zk_node *node)
-{
-	rb_erase(&node->rb, &zk_node_root);
-	free(node);
-}
-
 static inline void zk_tree_del(struct zk_node *node)
 {
 	sd_write_lock(&zk_tree_lock);
-	zk_tree_del_nolock(node);
+	rb_erase(&node->rb, &zk_node_root);
+	free(node);
 	sd_unlock(&zk_tree_lock);
 }
 
 static inline void zk_tree_destroy(void)
 {
-	struct zk_node *zk;
-	int i;
-
 	sd_write_lock(&zk_tree_lock);
-	for (i = 0; i < nr_sd_nodes; i++) {
-		zk = zk_tree_search_nolock(&sd_nodes[i].nid);
-		if (zk)
-			zk_tree_del_nolock(zk);
-	}
+	rb_destroy(&zk_node_root, struct zk_node, rb);
 	sd_unlock(&zk_tree_lock);
 }
 
@@ -454,8 +447,11 @@ static inline void build_node_list(void)
 	struct zk_node *zk;
 
 	nr_sd_nodes = 0;
-	rb_for_each_entry(zk, &zk_node_root, rb)
-		sd_nodes[nr_sd_nodes++] = zk->node;
+	INIT_RB_ROOT(&sd_node_root);
+	rb_for_each_entry(zk, &zk_node_root, rb) {
+		rb_insert(&sd_node_root, &zk->node, rb, node_cmp);
+		nr_sd_nodes++;
+	}
 
 	sd_debug("nr_sd_nodes:%zu", nr_sd_nodes);
 }
@@ -564,7 +560,6 @@ static int add_join_event(void *msg, size_t msg_len)
 	struct zk_event ev;
 	size_t len = msg_len + sizeof(struct sd_node) * SD_MAX_NODES;
 
-	assert(len <= SD_MAX_EVENT_BUF_SIZE);
 	ev.id = get_uniq_id();
 	ev.type = EVENT_JOIN;
 	ev.sender = this_node;
@@ -814,7 +809,7 @@ static void zk_handle_join(struct zk_event *ev)
 		return;
 	}
 
-	sd_join_handler(&ev->sender.node, sd_nodes, nr_sd_nodes, ev->buf);
+	sd_join_handler(&ev->sender.node, &sd_node_root, nr_sd_nodes, ev->buf);
 	push_join_response(ev);
 
 	sd_debug("I'm the master now");
@@ -878,7 +873,8 @@ static void zk_handle_accept(struct zk_event *ev)
 	zk_tree_add(&ev->sender);
 
 	build_node_list();
-	sd_accept_handler(&ev->sender.node, sd_nodes, nr_sd_nodes, ev->buf);
+	sd_accept_handler(&ev->sender.node, &sd_node_root, nr_sd_nodes,
+			  ev->buf);
 }
 
 static void kick_block_event(void)
@@ -916,7 +912,7 @@ static void zk_handle_leave(struct zk_event *ev)
 	block_event_list_del(n);
 	zk_tree_del(n);
 	build_node_list();
-	sd_leave_handler(&ev->sender.node, sd_nodes, nr_sd_nodes);
+	sd_leave_handler(&ev->sender.node, &sd_node_root, nr_sd_nodes);
 }
 
 static void zk_handle_block(struct zk_event *ev)
@@ -997,9 +993,9 @@ static inline void handle_session_expire(void)
 	INIT_RB_ROOT(&zk_node_root);
 	INIT_LIST_HEAD(&zk_block_list);
 	nr_sd_nodes = 0;
+	INIT_RB_ROOT(&sd_node_root);
 	first_push = true;
 	joined = false;
-	memset(sd_nodes, 0, sizeof(struct sd_node) * SD_MAX_NODES);
 
 	while (sd_reconnect_handler()) {
 		sd_err("failed to reconnect. sleep and retry...");
diff --git a/sheep/group.c b/sheep/group.c
index eee8385..660fc5a 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -20,8 +20,7 @@ struct get_vdis_work {
 	struct work work;
 	DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
 	struct sd_node joined;
-	size_t nr_members;
-	struct sd_node members[];
+	struct rb_root nroot;
 };
 
 static pthread_mutex_t wait_vdis_lock = PTHREAD_MUTEX_INITIALIZER;
@@ -32,26 +31,27 @@ static main_thread(struct vnode_info *) current_vnode_info;
 static main_thread(struct list_head *) pending_block_list;
 static main_thread(struct list_head *) pending_notify_list;
 
-static int get_zones_nr_from(const struct sd_node *nodes, int nr_nodes)
+static int get_zones_nr_from(struct rb_root *nroot)
 {
-	int nr_zones = 0, i, j;
+	int nr_zones = 0, j;
 	uint32_t zones[SD_MAX_COPIES];
+	struct sd_node *n;
 
-	for (i = 0; i < nr_nodes; i++) {
+	rb_for_each_entry(n, nroot, rb) {
 		/*
 		 * Only count zones that actually store data, pure gateways
 		 * don't contribute to the redundancy level.
 		 */
-		if (!nodes[i].nr_vnodes)
+		if (!n->nr_vnodes)
 			continue;
 
 		for (j = 0; j < nr_zones; j++) {
-			if (nodes[i].zone == zones[j])
+			if (n->zone == zones[j])
 				break;
 		}
 
 		if (j == nr_zones) {
-			zones[nr_zones] = nodes[i].zone;
+			zones[nr_zones] = n->zone;
 			if (++nr_zones == ARRAY_SIZE(zones))
 				break;
 		}
@@ -60,15 +60,6 @@ static int get_zones_nr_from(const struct sd_node *nodes, int nr_nodes)
 	return nr_zones;
 }
 
-static int get_node_idx(struct vnode_info *vnode_info, struct sd_node *ent)
-{
-	ent = xbsearch(ent, vnode_info->nodes, vnode_info->nr_nodes, node_cmp);
-	if (!ent)
-		return -1;
-
-	return ent - vnode_info->nodes;
-}
-
 /*
  * Grab an additional reference to the passed in vnode info.
  *
@@ -103,20 +94,22 @@ void put_vnode_info(struct vnode_info *vnode_info)
 	if (vnode_info) {
 		if (refcount_dec(&vnode_info->refcnt) == 0) {
 			rb_destroy(&vnode_info->vroot, struct sd_vnode, rb);
+			rb_destroy(&vnode_info->nroot, struct sd_node, rb);
 			free(vnode_info);
 		}
 	}
 }
 
-static void recalculate_vnodes(struct sd_node *nodes, int nr_nodes)
+static void recalculate_vnodes(struct rb_root *nroot)
 {
-	int i, nr_non_gateway_nodes = 0;
+	int nr_non_gateway_nodes = 0;
 	uint64_t avg_size = 0;
+	struct sd_node *n;
 	float factor;
 
-	for (i = 0; i < nr_nodes; i++) {
-		if (nodes[i].space) {
-			avg_size += nodes[i].space;
+	rb_for_each_entry(n, nroot, rb) {
+		if (n->space) {
+			avg_size += n->space;
 			nr_non_gateway_nodes++;
 		}
 	}
@@ -126,30 +119,35 @@ static void recalculate_vnodes(struct sd_node *nodes, int nr_nodes)
 
 	avg_size /= nr_non_gateway_nodes;
 
-	for (i = 0; i < nr_nodes; i++) {
-		factor = (float)nodes[i].space / (float)avg_size;
-		nodes[i].nr_vnodes = rintf(SD_DEFAULT_VNODES * factor);
-		sd_debug("node %d has %d vnodes, free space %" PRIu64,
-			 nodes[i].nid.port, nodes[i].nr_vnodes, nodes[i].space);
+	rb_for_each_entry(n, nroot, rb) {
+		factor = (float)n->space / (float)avg_size;
+		n->nr_vnodes = rintf(SD_DEFAULT_VNODES * factor);
+		sd_debug("node %s has %d vnodes, free space %" PRIu64,
+			 node_to_str(n), n->nr_vnodes, n->space);
 	}
 }
 
-struct vnode_info *alloc_vnode_info(const struct sd_node *nodes,
-				    size_t nr_nodes)
+struct vnode_info *alloc_vnode_info(const struct rb_root *nroot)
 {
 	struct vnode_info *vnode_info;
+	struct sd_node *n;
 
 	vnode_info = xzalloc(sizeof(*vnode_info));
 
 	INIT_RB_ROOT(&vnode_info->vroot);
-	vnode_info->nr_nodes = nr_nodes;
-	memcpy(vnode_info->nodes, nodes, sizeof(*nodes) * nr_nodes);
-	xqsort(vnode_info->nodes, nr_nodes, node_cmp);
+	INIT_RB_ROOT(&vnode_info->nroot);
+	rb_for_each_entry(n, nroot, rb) {
+		struct sd_node *new = xmalloc(sizeof(*new));
+		*new = *n;
+		if (unlikely(rb_insert(&vnode_info->nroot, new, rb, node_cmp)))
+			panic("node hash collision");
+		vnode_info->nr_nodes++;
+	}
 
-	recalculate_vnodes(vnode_info->nodes, nr_nodes);
+	recalculate_vnodes(&vnode_info->nroot);
 
-	nodes_to_vnodes(vnode_info->nodes, nr_nodes, &vnode_info->vroot);
-	vnode_info->nr_zones = get_zones_nr_from(nodes, nr_nodes);
+	nodes_to_vnodes(&vnode_info->nroot, &vnode_info->vroot);
+	vnode_info->nr_zones = get_zones_nr_from(&vnode_info->nroot);
 	refcount_set(&vnode_info->refcnt, 1);
 	return vnode_info;
 }
@@ -158,6 +156,7 @@ struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
 					struct vnode_info *cur_vinfo)
 {
 	struct sd_node nodes[SD_MAX_NODES];
+	struct rb_root nroot = RB_ROOT;
 	int nr_nodes;
 
 	nr_nodes = epoch_log_read(epoch, nodes, sizeof(nodes));
@@ -167,20 +166,21 @@ struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
 		if (nr_nodes == 0)
 			return NULL;
 	}
+	for (int i = 0; i < nr_nodes; i++)
+		rb_insert(&nroot, &nodes[i], rb, node_cmp);
 
-	return alloc_vnode_info(nodes, nr_nodes);
+	return alloc_vnode_info(&nroot);
 }
 
 int local_get_node_list(const struct sd_req *req, struct sd_rsp *rsp,
-			       void *data)
+			void *data)
 {
 	int nr_nodes;
 	struct vnode_info *cur_vinfo = main_thread_get(current_vnode_info);
 
 	if (cur_vinfo) {
 		nr_nodes = cur_vinfo->nr_nodes;
-		memcpy(data, cur_vinfo->nodes,
-			sizeof(struct sd_node) * nr_nodes);
+		nodes_to_buffer(&cur_vinfo->nroot, data);
 		rsp->data_length = nr_nodes * sizeof(struct sd_node);
 		rsp->node.nr_nodes = nr_nodes;
 	} else {
@@ -335,14 +335,13 @@ error:
 int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
 			  time_t *timestamp, struct vnode_info *vinfo)
 {
-	int i, nr, ret;
 	char buf[SD_MAX_NODES * sizeof(struct sd_node) + sizeof(time_t)];
+	const struct sd_node *node;
+	int ret;
 
-	nr = vinfo->nr_nodes;
-	for (i = 0; i < nr; i++) {
+	rb_for_each_entry(node, &vinfo->nroot, rb) {
 		struct sd_req hdr;
 		struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
-		const struct sd_node *node = vinfo->nodes + i;
 		int nodes_len;
 
 		if (node_is_local(node))
@@ -393,13 +392,13 @@ static bool cluster_ctime_check(const struct cluster_info *cinfo)
  */
 static bool enough_nodes_gathered(struct cluster_info *cinfo,
 				  const struct sd_node *joining,
-				  const struct sd_node *nodes,
+				  const struct rb_root *nroot,
 				  size_t nr_nodes)
 {
 	for (int i = 0; i < cinfo->nr_nodes; i++) {
 		const struct sd_node *key = cinfo->nodes + i, *n;
 
-		n = xlfind(key, nodes, nr_nodes, node_cmp);
+		n = rb_search(nroot, key, rb, node_cmp);
 		if (n == NULL && !node_eq(key, joining)) {
 			sd_debug("%s doesn't join yet", node_to_str(key));
 			return false;
@@ -412,7 +411,7 @@ static bool enough_nodes_gathered(struct cluster_info *cinfo,
 }
 
 static enum sd_status cluster_wait_check(const struct sd_node *joining,
-					 const struct sd_node *nodes,
+					 const struct rb_root *nroot,
 					 size_t nr_nodes,
 					 struct cluster_info *cinfo)
 {
@@ -432,7 +431,7 @@ static enum sd_status cluster_wait_check(const struct sd_node *joining,
 	 * node list, we can set the cluster live now.
 	 */
 	if (sys->cinfo.epoch > 0 &&
-	    enough_nodes_gathered(&sys->cinfo, joining, nodes, nr_nodes))
+	    enough_nodes_gathered(&sys->cinfo, joining, nroot, nr_nodes))
 		return SD_STATUS_OK;
 
 	return sys->cinfo.status;
@@ -473,7 +472,8 @@ static void do_get_vdis(struct work *work)
 {
 	struct get_vdis_work *w =
 		container_of(work, struct get_vdis_work, work);
-	int i, ret;
+	struct sd_node *n;
+	int ret;
 
 	if (!node_is_local(&w->joined)) {
 		sd_debug("try to get vdi bitmap from %s",
@@ -485,18 +485,17 @@ static void do_get_vdis(struct work *work)
 		return;
 	}
 
-	for (i = 0; i < w->nr_members; i++) {
+	rb_for_each_entry(n, &w->nroot, rb) {
 		/* We should not fetch vdi_bitmap and copy list from myself */
-		if (node_is_local(&w->members[i]))
+		if (node_is_local(n))
 			continue;
 
-		sd_debug("try to get vdi bitmap from %s",
-			 node_to_str(&w->members[i]));
-		ret = get_vdis_from(&w->members[i]);
+		sd_debug("try to get vdi bitmap from %s", node_to_str(n));
+		ret = get_vdis_from(n);
 		if (ret != SD_RES_SUCCESS) {
 			/* try to read from another node */
 			sd_alert("failed to get vdi bitmap from %s",
-				 node_to_str(&w->members[i]));
+				 node_to_str(n));
 			continue;
 		}
 
@@ -518,6 +517,7 @@ static void get_vdis_done(struct work *work)
 	pthread_cond_broadcast(&wait_vdis_cond);
 	pthread_mutex_unlock(&wait_vdis_lock);
 
+	rb_destroy(&w->nroot, struct sd_node, rb);
 	free(w);
 }
 
@@ -528,8 +528,7 @@ int inc_and_log_epoch(void)
 	if (cur_vinfo) {
 		/* update cluster info to the latest state */
 		sys->cinfo.nr_nodes = cur_vinfo->nr_nodes;
-		memcpy(sys->cinfo.nodes, cur_vinfo->nodes,
-		       sizeof(cur_vinfo->nodes[0]) * cur_vinfo->nr_nodes);
+		nodes_to_buffer(&cur_vinfo->nroot, sys->cinfo.nodes);
 	} else
 		sys->cinfo.nr_nodes = 0;
 
@@ -540,16 +539,28 @@ int inc_and_log_epoch(void)
 }
 
 static struct vnode_info *alloc_old_vnode_info(const struct sd_node *joined,
-					       const struct sd_node *nodes,
-					       size_t nr_nodes)
+					       const struct rb_root *nroot)
 {
-	struct sd_node old_nodes[SD_MAX_NODES];
+	struct rb_root old_root = RB_ROOT;
+	struct sd_node *n;
+	struct vnode_info *old;
 
 	/* exclude the newly added one */
-	memcpy(old_nodes, nodes, sizeof(*nodes) * nr_nodes);
-	xlremove(joined, old_nodes, &nr_nodes, node_cmp);
+	rb_for_each_entry(n, nroot, rb) {
+		struct sd_node *new = xmalloc(sizeof(*new));
 
-	return alloc_vnode_info(old_nodes, nr_nodes);
+		*new = *n;
+		if (node_eq(joined, new)) {
+			free(new);
+			continue;
+		}
+		if (rb_insert(&old_root, new, rb, node_cmp))
+			panic("node hash collision");
+	}
+
+	old = alloc_vnode_info(&old_root);
+	rb_destroy(&old_root, struct sd_node, rb);
+	return old;
 }
 
 static void setup_backend_store(const struct cluster_info *cinfo)
@@ -581,17 +592,14 @@ static void setup_backend_store(const struct cluster_info *cinfo)
 	}
 }
 
-static void get_vdis(const struct sd_node *nodes, size_t nr_nodes,
-		     const struct sd_node *joined)
+static void get_vdis(const struct rb_root *nroot, const struct sd_node *joined)
 {
-	int array_len = nr_nodes * sizeof(struct sd_node);
 	struct get_vdis_work *w;
 
-	w = xmalloc(sizeof(*w) + array_len);
+	w = xmalloc(sizeof(*w));
 	w->joined = *joined;
-	w->nr_members = nr_nodes;
-	memcpy(w->members, nodes, array_len);
-
+	INIT_RB_ROOT(&w->nroot);
+	rb_copy(nroot, struct sd_node, rb, &w->nroot, node_cmp);
 	refcount_inc(&nr_get_vdis_works);
 
 	w->work.fn = do_get_vdis;
@@ -613,7 +621,7 @@ void wait_get_vdis_done(void)
 
 static void update_cluster_info(const struct cluster_info *cinfo,
 				const struct sd_node *joined,
-				const struct sd_node *nodes,
+				const struct rb_root *nroot,
 				size_t nr_nodes)
 {
 	struct vnode_info *old_vnode_info;
@@ -624,13 +632,12 @@ static void update_cluster_info(const struct cluster_info *cinfo,
 		setup_backend_store(cinfo);
 
 	if (node_is_local(joined))
-		sockfd_cache_add_group(nodes, nr_nodes);
+		sockfd_cache_add_group(nroot);
 
 	old_vnode_info = main_thread_get(current_vnode_info);
-	main_thread_set(current_vnode_info,
-			  alloc_vnode_info(nodes, nr_nodes));
+	main_thread_set(current_vnode_info, alloc_vnode_info(nroot));
 
-	get_vdis(nodes, nr_nodes, joined);
+	get_vdis(nroot, joined);
 
 	if (cinfo->status == SD_STATUS_OK) {
 		if (!is_cluster_formatted())
@@ -643,10 +650,9 @@ static void update_cluster_info(const struct cluster_info *cinfo,
 				panic("cannot log current epoch %d",
 				      sys->cinfo.epoch);
 
-			if (!old_vnode_info) {
+			if (!old_vnode_info)
 				old_vnode_info = alloc_old_vnode_info(joined,
-						nodes, nr_nodes);
-			}
+								      nroot);
 
 			start_recovery(main_thread_get(current_vnode_info),
 				       old_vnode_info, true);
@@ -718,7 +724,7 @@ main_fn void sd_notify_handler(const struct sd_node *sender, void *data,
  * cluster must call this function and succeed in accept of the joining node.
  */
 main_fn bool sd_join_handler(const struct sd_node *joining,
-			     const struct sd_node *nodes, size_t nr_nodes,
+			     const struct rb_root *nroot, size_t nr_nodes,
 			     void *opaque)
 {
 	struct cluster_info *cinfo = opaque;
@@ -737,7 +743,7 @@ main_fn bool sd_join_handler(const struct sd_node *joining,
 	sd_debug("check %s, %d", node_to_str(joining), sys->cinfo.status);
 
 	if (sys->cinfo.status == SD_STATUS_WAIT)
-		status = cluster_wait_check(joining, nodes, nr_nodes, cinfo);
+		status = cluster_wait_check(joining, nroot, nr_nodes, cinfo);
 	else
 		status = sys->cinfo.status;
 
@@ -850,11 +856,11 @@ static bool cluster_join_check(const struct cluster_info *cinfo)
 }
 
 main_fn void sd_accept_handler(const struct sd_node *joined,
-			       const struct sd_node *members, size_t nr_members,
+			       const struct rb_root *nroot, size_t nr_nodes,
 			       const void *opaque)
 {
-	int i;
 	const struct cluster_info *cinfo = opaque;
+	struct sd_node *n;
 
 	if (node_is_local(joined) && !cluster_join_check(cinfo)) {
 		sd_err("failed to join Sheepdog");
@@ -864,13 +870,14 @@ main_fn void sd_accept_handler(const struct sd_node *joined,
 	sys->cinfo = *cinfo;
 
 	sd_debug("join %s", node_to_str(joined));
-	for (i = 0; i < nr_members; i++)
-		sd_debug("[%x] %s", i, node_to_str(members + i));
+	rb_for_each_entry(n, nroot, rb) {
+		sd_debug("%s", node_to_str(n));
+	}
 
 	if (sys->cinfo.status == SD_STATUS_SHUTDOWN)
 		return;
 
-	update_cluster_info(cinfo, joined, members, nr_members);
+	update_cluster_info(cinfo, joined, nroot, nr_nodes);
 
 	if (node_is_local(joined))
 		/* this output is used for testing */
@@ -878,15 +885,16 @@ main_fn void sd_accept_handler(const struct sd_node *joined,
 }
 
 main_fn void sd_leave_handler(const struct sd_node *left,
-			      const struct sd_node *members,
-			      size_t nr_members)
+			      const struct rb_root *nroot, size_t nr_nodes)
 {
 	struct vnode_info *old_vnode_info;
-	int i, ret;
+	struct sd_node *n;
+	int ret;
 
 	sd_debug("leave %s", node_to_str(left));
-	for (i = 0; i < nr_members; i++)
-		sd_debug("[%x] %s", i, node_to_str(members + i));
+	rb_for_each_entry(n, nroot, rb) {
+		sd_debug("%s", node_to_str(n));
+	}
 
 	if (sys->cinfo.status == SD_STATUS_SHUTDOWN)
 		return;
@@ -896,8 +904,7 @@ main_fn void sd_leave_handler(const struct sd_node *left,
 		sys->this_node.nr_vnodes = 0;
 
 	old_vnode_info = main_thread_get(current_vnode_info);
-	main_thread_set(current_vnode_info,
-			  alloc_vnode_info(members, nr_members));
+	main_thread_set(current_vnode_info, alloc_vnode_info(nroot));
 	if (sys->cinfo.status == SD_STATUS_OK) {
 		ret = inc_and_log_epoch();
 		if (ret != 0)
@@ -914,9 +921,11 @@ main_fn void sd_leave_handler(const struct sd_node *left,
 static void update_node_size(struct sd_node *node)
 {
 	struct vnode_info *cur_vinfo = main_thread_get(current_vnode_info);
-	int idx = get_node_idx(cur_vinfo, node);
-	assert(idx != -1);
-	cur_vinfo->nodes[idx].space = node->space;
+	struct sd_node *n = rb_search(&cur_vinfo->nroot, node, rb, node_cmp);
+
+	if (unlikely(!n))
+		panic("can't find %s", node_to_str(node));
+	n->space = node->space;
 }
 
 static void kick_node_recover(void)
@@ -924,8 +933,7 @@ static void kick_node_recover(void)
 	struct vnode_info *old = main_thread_get(current_vnode_info);
 	int ret;
 
-	main_thread_set(current_vnode_info,
-			alloc_vnode_info(old->nodes, old->nr_nodes));
+	main_thread_set(current_vnode_info, alloc_vnode_info(&old->nroot));
 	ret = inc_and_log_epoch();
 	if (ret != 0)
 		panic("cannot log current epoch %d", sys->cinfo.epoch);
diff --git a/sheep/ops.c b/sheep/ops.c
index 3be73c7..c14c80b 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -512,15 +512,14 @@ static int cluster_force_recover_work(struct request *req)
 	}
 
 	if (req->rq.data_length <
-	    sizeof(*old_vnode_info->nodes) * old_vnode_info->nr_nodes) {
+	    sizeof(struct sd_node) * old_vnode_info->nr_nodes) {
 		sd_err("too small buffer size, %d", req->rq.data_length);
 		return SD_RES_INVALID_PARMS;
 	}
 
 	req->rp.epoch = epoch;
-	req->rp.data_length = sizeof(*old_vnode_info->nodes) *
-		old_vnode_info->nr_nodes;
-	memcpy(req->data, old_vnode_info->nodes, req->rp.data_length);
+	req->rp.data_length = sizeof(struct sd_node) * old_vnode_info->nr_nodes;
+	nodes_to_buffer(&old_vnode_info->nroot, req->data);
 
 	put_vnode_info(old_vnode_info);
 
@@ -535,6 +534,7 @@ static int cluster_force_recover_main(const struct sd_req *req,
 	int ret = SD_RES_SUCCESS;
 	struct sd_node *nodes = data;
 	size_t nr_nodes = rsp->data_length / sizeof(*nodes);
+	struct rb_root nroot = RB_ROOT;
 
 	if (rsp->epoch != sys->cinfo.epoch) {
 		sd_err("epoch was incremented while cluster_force_recover");
@@ -553,8 +553,11 @@ static int cluster_force_recover_main(const struct sd_req *req,
 
 	sys->cinfo.status = SD_STATUS_OK;
 
+	for (int i = 0; i < nr_nodes; i++)
+		rb_insert(&nroot, &nodes[i], rb, node_cmp);
+
 	vnode_info = get_vnode_info();
-	old_vnode_info = alloc_vnode_info(nodes, nr_nodes);
+	old_vnode_info = alloc_vnode_info(&nroot);
 	start_recovery(vnode_info, old_vnode_info, true);
 	put_vnode_info(vnode_info);
 	put_vnode_info(old_vnode_info);
@@ -654,7 +657,8 @@ static int cluster_recovery_completion(const struct sd_req *req,
 
 	if (vnode_info->nr_nodes == nr_recovereds) {
 		for (i = 0; i < nr_recovereds; ++i) {
-			if (!node_eq(vnode_info->nodes + i, recovereds + i))
+			if (!rb_search(&vnode_info->nroot, &recovereds[i],
+				       rb, node_cmp))
 				break;
 		}
 		if (i == nr_recovereds) {
diff --git a/sheep/recovery.c b/sheep/recovery.c
index da5e711..0df3a5a 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -164,7 +164,7 @@ static int recover_object_from(struct recovery_obj_work *row,
 static bool invalid_node(const struct sd_node *n, struct vnode_info *info)
 {
 
-	if (xbsearch(n, info->nodes, info->nr_nodes, node_cmp))
+	if (rb_search(&info->nroot, n, rb, node_cmp))
 		return false;
 	return true;
 }
@@ -751,25 +751,28 @@ static void prepare_object_list(struct work *work)
 	struct recovery_list_work *rlw = container_of(rw,
 						      struct recovery_list_work,
 						      base);
-	struct sd_node *cur = rw->cur_vinfo->nodes;
-	int cur_nr = rw->cur_vinfo->nr_nodes;
-	int start = random() % cur_nr, i, end = cur_nr;
+	int nr_nodes = rw->cur_vinfo->nr_nodes;
+	int start = random() % nr_nodes, i, end = nr_nodes;
 	uint64_t *oids;
+	struct sd_node *nodes;
 
 	if (node_is_gateway_only())
 		return;
 
 	sd_debug("%u", rw->epoch);
 	wait_get_vdis_done();
+
+	nodes = xmalloc(sizeof(struct sd_node) * nr_nodes);
+	nodes_to_buffer(&rw->cur_vinfo->nroot, nodes);
 again:
 	/* We need to start at random node for better load balance */
 	for (i = start; i < end; i++) {
 		size_t nr_oids;
-		struct sd_node *node = cur + i;
+		struct sd_node *node = nodes + i;
 
 		if (uatomic_read(&next_rinfo)) {
 			sd_debug("go to the next recovery");
-			return;
+			goto out;
 		}
 
 		oids = fetch_object_list(node, rw->epoch, &nr_oids);
@@ -786,6 +789,8 @@ again:
 	}
 
 	sd_debug("%"PRIu64, rlw->count);
+out:
+	free(nodes);
 }
 
 int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo,
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 7796d86..588a61c 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -299,8 +299,7 @@ int local_get_node_list(const struct sd_req *req, struct sd_rsp *rsp,
 struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info);
 struct vnode_info *get_vnode_info(void);
 void put_vnode_info(struct vnode_info *vinfo);
-struct vnode_info *alloc_vnode_info(const struct sd_node *nodes,
-				    size_t nr_nodes);
+struct vnode_info *alloc_vnode_info(const struct rb_root *);
 struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
 					struct vnode_info *cur_vinfo);
 void wait_get_vdis_done(void);
-- 
1.7.9.5




More information about the sheepdog mailing list