[Sheepdog] [PATCH] introduce virtual nodes

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Tue Apr 5 18:09:05 CEST 2011


Currently, Sheepdog data balancing has some problems:

 - When there are only few physical nodes in the cluster, the
   consistent hash ring becomes sparse and data cannot be balanced
   well.
 - Even if some nodes have a larger disk space, we cannot allocate
   more data to the nodes.

This adds preliminary support for virtual nodes; Sheepdog assigns
multiple virtual nodes to each physical node, and creates a consistent
hash ring with virtual nodes.  The number of virtual nodes are fixed
to 64 in this patch, but we can extend it in future.

This patch changes the map between objects and nodes.  So, we need to
reformat Sheepdog cluster to try this feature, but this is a necessary
change, I think.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 collie/collie.c    |   33 +++---
 include/sheep.h    |  134 +++++++++++++++++----
 sheep/group.c      |   66 +++++------
 sheep/sdnet.c      |   36 +++---
 sheep/sheep_priv.h |   36 +++---
 sheep/store.c      |  339 +++++++++++++++++++++++-----------------------------
 sheep/vdi.c        |   79 ++++++------
 7 files changed, 389 insertions(+), 334 deletions(-)

diff --git a/collie/collie.c b/collie/collie.c
index fb1998c..e0ecc78 100644
--- a/collie/collie.c
+++ b/collie/collie.c
@@ -70,7 +70,8 @@ Common parameters:\n\
 static uint64_t node_list_version;
 
 static struct sheepdog_node_list_entry node_list_entries[SD_MAX_NODES];
-static int nr_nodes;
+static struct sheepdog_vnode_list_entry vnode_list_entries[SD_MAX_VNODES];
+static int nr_nodes, nr_vnodes;
 static unsigned master_idx;
 
 static int is_current(struct sheepdog_inode *i)
@@ -148,6 +149,7 @@ static int update_node_list(int max_nodes, int epoch)
 	}
 
 	memcpy(node_list_entries, buf, size);
+	nr_vnodes = nodes_to_vnodes(node_list_entries, nr_nodes, vnode_list_entries);
 	node_list_version = hdr.epoch;
 	master_idx = rsp->master_idx;
 out:
@@ -286,13 +288,13 @@ static int parse_vdi(vdi_parser_func_t func, size_t size, void *data)
 			continue;
 
 		oid = vid_to_vdi_oid(nr);
-		n = obj_to_sheep(node_list_entries, nr_nodes, oid, 0);
-		addr_to_str(name, sizeof(name), node_list_entries[n].addr, 0);
+		n = obj_to_sheep(vnode_list_entries, nr_vnodes, oid, 0);
+		addr_to_str(name, sizeof(name), vnode_list_entries[n].addr, 0);
 
-		fd = connect_to(name, node_list_entries[n].port);
+		fd = connect_to(name, vnode_list_entries[n].port);
 		if (fd < 0) {
 			printf("failed to connect %s:%d\n", name,
-			       node_list_entries[n].port);
+			       vnode_list_entries[n].port);
 			continue;
 		}
 
@@ -595,21 +597,22 @@ static int node_list(int argc, char **argv)
 {
 	int i;
 
-	printf("  Idx\tNode id (FNV-1a) - Host:Port\n");
+	printf("   Idx - Host:Port              Number of vnodes\n");
 	printf("------------------------------------------------\n");
 	for (i = 0; i < nr_nodes; i++) {
 		char data[128];
 
-		print_node_list_entry(&node_list_entries[i], data, sizeof(data));
+		addr_to_str(data, sizeof(data), node_list_entries[i].addr,
+			    node_list_entries[i].port);
 
 		if (i == master_idx) {
 			if (highlight)
 				printf(TEXT_BOLD);
-			printf("* %d\t%s\n", i, data);
+			printf("* %4d - %-20s\t%d\n", i, data, node_list_entries[i].nr_vnodes);
 			if (highlight)
 				printf(TEXT_NORMAL);
 		} else
-			printf("  %d\t%s\n", i, data);
+			printf("  %4d - %-20s\t%d\n", i, data, node_list_entries[i].nr_vnodes);
 	}
 
 	return 0;
@@ -941,11 +944,11 @@ reread:
 		else
 			wlen = strlen(value);
 
-		n = obj_to_sheep(node_list_entries, nr_nodes, oid, i);
+		n = obj_to_sheep(vnode_list_entries, nr_vnodes, oid, i);
 
-		addr_to_str(name, sizeof(name), node_list_entries[n].addr, 0);
+		addr_to_str(name, sizeof(name), vnode_list_entries[n].addr, 0);
 
-		fd = connect_to(name, node_list_entries[n].port);
+		fd = connect_to(name, vnode_list_entries[n].port);
 		if (fd < 0) {
 			printf("%s(%d): %s, %m\n", __func__, __LINE__,
 			       name);
@@ -1024,11 +1027,11 @@ static int vdi_getattr(int argc, char **argv)
 		rlen = SD_MAX_VDI_ATTR_VALUE_LEN;
 		wlen = 0;
 
-		n = obj_to_sheep(node_list_entries, nr_nodes, oid, i);
+		n = obj_to_sheep(vnode_list_entries, nr_vnodes, oid, i);
 
-		addr_to_str(name, sizeof(name), node_list_entries[n].addr, 0);
+		addr_to_str(name, sizeof(name), vnode_list_entries[n].addr, 0);
 
-		fd = connect_to(name, node_list_entries[n].port);
+		fd = connect_to(name, vnode_list_entries[n].port);
 		if (fd < 0) {
 			printf("%s(%d): %s, %m\n", __func__, __LINE__,
 			       name);
diff --git a/include/sheep.h b/include/sheep.h
index 87d7c2a..5e0d8b4 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -16,9 +16,14 @@
 #include "list.h"
 #include "net.h"
 
-#define SD_SHEEP_PROTO_VER 0x02
+#define SD_SHEEP_PROTO_VER 0x03
+
+#define SD_DEFAULT_REDUNDANCY 3
+#define SD_MAX_REDUNDANCY 8
 
 #define SD_MAX_NODES 1024
+#define SD_DEFAULT_VNODES 64
+#define SD_MAX_VNODES 65536
 #define SD_MAX_VMS   4096 /* FIXME: should be removed */
 
 #define SD_OP_SHEEP         0x80
@@ -78,10 +83,8 @@ struct sd_list_req {
 	uint32_t	epoch;
 	uint32_t        id;
 	uint32_t        data_length;
-	uint64_t        start;
-	uint64_t        end;
 	uint32_t        tgt_epoch;
-	uint32_t        pad[3];
+	uint32_t        pad[7];
 };
 
 struct sd_list_rsp {
@@ -92,9 +95,7 @@ struct sd_list_rsp {
 	uint32_t        id;
 	uint32_t        data_length;
 	uint32_t        result;
-	uint32_t        rsvd;
-	uint64_t        next;
-	uint32_t        pad[4];
+	uint32_t        pad[7];
 };
 
 struct sd_node_req {
@@ -124,10 +125,18 @@ struct sd_node_rsp {
 };
 
 struct sheepdog_node_list_entry {
+	uint8_t         addr[16];
+	uint16_t        port;
+	uint16_t	nr_vnodes;
+	uint16_t	pad[2];
+};
+
+struct sheepdog_vnode_list_entry {
 	uint64_t        id;
 	uint8_t         addr[16];
 	uint16_t        port;
-	uint16_t	pad[3];
+	uint16_t	node_idx;
+	uint16_t	pad[2];
 };
 
 struct epoch_log {
@@ -137,22 +146,50 @@ struct epoch_log {
 	struct sheepdog_node_list_entry nodes[SD_MAX_NODES];
 };
 
-static inline int hval_to_sheep(struct sheepdog_node_list_entry *entries,
+static inline int same_node(struct sheepdog_vnode_list_entry *e, int n1, int n2)
+{
+	if (memcmp(e[n1].addr, e[n2].addr, sizeof(e->addr)) == 0 &&
+	    e[n1].port == e[n2].port)
+		return 1;
+
+	return 0;
+}
+
+/* traverse the virtual node list and return the n'th one */
+static inline int get_nth_node(struct sheepdog_vnode_list_entry *entries,
+			       int nr_entries, int base, int n)
+{
+	int nodes[SD_MAX_REDUNDANCY];
+	int nr = 0, idx = base, i;
+
+	while (n--) {
+		nodes[nr++] = idx;
+next:
+		idx = (idx + 1) % nr_entries;
+		for (i = 0; i < nr; i++)
+			if (same_node(entries, idx, nodes[i]))
+				/* this node is already selected, so skip here */
+				goto next;
+	}
+
+	return idx;
+}
+
+static inline int hval_to_sheep(struct sheepdog_vnode_list_entry *entries,
 				int nr_entries, uint64_t id, int idx)
 {
 	int i;
-	struct sheepdog_node_list_entry *e = entries, *n;
+	struct sheepdog_vnode_list_entry *e = entries, *n;
 
 	for (i = 0; i < nr_entries - 1; i++, e++) {
 		n = e + 1;
 		if (id > e->id && id <= n->id)
 			break;
 	}
-
-	return (i + 1 + idx) % nr_entries;
+	return get_nth_node(entries, nr_entries, (i + 1) % nr_entries, idx);
 }
 
-static inline int obj_to_sheep(struct sheepdog_node_list_entry *entries,
+static inline int obj_to_sheep(struct sheepdog_vnode_list_entry *entries,
 			       int nr_entries, uint64_t oid, int idx)
 {
 	uint64_t id = fnv_64a_buf(&oid, sizeof(oid), FNV1A_64_INIT);
@@ -160,14 +197,6 @@ static inline int obj_to_sheep(struct sheepdog_node_list_entry *entries,
 	return hval_to_sheep(entries, nr_entries, id, idx);
 }
 
-static inline void print_node_list_entry(struct sheepdog_node_list_entry *e,
-					 char *str, size_t size)
-{
-	char  buf[INET6_ADDRSTRLEN];
-	snprintf(str, size, "%016" PRIx64 " - %s",
-		 e->id, addr_to_str(buf, sizeof(buf), e->addr, e->port));
-}
-
 static inline int is_sheep_op(uint8_t op)
 {
 	return op & SD_OP_SHEEP;
@@ -221,4 +250,67 @@ static inline const char *sd_strerror(int err)
 	return "Invalid error code";
 }
 
+static inline int node_cmp(const void *a, const void *b)
+{
+	const struct sheepdog_node_list_entry *node1 = a;
+	const struct sheepdog_node_list_entry *node2 = b;
+	int cmp;
+
+	cmp = memcmp(node1->addr, node2->addr, sizeof(node1->addr));
+	if (cmp != 0)
+		return cmp;
+
+	if (node1->port < node2->port)
+		return -1;
+	if (node1->port > node2->port)
+		return 1;
+	return 0;
+}
+
+static inline int vnode_cmp(const void *a, const void *b)
+{
+	const struct sheepdog_vnode_list_entry *node1 = a;
+	const struct sheepdog_vnode_list_entry *node2 = b;
+
+	if (node1->id < node2->id)
+		return -1;
+	if (node1->id > node2->id)
+		return 1;
+	return 0;
+}
+
+static inline int nodes_to_vnodes(struct sheepdog_node_list_entry *nodes, int nr,
+				  struct sheepdog_vnode_list_entry *vnodes)
+{
+	struct sheepdog_node_list_entry *n = nodes;
+	int i, j, nr_vnodes = 0;
+	uint64_t hval;
+
+	while (nr--) {
+		hval = FNV1A_64_INIT;
+
+		for (i = 0; i < n->nr_vnodes; i++) {
+			if (vnodes) {
+				hval = fnv_64a_buf(&n->port, sizeof(n->port), hval);
+				for (j = ARRAY_SIZE(n->addr) - 1; j >= 0; j--)
+					hval = fnv_64a_buf(&n->addr[j], 1, hval);
+
+				vnodes[nr_vnodes].id = hval;
+				memcpy(vnodes[nr_vnodes].addr, n->addr, sizeof(n->addr));
+				vnodes[nr_vnodes].port = n->port;
+				vnodes[nr_vnodes].node_idx = n - nodes;
+			}
+
+			nr_vnodes++;
+		}
+
+		n++;
+	}
+
+	if (vnodes)
+		qsort(vnodes, nr_vnodes, sizeof(*vnodes), vnode_cmp);
+
+	return nr_vnodes;
+}
+
 #endif
diff --git a/sheep/group.c b/sheep/group.c
index e0dcc64..b67ca4e 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -98,7 +98,7 @@ struct work_confchg {
 	char name[128];						\
 	list_for_each_entry(node, node_list, list) {		\
 		dprintf("%c nodeid: %x, pid: %d, ip: %s\n",	\
-			is_myself(&node->ent) ? 'l' : ' ',	\
+			is_myself(node->ent.addr, node->ent.port) ? 'l' : ' ',	\
 			node->nodeid, node->pid,		\
 			addr_to_str(name, sizeof(name),		\
 			node->ent.addr, node->ent.port));	\
@@ -130,18 +130,6 @@ CPG_EVENT_WORK_FNS(RUNNING, running)
 CPG_EVENT_WORK_FNS(SUSPENDED, suspended)
 CPG_EVENT_WORK_FNS(JOINING, joining)
 
-static int node_cmp(const void *a, const void *b)
-{
-	const struct sheepdog_node_list_entry *node1 = a;
-	const struct sheepdog_node_list_entry *node2 = b;
-
-	if (node1->id < node2->id)
-		return -1;
-	if (node1->id > node2->id)
-		return 1;
-	return 0;
-}
-
 static int send_message(cpg_handle_t handle, struct message_header *msg)
 {
 	struct iovec iov;
@@ -198,9 +186,26 @@ int get_ordered_sd_node_list(struct sheepdog_node_list_entry *entries)
 	return build_node_list(&sys->sd_node_list, entries);
 }
 
-int setup_ordered_sd_node_list(struct request *req)
+void get_ordered_sd_vnode_list(struct sheepdog_vnode_list_entry *entries,
+			       int *nr_vnodes, int *nr_nodes)
+{
+	struct sheepdog_node_list_entry nodes[SD_MAX_NODES];
+	int nr;
+
+	nr = build_node_list(&sys->sd_node_list, nodes);
+	*nr_nodes = nr;
+
+	if (sys->nr_vnodes == 0)
+		sys->nr_vnodes = nodes_to_vnodes(nodes, nr, sys->vnodes);
+
+	memcpy(entries, sys->vnodes, sizeof(*entries) * sys->nr_vnodes);
+
+	*nr_vnodes = sys->nr_vnodes;
+}
+
+void setup_ordered_sd_vnode_list(struct request *req)
 {
-	return get_ordered_sd_node_list(req->entry);
+	get_ordered_sd_vnode_list(req->entry, &req->nr_vnodes, &req->nr_nodes);
 }
 
 static void get_node_list(struct sd_node_req *req,
@@ -330,7 +335,7 @@ static int is_master(void)
 		return 0;
 
 	node = list_first_entry(&sys->sd_node_list, struct node, list);
-	if (is_myself(&node->ent))
+	if (is_myself(node->ent.addr, node->ent.port))
 		return 1;
 	return 0;
 }
@@ -496,7 +501,7 @@ static void get_vdi_bitmap_from_all(void)
 	nr_nodes = get_ordered_sd_node_list(entry);
 
 	for (i = 0; i < nr_nodes; i++) {
-		if (is_myself(&entry[i]))
+		if (is_myself(entry[i].addr, entry[i].port))
 			continue;
 
 		addr_to_str(host, sizeof(host), entry[i].addr, 0);
@@ -539,11 +544,11 @@ static int move_node_to_sd_list(uint32_t nodeid, uint32_t pid,
 	if (!node)
 		return 1;
 
-	if (!node->ent.id)
-		node->ent = ent;
+	node->ent = ent;
 
 	list_del(&node->list);
 	list_add_tail(&node->list, &sys->sd_node_list);
+	sys->nr_vnodes = 0;
 
 	return 0;
 }
@@ -555,7 +560,7 @@ static void update_cluster_info(struct join_message *msg)
 	struct sheepdog_node_list_entry entry[SD_MAX_NODES];
 
 	if (msg->result != SD_RES_SUCCESS) {
-		if (is_myself(&msg->header.from)) {
+		if (is_myself(msg->header.from.addr, msg->header.from.port)) {
 			eprintf("failed to join sheepdog, %d\n", msg->result);
 			sys->status = SD_STATUS_JOIN_FAILED;
 		}
@@ -746,7 +751,6 @@ static void vdi_op_done(struct vdi_op_message *msg)
 			eprintf("can't write epoch %u\n", sys->epoch);
 		update_epoch_store(sys->epoch);
 
-		set_nodeid(sys->this_node.id);
 		set_global_nr_copies(sys->nr_sobjs);
 
 		sys->status = SD_STATUS_OK;
@@ -759,7 +763,7 @@ static void vdi_op_done(struct vdi_op_message *msg)
 		ret = SD_RES_UNKNOWN;
 	}
 out:
-	if (!is_myself(&msg->header.from))
+	if (!is_myself(msg->header.from.addr, msg->header.from.port))
 		return;
 
 	req = list_first_entry(&sys->pending_list, struct request, pending_list);
@@ -804,8 +808,7 @@ static void __sd_deliver(struct cpg_event *cevent)
 			return;
 		}
 
-		if (!node->ent.id)
-			node->ent = m->from;
+		node->ent = m->from;
 	}
 
 	if (m->state == DM_INIT && is_master()) {
@@ -949,6 +952,7 @@ static void del_node(struct cpg_address *addr, struct work_confchg *w)
 		struct sheepdog_node_list_entry e[SD_MAX_NODES];
 
 		w->sd_node_left++;
+		sys->nr_vnodes = 0;
 
 		list_del(&node->list);
 		free(node);
@@ -1557,19 +1561,7 @@ join_retry:
 
 	set_addr(nodeid, port);
 	sys->this_node.port = port;
-
-	ret = get_nodeid(&sys->this_node.id);
-	if (!sys->this_node.id) {
-		uint64_t hval;
-		int i;
-
-		hval = fnv_64a_buf(&sys->this_node.port,
-				   sizeof(sys->this_node.port),
-				   FNV1A_64_INIT);
-		for (i = ARRAY_SIZE(sys->this_node.addr) - 1; i >= 0; i--)
-			hval = fnv_64a_buf(&sys->this_node.addr[i], 1, hval);
-		sys->this_node.id = hval;
-	}
+	sys->this_node.nr_vnodes = SD_DEFAULT_VNODES;
 
 	if (get_latest_epoch() == 0)
 		sys->status = SD_STATUS_WAIT_FOR_FORMAT;
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index 25853d4..3c8668a 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -51,7 +51,7 @@ void resume_pending_requests(void)
 		start_cpg_event_work();
 }
 
-static int is_access_local(struct sheepdog_node_list_entry *e, int nr_nodes,
+static int is_access_local(struct sheepdog_vnode_list_entry *e, int nr_nodes,
 			   uint64_t oid, int copies)
 {
 	int i, n;
@@ -59,7 +59,7 @@ static int is_access_local(struct sheepdog_node_list_entry *e, int nr_nodes,
 	for (i = 0; i < copies; i++) {
 		n = obj_to_sheep(e, nr_nodes, oid, i);
 
-		if (is_myself(&e[n]))
+		if (is_myself(e[n].addr, e[n].port))
 			return 1;
 	}
 
@@ -80,7 +80,7 @@ static void setup_access_to_local_objects(struct request *req)
 	if (!copies)
 		copies = sys->nr_sobjs;
 
-	if (is_access_local(req->entry, req->nr_nodes, hdr->oid, copies))
+	if (is_access_local(req->entry, req->nr_vnodes, hdr->oid, copies))
 		req->local_oid = hdr->oid;
 }
 
@@ -123,7 +123,7 @@ static void __done(struct work *work, int idx)
 		     req->rp.result == SD_RES_WAIT_FOR_FORMAT)) {
 
 			req->rq.epoch = sys->epoch;
-			req->nr_nodes = setup_ordered_sd_node_list(req);
+			setup_ordered_sd_vnode_list(req);
 			setup_access_to_local_objects(req);
 
 			list_add_tail(&cevent->cpg_event_list, &sys->cpg_event_siblings);
@@ -252,7 +252,7 @@ static void queue_request(struct request *req)
 	if (!(hdr->flags & SD_FLAG_CMD_DIRECT))
 		hdr->epoch = sys->epoch;
 
-	req->nr_nodes = setup_ordered_sd_node_list(req);
+	setup_ordered_sd_vnode_list(req);
 	setup_access_to_local_objects(req);
 
 	cevent->ctype = CPG_EVENT_REQUEST;
@@ -568,8 +568,8 @@ int create_listen_port(int port, void *data)
 	return create_listen_ports(port, create_listen_port_fn, data);
 }
 
-int write_object(struct sheepdog_node_list_entry *e,
-		 int nodes, uint32_t node_version,
+int write_object(struct sheepdog_vnode_list_entry *e,
+		 int vnodes, int nodes, uint32_t node_version,
 		 uint64_t oid, char *data, unsigned int datalen,
 		 uint64_t offset, int nr, int create)
 {
@@ -583,7 +583,7 @@ int write_object(struct sheepdog_node_list_entry *e,
 	for (i = 0; i < nr; i++) {
 		unsigned rlen = 0, wlen = datalen;
 
-		n = obj_to_sheep(e, nodes, oid, i);
+		n = obj_to_sheep(e, vnodes, oid, i);
 
 		if (memcmp(e[n].addr, sys->this_node.addr, sizeof(e[n].addr)) == 0 &&
 		    e[n].port == sys->this_node.port) {
@@ -631,8 +631,8 @@ int write_object(struct sheepdog_node_list_entry *e,
 	return !success;
 }
 
-int read_object(struct sheepdog_node_list_entry *e,
-		int nodes, uint32_t node_version,
+int read_object(struct sheepdog_vnode_list_entry *e,
+		int vnodes, int nodes, uint32_t node_version,
 		uint64_t oid, char *data, unsigned int datalen,
 		uint64_t offset, int nr)
 {
@@ -646,7 +646,7 @@ int read_object(struct sheepdog_node_list_entry *e,
 
 	/* search a local object first */
 	for (i = 0; i < nr; i++) {
-		n = obj_to_sheep(e, nodes, oid, i);
+		n = obj_to_sheep(e, vnodes, oid, i);
 
 		if (memcmp(e[n].addr, sys->this_node.addr, sizeof(e[n].addr)) == 0 &&
 		    e[n].port == sys->this_node.port) {
@@ -666,7 +666,7 @@ int read_object(struct sheepdog_node_list_entry *e,
 	for (i = 0; i < nr; i++) {
 		unsigned wlen = 0, rlen = datalen;
 
-		n = obj_to_sheep(e, nodes, oid, i);
+		n = obj_to_sheep(e, vnodes, oid, i);
 
 		addr_to_str(name, sizeof(name), e[n].addr, 0);
 
@@ -703,8 +703,8 @@ int read_object(struct sheepdog_node_list_entry *e,
 	return -last_error;
 }
 
-int remove_object(struct sheepdog_node_list_entry *e,
-		  int nodes, uint32_t node_version,
+int remove_object(struct sheepdog_vnode_list_entry *e,
+		  int vnodes, int nodes, uint32_t node_version,
 		  uint64_t oid, int nr)
 {
 	char name[128];
@@ -718,7 +718,7 @@ int remove_object(struct sheepdog_node_list_entry *e,
 	for (i = 0; i < nr; i++) {
 		unsigned wlen = 0, rlen = 0;
 
-		n = obj_to_sheep(e, nodes, oid, i);
+		n = obj_to_sheep(e, vnodes, oid, i);
 
 		addr_to_str(name, sizeof(name), e[n].addr, 0);
 
@@ -775,7 +775,7 @@ static int set_nodelay(int fd)
 	return ret;
 }
 
-int get_sheep_fd(struct sheepdog_node_list_entry *e, int node_idx,
+int get_sheep_fd(uint8_t *addr, uint16_t port, int node_idx,
 		 uint32_t epoch, int worker_idx)
 {
 	static int cached_fds[NR_WORKER_THREAD][SD_MAX_NODES];
@@ -817,9 +817,9 @@ int get_sheep_fd(struct sheepdog_node_list_entry *e, int node_idx,
 		return fd;
 	}
 
-	addr_to_str(name, sizeof(name), e[node_idx].addr, 0);
+	addr_to_str(name, sizeof(name), addr, 0);
 
-	fd = connect_to(name, e[node_idx].port);
+	fd = connect_to(name, port);
 	if (fd < 0)
 		return -1;
 
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index cebfd4d..8c2199d 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -21,9 +21,6 @@
 #include "net.h"
 #include "sheep.h"
 
-#define SD_DEFAULT_REDUNDANCY 3
-#define SD_MAX_REDUNDANCY 8
-
 #define SD_OP_REMOVE_OBJ     0x91
 
 #define SD_OP_GET_OBJ_LIST   0xA1
@@ -83,7 +80,8 @@ struct request {
 
 	uint64_t local_oid;
 
-	struct sheepdog_node_list_entry entry[SD_MAX_NODES];
+	struct sheepdog_vnode_list_entry entry[SD_MAX_VNODES];
+	int nr_vnodes;
 	int nr_nodes;
 	int check_consistency;
 
@@ -116,6 +114,10 @@ struct cluster_info {
 	struct list_head cpg_node_list;
 	struct list_head sd_node_list;
 
+	/* this array contains a list of ordered virtual nodes */
+	struct sheepdog_vnode_list_entry vnodes[SD_MAX_VNODES];
+	int nr_vnodes;
+
 	struct list_head pending_list;
 
 	DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
@@ -157,8 +159,10 @@ int read_vdis(char *data, int len, unsigned int *rsp_len);
 int get_vdi_attr(uint32_t epoch, char *data, int data_len, uint32_t vid,
 		 uint32_t *attrid, int creat, int excl);
 
-int setup_ordered_sd_node_list(struct request *req);
 int get_ordered_sd_node_list(struct sheepdog_node_list_entry *entries);
+void setup_ordered_sd_vnode_list(struct request *req);
+void get_ordered_sd_vnode_list(struct sheepdog_vnode_list_entry *entries,
+			       int *nr_vnodes, int *nr_nodes);
 int is_access_to_busy_objects(uint64_t oid);
 
 void resume_pending_requests(void);
@@ -180,8 +184,6 @@ int update_epoch_store(uint32_t epoch);
 
 int set_global_nr_copies(uint32_t copies);
 int get_global_nr_copies(uint32_t *copies);
-int set_nodeid(uint64_t nodeid);
-int get_nodeid(uint64_t *nodeid);
 
 #define NR_WORKER_THREAD 64
 
@@ -196,19 +198,19 @@ int start_recovery(uint32_t epoch);
 void resume_recovery_work(void);
 int is_recoverying_oid(uint64_t oid);
 
-int write_object(struct sheepdog_node_list_entry *e,
-		 int nodes, uint32_t node_version,
+int write_object(struct sheepdog_vnode_list_entry *e,
+		 int vnodes, int nodes, uint32_t node_version,
 		 uint64_t oid, char *data, unsigned int datalen,
 		 uint64_t offset, int nr, int create);
-int read_object(struct sheepdog_node_list_entry *e,
-		int nodes, uint32_t node_version,
+int read_object(struct sheepdog_vnode_list_entry *e,
+		int vnodes, int nodes, uint32_t node_version,
 		uint64_t oid, char *data, unsigned int datalen,
 		uint64_t offset, int nr);
-int remove_object(struct sheepdog_node_list_entry *e,
-		  int nodes, uint32_t node_version,
+int remove_object(struct sheepdog_vnode_list_entry *e,
+		  int vnodes, int nodes, uint32_t node_version,
 		  uint64_t oid, int nr);
 
-int get_sheep_fd(struct sheepdog_node_list_entry *e, int node_idx,
+int get_sheep_fd(uint8_t *addr, uint16_t port, int node_idx,
 		 uint32_t epoch, int worker_idx);
 
 /* Journal */
@@ -271,9 +273,11 @@ inline int jrnl_commit_data(struct jrnl_descriptor *jd);
 int jrnl_perform(struct jrnl_descriptor *jd);
 int jrnl_recover(void);
 
-static inline int is_myself(struct sheepdog_node_list_entry *e)
+static inline int is_myself(uint8_t *addr, uint16_t port)
 {
-	return e->id == sys->this_node.id;
+	return (memcmp(addr, sys->this_node.addr,
+		       sizeof(sys->this_node.addr)) == 0) &&
+		port == sys->this_node.port;
 }
 
 #define panic(fmt, args...)			\
diff --git a/sheep/store.c b/sheep/store.c
index 8038629..e63e3c0 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -26,7 +26,6 @@
 
 #define ANAME_CTIME "user.sheepdog.ctime"
 #define ANAME_COPIES "user.sheepdog.copies"
-#define ANAME_NODEID "user.sheepdog.nodeid"
 
 static char *obj_path;
 static char *epoch_path;
@@ -108,16 +107,6 @@ static int stat_sheep(uint64_t *store_size, uint64_t *store_free, uint32_t epoch
 	return SD_RES_SUCCESS;
 }
 
-static int is_obj_in_range(uint64_t oid, uint64_t start, uint64_t end)
-{
-	uint64_t hval = fnv_64a_buf(&oid, sizeof(oid), FNV1A_64_INIT);
-
-	if (start < end)
-		return (start < hval && hval <= end);
-	else
-		return (start < hval || hval <= end);
-}
-
 static int get_obj_list(struct request *req)
 {
 	DIR *dir;
@@ -125,17 +114,12 @@ static int get_obj_list(struct request *req)
 	struct sd_list_req *hdr = (struct sd_list_req *)&req->rq;
 	struct sd_list_rsp *rsp = (struct sd_list_rsp *)&req->rp;
 	uint64_t oid;
-	uint64_t start_hash = hdr->start;
-	uint64_t end_hash = hdr->end;
 	uint32_t epoch = hdr->tgt_epoch;
 	char path[1024];
 	uint64_t *p = (uint64_t *)req->data;
 	int nr = 0;
 	uint64_t *objlist = NULL;
 	int obj_nr = 0, fd, i;
-	struct sheepdog_node_list_entry *e;
-	int e_nr;
-	int idx;
 	int res = SD_RES_SUCCESS;
 	int buf_len;
 	char *buf;
@@ -166,11 +150,7 @@ static int get_obj_list(struct request *req)
 	obj_nr /= sizeof(uint64_t);
 	objlist = (uint64_t *)buf;
 	for (i = 0; i < obj_nr; i++) {
-		if (is_obj_in_range(objlist[i], start_hash, end_hash)) {
-			dprintf("%u, %016"PRIx64", %016"PRIx64" %016"PRIx64"\n", epoch,
-				objlist[i], start_hash, end_hash);
-			p[nr++] = objlist[i];
-		}
+		p[nr++] = objlist[i];
 
 		if (nr * sizeof(uint64_t) >= hdr->data_length)
 			break;
@@ -203,11 +183,7 @@ local:
 		if (i < obj_nr)
 			continue;
 
-		if (is_obj_in_range(oid, start_hash, end_hash)) {
-			dprintf("%u, %016"PRIx64", %016"PRIx64" %016"PRIx64"\n", epoch,
-				oid, start_hash, end_hash);
-			p[nr++] = oid;
-		}
+		p[nr++] = oid;
 
 		if (nr * sizeof(uint64_t) >= hdr->data_length)
 			break;
@@ -215,39 +191,7 @@ local:
 
 	eprintf("nr = %"PRIu32"\n", nr);
 
-	e_nr = epoch_log_read(epoch, buf, buf_len);
-	e_nr /= sizeof(*e);
-	e = (struct sheepdog_node_list_entry *)buf;
-
-	if (e_nr <= sys->nr_sobjs) {
-		rsp->next = end_hash;
-		closedir(dir);
-		goto out;
-	}
-
-	for (idx = 0; idx < e_nr; idx++) {
-		if (e[idx].id == sys->this_node.id)
-			break;
-	}
-	if (idx != e_nr) {
-		uint64_t hval = e[idx % e_nr].id;
-
-		rsp->next = end_hash;
-
-		if (start_hash < end_hash) {
-			if (start_hash < hval && hval <= end_hash)
-				rsp->next = hval;
-		} else
-			if (start_hash < hval || hval <= end_hash)
-				rsp->next = hval;
-
-		dprintf("%u, %016"PRIx64", %016"PRIx64" %016"PRIx64"\n", epoch, hval,
-			start_hash, end_hash);
-	} else
-		res = SD_RES_SYSTEM_ERROR;
-
 	closedir(dir);
-
 out:
 	free(buf);
 	rsp->data_length = nr * sizeof(uint64_t);
@@ -265,19 +209,19 @@ static int read_from_one(struct request *req, uint32_t epoch, uint64_t oid,
 	int i, n, nr, fd, ret;
 	unsigned wlen, rlen;
 	char name[128];
-	struct sheepdog_node_list_entry *e;
+	struct sheepdog_vnode_list_entry *e;
 	struct sd_obj_req hdr;
 	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr;
 
 	e = req->entry;
-	nr = req->nr_nodes;
+	nr = req->nr_vnodes;
 
 	for (i = 0; i < nr; i++) {
 		n = obj_to_sheep(e, nr, oid, i);
 
 		addr_to_str(name, sizeof(name), e[n].addr, 0);
 
-		if (is_myself(&e[n])) {
+		if (is_myself(e[n].addr, e[n].port)) {
 			fd = ob_open(epoch, oid, 0, &ret);
 			if (fd < 0 || ret != 0)
 				continue;
@@ -352,12 +296,12 @@ static int forward_read_obj_req(struct request *req, int idx)
 	unsigned wlen, rlen;
 	struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq;
 	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr;
-	struct sheepdog_node_list_entry *e;
+	struct sheepdog_vnode_list_entry *e;
 	uint64_t oid = hdr.oid;
 	int copies;
 
 	e = req->entry;
-	nr = req->nr_nodes;
+	nr = req->nr_vnodes;
 
 	copies = hdr.copies;
 
@@ -373,7 +317,7 @@ static int forward_read_obj_req(struct request *req, int idx)
 	for (i = 0; i < copies; i++) {
 		n = obj_to_sheep(e, nr, oid, i);
 
-		if (is_myself(&e[n])) {
+		if (is_myself(e[n].addr, e[n].port)) {
 			ret = store_queue_request_local(req, hdr.epoch);
 			goto out;
 		}
@@ -381,7 +325,7 @@ static int forward_read_obj_req(struct request *req, int idx)
 
 	n = obj_to_sheep(e, nr, oid, 0);
 
-	fd = get_sheep_fd(e, n, hdr.epoch, idx);
+	fd = get_sheep_fd(e[n].addr, e[n].port, e[n].node_idx, hdr.epoch, idx);
 	if (fd < 0) {
 		ret = SD_RES_NETWORK_ERROR;
 		goto out;
@@ -409,7 +353,7 @@ static int forward_write_obj_req(struct request *req, int idx)
 	char name[128];
 	struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq;
 	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp;
-	struct sheepdog_node_list_entry *e;
+	struct sheepdog_vnode_list_entry *e;
 	uint64_t oid = hdr.oid;
 	int copies;
 	struct pollfd pfds[SD_MAX_REDUNDANCY];
@@ -417,7 +361,7 @@ static int forward_write_obj_req(struct request *req, int idx)
 
 	dprintf("%"PRIx64"\n", oid);
 	e = req->entry;
-	nr = req->nr_nodes;
+	nr = req->nr_vnodes;
 
 	copies = hdr.copies;
 
@@ -443,12 +387,12 @@ static int forward_write_obj_req(struct request *req, int idx)
 
 		addr_to_str(name, sizeof(name), e[n].addr, 0);
 
-		if (is_myself(&e[n])) {
+		if (is_myself(e[n].addr, e[n].port)) {
 			local = 1;
 			continue;
 		}
 
-		fd = get_sheep_fd(e, n, hdr.epoch, idx);
+		fd = get_sheep_fd(e[n].addr, e[n].port, e[n].node_idx, hdr.epoch, idx);
 		if (fd < 0) {
 			eprintf("failed to connect to %s:%"PRIu32"\n", name, e[n].port);
 			ret = SD_RES_NETWORK_ERROR;
@@ -1053,11 +997,6 @@ uint64_t get_cluster_ctime(void)
 	return ctime;
 }
 
-static int node_distance(int my, int her, int nr)
-{
-	return (my + nr - her) % nr;
-}
-
 /*
  * contains_node - checks that the node id is included in the target nodes
  *
@@ -1065,14 +1004,17 @@ static int node_distance(int my, int her, int nr)
  * from the base_idx'th on the consistent hash ring, where N is the
  * number of copies of objects.
  */
-static int contains_node(uint64_t id, struct sheepdog_node_list_entry *entry,
+static int contains_node(struct sheepdog_vnode_list_entry *key,
+			 struct sheepdog_vnode_list_entry *entry,
 			 int nr, int base_idx)
 {
 	int i;
 
 	for (i = 0; i < sys->nr_sobjs; i++) {
-		if (entry[(base_idx + i) % nr].id == id)
-			return (base_idx + i) % nr;
+		int idx = get_nth_node(entry, nr, base_idx, i);
+		if (memcmp(key->addr, entry[idx].addr, sizeof(key->addr)) == 0
+		    && key->port == entry[idx].port)
+			return idx;
 	}
 	return -1;
 }
@@ -1126,17 +1068,17 @@ static uint64_t blocking_oid;
  * The node D, E, F, and A can recover objects from local, and the
  * node G recovers from the node B.
  */
-static int find_tgt_node(struct sheepdog_node_list_entry *old_entry, int old_nr, int old_idx,
-			 struct sheepdog_node_list_entry *cur_entry, int cur_nr, int cur_idx,
+static int find_tgt_node(struct sheepdog_vnode_list_entry *old_entry, int old_nr, int old_idx,
+			 struct sheepdog_vnode_list_entry *cur_entry, int cur_nr, int cur_idx,
 			 int copy_idx)
 {
 	int i, j, idx;
 
 	dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", old_idx, old_nr, cur_idx, cur_nr, copy_idx);
 
-	if (copy_idx < cur_nr) {
+	if (copy_idx < sys->nr_sobjs) {
 		/* If the same node is in the previous target nodes, return its index */
-		idx = contains_node(cur_entry[(cur_idx + copy_idx) % cur_nr].id,
+		idx = contains_node(cur_entry + get_nth_node(cur_entry, cur_nr, cur_idx, copy_idx),
 				    old_entry, old_nr, old_idx);
 		if (idx >= 0) {
 			dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", idx, copy_idx, cur_idx, cur_nr);
@@ -1145,57 +1087,55 @@ static int find_tgt_node(struct sheepdog_node_list_entry *old_entry, int old_nr,
 	}
 
 	for (i = 0, j = 0; ; i++, j++) {
-		if (i < cur_nr) {
+		if (i < copy_idx) {
 			/* Skip if the node can recover from its local */
-			idx = contains_node(cur_entry[(cur_idx + i) % cur_nr].id,
+			idx = contains_node(cur_entry + get_nth_node(cur_entry, cur_nr, cur_idx, i),
 					    old_entry, old_nr, old_idx);
 			if (idx >= 0)
 				continue;
 
 			/* Find the next target which needs to recover from remote */
-			while (contains_node(old_entry[(old_idx + j) % old_nr].id,
-					     cur_entry, cur_nr, cur_idx) >= 0 && j < old_nr)
+			while (j < sys->nr_sobjs &&
+			       contains_node(old_entry + get_nth_node(old_entry, old_nr, old_idx, j),
+					     cur_entry, cur_nr, cur_idx) >= 0)
 				j++;
 		}
-		if (j == old_nr) {
+		if (j == sys->nr_sobjs) {
 			/*
 			 * Cannot find the target because the number of nodes
 			 * is smaller than the number of copies.  We can select
 			 * any node in this case, so select the first one.
 			 */
-
-			/* old_nr should be smaller than sys->nr_sobjs */
-			if (old_nr >= sys->nr_sobjs)
-				eprintf("bug: %"PRIu32", %"PRIu32"\n", old_nr, sys->nr_sobjs);
-
 			return old_idx;
 		}
 
 		if (i == copy_idx) {
 			/* Found the target node correspoinding to copy_idx */
-			dprintf("%"PRIu32", %"PRIu32", %"PRIu32"\n", (old_idx + j) % old_nr, copy_idx,
-				(cur_idx + i) % cur_nr);
-			return (old_idx + j) % old_nr;
+			dprintf("%"PRIu32", %"PRIu32", %"PRIu32"\n",
+				get_nth_node(old_entry, old_nr, old_idx, j),
+				copy_idx, (cur_idx + i) % cur_nr);
+			return get_nth_node(old_entry, old_nr, old_idx, j);
 		}
 
 	}
+
 	return -1;
 }
 
 static int __recover_one(struct recovery_work *rw,
-			 struct sheepdog_node_list_entry *_old_entry, int old_nr,
-			 struct sheepdog_node_list_entry *_cur_entry, int cur_nr, int cur_idx,
+			 struct sheepdog_vnode_list_entry *_old_entry, int old_nr,
+			 struct sheepdog_vnode_list_entry *_cur_entry, int cur_nr, int cur_idx,
 			 int copy_idx, uint32_t epoch, uint32_t tgt_epoch,
 			 uint64_t oid, char *buf, int buf_len)
 {
-	struct sheepdog_node_list_entry *e;
+	struct sheepdog_vnode_list_entry *e;
 	struct sd_obj_req hdr;
 	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr;
 	char name[128];
 	unsigned wlen = 0, rlen;
 	int fd, ret;
-	struct sheepdog_node_list_entry old_entry[SD_MAX_NODES],
-		cur_entry[SD_MAX_NODES], *next_entry;
+	struct sheepdog_vnode_list_entry old_entry[SD_MAX_VNODES],
+		cur_entry[SD_MAX_VNODES], next_entry[SD_MAX_VNODES];
 	int next_nr;
 	int tgt_idx = -1;
 	int old_idx;
@@ -1213,7 +1153,7 @@ next:
 	}
 	e = old_entry + tgt_idx;
 
-	if (e->id == sys->this_node.id) {
+	if (is_myself(e->addr, e->port)) {
 		char old[PATH_MAX], new[PATH_MAX];
 
 		snprintf(old, sizeof(old), "%s%08u/%016" PRIx64, obj_path,
@@ -1230,8 +1170,9 @@ next:
 				eprintf("no previous epoch, %"PRIu32"\n", tgt_epoch - 1);
 				return -1;
 			}
-			next_entry = (struct sheepdog_node_list_entry *)buf;
-			next_nr /= sizeof(*next_entry);
+			next_nr = nodes_to_vnodes((struct sheepdog_node_list_entry *)buf,
+						  next_nr / sizeof(struct sheepdog_node_list_entry),
+						  next_entry);
 			goto not_found;
 		}
 
@@ -1321,14 +1262,21 @@ next:
 		eprintf("%"PRIu32"\n", rsp->result);
 		return -1;
 	}
-	next_entry = (struct sheepdog_node_list_entry *)buf;
-	next_nr = rsp->data_length / sizeof(*old_entry);
+	next_nr = nodes_to_vnodes((struct sheepdog_node_list_entry *)buf,
+				  rsp->data_length / sizeof(struct sheepdog_node_list_entry),
+				  next_entry);
 
 not_found:
-	copy_idx = node_distance(tgt_idx, old_idx, old_nr);
+	for (copy_idx = 0; copy_idx < sys->nr_sobjs; copy_idx++)
+		if (get_nth_node(old_entry, old_nr, old_idx, copy_idx) == tgt_idx)
+			break;
+	if (copy_idx == sys->nr_sobjs) {
+		eprintf("bug: cannot find the proper copy_idx\n");
+		return -1;
+	}
+
 	dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", rsp->result, rsp->data_length, tgt_idx,
 		old_idx, old_nr, copy_idx);
-
 	memcpy(cur_entry, old_entry, sizeof(*old_entry) * old_nr);
 	cur_nr = old_nr;
 	cur_idx = old_idx;
@@ -1346,12 +1294,15 @@ static void recover_one(struct work *work, int idx)
 	char *buf = NULL;
 	int ret;
 	uint64_t oid = *(((uint64_t *)rw->buf) + rw->done);
-	struct sheepdog_node_list_entry old_entry[SD_MAX_NODES],
-		cur_entry[SD_MAX_NODES];
-	int old_nr, cur_nr;
+	struct sheepdog_node_list_entry old_nodes[SD_MAX_NODES];
+	struct sheepdog_node_list_entry cur_nodes[SD_MAX_NODES];
+	struct sheepdog_vnode_list_entry old_vnodes[SD_MAX_VNODES];
+	struct sheepdog_vnode_list_entry cur_vnodes[SD_MAX_VNODES];
+	int old_nr_nodes, cur_nr_nodes, old_nr_vnodes, cur_nr_vnodes;
 	uint32_t epoch = rw->epoch;
-	int i, my_idx = -1, copy_idx = 0, cur_idx = -1;
+	int i, copy_idx = 0, cur_idx = -1;
 	int fd;
+	int nr_objs;
 
 	eprintf("%"PRIu32" %"PRIu32", %16"PRIx64"\n", rw->done, rw->count, oid);
 
@@ -1370,35 +1321,48 @@ static void recover_one(struct work *work, int idx)
 	else
 		buf = malloc(sizeof(struct sheepdog_inode));
 
-	cur_nr = epoch_log_read(epoch, (char *)cur_entry, sizeof(cur_entry));
-	if (cur_nr <= 0) {
+	cur_nr_nodes = epoch_log_read(epoch, (char *)cur_nodes, sizeof(cur_nodes));
+	if (cur_nr_nodes <= 0) {
 		eprintf("failed to read current epoch, %"PRIu32"\n", epoch);
 		goto out;
 	}
-	cur_nr /= sizeof(struct sheepdog_node_list_entry);
+	cur_nr_nodes /= sizeof(struct sheepdog_node_list_entry);
 
-	old_nr = epoch_log_read(epoch - 1, (char *)old_entry, sizeof(old_entry));
-	if (old_nr <= 0) {
+	old_nr_nodes = epoch_log_read(epoch - 1, (char *)old_nodes, sizeof(old_nodes));
+	if (old_nr_nodes <= 0) {
 		eprintf("failed to read previous epoch, %"PRIu32"\n", epoch - 1);
 		goto fail;
 	}
-	old_nr /= sizeof(struct sheepdog_node_list_entry);
+	old_nr_nodes /= sizeof(struct sheepdog_node_list_entry);
+
+	old_nr_vnodes = nodes_to_vnodes(old_nodes, old_nr_nodes, old_vnodes);
+	cur_nr_vnodes = nodes_to_vnodes(cur_nodes, cur_nr_nodes, cur_vnodes);
 
 	if (!sys->nr_sobjs)
 		goto fail;
 
-	cur_idx = obj_to_sheep(cur_entry, cur_nr, oid, 0);
+	cur_idx = obj_to_sheep(cur_vnodes, cur_nr_vnodes, oid, 0);
 
-	for (i = 0; i < cur_nr; i++) {
-		if (cur_entry[i].id == sys->this_node.id) {
-			my_idx = i;
+	nr_objs = sys->nr_sobjs;
+	if (nr_objs > cur_nr_nodes)
+		nr_objs = cur_nr_nodes;
+
+	copy_idx = -1;
+	for (i = 0; i < nr_objs; i++) {
+		int n = obj_to_sheep(cur_vnodes, cur_nr_vnodes, oid, i);
+		if (is_myself(cur_vnodes[n].addr, cur_vnodes[n].port)) {
+			copy_idx = i;
 			break;
 		}
 	}
-	copy_idx = node_distance(my_idx, cur_idx, cur_nr);
-	dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", my_idx, cur_idx, cur_nr, copy_idx);
+	if (copy_idx < 0) {
+		eprintf("shouldn't happen\n");
+		goto out;
+	}
+
+	dprintf("%"PRIu32", %"PRIu32", %"PRIu32"\n", cur_idx, cur_nr_nodes, copy_idx);
 
-	ret = __recover_one(rw, old_entry, old_nr, cur_entry, cur_nr,
+	ret = __recover_one(rw, old_vnodes, old_nr_vnodes, cur_vnodes, cur_nr_vnodes,
 			    cur_idx, copy_idx, epoch, epoch - 1, oid,
 			    buf, SD_DATA_OBJ_SIZE);
 	if (ret == 0)
@@ -1407,8 +1371,8 @@ static void recover_one(struct work *work, int idx)
 	for (i = 0; i < sys->nr_sobjs; i++) {
 		if (i == copy_idx)
 			continue;
-		ret = __recover_one(rw, old_entry, old_nr,
-				    cur_entry, cur_nr, cur_idx, i,
+		ret = __recover_one(rw, old_vnodes, old_nr_vnodes,
+				    cur_vnodes, cur_nr_vnodes, cur_idx, i,
 				    epoch, epoch - 1, oid, buf, SD_DATA_OBJ_SIZE);
 		if (ret == 0)
 			goto out;
@@ -1547,12 +1511,10 @@ static void recover_done(struct work *work, int idx)
 	}
 }
 
-static int __fill_obj_list(struct recovery_work *rw,
-			   struct sheepdog_node_list_entry *e,
-			   uint64_t start_hash, uint64_t end_hash, uint64_t *done_hash)
+static int __fill_obj_list(struct sheepdog_node_list_entry *e, uint32_t epoch,
+			   uint8_t *buf, size_t buf_size)
 {
 	int fd, ret;
-	uint32_t epoch = rw->epoch;
 	unsigned wlen, rlen;
 	char name[128];
 	struct sd_list_req hdr;
@@ -1569,72 +1531,98 @@ static int __fill_obj_list(struct recovery_work *rw,
 	}
 
 	wlen = 0;
-	rlen = (1 << 20) - (rw->count * sizeof(uint64_t));
+	rlen = buf_size;
 
 	memset(&hdr, 0, sizeof(hdr));
 	hdr.opcode = SD_OP_GET_OBJ_LIST;
 	/* we don't need to set epoch */
 	hdr.epoch = epoch;
-	hdr.start = start_hash;
-	hdr.end = end_hash;
 	hdr.tgt_epoch = epoch - 1;
 	hdr.flags = 0;
 	hdr.data_length = rlen;
 
-	dprintf("%016"PRIx64", %016"PRIx64"\n", hdr.start, hdr.end);
-
-	ret = exec_req(fd, (struct sd_req *)&hdr, rw->buf + rw->count * sizeof(uint64_t), &wlen, &rlen);
+	ret = exec_req(fd, (struct sd_req *)&hdr, buf, &wlen, &rlen);
 
 	close(fd);
 
 	rsp = (struct sd_list_rsp *)&hdr;
 
 	if (ret || rsp->result != SD_RES_SUCCESS) {
-		rw->retry = 1;
-		*done_hash = end_hash;
 		eprintf("try again, %"PRIu32", %"PRIu32"\n", ret, rsp->result);
-		return 0;
+		return -1;
 	}
 
 	dprintf("%"PRIu32"\n", rsp->data_length);
 
-	if (rsp->data_length)
-		rw->count += rsp->data_length / sizeof(uint64_t);
+	return rsp->data_length / sizeof(uint64_t);
+}
 
-	*done_hash = rsp->next;
+static int merge_objlist(struct sheepdog_vnode_list_entry *entries, int nr_entries,
+			 uint64_t *list1, int nr_list1,
+			 uint64_t *list2, int nr_list2, int nr_objs)
+{
+	int i, j, idx;
 
-	return 0;
+	for (i = 0; i < nr_list2; i++) {
+		for (j = 0; j < nr_objs; j++) {
+			idx = obj_to_sheep(entries, nr_entries, list2[i], j);
+			if (is_myself(entries[idx].addr, entries[idx].port))
+				break;
+		}
+		if (j == nr_objs)
+			continue;
+
+		if (bsearch(list2 + i, list1, nr_list1, sizeof(*list1), obj_cmp))
+			continue;
+
+		list1[nr_list1++] = list2[i];
+	}
+
+	qsort(list1, nr_list1, sizeof(*list1), obj_cmp);
+
+	return nr_list1;
 }
 
 static int fill_obj_list(struct recovery_work *rw,
 			 struct sheepdog_node_list_entry *old_entry, int old_nr,
 			 struct sheepdog_node_list_entry *cur_entry, int cur_nr,
-			 uint64_t start_hval, uint64_t end_hval, int nr_objs)
+			 int nr_objs)
 {
-	int i, idx, old_idx, cur_idx;
-	uint64_t hval, done_hval = end_hval;
+	int i, j;
+	uint8_t *buf = NULL;
+	size_t buf_size = SD_DATA_OBJ_SIZE; /* FIXME */
+	struct sheepdog_vnode_list_entry vnodes[SD_MAX_VNODES];
+	int nr_vnodes;
 
-	hval = start_hval;
-again:
-	old_idx = hval_to_sheep(old_entry, old_nr, hval + 1, 0);
-	cur_idx = hval_to_sheep(cur_entry, cur_nr, hval + 1, 0);
+	buf = malloc(buf_size);
+	if (!buf)
+		goto fail;
 
-	for (i = 0; i < nr_objs; i++) {
-		idx = find_tgt_node(old_entry, old_nr, old_idx, cur_entry, cur_nr, cur_idx, i);
-		dprintf("%"PRIu32", %"PRIu32"\n", idx, i);
-		if (__fill_obj_list(rw, old_entry + idx, hval, end_hval, &done_hval) == 0)
-			break;
-	}
-	if (i == nr_objs)
-		return -1;
+	nr_vnodes = nodes_to_vnodes(cur_entry, cur_nr, vnodes);
+	for (i = 0; i < cur_nr; i++) {
+		int nr;
 
-	if (done_hval != end_hval) {
-		dprintf("%"PRIx64", %"PRIx64"\n", done_hval, end_hval);
-		hval = done_hval;
-		goto again;
+		for (j = 0; j < old_nr; j++)
+			if (node_cmp(cur_entry + i, old_entry + j) == 0)
+				break;
+
+		if (j == old_nr)
+			/* cur_entry[i] doesn't have a list file */
+			continue;
+
+		nr  = __fill_obj_list(cur_entry + i, rw->epoch, buf, buf_size);
+		if (nr < 0)
+			goto fail;
+		rw->count = merge_objlist(vnodes, nr_vnodes, (uint64_t *)rw->buf,
+					  rw->count, (uint64_t *)buf, nr, nr_objs);
 	}
 
+	free(buf);
 	return 0;
+fail:
+	free(buf);
+	rw->retry = 1;
+	return -1;
 }
 
 static void __start_recovery(struct work *work, int idx)
@@ -1644,9 +1632,7 @@ static void __start_recovery(struct work *work, int idx)
 	struct sheepdog_node_list_entry old_entry[SD_MAX_NODES],
 		cur_entry[SD_MAX_NODES];
 	int old_nr, cur_nr, nr_objs;
-	int my_idx = -1;
-	int i, fd;
-	uint64_t start_hash, end_hash;
+	int fd;
 	char path[PATH_MAX], tmp_path[PATH_MAX];
 	int ret;
 
@@ -1672,18 +1658,7 @@ static void __start_recovery(struct work *work, int idx)
 	if (!nr_objs)
 		goto fail;
 
-	for (i = 0; i < cur_nr; i++) {
-		if (cur_entry[i].id == sys->this_node.id) {
-			my_idx = i;
-			break;
-		}
-	}
-	start_hash = cur_entry[(my_idx - nr_objs + cur_nr) % cur_nr].id;
-	end_hash = cur_entry[my_idx].id;
-
-	dprintf("fill obj list (from 0x%"PRIx64" to 0x%"PRIx64")\n", start_hash, end_hash);
-	if (fill_obj_list(rw, old_entry, old_nr, cur_entry, cur_nr,
-			  start_hash, end_hash, nr_objs) != 0) {
+	if (fill_obj_list(rw, old_entry, old_nr, cur_entry, cur_nr, nr_objs) != 0) {
 		eprintf("fatal recovery error\n");
 		goto fail;
 	}
@@ -1691,8 +1666,6 @@ static void __start_recovery(struct work *work, int idx)
 	if (rw->retry)
 		goto fail;
 
-	qsort(rw->buf, rw->count, sizeof(uint64_t), obj_cmp);
-
 	snprintf(path, sizeof(path), "%s%08u/list", obj_path, epoch);
 	snprintf(tmp_path, sizeof(tmp_path), "%s%08u/list.tmp", obj_path, epoch);
 
@@ -1810,16 +1783,6 @@ static int attr(char *path, const char *name, void *var, int len, int set)
 	return SD_RES_SUCCESS;
 }
 
-int set_nodeid(uint64_t nodeid)
-{
-	return attr(epoch_path, ANAME_NODEID, &nodeid, sizeof(nodeid), 1);
-}
-
-int get_nodeid(uint64_t *nodeid)
-{
-	return attr(epoch_path, ANAME_NODEID, nodeid, sizeof(*nodeid), 0);
-}
-
 static int init_base_path(const char *d)
 {
 	int new = 0;
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 77f2704..9518085 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -21,17 +21,17 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 			  uint32_t base_vid, uint32_t cur_vid, uint32_t copies,
 			  uint32_t snapid, int is_snapshot)
 {
-	struct sheepdog_node_list_entry entries[SD_MAX_NODES];
+	struct sheepdog_vnode_list_entry entries[SD_MAX_VNODES];
 	/* we are not called concurrently */
 	static struct sheepdog_inode new, base, cur;
 	struct timeval tv;
-	int ret, nr_nodes;
+	int ret, nr_vnodes, nr_nodes;
 	unsigned long block_size = SD_DATA_OBJ_SIZE;
 
-	nr_nodes = get_ordered_sd_node_list(entries);
+	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_nodes);
 
 	if (base_vid) {
-		ret = read_object(entries, nr_nodes, epoch,
+		ret = read_object(entries, nr_vnodes, nr_nodes, epoch,
 				  vid_to_vdi_oid(base_vid), (char *)&base,
 				  sizeof(base), 0, copies);
 		if (ret < 0)
@@ -45,7 +45,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 			vprintf(SDOG_INFO "tree snapshot %s %" PRIx32 " %" PRIx32 "\n",
 				name, cur_vid, base_vid);
 
-			ret = read_object(entries, nr_nodes, epoch,
+			ret = read_object(entries, nr_vnodes, nr_nodes, epoch,
 					  vid_to_vdi_oid(cur_vid), (char *)&cur,
 					  SD_INODE_HEADER_SIZE, 0, copies);
 			if (ret < 0) {
@@ -88,7 +88,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 	}
 
 	if (is_snapshot && cur_vid != base_vid) {
-		ret = write_object(entries, nr_nodes, epoch,
+		ret = write_object(entries, nr_vnodes, nr_nodes, epoch,
 				   vid_to_vdi_oid(cur_vid), (char *)&cur,
 				   SD_INODE_HEADER_SIZE, 0, copies, 0);
 		if (ret != 0) {
@@ -98,7 +98,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 	}
 
 	if (base_vid) {
-		ret = write_object(entries, nr_nodes, epoch,
+		ret = write_object(entries, nr_vnodes, nr_nodes, epoch,
 				   vid_to_vdi_oid(base_vid), (char *)&base,
 				   SD_INODE_HEADER_SIZE, 0, copies, 0);
 		if (ret != 0) {
@@ -107,7 +107,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 		}
 	}
 
-	ret = write_object(entries, nr_nodes, epoch,
+	ret = write_object(entries, nr_vnodes, nr_nodes, epoch,
 			   vid_to_vdi_oid(new_vid), (char *)&new, sizeof(new),
 			   0, copies, 1);
 	if (ret != 0)
@@ -121,20 +121,20 @@ static int find_first_vdi(uint32_t epoch, unsigned long start, unsigned long end
 			  unsigned long *deleted_nr, uint32_t *next_snap,
 			  unsigned int *nr_copies)
 {
-	struct sheepdog_node_list_entry entries[SD_MAX_NODES];
+	struct sheepdog_vnode_list_entry entries[SD_MAX_VNODES];
 	static struct sheepdog_inode inode;
 	unsigned long i;
-	int nr_nodes, nr_reqs;
+	int nr_vnodes, nr_nodes, nr_reqs;
 	int ret, vdi_found = 0;
 
-	nr_nodes = get_ordered_sd_node_list(entries);
+	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_nodes);
 
 	nr_reqs = sys->nr_sobjs;
 	if (nr_reqs > nr_nodes)
 		nr_reqs = nr_nodes;
 
 	for (i = start; i >= end; i--) {
-		ret = read_object(entries, nr_nodes, epoch,
+		ret = read_object(entries, nr_vnodes, nr_nodes, epoch,
 				  vid_to_vdi_oid(i), (char *)&inode,
 				  SD_INODE_HEADER_SIZE, 0, nr_reqs);
 		if (ret < 0)
@@ -297,8 +297,8 @@ int del_vdi(uint32_t epoch, char *data, int data_len, uint32_t *vid,
 	uint32_t dummy0;
 	unsigned long dummy1, dummy2;
 	int ret;
-	struct sheepdog_node_list_entry entries[SD_MAX_NODES];
-	int nr_nodes, nr_reqs;
+	struct sheepdog_vnode_list_entry entries[SD_MAX_VNODES];
+	int nr_vnodes, nr_nodes, nr_reqs;
 	static struct sheepdog_inode inode;
 
 	if (data_len == SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN)
@@ -313,12 +313,12 @@ int del_vdi(uint32_t epoch, char *data, int data_len, uint32_t *vid,
 	if (ret != SD_RES_SUCCESS)
 		return ret;
 
-	nr_nodes = get_ordered_sd_node_list(entries);
+	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_nodes);
 	nr_reqs = sys->nr_sobjs;
 	if (nr_reqs > nr_nodes)
 		nr_reqs = nr_nodes;
 
-	ret = read_object(entries, nr_nodes, epoch,
+	ret = read_object(entries, nr_vnodes, nr_nodes, epoch,
 			  vid_to_vdi_oid(*vid), (char *)&inode,
 			  SD_INODE_HEADER_SIZE, 0, nr_reqs);
 	if (ret < 0)
@@ -326,7 +326,7 @@ int del_vdi(uint32_t epoch, char *data, int data_len, uint32_t *vid,
 
 	memset(inode.name, 0, sizeof(inode.name));
 
-	ret = write_object(entries, nr_nodes, epoch,
+	ret = write_object(entries, nr_vnodes, nr_nodes, epoch,
 			   vid_to_vdi_oid(*vid), (char *)&inode,
 			   SD_INODE_HEADER_SIZE, 0, nr_reqs, 0);
 	if (ret != 0)
@@ -369,8 +369,8 @@ static void delete_one(struct work *work, int idx)
 {
 	struct deletion_work *dw = container_of(work, struct deletion_work, work);
 	uint32_t vdi_id = *(((uint32_t *)dw->buf) + dw->count - dw->done - 1);
-	struct sheepdog_node_list_entry entries[SD_MAX_NODES];
-	int nr_nodes;
+	struct sheepdog_vnode_list_entry entries[SD_MAX_VNODES];
+	int nr_vnodes, nr_nodes;
 	int ret, i;
 	static struct sheepdog_inode inode;
 
@@ -381,9 +381,9 @@ static void delete_one(struct work *work, int idx)
 	 * is called in threads and not serialized with cpg_event so
 	 * we can't access to epoch and sd_node_list safely.
 	 */
-	nr_nodes = get_ordered_sd_node_list(entries);
+	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_nodes);
 
-	ret = read_object(entries, nr_nodes, dw->epoch,
+	ret = read_object(entries, nr_vnodes, nr_nodes, dw->epoch,
 			  vid_to_vdi_oid(vdi_id), (void *)&inode, sizeof(inode),
 			  0, sys->nr_sobjs);
 
@@ -396,7 +396,7 @@ static void delete_one(struct work *work, int idx)
 		if (!inode.data_vdi_id[i])
 			continue;
 
-		remove_object(entries, nr_nodes, dw->epoch,
+		remove_object(entries, nr_vnodes, nr_nodes, dw->epoch,
 			      vid_to_data_oid(inode.data_vdi_id[i], i),
 			      inode.nr_copies);
 	}
@@ -426,8 +426,8 @@ static void delete_one_done(struct work *work, int idx)
 }
 
 static int fill_vdi_list(struct deletion_work *dw,
-			 struct sheepdog_node_list_entry *entries,
-			 int nr_entries, uint32_t root_vid)
+			 struct sheepdog_vnode_list_entry *entries,
+			 int nr_vnodes, int nr_nodes, uint32_t root_vid)
 {
 	int ret, i;
 	static struct sheepdog_inode inode;
@@ -437,7 +437,7 @@ static int fill_vdi_list(struct deletion_work *dw,
 	((uint32_t *)dw->buf)[dw->count++] = root_vid;
 again:
 	vid = ((uint32_t *)dw->buf)[done++];
-	ret = read_object(entries, nr_entries, dw->epoch,
+	ret = read_object(entries, nr_vnodes, nr_nodes, dw->epoch,
 			  vid_to_vdi_oid(vid), (void *)&inode,
 			  SD_INODE_HEADER_SIZE, 0, sys->nr_sobjs);
 
@@ -462,14 +462,15 @@ again:
 	return 0;
 }
 
-static uint64_t get_vdi_root(struct sheepdog_node_list_entry *entries,
-			     int nr_entries, uint32_t epoch, uint32_t vid)
+static uint64_t get_vdi_root(struct sheepdog_vnode_list_entry *entries,
+			     int nr_vnodes, int nr_nodes, uint32_t epoch,
+			     uint32_t vid)
 {
 	int ret;
 	static struct sheepdog_inode inode;
 
 next:
-	ret = read_object(entries, nr_entries, epoch,
+	ret = read_object(entries, nr_vnodes, nr_nodes, epoch,
 			  vid_to_vdi_oid(vid), (void *)&inode,
 			  SD_INODE_HEADER_SIZE, 0, sys->nr_sobjs);
 
@@ -489,8 +490,8 @@ next:
 int start_deletion(uint32_t vid, uint32_t epoch)
 {
 	struct deletion_work *dw;
-	struct sheepdog_node_list_entry entries[SD_MAX_NODES];
-	int nr_nodes, ret;
+	struct sheepdog_vnode_list_entry entries[SD_MAX_VNODES];
+	int nr_vnodes, nr_nodes, ret;
 	uint32_t root_vid;
 
 	dw = zalloc(sizeof(struct deletion_work));
@@ -510,13 +511,13 @@ int start_deletion(uint32_t vid, uint32_t epoch)
 	dw->work.fn = delete_one;
 	dw->work.done = delete_one_done;
 
-	nr_nodes = get_ordered_sd_node_list(entries);
+	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_nodes);
 
-	root_vid = get_vdi_root(entries, nr_nodes, dw->epoch, dw->vid);
+	root_vid = get_vdi_root(entries, nr_vnodes, nr_nodes, dw->epoch, dw->vid);
 	if (!root_vid)
 		return SD_RES_EIO;
 
-	ret = fill_vdi_list(dw, entries, nr_nodes, root_vid);
+	ret = fill_vdi_list(dw, entries, nr_vnodes, nr_nodes, root_vid);
 	if (ret)
 		return SD_RES_SUCCESS;
 
@@ -539,18 +540,18 @@ int start_deletion(uint32_t vid, uint32_t epoch)
 int get_vdi_attr(uint32_t epoch, char *data, int data_len, uint32_t vid,
 		 uint32_t *attrid, int creat, int excl)
 {
-	struct sheepdog_node_list_entry entries[SD_MAX_NODES];
+	struct sheepdog_vnode_list_entry entries[SD_MAX_VNODES];
 	char attr_buf[SD_ATTR_HEADER_SIZE], inode_buf[SD_INODE_HEADER_SIZE];
 	uint64_t oid;
 	uint32_t end;
-	int ret, nr_nodes, copies;
+	int ret, nr_nodes, nr_vnodes, copies;
 
 	if (data_len != SD_ATTR_HEADER_SIZE)
 		return SD_RES_INVALID_PARMS;
 
-	nr_nodes = get_ordered_sd_node_list(entries);
+	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_nodes);
 
-	ret = read_object(entries, nr_nodes, epoch, vid_to_vdi_oid(vid),
+	ret = read_object(entries, nr_vnodes, nr_nodes, epoch, vid_to_vdi_oid(vid),
 			  inode_buf, sizeof(inode_buf), 0, sys->nr_sobjs);
 	if (ret != SD_INODE_HEADER_SIZE) {
 		eprintf("failed to read vdi object, %"PRIx32"\n", vid);
@@ -565,11 +566,11 @@ int get_vdi_attr(uint32_t epoch, char *data, int data_len, uint32_t vid,
 	end = *attrid - 1;
 	while (*attrid != end) {
 		oid = vid_to_attr_oid(vid, *attrid);
-		ret = read_object(entries, nr_nodes, epoch, oid, attr_buf,
+		ret = read_object(entries, nr_vnodes, nr_nodes, epoch, oid, attr_buf,
 				  sizeof(attr_buf), 0, copies);
 
 		if (ret == -SD_RES_NO_OBJ && creat) {
-			ret = write_object(entries, nr_nodes, epoch, oid, data,
+			ret = write_object(entries, nr_vnodes, nr_nodes, epoch, oid, data,
 					   data_len, 0, copies, 1);
 			if (ret)
 				return SD_RES_EIO;
-- 
1.5.6.5




More information about the sheepdog mailing list