[Sheepdog] [PATCH] support location-aware data placement

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Wed Aug 3 10:53:41 CEST 2011


This introduces a zone to specify the location of sheep daemons.
With this patch, you can do more flexible replication placement:

 - When storage servers have multiple disks and you run multiple sheep
   daemons on them, you can ensure that the data is replicated to
   multiple servers by specifying the same zone id to the daemons on
   the same server.  If you don't specify the zone id, the data could
   be replicated to the disks of the same server.

 - You can replicate data to different racks by specifying the same
   zone id to the daemons in the same rack.

To use this feature, specify a zone id in the sheep command line
options.

Example:
    $ sheep /store/0 -p 7000 -z 1
    $ sheep /store/1 -p 7001 -z 1
    $ sheep /store/2 -p 7002 -z 2
    $ sheep /store/3 -p 7003 -z 2
    $ sheep /store/4 -p 7004 -z 2
    $ sheep /store/5 -p 7005 -z 3
    $ collie cluster format -c 3
    $ collie node list
       Idx - Host:Port          Vnodes   Zone
    -----------------------------------------
    *    0 - 10.68.14.1:7000        64      1
         1 - 10.68.14.1:7001        64      1
         2 - 10.68.14.1:7002        64      2
         3 - 10.68.14.1:7003        64      2
         4 - 10.68.14.1:7004        64      2
         5 - 10.68.14.1:7005        64      3
    $ qemu-img convert 6g.raw sheepdog:test
    $ collie node info
    Id      Size    Used    Use%
     0      420 GB  3.1 GB    0%
     1      420 GB  2.9 GB    0%
     2      419 GB  2.3 GB    0%
     3      419 GB  2.0 GB    0%
     4      419 GB  1.7 GB    0%
     5      423 GB  6.0 GB    1%

    Total   2.5 TB  18 GB     0%, total virtual VDI Size    6.0 GB

In the above example, the same data is not replicated in the same
zone, so the total used size of these zones are equal.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 collie/collie.c    |   14 +++---
 include/sheep.h    |   22 ++++++++--
 sheep/group.c      |   49 +++++++++++++++------
 sheep/sdnet.c      |   26 ++++++------
 sheep/sheep.c      |   12 +++++-
 sheep/sheep_priv.h |   10 ++--
 sheep/store.c      |  123 +++++++++++++++++++++++++++++++++-------------------
 sheep/vdi.c        |   68 ++++++++++++++--------------
 8 files changed, 203 insertions(+), 121 deletions(-)

diff --git a/collie/collie.c b/collie/collie.c
index ae33e0c..2cdc161 100644
--- a/collie/collie.c
+++ b/collie/collie.c
@@ -625,8 +625,8 @@ static int node_list(int argc, char **argv)
 	int i;
 
 	if (!raw_output) {
-		printf("   Idx - Host:Port              Number of vnodes\n");
-		printf("------------------------------------------------\n");
+		printf("   Idx - Host:Port          Vnodes   Zone\n");
+		printf("-----------------------------------------\n");
 	}
 	for (i = 0; i < nr_nodes; i++) {
 		char data[128];
@@ -637,13 +637,15 @@ static int node_list(int argc, char **argv)
 		if (i == master_idx) {
 			if (highlight)
 				printf(TEXT_BOLD);
-			printf(raw_output ? "* %d %s %d\n" : "* %4d - %-20s\t%d\n",
-			       i, data, node_list_entries[i].nr_vnodes);
+			printf(raw_output ? "* %d %s %d %d\n" : "* %4d - %-20s\t%d\t%d\n",
+			       i, data, node_list_entries[i].nr_vnodes,
+			       node_list_entries[i].zone);
 			if (highlight)
 				printf(TEXT_NORMAL);
 		} else
-			printf(raw_output ? "- %d %s %d\n" : "  %4d - %-20s\t%d\n",
-			       i, data, node_list_entries[i].nr_vnodes);
+			printf(raw_output ? "- %d %s %d %d\n" : "  %4d - %-20s\t%d\t%d\n",
+			       i, data, node_list_entries[i].nr_vnodes,
+			       node_list_entries[i].zone);
 	}
 
 	return EXIT_SUCCESS;
diff --git a/include/sheep.h b/include/sheep.h
index 31f1159..ea78c1b 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -128,7 +128,8 @@ struct sheepdog_node_list_entry {
 	uint8_t         addr[16];
 	uint16_t        port;
 	uint16_t	nr_vnodes;
-	uint16_t	pad[2];
+	uint16_t	zone;
+	uint16_t	pad;
 };
 
 struct sheepdog_vnode_list_entry {
@@ -136,7 +137,8 @@ struct sheepdog_vnode_list_entry {
 	uint8_t         addr[16];
 	uint16_t        port;
 	uint16_t	node_idx;
-	uint16_t	pad[2];
+	uint16_t	zone;
+	uint16_t	pad;
 };
 
 struct epoch_log {
@@ -155,6 +157,11 @@ static inline int same_node(struct sheepdog_vnode_list_entry *e, int n1, int n2)
 	return 0;
 }
 
+static inline int same_zone(struct sheepdog_vnode_list_entry *e, int n1, int n2)
+{
+	return e[n1].zone != 0 && e[n1].zone == e[n2].zone;
+}
+
 /* traverse the virtual node list and return the n'th one */
 static inline int get_nth_node(struct sheepdog_vnode_list_entry *entries,
 			       int nr_entries, int base, int n)
@@ -166,12 +173,18 @@ static inline int get_nth_node(struct sheepdog_vnode_list_entry *entries,
 		nodes[nr++] = idx;
 next:
 		idx = (idx + 1) % nr_entries;
-		if (idx == base)
+		if (idx == base) {
+			abort();
 			return -1; /* not found */
-		for (i = 0; i < nr; i++)
+		}
+		for (i = 0; i < nr; i++) {
 			if (same_node(entries, idx, nodes[i]))
 				/* this node is already selected, so skip here */
 				goto next;
+			if (same_zone(entries, idx, nodes[i]))
+				/* this node is in the same zone, so skip here */
+				goto next;
+		}
 	}
 
 	return idx;
@@ -307,6 +320,7 @@ static inline int nodes_to_vnodes(struct sheepdog_node_list_entry *nodes, int nr
 				memcpy(vnodes[nr_vnodes].addr, n->addr, sizeof(n->addr));
 				vnodes[nr_vnodes].port = n->port;
 				vnodes[nr_vnodes].node_idx = n - nodes;
+				vnodes[nr_vnodes].zone = n->zone;
 			}
 
 			nr_vnodes++;
diff --git a/sheep/group.c b/sheep/group.c
index 957daf1..235320d 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -168,36 +168,59 @@ static int get_node_idx(struct sheepdog_node_list_entry *ent,
 	return ent - entries;
 }
 
-static int build_node_list(struct list_head *node_list,
-			   struct sheepdog_node_list_entry *entries)
+static void build_node_list(struct list_head *node_list,
+			    struct sheepdog_node_list_entry *entries,
+			    int *nr_nodes, int *nr_zones)
 {
 	struct node *node;
-	int nr = 0;
+	int nr = 0, nr_zero_zones = 0, i;
+	uint16_t zones[SD_MAX_REDUNDANCY];
+
+	if (nr_zones)
+		*nr_zones = 0;
 
 	list_for_each_entry(node, node_list, list) {
 		if (entries)
 			memcpy(entries + nr, &node->ent, sizeof(*entries));
 		nr++;
+
+		if (nr_zones && *nr_zones < ARRAY_SIZE(zones)) {
+			if (node->ent.zone != 0) {
+				for (i = 0; i < *nr_zones; i++) {
+					if (zones[i] == node->ent.zone)
+						break;
+				}
+				if (i == *nr_zones)
+					zones[(*nr_zones)++] = node->ent.zone;
+			} else
+				nr_zero_zones++;
+		}
 	}
 	if (entries)
 		qsort(entries, nr, sizeof(*entries), node_cmp);
-
-	return nr;
+	if (nr_nodes)
+		*nr_nodes = nr;
+	if (nr_zones)
+		/* Zero zone nodes behave as if they have different zones */
+		*nr_zones += nr_zero_zones;
 }
 
 int get_ordered_sd_node_list(struct sheepdog_node_list_entry *entries)
 {
-	return build_node_list(&sys->sd_node_list, entries);
+	int nr_nodes;
+
+	build_node_list(&sys->sd_node_list, entries, &nr_nodes, NULL);
+
+	return nr_nodes;
 }
 
 void get_ordered_sd_vnode_list(struct sheepdog_vnode_list_entry *entries,
-			       int *nr_vnodes, int *nr_nodes)
+			       int *nr_vnodes, int *nr_zones)
 {
 	struct sheepdog_node_list_entry nodes[SD_MAX_NODES];
 	int nr;
 
-	nr = build_node_list(&sys->sd_node_list, nodes);
-	*nr_nodes = nr;
+	build_node_list(&sys->sd_node_list, nodes, &nr, nr_zones);
 
 	if (sys->nr_vnodes == 0)
 		sys->nr_vnodes = nodes_to_vnodes(nodes, nr, sys->vnodes);
@@ -209,7 +232,7 @@ void get_ordered_sd_vnode_list(struct sheepdog_vnode_list_entry *entries,
 
 void setup_ordered_sd_vnode_list(struct request *req)
 {
-	get_ordered_sd_vnode_list(req->entry, &req->nr_vnodes, &req->nr_nodes);
+	get_ordered_sd_vnode_list(req->entry, &req->nr_vnodes, &req->nr_zones);
 }
 
 static void get_node_list(struct sd_node_req *req,
@@ -1471,8 +1494,8 @@ do_retry:
 		if (is_io_request(req->rq.opcode)) {
 			int copies = sys->nr_sobjs;
 
-			if (copies > req->nr_nodes)
-				copies = req->nr_nodes;
+			if (copies > req->nr_zones)
+				copies = req->nr_zones;
 
 			if (__is_access_to_recoverying_objects(req)) {
 				if (req->rq.flags & SD_FLAG_CMD_DIRECT) {
@@ -1662,7 +1685,7 @@ static void set_addr(unsigned int nodeid, int port)
 		memcpy(sys->this_node.addr, saddr, 16);
 	} else if (ss->ss_family == AF_INET) {
 		saddr = &sin->sin_addr;
-		memcpy(sys->this_node.addr + 12, saddr, 16);
+		memcpy(sys->this_node.addr + 12, saddr, 4);
 	} else {
 		vprintf(SDOG_ERR "unknown protocol %d\n", ss->ss_family);
 		exit(1);
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index 089e7f6..7c588cb 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -85,8 +85,8 @@ static void setup_access_to_local_objects(struct request *req)
 	copies = hdr->copies;
 	if (!copies)
 		copies = sys->nr_sobjs;
-	if (copies > req->nr_nodes)
-		copies = req->nr_nodes;
+	if (copies > req->nr_zones)
+		copies = req->nr_zones;
 
 	if (is_access_local(req->entry, req->nr_vnodes, hdr->oid, copies))
 		req->local_oid = hdr->oid;
@@ -99,8 +99,8 @@ static void __done(struct work *work, int idx)
 	int again = 0;
 	int copies = sys->nr_sobjs;
 
-	if (copies > req->nr_nodes)
-		copies = req->nr_nodes;
+	if (copies > req->nr_zones)
+		copies = req->nr_zones;
 
 	switch (hdr->opcode) {
 	case SD_OP_NEW_VDI:
@@ -606,7 +606,7 @@ int create_listen_port(int port, void *data)
 }
 
 int write_object(struct sheepdog_vnode_list_entry *e,
-		 int vnodes, int nodes, uint32_t node_version,
+		 int vnodes, int zones, uint32_t node_version,
 		 uint64_t oid, char *data, unsigned int datalen,
 		 uint64_t offset, int nr, int create)
 {
@@ -614,8 +614,8 @@ int write_object(struct sheepdog_vnode_list_entry *e,
 	int i, n, fd, ret, success = 0;
 	char name[128];
 
-	if (nr > nodes)
-		nr = nodes;
+	if (nr > zones)
+		nr = zones;
 
 	for (i = 0; i < nr; i++) {
 		unsigned rlen = 0, wlen = datalen;
@@ -669,7 +669,7 @@ int write_object(struct sheepdog_vnode_list_entry *e,
 }
 
 int read_object(struct sheepdog_vnode_list_entry *e,
-		int vnodes, int nodes, uint32_t node_version,
+		int vnodes, int zones, uint32_t node_version,
 		uint64_t oid, char *data, unsigned int datalen,
 		uint64_t offset, int nr)
 {
@@ -678,8 +678,8 @@ int read_object(struct sheepdog_vnode_list_entry *e,
 	char name[128];
 	int i = 0, n, fd, ret, last_error = SD_RES_SUCCESS;
 
-	if (nr > nodes)
-		nr = nodes;
+	if (nr > zones)
+		nr = zones;
 
 	/* search a local object first */
 	for (i = 0; i < nr; i++) {
@@ -741,7 +741,7 @@ int read_object(struct sheepdog_vnode_list_entry *e,
 }
 
 int remove_object(struct sheepdog_vnode_list_entry *e,
-		  int vnodes, int nodes, uint32_t node_version,
+		  int vnodes, int zones, uint32_t node_version,
 		  uint64_t oid, int nr)
 {
 	char name[128];
@@ -749,8 +749,8 @@ int remove_object(struct sheepdog_vnode_list_entry *e,
 	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr;
 	int i = 0, n, fd, ret;
 
-	if (nr > nodes)
-		nr = nodes;
+	if (nr > zones)
+		nr = zones;
 
 	for (i = 0; i < nr; i++) {
 		unsigned wlen = 0, rlen = 0;
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 8755dc4..9ed8b90 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -34,11 +34,12 @@ static struct option const long_options[] = {
 	{"loglevel", required_argument, NULL, 'l'},
 	{"debug", no_argument, NULL, 'd'},
 	{"directio", no_argument, NULL, 'D'},
+	{"zone", required_argument, NULL, 'z'},
 	{"help", no_argument, NULL, 'h'},
 	{NULL, 0, NULL, 0},
 };
 
-static const char *short_options = "p:fl:dDh";
+static const char *short_options = "p:fl:dDz:h";
 
 static void usage(int status)
 {
@@ -54,6 +55,7 @@ Sheepdog Daemon, version %s\n\
   -l, --loglevel          specify the message level printed by default\n\
   -d, --debug             print debug messages\n\
   -D, --directio          use direct IO\n\
+  -z, --zone              specify the zone id\n\
   -h, --help              display this help and exit\n\
 ", PACKAGE_VERSION);
 	}
@@ -94,6 +96,14 @@ int main(int argc, char **argv)
 			dprintf("direct IO mode\n");
 			sys->use_directio = 1;
 			break;
+		case 'z':
+			sys->this_node.zone = atoi(optarg);
+			if (sys->this_node.zone == 0) {
+				eprintf("zone id must be between 1 and 65535\n");
+				exit(1);
+			}
+			dprintf("zone id = %d\n", sys->this_node.zone);
+			break;
 		case 'h':
 			usage(0);
 			break;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 06edfc7..67c651b 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -83,7 +83,7 @@ struct request {
 
 	struct sheepdog_vnode_list_entry entry[SD_MAX_VNODES];
 	int nr_vnodes;
-	int nr_nodes;
+	int nr_zones;
 	int check_consistency;
 
 	req_end_t done;
@@ -165,7 +165,7 @@ int get_vdi_attr(uint32_t epoch, char *data, int data_len, uint32_t vid,
 int get_ordered_sd_node_list(struct sheepdog_node_list_entry *entries);
 void setup_ordered_sd_vnode_list(struct request *req);
 void get_ordered_sd_vnode_list(struct sheepdog_vnode_list_entry *entries,
-			       int *nr_vnodes, int *nr_nodes);
+			       int *nr_vnodes, int *nr_zones);
 int is_access_to_busy_objects(uint64_t oid);
 int is_access_local(struct sheepdog_vnode_list_entry *e, int nr_nodes,
 		    uint64_t oid, int copies);
@@ -205,15 +205,15 @@ void resume_recovery_work(void);
 int is_recoverying_oid(uint64_t oid);
 
 int write_object(struct sheepdog_vnode_list_entry *e,
-		 int vnodes, int nodes, uint32_t node_version,
+		 int vnodes, int zones, uint32_t node_version,
 		 uint64_t oid, char *data, unsigned int datalen,
 		 uint64_t offset, int nr, int create);
 int read_object(struct sheepdog_vnode_list_entry *e,
-		int vnodes, int nodes, uint32_t node_version,
+		int vnodes, int zones, uint32_t node_version,
 		uint64_t oid, char *data, unsigned int datalen,
 		uint64_t offset, int nr);
 int remove_object(struct sheepdog_vnode_list_entry *e,
-		  int vnodes, int nodes, uint32_t node_version,
+		  int vnodes, int zones, uint32_t node_version,
 		  uint64_t oid, int nr);
 
 int get_sheep_fd(uint8_t *addr, uint16_t port, int node_idx,
diff --git a/sheep/store.c b/sheep/store.c
index eb1d4f1..cdf6a94 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -308,8 +308,8 @@ static int forward_read_obj_req(struct request *req, int idx)
 	/* temporary hack */
 	if (!copies)
 		copies = sys->nr_sobjs;
-	if (copies > req->nr_nodes)
-		copies = req->nr_nodes;
+	if (copies > req->nr_zones)
+		copies = req->nr_zones;
 
 	hdr.flags |= SD_FLAG_CMD_DIRECT;
 
@@ -368,8 +368,8 @@ static int forward_write_obj_req(struct request *req, int idx)
 	/* temporary hack */
 	if (!copies)
 		copies = sys->nr_sobjs;
-	if (copies > req->nr_nodes)
-		copies = req->nr_nodes;
+	if (copies > req->nr_zones)
+		copies = req->nr_zones;
 
 	nr_fds = 0;
 	memset(pfds, 0, sizeof(pfds));
@@ -1022,6 +1022,32 @@ uint64_t get_cluster_ctime(void)
 	return ctime;
 }
 
+static int get_max_copies(struct sheepdog_node_list_entry *entries, int nr)
+{
+	int i, j;
+	unsigned int nr_zones = 0, nr_zero_zones = 0;
+	uint16_t zones[SD_MAX_REDUNDANCY];
+
+	for (i = 0; i < nr; i++) {
+		if (nr_zones >= ARRAY_SIZE(zones))
+			break;
+
+		if (entries[i].zone == 0) {
+			nr_zero_zones++;
+			continue;
+		}
+
+		for (j = 0; j < nr_zones; j++) {
+			if (zones[j] == entries[i].zone)
+				break;
+		}
+		if (j == nr_zones)
+			zones[nr_zones++] = entries[i].zone;
+	}
+
+	return min(sys->nr_sobjs, nr_zones + nr_zero_zones);
+}
+
 /*
  * contains_node - checks that the node id is included in the target nodes
  *
@@ -1031,11 +1057,11 @@ uint64_t get_cluster_ctime(void)
  */
 static int contains_node(struct sheepdog_vnode_list_entry *key,
 			 struct sheepdog_vnode_list_entry *entry,
-			 int nr, int base_idx)
+			 int nr, int base_idx, int copies)
 {
 	int i;
 
-	for (i = 0; i < sys->nr_sobjs; i++) {
+	for (i = 0; i < copies; i++) {
 		int idx = get_nth_node(entry, nr, base_idx, i);
 		if (memcmp(key->addr, entry[idx].addr, sizeof(key->addr)) == 0
 		    && key->port == entry[idx].port)
@@ -1100,41 +1126,42 @@ static struct recovery_work *recovering_work;
  * The node D, E, F, and A can recover objects from local, and the
  * node G recovers from the node B.
  */
-static int find_tgt_node(struct sheepdog_vnode_list_entry *old_entry, int old_nr, int old_idx,
-			 struct sheepdog_vnode_list_entry *cur_entry, int cur_nr, int cur_idx,
+static int find_tgt_node(struct sheepdog_vnode_list_entry *old_entry,
+			 int old_nr, int old_idx, int old_copies,
+			 struct sheepdog_vnode_list_entry *cur_entry,
+			 int cur_nr, int cur_idx, int cur_copies,
 			 int copy_idx)
 {
 	int i, j, idx;
 
-	dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", old_idx, old_nr, cur_idx, cur_nr, copy_idx);
+	dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n",
+		old_idx, old_nr, old_copies, cur_idx, cur_nr, cur_copies, copy_idx);
 
-	if (copy_idx < sys->nr_sobjs) {
-		/* If the same node is in the previous target nodes, return its index */
-		idx = contains_node(cur_entry + get_nth_node(cur_entry, cur_nr, cur_idx, copy_idx),
-				    old_entry, old_nr, old_idx);
-		if (idx >= 0) {
-			dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", idx, copy_idx, cur_idx, cur_nr);
-			return idx;
-		}
+	/* If the same node is in the previous target nodes, return its index */
+	idx = contains_node(cur_entry + get_nth_node(cur_entry, cur_nr, cur_idx, copy_idx),
+			    old_entry, old_nr, old_idx, old_copies);
+	if (idx >= 0) {
+		dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", idx, copy_idx, cur_idx, cur_nr);
+		return idx;
 	}
 
 	for (i = 0, j = 0; ; i++, j++) {
 		if (i < copy_idx) {
 			/* Skip if the node can recover from its local */
 			idx = contains_node(cur_entry + get_nth_node(cur_entry, cur_nr, cur_idx, i),
-					    old_entry, old_nr, old_idx);
+					    old_entry, old_nr, old_idx, old_copies);
 			if (idx >= 0)
 				continue;
 
 			/* Find the next target which needs to recover from remote */
-			while (j < sys->nr_sobjs &&
+			while (j < old_copies &&
 			       contains_node(old_entry + get_nth_node(old_entry, old_nr, old_idx, j),
-					     cur_entry, cur_nr, cur_idx) >= 0)
+					     cur_entry, cur_nr, cur_idx, cur_copies) >= 0)
 				j++;
 		}
-		if (j == sys->nr_sobjs) {
+		if (j == old_copies) {
 			/*
-			 * Cannot find the target because the number of nodes
+			 * Cannot find the target because the number of zones
 			 * is smaller than the number of copies.  We can select
 			 * any node in this case, so select the first one.
 			 */
@@ -1155,8 +1182,10 @@ static int find_tgt_node(struct sheepdog_vnode_list_entry *old_entry, int old_nr
 }
 
 static int __recover_one(struct recovery_work *rw,
-			 struct sheepdog_vnode_list_entry *_old_entry, int old_nr,
-			 struct sheepdog_vnode_list_entry *_cur_entry, int cur_nr, int cur_idx,
+			 struct sheepdog_vnode_list_entry *_old_entry,
+			 int old_nr, int old_copies,
+			 struct sheepdog_vnode_list_entry *_cur_entry,
+			 int cur_nr, int cur_copies, int cur_idx,
 			 int copy_idx, uint32_t epoch, uint32_t tgt_epoch,
 			 uint64_t oid, char *buf, int buf_len)
 {
@@ -1168,7 +1197,7 @@ static int __recover_one(struct recovery_work *rw,
 	int fd, ret;
 	struct sheepdog_vnode_list_entry old_entry[SD_MAX_VNODES],
 		cur_entry[SD_MAX_VNODES], next_entry[SD_MAX_VNODES];
-	int next_nr;
+	int next_nr, next_copies;
 	int tgt_idx = -1;
 	int old_idx;
 
@@ -1178,7 +1207,8 @@ next:
 	dprintf("recover obj %"PRIx64" from epoch %"PRIu32"\n", oid, tgt_epoch);
 	old_idx = obj_to_sheep(old_entry, old_nr, oid, 0);
 
-	tgt_idx = find_tgt_node(old_entry, old_nr, old_idx, cur_entry, cur_nr, cur_idx, copy_idx);
+	tgt_idx = find_tgt_node(old_entry, old_nr, old_idx, old_copies,
+				cur_entry, cur_nr, cur_idx, cur_copies, copy_idx);
 	if (tgt_idx < 0) {
 		eprintf("cannot find target node, %"PRIx64"\n", oid);
 		return -1;
@@ -1202,9 +1232,11 @@ next:
 				eprintf("no previous epoch, %"PRIu32"\n", tgt_epoch - 1);
 				return -1;
 			}
+			next_nr /= sizeof(struct sheepdog_node_list_entry);
+			next_copies = get_max_copies((struct sheepdog_node_list_entry *)buf,
+						     next_nr);
 			next_nr = nodes_to_vnodes((struct sheepdog_node_list_entry *)buf,
-						  next_nr / sizeof(struct sheepdog_node_list_entry),
-						  next_entry);
+						  next_nr, next_entry);
 			goto not_found;
 		}
 
@@ -1297,15 +1329,16 @@ next:
 		eprintf("%"PRIu32"\n", rsp->result);
 		return -1;
 	}
+	next_nr = rsp->data_length / sizeof(struct sheepdog_node_list_entry);
+	next_copies = get_max_copies((struct sheepdog_node_list_entry *)buf, next_nr);
 	next_nr = nodes_to_vnodes((struct sheepdog_node_list_entry *)buf,
-				  rsp->data_length / sizeof(struct sheepdog_node_list_entry),
-				  next_entry);
+				  next_nr, next_entry);
 
 not_found:
-	for (copy_idx = 0; copy_idx < sys->nr_sobjs; copy_idx++)
+	for (copy_idx = 0; copy_idx < old_copies; copy_idx++)
 		if (get_nth_node(old_entry, old_nr, old_idx, copy_idx) == tgt_idx)
 			break;
-	if (copy_idx == sys->nr_sobjs) {
+	if (copy_idx == old_copies) {
 		eprintf("bug: cannot find the proper copy_idx\n");
 		return -1;
 	}
@@ -1313,10 +1346,12 @@ not_found:
 	dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", rsp->result, rsp->data_length, tgt_idx,
 		old_idx, old_nr, copy_idx);
 	memcpy(cur_entry, old_entry, sizeof(*old_entry) * old_nr);
+	cur_copies = old_copies;
 	cur_nr = old_nr;
 	cur_idx = old_idx;
 
 	memcpy(old_entry, next_entry, next_nr * sizeof(*next_entry));
+	old_copies = next_copies;
 	old_nr = next_nr;
 
 	tgt_epoch--;
@@ -1334,10 +1369,10 @@ static void recover_one(struct work *work, int idx)
 	struct sheepdog_vnode_list_entry old_vnodes[SD_MAX_VNODES];
 	struct sheepdog_vnode_list_entry cur_vnodes[SD_MAX_VNODES];
 	int old_nr_nodes, cur_nr_nodes, old_nr_vnodes, cur_nr_vnodes;
+	int old_copies, cur_copies;
 	uint32_t epoch = rw->epoch;
 	int i, copy_idx = 0, cur_idx = -1;
 	int fd;
-	int nr_objs;
 
 	eprintf("%"PRIu32" %"PRIu32", %16"PRIx64"\n", rw->done, rw->count, oid);
 
@@ -1379,12 +1414,11 @@ static void recover_one(struct work *work, int idx)
 
 	cur_idx = obj_to_sheep(cur_vnodes, cur_nr_vnodes, oid, 0);
 
-	nr_objs = sys->nr_sobjs;
-	if (nr_objs > cur_nr_nodes)
-		nr_objs = cur_nr_nodes;
+	old_copies = get_max_copies(old_nodes, old_nr_nodes);
+	cur_copies = get_max_copies(cur_nodes, cur_nr_nodes);
 
 	copy_idx = -1;
-	for (i = 0; i < nr_objs; i++) {
+	for (i = 0; i < cur_copies; i++) {
 		int n = obj_to_sheep(cur_vnodes, cur_nr_vnodes, oid, i);
 		if (is_myself(cur_vnodes[n].addr, cur_vnodes[n].port)) {
 			copy_idx = i;
@@ -1398,17 +1432,18 @@ static void recover_one(struct work *work, int idx)
 
 	dprintf("%"PRIu32", %"PRIu32", %"PRIu32"\n", cur_idx, cur_nr_nodes, copy_idx);
 
-	ret = __recover_one(rw, old_vnodes, old_nr_vnodes, cur_vnodes, cur_nr_vnodes,
+	ret = __recover_one(rw, old_vnodes, old_nr_vnodes, old_copies,
+			    cur_vnodes, cur_nr_vnodes, cur_copies,
 			    cur_idx, copy_idx, epoch, epoch - 1, oid,
 			    buf, SD_DATA_OBJ_SIZE);
 	if (ret == 0)
 		goto out;
 
-	for (i = 0; i < sys->nr_sobjs; i++) {
+	for (i = 0; i < cur_copies; i++) {
 		if (i == copy_idx)
 			continue;
-		ret = __recover_one(rw, old_vnodes, old_nr_vnodes,
-				    cur_vnodes, cur_nr_vnodes, cur_idx, i,
+		ret = __recover_one(rw, old_vnodes, old_nr_vnodes, old_copies,
+				    cur_vnodes, cur_nr_vnodes, cur_copies, cur_idx, i,
 				    epoch, epoch - 1, oid, buf, SD_DATA_OBJ_SIZE);
 		if (ret == 0)
 			goto out;
@@ -1722,11 +1757,9 @@ static void __start_recovery(struct work *work, int idx)
 	}
 	old_nr /= sizeof(struct sheepdog_node_list_entry);
 
-	nr_objs = sys->nr_sobjs;
-	if (nr_objs > cur_nr)
-		nr_objs = cur_nr;
-	if (!nr_objs)
+	if (!sys->nr_sobjs)
 		goto fail;
+	nr_objs = get_max_copies(cur_entry, cur_nr);
 
 	if (fill_obj_list(rw, old_entry, old_nr, cur_entry, cur_nr, nr_objs) != 0) {
 		eprintf("fatal recovery error\n");
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 9f8ce01..b6ffeed 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -25,13 +25,13 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 	/* we are not called concurrently */
 	static struct sheepdog_inode new, base, cur;
 	struct timeval tv;
-	int ret, nr_vnodes, nr_nodes;
+	int ret, nr_vnodes, nr_zones;
 	unsigned long block_size = SD_DATA_OBJ_SIZE;
 
-	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_nodes);
+	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
 
 	if (base_vid) {
-		ret = read_object(entries, nr_vnodes, nr_nodes, epoch,
+		ret = read_object(entries, nr_vnodes, nr_zones, epoch,
 				  vid_to_vdi_oid(base_vid), (char *)&base,
 				  sizeof(base), 0, copies);
 		if (ret < 0)
@@ -45,7 +45,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 			vprintf(SDOG_INFO "tree snapshot %s %" PRIx32 " %" PRIx32 "\n",
 				name, cur_vid, base_vid);
 
-			ret = read_object(entries, nr_vnodes, nr_nodes, epoch,
+			ret = read_object(entries, nr_vnodes, nr_zones, epoch,
 					  vid_to_vdi_oid(cur_vid), (char *)&cur,
 					  SD_INODE_HEADER_SIZE, 0, copies);
 			if (ret < 0) {
@@ -88,7 +88,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 	}
 
 	if (is_snapshot && cur_vid != base_vid) {
-		ret = write_object(entries, nr_vnodes, nr_nodes, epoch,
+		ret = write_object(entries, nr_vnodes, nr_zones, epoch,
 				   vid_to_vdi_oid(cur_vid), (char *)&cur,
 				   SD_INODE_HEADER_SIZE, 0, copies, 0);
 		if (ret != 0) {
@@ -98,7 +98,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 	}
 
 	if (base_vid) {
-		ret = write_object(entries, nr_vnodes, nr_nodes, epoch,
+		ret = write_object(entries, nr_vnodes, nr_zones, epoch,
 				   vid_to_vdi_oid(base_vid), (char *)&base,
 				   SD_INODE_HEADER_SIZE, 0, copies, 0);
 		if (ret != 0) {
@@ -107,7 +107,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 		}
 	}
 
-	ret = write_object(entries, nr_vnodes, nr_nodes, epoch,
+	ret = write_object(entries, nr_vnodes, nr_zones, epoch,
 			   vid_to_vdi_oid(new_vid), (char *)&new, sizeof(new),
 			   0, copies, 1);
 	if (ret != 0)
@@ -124,17 +124,17 @@ static int find_first_vdi(uint32_t epoch, unsigned long start, unsigned long end
 	struct sheepdog_vnode_list_entry entries[SD_MAX_VNODES];
 	static struct sheepdog_inode inode;
 	unsigned long i;
-	int nr_vnodes, nr_nodes, nr_reqs;
+	int nr_vnodes, nr_zones, nr_reqs;
 	int ret, vdi_found = 0;
 
-	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_nodes);
+	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
 
 	nr_reqs = sys->nr_sobjs;
-	if (nr_reqs > nr_nodes)
-		nr_reqs = nr_nodes;
+	if (nr_reqs > nr_zones)
+		nr_reqs = nr_zones;
 
 	for (i = start; i >= end; i--) {
-		ret = read_object(entries, nr_vnodes, nr_nodes, epoch,
+		ret = read_object(entries, nr_vnodes, nr_zones, epoch,
 				  vid_to_vdi_oid(i), (char *)&inode,
 				  SD_INODE_HEADER_SIZE, 0, nr_reqs);
 		if (ret < 0)
@@ -298,7 +298,7 @@ int del_vdi(uint32_t epoch, char *data, int data_len, uint32_t *vid,
 	unsigned long dummy1, dummy2;
 	int ret;
 	struct sheepdog_vnode_list_entry entries[SD_MAX_VNODES];
-	int nr_vnodes, nr_nodes, nr_reqs;
+	int nr_vnodes, nr_zones, nr_reqs;
 	static struct sheepdog_inode inode;
 
 	if (data_len == SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN)
@@ -313,12 +313,12 @@ int del_vdi(uint32_t epoch, char *data, int data_len, uint32_t *vid,
 	if (ret != SD_RES_SUCCESS)
 		return ret;
 
-	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_nodes);
+	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
 	nr_reqs = sys->nr_sobjs;
-	if (nr_reqs > nr_nodes)
-		nr_reqs = nr_nodes;
+	if (nr_reqs > nr_zones)
+		nr_reqs = nr_zones;
 
-	ret = read_object(entries, nr_vnodes, nr_nodes, epoch,
+	ret = read_object(entries, nr_vnodes, nr_zones, epoch,
 			  vid_to_vdi_oid(*vid), (char *)&inode,
 			  SD_INODE_HEADER_SIZE, 0, nr_reqs);
 	if (ret < 0)
@@ -326,7 +326,7 @@ int del_vdi(uint32_t epoch, char *data, int data_len, uint32_t *vid,
 
 	memset(inode.name, 0, sizeof(inode.name));
 
-	ret = write_object(entries, nr_vnodes, nr_nodes, epoch,
+	ret = write_object(entries, nr_vnodes, nr_zones, epoch,
 			   vid_to_vdi_oid(*vid), (char *)&inode,
 			   SD_INODE_HEADER_SIZE, 0, nr_reqs, 0);
 	if (ret != 0)
@@ -370,7 +370,7 @@ static void delete_one(struct work *work, int idx)
 	struct deletion_work *dw = container_of(work, struct deletion_work, work);
 	uint32_t vdi_id = *(((uint32_t *)dw->buf) + dw->count - dw->done - 1);
 	struct sheepdog_vnode_list_entry entries[SD_MAX_VNODES];
-	int nr_vnodes, nr_nodes;
+	int nr_vnodes, nr_zones;
 	int ret, i;
 	static struct sheepdog_inode inode;
 
@@ -381,9 +381,9 @@ static void delete_one(struct work *work, int idx)
 	 * is called in threads and not serialized with cpg_event so
 	 * we can't access to epoch and sd_node_list safely.
 	 */
-	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_nodes);
+	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
 
-	ret = read_object(entries, nr_vnodes, nr_nodes, dw->epoch,
+	ret = read_object(entries, nr_vnodes, nr_zones, dw->epoch,
 			  vid_to_vdi_oid(vdi_id), (void *)&inode, sizeof(inode),
 			  0, sys->nr_sobjs);
 
@@ -396,7 +396,7 @@ static void delete_one(struct work *work, int idx)
 		if (!inode.data_vdi_id[i])
 			continue;
 
-		remove_object(entries, nr_vnodes, nr_nodes, dw->epoch,
+		remove_object(entries, nr_vnodes, nr_zones, dw->epoch,
 			      vid_to_data_oid(inode.data_vdi_id[i], i),
 			      inode.nr_copies);
 	}
@@ -427,7 +427,7 @@ static void delete_one_done(struct work *work, int idx)
 
 static int fill_vdi_list(struct deletion_work *dw,
 			 struct sheepdog_vnode_list_entry *entries,
-			 int nr_vnodes, int nr_nodes, uint32_t root_vid)
+			 int nr_vnodes, int nr_zones, uint32_t root_vid)
 {
 	int ret, i;
 	static struct sheepdog_inode inode;
@@ -437,7 +437,7 @@ static int fill_vdi_list(struct deletion_work *dw,
 	((uint32_t *)dw->buf)[dw->count++] = root_vid;
 again:
 	vid = ((uint32_t *)dw->buf)[done++];
-	ret = read_object(entries, nr_vnodes, nr_nodes, dw->epoch,
+	ret = read_object(entries, nr_vnodes, nr_zones, dw->epoch,
 			  vid_to_vdi_oid(vid), (void *)&inode,
 			  SD_INODE_HEADER_SIZE, 0, sys->nr_sobjs);
 
@@ -463,14 +463,14 @@ again:
 }
 
 static uint64_t get_vdi_root(struct sheepdog_vnode_list_entry *entries,
-			     int nr_vnodes, int nr_nodes, uint32_t epoch,
+			     int nr_vnodes, int nr_zones, uint32_t epoch,
 			     uint32_t vid)
 {
 	int ret;
 	static struct sheepdog_inode inode;
 
 next:
-	ret = read_object(entries, nr_vnodes, nr_nodes, epoch,
+	ret = read_object(entries, nr_vnodes, nr_zones, epoch,
 			  vid_to_vdi_oid(vid), (void *)&inode,
 			  SD_INODE_HEADER_SIZE, 0, sys->nr_sobjs);
 
@@ -491,7 +491,7 @@ int start_deletion(uint32_t vid, uint32_t epoch)
 {
 	struct deletion_work *dw;
 	struct sheepdog_vnode_list_entry entries[SD_MAX_VNODES];
-	int nr_vnodes, nr_nodes, ret;
+	int nr_vnodes, nr_zones, ret;
 	uint32_t root_vid;
 
 	dw = zalloc(sizeof(struct deletion_work));
@@ -511,13 +511,13 @@ int start_deletion(uint32_t vid, uint32_t epoch)
 	dw->work.fn = delete_one;
 	dw->work.done = delete_one_done;
 
-	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_nodes);
+	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
 
-	root_vid = get_vdi_root(entries, nr_vnodes, nr_nodes, dw->epoch, dw->vid);
+	root_vid = get_vdi_root(entries, nr_vnodes, nr_zones, dw->epoch, dw->vid);
 	if (!root_vid)
 		return SD_RES_EIO;
 
-	ret = fill_vdi_list(dw, entries, nr_vnodes, nr_nodes, root_vid);
+	ret = fill_vdi_list(dw, entries, nr_vnodes, nr_zones, root_vid);
 	if (ret)
 		return SD_RES_SUCCESS;
 
@@ -544,12 +544,12 @@ int get_vdi_attr(uint32_t epoch, char *data, int data_len, uint32_t vid,
 	char attr_buf[SD_ATTR_HEADER_SIZE];
 	uint64_t oid;
 	uint32_t end;
-	int ret, nr_nodes, nr_vnodes;
+	int ret, nr_zones, nr_vnodes;
 
 	if (data_len != SD_ATTR_HEADER_SIZE)
 		return SD_RES_INVALID_PARMS;
 
-	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_nodes);
+	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
 
 	*attrid = fnv_64a_buf(data, data_len, FNV1A_64_INIT);
 	*attrid &= (UINT64_C(1) << VDI_SPACE_SHIFT) - 1;
@@ -557,11 +557,11 @@ int get_vdi_attr(uint32_t epoch, char *data, int data_len, uint32_t vid,
 	end = *attrid - 1;
 	while (*attrid != end) {
 		oid = vid_to_attr_oid(vid, *attrid);
-		ret = read_object(entries, nr_vnodes, nr_nodes, epoch, oid, attr_buf,
+		ret = read_object(entries, nr_vnodes, nr_zones, epoch, oid, attr_buf,
 				  sizeof(attr_buf), 0, copies);
 
 		if (ret == -SD_RES_NO_OBJ && creat) {
-			ret = write_object(entries, nr_vnodes, nr_nodes, epoch, oid, data,
+			ret = write_object(entries, nr_vnodes, nr_zones, epoch, oid, data,
 					   data_len, 0, copies, 1);
 			if (ret)
 				return SD_RES_EIO;
-- 
1.7.2.5




More information about the sheepdog mailing list