[sheepdog] [PATCH v2] sheep, dog: add vnodes fixed options

Saeki Masaki saeki.masaki at po.ntts.co.jp
Tue Jan 13 10:36:25 CET 2015


v2: rebase master and fix cording style.

v1: In the current sheepdog, vnodes is recalculated at the time of
    node increase or decrease.

In the auto recovery, first get the object from the other node,
finally delete the object do not need.
During auto recovery run, available disk decreases.
In the worst case, it exhaust available disk.

Add the following new commands and options.
1. option to specify vnodes in sheep. (-V, --vnodes)
  - The old days it has been implemented as -v(--vnodes) option.
  - For now -v option is used to print version. so add in -V (large).
  $ sheep -V 100 /var/lib/sheepdog
  If -V is specified, vnodes strategy of sheep is 'fixed'
  (default value is 'auto')

2. option to dog cluster format with vnode fixed. (-V, --fixedvnodes)
  $ dog cluster format -V
  If 'fixed' and 'auto' vnodes strategy are mixed,
  cluster format command fails.
  (different sheep of vnodes strategy can not be mixed in the cluster)

3. dog command to change the vnodes
  $ dog node vnode set <vnodes>
  After changing the vnodes, new epoch are created
  and auto recovery will start.

If you want to operate the vnodes fixed,
it is necessary to manage the vnodes with capacity of
the data store in each node.
So you should use this option carefully.

For example of using fixed vnodes strategy:

1) start sheep with fixed vnodes strategy.
  $ sheep -V 100 /var/lib/sheepdog
  $ sheep -V 110 /var/lib/sheepdog
  $ dog node list
    Id   Host:Port         V-Nodes       Zone
     0   172.16.4.205:7000        100 1812140204
     1   172.16.4.206:7000        120 1828917420

2) format the cluster with fixed vnodes strategy.
  $ dog cluster format -V

3) check vnodes strategy of cluster.
  $ dog cluster info -v
    Cluster status: running, auto-recovery enabled
    Cluster store: plain with 3 redundancy policy
    Cluster vnodes strategy: fixed
    Cluster vnode mode: node
    Cluster created at Wed Dec 17 18:20:10 2014

    Epoch Time          Version [Host:Port:V-Nodes,,,]
    2014-12-17 18:20:10     1 [172.16.4.205:7000:100, 172.16.4.206:7000:120]

4) change of vnodes.
  $ dog node vnodes set 140
  $ dog node list
    Id   Host:Port         V-Nodes       Zone
     0   172.16.4.205:7000        140 1812140204
     1   172.16.4.206:7000        120 1828917420

Signed-off-by: Masaki Saeki <saeki.masaki at po.ntts.co.jp>

---
 dog/cluster.c            |   86 ++++++++++++++++++++++++++++++++++++++--------
 dog/node.c               |   67 +++++++++++++++++++++++++++++++++++
 include/internal_proto.h |    3 ++
 include/sheep.h          |    8 ++++
 include/sheepdog_proto.h |    2 +
 sheep/config.c           |   14 +++++++-
 sheep/group.c            |   60 +++++++++++++++++++++++++++++---
 sheep/ops.c              |   82 +++++++++++++++++++++++++++++++++++++++++++
 sheep/sheep.c            |   32 +++++++++++++++--
 9 files changed, 330 insertions(+), 24 deletions(-)

diff --git a/dog/cluster.c b/dog/cluster.c
index 20f190b..6a2db6e 100644
--- a/dog/cluster.c
+++ b/dog/cluster.c
@@ -15,6 +15,7 @@
 #include <sys/time.h>
 
 #include "dog.h"
+#include "sheep.h"
 #include "farm/farm.h"
 
 static struct sd_option cluster_options[] = {
@@ -27,6 +28,7 @@ static struct sd_option cluster_options[] = {
 	 "do not serve write request if number of nodes is not sufficient"},
 	{'z', "block_size_shift", true, "specify the shift num of default"
 	      " data object size"},
+	{'V', "fixedvnodes", false, "disable automatic vnodes calculation"},
 	{ 0, NULL, false, NULL },
 };
 
@@ -38,6 +40,7 @@ static struct cluster_cmd_data {
 	bool force;
 	bool strict;
 	char name[STORE_LEN];
+	bool fixed_vnodes;
 } cluster_cmd_data;
 
 #define DEFAULT_STORE	"plain"
@@ -87,6 +90,41 @@ static int cluster_format(int argc, char **argv)
 	struct timeval tv;
 	char store_name[STORE_LEN];
 	static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
+	struct sd_node *n;
+
+	rb_for_each_entry(n, &sd_nroot, rb) {
+		struct sd_req info_req;
+		struct sd_rsp *info_rsp = (struct sd_rsp *)&info_req;
+		struct cluster_info cinfo;
+
+		sd_init_req(&info_req, SD_OP_CLUSTER_INFO);
+		info_req.data_length = sizeof(cinfo);
+		ret = dog_exec_req(&n->nid, &info_req, &cinfo);
+		if (ret < 0) {
+			sd_err("Fail to execute request");
+			return EXIT_FAILURE;
+		}
+		if (info_rsp->result != SD_RES_SUCCESS) {
+			sd_err("%s", sd_strerror(info_rsp->result));
+			return EXIT_FAILURE;
+		}
+
+		if (n->nr_vnodes != 0) {
+			if ((cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
+				&& cluster_cmd_data.fixed_vnodes) {
+				sd_err("Can not apply the option of '-V', "
+					"because there are vnode strategy of sheep "
+					"is auto in the cluster");
+				return EXIT_FAILURE;
+			} else if (!(cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
+				&& !cluster_cmd_data.fixed_vnodes) {
+				sd_err("Need to specify the option of '-V', "
+					"because there are vnode strategy of sheep "
+					"is fixed in the cluster");
+				return EXIT_FAILURE;
+			}
+		}
+	}
 
 	if (cluster_cmd_data.copies > sd_nodes_nr) {
 		char info[1024];
@@ -132,6 +170,11 @@ static int cluster_format(int argc, char **argv)
 	hdr.cluster.flags |= SD_CLUSTER_FLAG_DISKMODE;
 #endif
 
+	if (cluster_cmd_data.fixed_vnodes)
+		hdr.cluster.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES;
+	else
+		hdr.cluster.flags |= SD_CLUSTER_FLAG_AUTO_VNODES;
+
 	printf("using backend %s store\n", store_name);
 	ret = dog_exec_req(&sd_nid, &hdr, store_name);
 	if (ret < 0)
@@ -160,14 +203,15 @@ static void print_nodes(const struct epoch_log *logs, uint16_t flags)
 				if (entry->disks[nr_disk].disk_id == 0)
 					break;
 			}
-			printf("%s%s(%d)",
-			       (i == 0) ? "" : ", ",
-			       addr_to_str(entry->nid.addr, entry->nid.port),
-			       nr_disk);
+			printf("%s%s:%d(%d)",
+				(i == 0) ? "" : ", ",
+				addr_to_str(entry->nid.addr, entry->nid.port),
+					entry->nr_vnodes, nr_disk);
 		} else
-			printf("%s%s",
-			       (i == 0) ? "" : ", ",
-			       addr_to_str(entry->nid.addr, entry->nid.port));
+			printf("%s%s:%d",
+				(i == 0) ? "" : ", ",
+				addr_to_str(entry->nid.addr, entry->nid.port),
+					entry->nr_vnodes);
 	}
 }
 
@@ -232,6 +276,15 @@ retry:
 			}
 			printf("%s with %s redundancy policy\n",
 			       logs->drv_name, copy);
+
+			/* show vnode strategy */
+			if (!raw_output)
+				printf("Cluster vnodes strategy: ");
+			if (logs->flags & SD_CLUSTER_FLAG_AUTO_VNODES)
+				printf("auto\n");
+			else
+				printf("fixed\n");
+
 		} else
 			printf("%s\n", sd_strerror(rsp->result));
 
@@ -239,15 +292,17 @@ retry:
 		if (!raw_output)
 			printf("Cluster vnode mode: ");
 		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
-			printf("disk");
+			printf("disk\n");
 		else
-			printf("node");
-	}
+			printf("node\n");
+	} else
+		printf("\n");
 
 	if (!raw_output && rsp->data_length > 0) {
 		ct = logs[0].ctime >> 32;
-		printf("\nCluster created at %s\n", ctime(&ct));
-		printf("Epoch Time           Version\n");
+		printf("Cluster created at %s\n", ctime(&ct));
+		printf("Epoch Time           Version [Host:Port:V-Nodes,,,]");
+		printf("\n");
 	}
 
 	nr_logs = rsp->data_length / (sizeof(struct epoch_log)
@@ -761,7 +816,7 @@ failure:
 static struct subcommand cluster_cmd[] = {
 	{"info", NULL, "aprhvT", "show cluster information",
 	 NULL, CMD_NEED_NODELIST, cluster_info, cluster_options},
-	{"format", NULL, "bctaphzT", "create a Sheepdog store",
+	{"format", NULL, "bctaphzTV", "create a Sheepdog store",
 	 NULL, CMD_NEED_NODELIST, cluster_format, cluster_options},
 	{"shutdown", NULL, "aphT", "stop Sheepdog",
 	 NULL, 0, cluster_shutdown, cluster_options},
@@ -823,9 +878,10 @@ static int cluster_parser(int ch, const char *opt)
 			" Please set shift bit larger than 20");
 			exit(EXIT_FAILURE);
 		}
-
 		cluster_cmd_data.block_size_shift = block_size_shift;
-
+		break;
+	case 'V':
+		cluster_cmd_data.fixed_vnodes = true;
 		break;
 	}
 
diff --git a/dog/node.c b/dog/node.c
index d4c8fe7..36141ad 100644
--- a/dog/node.c
+++ b/dog/node.c
@@ -764,6 +764,71 @@ static int node_log(int argc, char **argv)
 	return do_generic_subcommand(node_log_cmd, argc, argv);
 }
 
+static int do_vnodes_set(const struct node_id *nid, int *nr_vnodes)
+{
+	int ret = 0;
+	struct sd_req hdr;
+	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+
+	sd_init_req(&hdr, SD_OP_SET_VNODES);
+	hdr.flags = SD_FLAG_CMD_WRITE;
+	hdr.data_length = sizeof(nr_vnodes);
+
+	ret = dog_exec_req(nid, &hdr, nr_vnodes);
+	if (ret < 0)
+		return EXIT_SYSFAIL;
+
+	if (rsp->result != SD_RES_SUCCESS)
+		return EXIT_FAILURE;
+
+	return ret;
+}
+
+static int node_vnodes_set(int argc, char **argv)
+{
+	int ret = 0;
+	char *p;
+	int32_t nr_vnodes = strtol(argv[optind], &p, 10);
+
+	if (argv[optind] == p || nr_vnodes < 1 || nr_vnodes > UINT16_MAX
+		|| *p != '\0') {
+		sd_err("Invalid number of vnodes '%s': must be an integer "
+			"between 1 and %u",
+			argv[optind], UINT16_MAX);
+		exit(EXIT_USAGE);
+	}
+
+	ret = do_vnodes_set(&sd_nid, &nr_vnodes);
+
+	switch (ret) {
+	case EXIT_FAILURE:
+	case EXIT_SYSFAIL:
+		sd_err("Failed to execute request");
+		ret = -1;
+		break;
+	case EXIT_SUCCESS:
+		/* do nothing */
+		break;
+	default:
+		sd_err("unknown return code of do_vnodes_set(): %d", ret);
+		ret = -1;
+		break;
+	}
+
+	return ret;
+}
+
+static struct subcommand node_vnodes_cmd[] = {
+	{"set", "<num of vnodes>", NULL, "set new vnodes",
+	 NULL, CMD_NEED_ARG, node_vnodes_set},
+	{NULL},
+};
+
+static int node_vnodes(int argc, char **argv)
+{
+	return do_generic_subcommand(node_vnodes_cmd, argc, argv);
+}
+
 static struct subcommand node_cmd[] = {
 	{"kill", "<node id>", "aprhlT", "kill node", NULL,
 	 CMD_NEED_NODELIST, node_kill, node_options},
@@ -780,6 +845,8 @@ static struct subcommand node_cmd[] = {
 	 0, node_stat, node_options},
 	{"log", NULL, "aphT", "show or set log level of the node", node_log_cmd,
 	 CMD_NEED_ARG, node_log},
+	{"vnodes", "<num of vnodes>", "aph", "set new vnodes", node_vnodes_cmd,
+	 CMD_NEED_ARG, node_vnodes},
 	{NULL,},
 };
 
diff --git a/include/internal_proto.h b/include/internal_proto.h
index c1ffe53..225cc28 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -113,6 +113,8 @@
 #define SD_OP_READ_DEL_VDIS  0xC9
 #define SD_OP_GET_RECOVERY      0xCA
 #define SD_OP_SET_RECOVERY      0xCB
+#define SD_OP_SET_VNODES 0xCC
+#define SD_OP_GET_VNODES 0xCD
 
 /* internal flags for hdr.flags, must be above 0x80 */
 #define SD_FLAG_CMD_RECOVERY 0x0080
@@ -145,6 +147,7 @@
 
 #define SD_CLUSTER_FLAG_STRICT		0x0001 /* Strict mode for write */
 #define SD_CLUSTER_FLAG_DISKMODE	0x0002 /* Disk mode for cluster */
+#define SD_CLUSTER_FLAG_AUTO_VNODES	0x0004 /* Cluster vnodes strategy */
 
 enum sd_status {
 	SD_STATUS_OK = 1,
diff --git a/include/sheep.h b/include/sheep.h
index 22524c1..fe6f066 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -149,6 +149,9 @@ static inline const char *sd_strerror(int err)
 			"IO has halted as there are not enough living nodes",
 		[SD_RES_READONLY] = "Object is read-only",
 		[SD_RES_INODE_INVALIDATED] = "Inode object is invalidated",
+		[SD_RES_INVALID_VNODES_STRATEGY] =
+			"Invalid cluster vnodes strategy",
+		[SD_RES_GATEWAY_MODE] = "Targeted node is gateway mode",
 
 		/* from internal_proto.h */
 		[SD_RES_OLD_NODE_VER] = "Request has an old epoch",
@@ -328,4 +331,9 @@ static inline bool is_cluster_diskmode(const struct cluster_info *cinfo)
 	return (cinfo->flags & SD_CLUSTER_FLAG_DISKMODE) > 0;
 }
 
+static inline bool is_cluster_autovnodes(const struct cluster_info *cinfo)
+{
+	return (cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES) > 0;
+}
+
 #endif
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 9495742..3910bd5 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -87,6 +87,8 @@
 #define SD_RES_INCOMPLETE    0x1B /* Object (in kv) is incomplete uploading */
 #define SD_RES_COLLECTING_CINFO 0x1C /* sheep is collecting cluster wide status, not ready for operation */
 #define SD_RES_INODE_INVALIDATED 0x1D /* inode object in client is invalidated, refreshing is required */
+#define SD_RES_GATEWAY_MODE  0x1E /* Target node is gateway mode */
+#define SD_RES_INVALID_VNODES_STRATEGY 0x1F /* Invalid vnodes strategy */
 
 /* errors above 0x80 are sheepdog-internal */
 
diff --git a/sheep/config.c b/sheep/config.c
index dfad5fd..9518109 100644
--- a/sheep/config.c
+++ b/sheep/config.c
@@ -62,7 +62,11 @@ static int get_cluster_config(struct cluster_info *cinfo)
 {
 	cinfo->ctime = config.ctime;
 	cinfo->nr_copies = config.copies;
-	cinfo->flags = config.flags;
+	if (config.ctime > 0)
+		cinfo->flags = config.flags;
+	else
+		cinfo->flags = (config.flags & ~SD_CLUSTER_FLAG_AUTO_VNODES) |
+			(cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES);
 	cinfo->copy_policy = config.copy_policy;
 	cinfo->block_size_shift = config.block_size_shift;
 	memcpy(cinfo->store, config.store, sizeof(config.store));
@@ -122,6 +126,14 @@ int init_config_file(void)
 	}
 
 reload:
+	if ((config.flags & SD_CLUSTER_FLAG_AUTO_VNODES) !=
+			(sys->cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
+		&& !sys->gateway_only
+		&& config.ctime > 0) {
+		sd_err("Designation of before a restart and a vnodes option is different.");
+		return -1;
+	}
+
 	ret = 0;
 	get_cluster_config(&sys->cinfo);
 	if ((config.flags & SD_CLUSTER_FLAG_DISKMODE) !=
diff --git a/sheep/group.c b/sheep/group.c
index 9462aa4..2034300 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -145,7 +145,8 @@ struct vnode_info *alloc_vnode_info(const struct rb_root *nroot)
 		vnode_info->nr_nodes++;
 	}
 
-	recalculate_vnodes(&vnode_info->nroot);
+	if (is_cluster_autovnodes(&sys->cinfo))
+		recalculate_vnodes(&vnode_info->nroot);
 
 	if (is_cluster_diskmode(&sys->cinfo))
 		disks_to_vnodes(&vnode_info->nroot, &vnode_info->vroot);
@@ -1133,6 +1134,20 @@ static bool cluster_join_check(const struct cluster_info *cinfo)
 	if (!cluster_ctime_check(cinfo))
 		return false;
 
+	if (cinfo->ctime > 0 && sys->this_node.nr_vnodes != 0) {
+		if (!is_cluster_autovnodes(&sys->cinfo)
+			&& is_cluster_autovnodes(cinfo)) {
+			sd_err("failed to join for vnodes strategy unmatch. "
+				" cluster:fixed, joined:auto");
+			return false;
+		} else if (is_cluster_autovnodes(&sys->cinfo)
+			&& !is_cluster_autovnodes(cinfo)) {
+			sd_err("failed to join for vnodes strategy unmatch. "
+				" cluster:auto, joined:fixed");
+			return false;
+		}
+	}
+
 	/*
 	 * Sheepdog's recovery code assumes every node have the same epoch
 	 * history. But we don't check epoch history of joining node because:
@@ -1155,6 +1170,13 @@ main_fn void sd_accept_handler(const struct sd_node *joined,
 	const struct cluster_info *cinfo = opaque;
 	struct sd_node *n;
 	enum sd_status prev_status = sys->cinfo.status;
+	uint16_t flags;
+
+	if (node_is_local(joined) && sys->gateway_only
+		&& sys->cinfo.ctime <= 0)
+		flags = cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES;
+	else
+		flags = sys->cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES;
 
 	if (node_is_local(joined) && !cluster_join_check(cinfo)) {
 		sd_err("failed to join Sheepdog");
@@ -1163,6 +1185,9 @@ main_fn void sd_accept_handler(const struct sd_node *joined,
 
 	cluster_info_copy(&sys->cinfo, cinfo);
 
+	sys->cinfo.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES;
+	sys->cinfo.flags |= flags;
+
 	sd_debug("join %s", node_to_str(joined));
 	rb_for_each_entry(n, nroot, rb) {
 		sd_debug("%s", node_to_str(n));
@@ -1244,7 +1269,7 @@ main_fn void sd_leave_handler(const struct sd_node *left,
 	remove_node_from_participants(&left->nid);
 }
 
-static void update_node_size(struct sd_node *node)
+static void update_node_info(struct sd_node *node)
 {
 	struct vnode_info *cur_vinfo = get_vnode_info();
 	struct sd_node *n = rb_search(&cur_vinfo->nroot, node, rb, node_cmp);
@@ -1252,6 +1277,10 @@ static void update_node_size(struct sd_node *node)
 	if (unlikely(!n))
 		panic("can't find %s", node_to_str(node));
 	n->space = node->space;
+
+	if (!is_cluster_autovnodes(&sys->cinfo))
+		n->nr_vnodes = node->nr_vnodes;
+
 	if (is_cluster_diskmode(&sys->cinfo)) {
 		memset(n->disks, 0, sizeof(struct disk_info) * DISK_MAX);
 		for (int i = 0; i < DISK_MAX; i++)
@@ -1280,14 +1309,14 @@ static void kick_node_recover(void)
 
 main_fn void sd_update_node_handler(struct sd_node *node)
 {
-	update_node_size(node);
+	update_node_info(node);
 	kick_node_recover();
 }
 
 int create_cluster(int port, int64_t zone, int nr_vnodes,
 		   bool explicit_addr)
 {
-	int nr_nodes = 0, ret;
+	int nr_nodes = 0, ret, i, vnodes = 0;
 
 	if (!sys->cdrv) {
 		sys->cdrv = find_cdrv(DEFAULT_CLUSTER_DRIVER);
@@ -1323,11 +1352,32 @@ int create_cluster(int port, int64_t zone, int nr_vnodes,
 	sys->cinfo.epoch = get_latest_epoch();
 	if (sys->cinfo.epoch) {
 		ret = epoch_log_read(sys->cinfo.epoch, sys->cinfo.nodes,
-				sizeof(sys->cinfo.nodes), &nr_nodes);
+			sizeof(sys->cinfo.nodes), &nr_nodes);
 		if (ret != SD_RES_SUCCESS)
 			return -1;
 		sys->cinfo.nr_nodes = nr_nodes;
 	}
+
+	if (!is_cluster_autovnodes(&sys->cinfo)) {
+		for (i = 0; i < nr_nodes; i++) {
+			if ((addr_to_str(sys->this_node.nid.addr,
+					sys->this_node.nid.port)
+				== addr_to_str(sys->cinfo.nodes[i].nid.addr,
+					sys->cinfo.nodes[i].nid.port))
+				&& (sys->this_node.nid.port
+					== sys->cinfo.nodes[i].nid.port)) {
+				vnodes = sys->cinfo.nodes[i].nr_vnodes;
+				break;
+			}
+		}
+		if (sys->cinfo.epoch != 0 && sys->this_node.nr_vnodes != vnodes
+			&& !sys->gateway_only) {
+			sd_err("mismatch specified vnodes is compared with the previous. "
+				"previous vnodes:%d", vnodes);
+			return -1;
+		}
+	}
+
 	sys->cinfo.status = SD_STATUS_WAIT;
 
 	main_thread_set(pending_block_list,
diff --git a/sheep/ops.c b/sheep/ops.c
index d097a15..dad03a6 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -265,6 +265,29 @@ static int remove_epoch(uint32_t epoch)
 	return SD_RES_SUCCESS;
 }
 
+static int get_vnodes(struct vnode_info *vinfo, int *nr_vnodes)
+{
+	int ret;
+	struct sd_node *node;
+
+	rb_for_each_entry(node, &vinfo->nroot, rb) {
+		struct sd_req hdr;
+		if (node_is_local(node))
+			continue;
+		if (node->nr_vnodes == 0)
+			continue;
+
+		sd_init_req(&hdr, SD_OP_GET_VNODES);
+		hdr.data_length = sizeof(*nr_vnodes);
+		hdr.epoch = sys_epoch();
+		ret = sheep_exec_req(&node->nid, &hdr, nr_vnodes);
+		if (ret != SD_RES_SUCCESS)
+			return ret;
+		node->nr_vnodes = *nr_vnodes;
+	}
+	return SD_RES_SUCCESS;
+}
+
 static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
 			   void *data, const struct sd_node *sender)
 {
@@ -272,6 +295,8 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
 	uint32_t latest_epoch;
 	struct store_driver *driver;
 	char *store_name = data;
+	int32_t nr_vnodes;
+	struct vnode_info *vinfo = get_vnode_info();
 
 	driver = find_store_driver(data);
 	if (!driver)
@@ -290,6 +315,12 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
 	if (ret != SD_RES_SUCCESS)
 		return ret;
 
+	if (sys->gateway_only) {
+		ret = get_vnodes(vinfo, &nr_vnodes);
+		if (ret != SD_RES_SUCCESS)
+			return ret;
+	}
+
 	sys->cinfo.nr_copies = req->cluster.copies;
 	sys->cinfo.copy_policy = req->cluster.copy_policy;
 	sys->cinfo.block_size_shift = req->cluster.block_size_shift;
@@ -1479,6 +1510,45 @@ static int local_set_recovery(struct request *req)
 	return SD_RES_SUCCESS;
 }
 
+static int local_get_vnodes(struct request *req)
+{
+	int *nr_vnodes;
+
+	nr_vnodes = req->data;
+	req->rp.data_length = sizeof(nr_vnodes);
+	*nr_vnodes = sys->this_node.nr_vnodes;
+
+	return SD_RES_SUCCESS;
+}
+
+static int local_set_vnodes(const struct sd_req *req,
+				struct sd_rsp *rsp, void *data,
+				const struct sd_node *sender)
+{
+	int ret;
+	int *nr_vnodes = (int *)data;
+
+	if (sys->gateway_only) {
+		sd_err("failed to set vnodes, cause operating in gateway mode.");
+		return SD_RES_GATEWAY_MODE;
+	}
+	if (is_cluster_autovnodes(&sys->cinfo)) {
+		sd_err("failed to set vnodes, cause operating in auto vnodes strategy.");
+		return SD_RES_INVALID_VNODES_STRATEGY;
+	}
+
+	if (1 > *nr_vnodes || *nr_vnodes > UINT16_MAX) {
+		sd_err("invalid vnodes: %d", *nr_vnodes);
+		return SD_RES_INVALID_PARMS;
+	}
+
+	sys->this_node.nr_vnodes = *nr_vnodes;
+
+	ret = sys->cdrv->update_node(&sys->this_node);
+
+	return ret;
+}
+
 static struct sd_op_template sd_ops[] = {
 
 	/* cluster operations */
@@ -1872,6 +1942,18 @@ static struct sd_op_template sd_ops[] = {
 		.process_main = local_get_cluster_default,
 	},
 
+	[SD_OP_GET_VNODES] = {
+		.name = "GET_VNODES",
+		.type = SD_OP_TYPE_LOCAL,
+		.process_work = local_get_vnodes,
+	},
+
+	[SD_OP_SET_VNODES] = {
+		.name = "SET_VNODES",
+		.type = SD_OP_TYPE_LOCAL,
+		.process_main = local_set_vnodes,
+	},
+
 	/* gateway I/O operations */
 	[SD_OP_CREATE_AND_WRITE_OBJ] = {
 		.name = "CREATE_AND_WRITE_OBJ",
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 9471a3b..e0a034f 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -121,6 +121,10 @@ static const char recovery_help[] =
 "\tinterval=: object recovery interval time (millisec)\n"
 "Example:\n\t$ sheep -R max=50,interval=1000 ...\n";
 
+static const char vnodes_help[] =
+"Example:\n\t$ sheep -V 128\n"
+"\tset number of vnodes\n";
+
 static struct sd_option sheep_options[] = {
 	{'b', "bindaddr", true, "specify IP address of interface to listen on",
 	 bind_help},
@@ -147,6 +151,7 @@ static struct sd_option sheep_options[] = {
 	 recovery_help},
 	{'u', "upgrade", false, "upgrade to the latest data layout"},
 	{'v', "version", false, "show the version"},
+	{'V', "vnodes", true, "set number of vnodes", vnodes_help},
 	{'w', "cache", true, "enable object cache", cache_help},
 	{'y', "myaddr", true, "specify the address advertised to other sheep",
 	 myaddr_help},
@@ -646,11 +651,12 @@ static void sighup_handler(int signum)
 int main(int argc, char **argv)
 {
 	int ch, longindex, ret, port = SD_LISTEN_PORT, io_port = SD_LISTEN_PORT;
-	int nr_vnodes = SD_DEFAULT_VNODES, rc = 1;
+	int rc = 1;
 	const char *dirp = DEFAULT_OBJECT_DIR, *short_options;
 	char *dir, *p, *pid_file = NULL, *bindaddr = NULL, log_path[PATH_MAX],
 	     *argp = NULL;
 	bool explicit_addr = false;
+	int32_t nr_vnodes = -1;
 	int64_t zone = -1;
 	struct cluster_driver *cdrv;
 	struct option *long_options;
@@ -659,6 +665,7 @@ int main(int argc, char **argv)
 	struct stat logdir_st;
 	enum log_dst_type log_dst_type;
 
+	sys->cinfo.flags |= SD_CLUSTER_FLAG_AUTO_VNODES;
 	sys->node_status = SD_NODE_STATUS_INITIALIZATION;
 
 	sys->rthrottling.max_exec_count = 0;
@@ -707,7 +714,10 @@ int main(int argc, char **argv)
 			sys->backend_dio = true;
 			break;
 		case 'g':
-			/* same as '-v 0' */
+			if (nr_vnodes > 0) {
+				sd_err("Options '-g' and '-V' can not be both specified");
+				exit(1);
+			}
 			nr_vnodes = 0;
 			break;
 		case 'z':
@@ -797,6 +807,21 @@ int main(int argc, char **argv)
 				PACKAGE_VERSION);
 			exit(0);
 			break;
+		case 'V':
+			sys->cinfo.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES;
+			if (nr_vnodes == 0) {
+				sd_err("Options '-g' and '-V' can not be both specified");
+				exit(1);
+			}
+			nr_vnodes = strtol(optarg, &p, 10);
+			if (optarg == p || nr_vnodes < 1
+				|| UINT16_MAX < nr_vnodes || *p != '\0') {
+				sd_err("Invalid number of vnodes '%s': must be "
+					"an integer between 1 and %u",
+					optarg, UINT16_MAX);
+				exit(1);
+			}
+			break;
 		default:
 			usage(1);
 			break;
@@ -813,7 +838,8 @@ int main(int argc, char **argv)
 	if (nr_vnodes == 0) {
 		sys->gateway_only = true;
 		sys->disk_space = 0;
-	}
+	} else if (nr_vnodes == -1)
+		nr_vnodes = SD_DEFAULT_VNODES;
 
 	if (optind != argc) {
 		argp = strdup(argv[optind]);
-- 
1.7.1


-- 
NTTソフトウェア株式会社
クラウド事業部
第一事業ユニット
佐伯 昌樹
TEL: 045-212-7393
FAX: 045-662-7856
Mail: saeki.masaki at po.ntts.co.jp
--




More information about the sheepdog mailing list