[sheepdog] [PATCH v2] sheep, dog: add vnodes fixed options

Hitoshi Mitake mitake.hitoshi at lab.ntt.co.jp
Wed Jan 14 06:00:38 CET 2015


At Tue, 13 Jan 2015 18:36:25 +0900,
Saeki Masaki wrote:
> 
> v2: rebase master and fix cording style.
> 
> v1: In the current sheepdog, vnodes is recalculated at the time of
>     node increase or decrease.
> 
> In the auto recovery, first get the object from the other node,
> finally delete the object do not need.
> During auto recovery run, available disk decreases.
> In the worst case, it exhaust available disk.
> 
> Add the following new commands and options.
> 1. option to specify vnodes in sheep. (-V, --vnodes)
>   - The old days it has been implemented as -v(--vnodes) option.
>   - For now -v option is used to print version. so add in -V (large).
>   $ sheep -V 100 /var/lib/sheepdog
>   If -V is specified, vnodes strategy of sheep is 'fixed'
>   (default value is 'auto')
> 
> 2. option to dog cluster format with vnode fixed. (-V, --fixedvnodes)
>   $ dog cluster format -V
>   If 'fixed' and 'auto' vnodes strategy are mixed,
>   cluster format command fails.
>   (different sheep of vnodes strategy can not be mixed in the cluster)
> 
> 3. dog command to change the vnodes
>   $ dog node vnode set <vnodes>
>   After changing the vnodes, new epoch are created
>   and auto recovery will start.
> 
> If you want to operate the vnodes fixed,
> it is necessary to manage the vnodes with capacity of
> the data store in each node.
> So you should use this option carefully.
> 
> For example of using fixed vnodes strategy:
> 
> 1) start sheep with fixed vnodes strategy.
>   $ sheep -V 100 /var/lib/sheepdog
>   $ sheep -V 110 /var/lib/sheepdog
>   $ dog node list
>     Id   Host:Port         V-Nodes       Zone
>      0   172.16.4.205:7000        100 1812140204
>      1   172.16.4.206:7000        120 1828917420
> 
> 2) format the cluster with fixed vnodes strategy.
>   $ dog cluster format -V
> 
> 3) check vnodes strategy of cluster.
>   $ dog cluster info -v
>     Cluster status: running, auto-recovery enabled
>     Cluster store: plain with 3 redundancy policy
>     Cluster vnodes strategy: fixed
>     Cluster vnode mode: node
>     Cluster created at Wed Dec 17 18:20:10 2014
> 
>     Epoch Time          Version [Host:Port:V-Nodes,,,]
>     2014-12-17 18:20:10     1 [172.16.4.205:7000:100, 172.16.4.206:7000:120]
> 
> 4) change of vnodes.
>   $ dog node vnodes set 140
>   $ dog node list
>     Id   Host:Port         V-Nodes       Zone
>      0   172.16.4.205:7000        140 1812140204
>      1   172.16.4.206:7000        120 1828917420
> 
> Signed-off-by: Masaki Saeki <saeki.masaki at po.ntts.co.jp>
> 
> ---
>  dog/cluster.c            |   86 ++++++++++++++++++++++++++++++++++++++--------
>  dog/node.c               |   67 +++++++++++++++++++++++++++++++++++
>  include/internal_proto.h |    3 ++
>  include/sheep.h          |    8 ++++
>  include/sheepdog_proto.h |    2 +
>  sheep/config.c           |   14 +++++++-
>  sheep/group.c            |   60 +++++++++++++++++++++++++++++---
>  sheep/ops.c              |   82 +++++++++++++++++++++++++++++++++++++++++++
>  sheep/sheep.c            |   32 +++++++++++++++--
>  9 files changed, 330 insertions(+), 24 deletions(-)

Applied, thanks.
Hitoshi

> 
> diff --git a/dog/cluster.c b/dog/cluster.c
> index 20f190b..6a2db6e 100644
> --- a/dog/cluster.c
> +++ b/dog/cluster.c
> @@ -15,6 +15,7 @@
>  #include <sys/time.h>
>  
>  #include "dog.h"
> +#include "sheep.h"
>  #include "farm/farm.h"
>  
>  static struct sd_option cluster_options[] = {
> @@ -27,6 +28,7 @@ static struct sd_option cluster_options[] = {
>  	 "do not serve write request if number of nodes is not sufficient"},
>  	{'z', "block_size_shift", true, "specify the shift num of default"
>  	      " data object size"},
> +	{'V', "fixedvnodes", false, "disable automatic vnodes calculation"},
>  	{ 0, NULL, false, NULL },
>  };
>  
> @@ -38,6 +40,7 @@ static struct cluster_cmd_data {
>  	bool force;
>  	bool strict;
>  	char name[STORE_LEN];
> +	bool fixed_vnodes;
>  } cluster_cmd_data;
>  
>  #define DEFAULT_STORE	"plain"
> @@ -87,6 +90,41 @@ static int cluster_format(int argc, char **argv)
>  	struct timeval tv;
>  	char store_name[STORE_LEN];
>  	static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
> +	struct sd_node *n;
> +
> +	rb_for_each_entry(n, &sd_nroot, rb) {
> +		struct sd_req info_req;
> +		struct sd_rsp *info_rsp = (struct sd_rsp *)&info_req;
> +		struct cluster_info cinfo;
> +
> +		sd_init_req(&info_req, SD_OP_CLUSTER_INFO);
> +		info_req.data_length = sizeof(cinfo);
> +		ret = dog_exec_req(&n->nid, &info_req, &cinfo);
> +		if (ret < 0) {
> +			sd_err("Fail to execute request");
> +			return EXIT_FAILURE;
> +		}
> +		if (info_rsp->result != SD_RES_SUCCESS) {
> +			sd_err("%s", sd_strerror(info_rsp->result));
> +			return EXIT_FAILURE;
> +		}
> +
> +		if (n->nr_vnodes != 0) {
> +			if ((cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
> +				&& cluster_cmd_data.fixed_vnodes) {
> +				sd_err("Can not apply the option of '-V', "
> +					"because there are vnode strategy of sheep "
> +					"is auto in the cluster");
> +				return EXIT_FAILURE;
> +			} else if (!(cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
> +				&& !cluster_cmd_data.fixed_vnodes) {
> +				sd_err("Need to specify the option of '-V', "
> +					"because there are vnode strategy of sheep "
> +					"is fixed in the cluster");
> +				return EXIT_FAILURE;
> +			}
> +		}
> +	}
>  
>  	if (cluster_cmd_data.copies > sd_nodes_nr) {
>  		char info[1024];
> @@ -132,6 +170,11 @@ static int cluster_format(int argc, char **argv)
>  	hdr.cluster.flags |= SD_CLUSTER_FLAG_DISKMODE;
>  #endif
>  
> +	if (cluster_cmd_data.fixed_vnodes)
> +		hdr.cluster.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES;
> +	else
> +		hdr.cluster.flags |= SD_CLUSTER_FLAG_AUTO_VNODES;
> +
>  	printf("using backend %s store\n", store_name);
>  	ret = dog_exec_req(&sd_nid, &hdr, store_name);
>  	if (ret < 0)
> @@ -160,14 +203,15 @@ static void print_nodes(const struct epoch_log *logs, uint16_t flags)
>  				if (entry->disks[nr_disk].disk_id == 0)
>  					break;
>  			}
> -			printf("%s%s(%d)",
> -			       (i == 0) ? "" : ", ",
> -			       addr_to_str(entry->nid.addr, entry->nid.port),
> -			       nr_disk);
> +			printf("%s%s:%d(%d)",
> +				(i == 0) ? "" : ", ",
> +				addr_to_str(entry->nid.addr, entry->nid.port),
> +					entry->nr_vnodes, nr_disk);
>  		} else
> -			printf("%s%s",
> -			       (i == 0) ? "" : ", ",
> -			       addr_to_str(entry->nid.addr, entry->nid.port));
> +			printf("%s%s:%d",
> +				(i == 0) ? "" : ", ",
> +				addr_to_str(entry->nid.addr, entry->nid.port),
> +					entry->nr_vnodes);
>  	}
>  }
>  
> @@ -232,6 +276,15 @@ retry:
>  			}
>  			printf("%s with %s redundancy policy\n",
>  			       logs->drv_name, copy);
> +
> +			/* show vnode strategy */
> +			if (!raw_output)
> +				printf("Cluster vnodes strategy: ");
> +			if (logs->flags & SD_CLUSTER_FLAG_AUTO_VNODES)
> +				printf("auto\n");
> +			else
> +				printf("fixed\n");
> +
>  		} else
>  			printf("%s\n", sd_strerror(rsp->result));
>  
> @@ -239,15 +292,17 @@ retry:
>  		if (!raw_output)
>  			printf("Cluster vnode mode: ");
>  		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
> -			printf("disk");
> +			printf("disk\n");
>  		else
> -			printf("node");
> -	}
> +			printf("node\n");
> +	} else
> +		printf("\n");
>  
>  	if (!raw_output && rsp->data_length > 0) {
>  		ct = logs[0].ctime >> 32;
> -		printf("\nCluster created at %s\n", ctime(&ct));
> -		printf("Epoch Time           Version\n");
> +		printf("Cluster created at %s\n", ctime(&ct));
> +		printf("Epoch Time           Version [Host:Port:V-Nodes,,,]");
> +		printf("\n");
>  	}
>  
>  	nr_logs = rsp->data_length / (sizeof(struct epoch_log)
> @@ -761,7 +816,7 @@ failure:
>  static struct subcommand cluster_cmd[] = {
>  	{"info", NULL, "aprhvT", "show cluster information",
>  	 NULL, CMD_NEED_NODELIST, cluster_info, cluster_options},
> -	{"format", NULL, "bctaphzT", "create a Sheepdog store",
> +	{"format", NULL, "bctaphzTV", "create a Sheepdog store",
>  	 NULL, CMD_NEED_NODELIST, cluster_format, cluster_options},
>  	{"shutdown", NULL, "aphT", "stop Sheepdog",
>  	 NULL, 0, cluster_shutdown, cluster_options},
> @@ -823,9 +878,10 @@ static int cluster_parser(int ch, const char *opt)
>  			" Please set shift bit larger than 20");
>  			exit(EXIT_FAILURE);
>  		}
> -
>  		cluster_cmd_data.block_size_shift = block_size_shift;
> -
> +		break;
> +	case 'V':
> +		cluster_cmd_data.fixed_vnodes = true;
>  		break;
>  	}
>  
> diff --git a/dog/node.c b/dog/node.c
> index d4c8fe7..36141ad 100644
> --- a/dog/node.c
> +++ b/dog/node.c
> @@ -764,6 +764,71 @@ static int node_log(int argc, char **argv)
>  	return do_generic_subcommand(node_log_cmd, argc, argv);
>  }
>  
> +static int do_vnodes_set(const struct node_id *nid, int *nr_vnodes)
> +{
> +	int ret = 0;
> +	struct sd_req hdr;
> +	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> +
> +	sd_init_req(&hdr, SD_OP_SET_VNODES);
> +	hdr.flags = SD_FLAG_CMD_WRITE;
> +	hdr.data_length = sizeof(nr_vnodes);
> +
> +	ret = dog_exec_req(nid, &hdr, nr_vnodes);
> +	if (ret < 0)
> +		return EXIT_SYSFAIL;
> +
> +	if (rsp->result != SD_RES_SUCCESS)
> +		return EXIT_FAILURE;
> +
> +	return ret;
> +}
> +
> +static int node_vnodes_set(int argc, char **argv)
> +{
> +	int ret = 0;
> +	char *p;
> +	int32_t nr_vnodes = strtol(argv[optind], &p, 10);
> +
> +	if (argv[optind] == p || nr_vnodes < 1 || nr_vnodes > UINT16_MAX
> +		|| *p != '\0') {
> +		sd_err("Invalid number of vnodes '%s': must be an integer "
> +			"between 1 and %u",
> +			argv[optind], UINT16_MAX);
> +		exit(EXIT_USAGE);
> +	}
> +
> +	ret = do_vnodes_set(&sd_nid, &nr_vnodes);
> +
> +	switch (ret) {
> +	case EXIT_FAILURE:
> +	case EXIT_SYSFAIL:
> +		sd_err("Failed to execute request");
> +		ret = -1;
> +		break;
> +	case EXIT_SUCCESS:
> +		/* do nothing */
> +		break;
> +	default:
> +		sd_err("unknown return code of do_vnodes_set(): %d", ret);
> +		ret = -1;
> +		break;
> +	}
> +
> +	return ret;
> +}
> +
> +static struct subcommand node_vnodes_cmd[] = {
> +	{"set", "<num of vnodes>", NULL, "set new vnodes",
> +	 NULL, CMD_NEED_ARG, node_vnodes_set},
> +	{NULL},
> +};
> +
> +static int node_vnodes(int argc, char **argv)
> +{
> +	return do_generic_subcommand(node_vnodes_cmd, argc, argv);
> +}
> +
>  static struct subcommand node_cmd[] = {
>  	{"kill", "<node id>", "aprhlT", "kill node", NULL,
>  	 CMD_NEED_NODELIST, node_kill, node_options},
> @@ -780,6 +845,8 @@ static struct subcommand node_cmd[] = {
>  	 0, node_stat, node_options},
>  	{"log", NULL, "aphT", "show or set log level of the node", node_log_cmd,
>  	 CMD_NEED_ARG, node_log},
> +	{"vnodes", "<num of vnodes>", "aph", "set new vnodes", node_vnodes_cmd,
> +	 CMD_NEED_ARG, node_vnodes},
>  	{NULL,},
>  };
>  
> diff --git a/include/internal_proto.h b/include/internal_proto.h
> index c1ffe53..225cc28 100644
> --- a/include/internal_proto.h
> +++ b/include/internal_proto.h
> @@ -113,6 +113,8 @@
>  #define SD_OP_READ_DEL_VDIS  0xC9
>  #define SD_OP_GET_RECOVERY      0xCA
>  #define SD_OP_SET_RECOVERY      0xCB
> +#define SD_OP_SET_VNODES 0xCC
> +#define SD_OP_GET_VNODES 0xCD
>  
>  /* internal flags for hdr.flags, must be above 0x80 */
>  #define SD_FLAG_CMD_RECOVERY 0x0080
> @@ -145,6 +147,7 @@
>  
>  #define SD_CLUSTER_FLAG_STRICT		0x0001 /* Strict mode for write */
>  #define SD_CLUSTER_FLAG_DISKMODE	0x0002 /* Disk mode for cluster */
> +#define SD_CLUSTER_FLAG_AUTO_VNODES	0x0004 /* Cluster vnodes strategy */
>  
>  enum sd_status {
>  	SD_STATUS_OK = 1,
> diff --git a/include/sheep.h b/include/sheep.h
> index 22524c1..fe6f066 100644
> --- a/include/sheep.h
> +++ b/include/sheep.h
> @@ -149,6 +149,9 @@ static inline const char *sd_strerror(int err)
>  			"IO has halted as there are not enough living nodes",
>  		[SD_RES_READONLY] = "Object is read-only",
>  		[SD_RES_INODE_INVALIDATED] = "Inode object is invalidated",
> +		[SD_RES_INVALID_VNODES_STRATEGY] =
> +			"Invalid cluster vnodes strategy",
> +		[SD_RES_GATEWAY_MODE] = "Targeted node is gateway mode",
>  
>  		/* from internal_proto.h */
>  		[SD_RES_OLD_NODE_VER] = "Request has an old epoch",
> @@ -328,4 +331,9 @@ static inline bool is_cluster_diskmode(const struct cluster_info *cinfo)
>  	return (cinfo->flags & SD_CLUSTER_FLAG_DISKMODE) > 0;
>  }
>  
> +static inline bool is_cluster_autovnodes(const struct cluster_info *cinfo)
> +{
> +	return (cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES) > 0;
> +}
> +
>  #endif
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 9495742..3910bd5 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -87,6 +87,8 @@
>  #define SD_RES_INCOMPLETE    0x1B /* Object (in kv) is incomplete uploading */
>  #define SD_RES_COLLECTING_CINFO 0x1C /* sheep is collecting cluster wide status, not ready for operation */
>  #define SD_RES_INODE_INVALIDATED 0x1D /* inode object in client is invalidated, refreshing is required */
> +#define SD_RES_GATEWAY_MODE  0x1E /* Target node is gateway mode */
> +#define SD_RES_INVALID_VNODES_STRATEGY 0x1F /* Invalid vnodes strategy */
>  
>  /* errors above 0x80 are sheepdog-internal */
>  
> diff --git a/sheep/config.c b/sheep/config.c
> index dfad5fd..9518109 100644
> --- a/sheep/config.c
> +++ b/sheep/config.c
> @@ -62,7 +62,11 @@ static int get_cluster_config(struct cluster_info *cinfo)
>  {
>  	cinfo->ctime = config.ctime;
>  	cinfo->nr_copies = config.copies;
> -	cinfo->flags = config.flags;
> +	if (config.ctime > 0)
> +		cinfo->flags = config.flags;
> +	else
> +		cinfo->flags = (config.flags & ~SD_CLUSTER_FLAG_AUTO_VNODES) |
> +			(cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES);
>  	cinfo->copy_policy = config.copy_policy;
>  	cinfo->block_size_shift = config.block_size_shift;
>  	memcpy(cinfo->store, config.store, sizeof(config.store));
> @@ -122,6 +126,14 @@ int init_config_file(void)
>  	}
>  
>  reload:
> +	if ((config.flags & SD_CLUSTER_FLAG_AUTO_VNODES) !=
> +			(sys->cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
> +		&& !sys->gateway_only
> +		&& config.ctime > 0) {
> +		sd_err("Designation of before a restart and a vnodes option is different.");
> +		return -1;
> +	}
> +
>  	ret = 0;
>  	get_cluster_config(&sys->cinfo);
>  	if ((config.flags & SD_CLUSTER_FLAG_DISKMODE) !=
> diff --git a/sheep/group.c b/sheep/group.c
> index 9462aa4..2034300 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -145,7 +145,8 @@ struct vnode_info *alloc_vnode_info(const struct rb_root *nroot)
>  		vnode_info->nr_nodes++;
>  	}
>  
> -	recalculate_vnodes(&vnode_info->nroot);
> +	if (is_cluster_autovnodes(&sys->cinfo))
> +		recalculate_vnodes(&vnode_info->nroot);
>  
>  	if (is_cluster_diskmode(&sys->cinfo))
>  		disks_to_vnodes(&vnode_info->nroot, &vnode_info->vroot);
> @@ -1133,6 +1134,20 @@ static bool cluster_join_check(const struct cluster_info *cinfo)
>  	if (!cluster_ctime_check(cinfo))
>  		return false;
>  
> +	if (cinfo->ctime > 0 && sys->this_node.nr_vnodes != 0) {
> +		if (!is_cluster_autovnodes(&sys->cinfo)
> +			&& is_cluster_autovnodes(cinfo)) {
> +			sd_err("failed to join for vnodes strategy unmatch. "
> +				" cluster:fixed, joined:auto");
> +			return false;
> +		} else if (is_cluster_autovnodes(&sys->cinfo)
> +			&& !is_cluster_autovnodes(cinfo)) {
> +			sd_err("failed to join for vnodes strategy unmatch. "
> +				" cluster:auto, joined:fixed");
> +			return false;
> +		}
> +	}
> +
>  	/*
>  	 * Sheepdog's recovery code assumes every node have the same epoch
>  	 * history. But we don't check epoch history of joining node because:
> @@ -1155,6 +1170,13 @@ main_fn void sd_accept_handler(const struct sd_node *joined,
>  	const struct cluster_info *cinfo = opaque;
>  	struct sd_node *n;
>  	enum sd_status prev_status = sys->cinfo.status;
> +	uint16_t flags;
> +
> +	if (node_is_local(joined) && sys->gateway_only
> +		&& sys->cinfo.ctime <= 0)
> +		flags = cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES;
> +	else
> +		flags = sys->cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES;
>  
>  	if (node_is_local(joined) && !cluster_join_check(cinfo)) {
>  		sd_err("failed to join Sheepdog");
> @@ -1163,6 +1185,9 @@ main_fn void sd_accept_handler(const struct sd_node *joined,
>  
>  	cluster_info_copy(&sys->cinfo, cinfo);
>  
> +	sys->cinfo.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES;
> +	sys->cinfo.flags |= flags;
> +
>  	sd_debug("join %s", node_to_str(joined));
>  	rb_for_each_entry(n, nroot, rb) {
>  		sd_debug("%s", node_to_str(n));
> @@ -1244,7 +1269,7 @@ main_fn void sd_leave_handler(const struct sd_node *left,
>  	remove_node_from_participants(&left->nid);
>  }
>  
> -static void update_node_size(struct sd_node *node)
> +static void update_node_info(struct sd_node *node)
>  {
>  	struct vnode_info *cur_vinfo = get_vnode_info();
>  	struct sd_node *n = rb_search(&cur_vinfo->nroot, node, rb, node_cmp);
> @@ -1252,6 +1277,10 @@ static void update_node_size(struct sd_node *node)
>  	if (unlikely(!n))
>  		panic("can't find %s", node_to_str(node));
>  	n->space = node->space;
> +
> +	if (!is_cluster_autovnodes(&sys->cinfo))
> +		n->nr_vnodes = node->nr_vnodes;
> +
>  	if (is_cluster_diskmode(&sys->cinfo)) {
>  		memset(n->disks, 0, sizeof(struct disk_info) * DISK_MAX);
>  		for (int i = 0; i < DISK_MAX; i++)
> @@ -1280,14 +1309,14 @@ static void kick_node_recover(void)
>  
>  main_fn void sd_update_node_handler(struct sd_node *node)
>  {
> -	update_node_size(node);
> +	update_node_info(node);
>  	kick_node_recover();
>  }
>  
>  int create_cluster(int port, int64_t zone, int nr_vnodes,
>  		   bool explicit_addr)
>  {
> -	int nr_nodes = 0, ret;
> +	int nr_nodes = 0, ret, i, vnodes = 0;
>  
>  	if (!sys->cdrv) {
>  		sys->cdrv = find_cdrv(DEFAULT_CLUSTER_DRIVER);
> @@ -1323,11 +1352,32 @@ int create_cluster(int port, int64_t zone, int nr_vnodes,
>  	sys->cinfo.epoch = get_latest_epoch();
>  	if (sys->cinfo.epoch) {
>  		ret = epoch_log_read(sys->cinfo.epoch, sys->cinfo.nodes,
> -				sizeof(sys->cinfo.nodes), &nr_nodes);
> +			sizeof(sys->cinfo.nodes), &nr_nodes);
>  		if (ret != SD_RES_SUCCESS)
>  			return -1;
>  		sys->cinfo.nr_nodes = nr_nodes;
>  	}
> +
> +	if (!is_cluster_autovnodes(&sys->cinfo)) {
> +		for (i = 0; i < nr_nodes; i++) {
> +			if ((addr_to_str(sys->this_node.nid.addr,
> +					sys->this_node.nid.port)
> +				== addr_to_str(sys->cinfo.nodes[i].nid.addr,
> +					sys->cinfo.nodes[i].nid.port))
> +				&& (sys->this_node.nid.port
> +					== sys->cinfo.nodes[i].nid.port)) {
> +				vnodes = sys->cinfo.nodes[i].nr_vnodes;
> +				break;
> +			}
> +		}
> +		if (sys->cinfo.epoch != 0 && sys->this_node.nr_vnodes != vnodes
> +			&& !sys->gateway_only) {
> +			sd_err("mismatch specified vnodes is compared with the previous. "
> +				"previous vnodes:%d", vnodes);
> +			return -1;
> +		}
> +	}
> +
>  	sys->cinfo.status = SD_STATUS_WAIT;
>  
>  	main_thread_set(pending_block_list,
> diff --git a/sheep/ops.c b/sheep/ops.c
> index d097a15..dad03a6 100644
> --- a/sheep/ops.c
> +++ b/sheep/ops.c
> @@ -265,6 +265,29 @@ static int remove_epoch(uint32_t epoch)
>  	return SD_RES_SUCCESS;
>  }
>  
> +static int get_vnodes(struct vnode_info *vinfo, int *nr_vnodes)
> +{
> +	int ret;
> +	struct sd_node *node;
> +
> +	rb_for_each_entry(node, &vinfo->nroot, rb) {
> +		struct sd_req hdr;
> +		if (node_is_local(node))
> +			continue;
> +		if (node->nr_vnodes == 0)
> +			continue;
> +
> +		sd_init_req(&hdr, SD_OP_GET_VNODES);
> +		hdr.data_length = sizeof(*nr_vnodes);
> +		hdr.epoch = sys_epoch();
> +		ret = sheep_exec_req(&node->nid, &hdr, nr_vnodes);
> +		if (ret != SD_RES_SUCCESS)
> +			return ret;
> +		node->nr_vnodes = *nr_vnodes;
> +	}
> +	return SD_RES_SUCCESS;
> +}
> +
>  static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
>  			   void *data, const struct sd_node *sender)
>  {
> @@ -272,6 +295,8 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
>  	uint32_t latest_epoch;
>  	struct store_driver *driver;
>  	char *store_name = data;
> +	int32_t nr_vnodes;
> +	struct vnode_info *vinfo = get_vnode_info();
>  
>  	driver = find_store_driver(data);
>  	if (!driver)
> @@ -290,6 +315,12 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
>  	if (ret != SD_RES_SUCCESS)
>  		return ret;
>  
> +	if (sys->gateway_only) {
> +		ret = get_vnodes(vinfo, &nr_vnodes);
> +		if (ret != SD_RES_SUCCESS)
> +			return ret;
> +	}
> +
>  	sys->cinfo.nr_copies = req->cluster.copies;
>  	sys->cinfo.copy_policy = req->cluster.copy_policy;
>  	sys->cinfo.block_size_shift = req->cluster.block_size_shift;
> @@ -1479,6 +1510,45 @@ static int local_set_recovery(struct request *req)
>  	return SD_RES_SUCCESS;
>  }
>  
> +static int local_get_vnodes(struct request *req)
> +{
> +	int *nr_vnodes;
> +
> +	nr_vnodes = req->data;
> +	req->rp.data_length = sizeof(nr_vnodes);
> +	*nr_vnodes = sys->this_node.nr_vnodes;
> +
> +	return SD_RES_SUCCESS;
> +}
> +
> +static int local_set_vnodes(const struct sd_req *req,
> +				struct sd_rsp *rsp, void *data,
> +				const struct sd_node *sender)
> +{
> +	int ret;
> +	int *nr_vnodes = (int *)data;
> +
> +	if (sys->gateway_only) {
> +		sd_err("failed to set vnodes, cause operating in gateway mode.");
> +		return SD_RES_GATEWAY_MODE;
> +	}
> +	if (is_cluster_autovnodes(&sys->cinfo)) {
> +		sd_err("failed to set vnodes, cause operating in auto vnodes strategy.");
> +		return SD_RES_INVALID_VNODES_STRATEGY;
> +	}
> +
> +	if (1 > *nr_vnodes || *nr_vnodes > UINT16_MAX) {
> +		sd_err("invalid vnodes: %d", *nr_vnodes);
> +		return SD_RES_INVALID_PARMS;
> +	}
> +
> +	sys->this_node.nr_vnodes = *nr_vnodes;
> +
> +	ret = sys->cdrv->update_node(&sys->this_node);
> +
> +	return ret;
> +}
> +
>  static struct sd_op_template sd_ops[] = {
>  
>  	/* cluster operations */
> @@ -1872,6 +1942,18 @@ static struct sd_op_template sd_ops[] = {
>  		.process_main = local_get_cluster_default,
>  	},
>  
> +	[SD_OP_GET_VNODES] = {
> +		.name = "GET_VNODES",
> +		.type = SD_OP_TYPE_LOCAL,
> +		.process_work = local_get_vnodes,
> +	},
> +
> +	[SD_OP_SET_VNODES] = {
> +		.name = "SET_VNODES",
> +		.type = SD_OP_TYPE_LOCAL,
> +		.process_main = local_set_vnodes,
> +	},
> +
>  	/* gateway I/O operations */
>  	[SD_OP_CREATE_AND_WRITE_OBJ] = {
>  		.name = "CREATE_AND_WRITE_OBJ",
> diff --git a/sheep/sheep.c b/sheep/sheep.c
> index 9471a3b..e0a034f 100644
> --- a/sheep/sheep.c
> +++ b/sheep/sheep.c
> @@ -121,6 +121,10 @@ static const char recovery_help[] =
>  "\tinterval=: object recovery interval time (millisec)\n"
>  "Example:\n\t$ sheep -R max=50,interval=1000 ...\n";
>  
> +static const char vnodes_help[] =
> +"Example:\n\t$ sheep -V 128\n"
> +"\tset number of vnodes\n";
> +
>  static struct sd_option sheep_options[] = {
>  	{'b', "bindaddr", true, "specify IP address of interface to listen on",
>  	 bind_help},
> @@ -147,6 +151,7 @@ static struct sd_option sheep_options[] = {
>  	 recovery_help},
>  	{'u', "upgrade", false, "upgrade to the latest data layout"},
>  	{'v', "version", false, "show the version"},
> +	{'V', "vnodes", true, "set number of vnodes", vnodes_help},
>  	{'w', "cache", true, "enable object cache", cache_help},
>  	{'y', "myaddr", true, "specify the address advertised to other sheep",
>  	 myaddr_help},
> @@ -646,11 +651,12 @@ static void sighup_handler(int signum)
>  int main(int argc, char **argv)
>  {
>  	int ch, longindex, ret, port = SD_LISTEN_PORT, io_port = SD_LISTEN_PORT;
> -	int nr_vnodes = SD_DEFAULT_VNODES, rc = 1;
> +	int rc = 1;
>  	const char *dirp = DEFAULT_OBJECT_DIR, *short_options;
>  	char *dir, *p, *pid_file = NULL, *bindaddr = NULL, log_path[PATH_MAX],
>  	     *argp = NULL;
>  	bool explicit_addr = false;
> +	int32_t nr_vnodes = -1;
>  	int64_t zone = -1;
>  	struct cluster_driver *cdrv;
>  	struct option *long_options;
> @@ -659,6 +665,7 @@ int main(int argc, char **argv)
>  	struct stat logdir_st;
>  	enum log_dst_type log_dst_type;
>  
> +	sys->cinfo.flags |= SD_CLUSTER_FLAG_AUTO_VNODES;
>  	sys->node_status = SD_NODE_STATUS_INITIALIZATION;
>  
>  	sys->rthrottling.max_exec_count = 0;
> @@ -707,7 +714,10 @@ int main(int argc, char **argv)
>  			sys->backend_dio = true;
>  			break;
>  		case 'g':
> -			/* same as '-v 0' */
> +			if (nr_vnodes > 0) {
> +				sd_err("Options '-g' and '-V' can not be both specified");
> +				exit(1);
> +			}
>  			nr_vnodes = 0;
>  			break;
>  		case 'z':
> @@ -797,6 +807,21 @@ int main(int argc, char **argv)
>  				PACKAGE_VERSION);
>  			exit(0);
>  			break;
> +		case 'V':
> +			sys->cinfo.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES;
> +			if (nr_vnodes == 0) {
> +				sd_err("Options '-g' and '-V' can not be both specified");
> +				exit(1);
> +			}
> +			nr_vnodes = strtol(optarg, &p, 10);
> +			if (optarg == p || nr_vnodes < 1
> +				|| UINT16_MAX < nr_vnodes || *p != '\0') {
> +				sd_err("Invalid number of vnodes '%s': must be "
> +					"an integer between 1 and %u",
> +					optarg, UINT16_MAX);
> +				exit(1);
> +			}
> +			break;
>  		default:
>  			usage(1);
>  			break;
> @@ -813,7 +838,8 @@ int main(int argc, char **argv)
>  	if (nr_vnodes == 0) {
>  		sys->gateway_only = true;
>  		sys->disk_space = 0;
> -	}
> +	} else if (nr_vnodes == -1)
> +		nr_vnodes = SD_DEFAULT_VNODES;
>  
>  	if (optind != argc) {
>  		argp = strdup(argv[optind]);
> -- 
> 1.7.1
> 
> 
> -- 
> NTTソフトウェア株式会社
> クラウド事業部
> 第一事業ユニット
> 佐伯 昌樹
> TEL: 045-212-7393
> FAX: 045-662-7856
> Mail: saeki.masaki at po.ntts.co.jp
> --
> 



More information about the sheepdog mailing list