[sheepdog] [PATCH] sheep, dog: add vnodes fixed options.

Saeki Masaki saeki.masaki at po.ntts.co.jp
Thu Dec 25 11:18:02 CET 2014


On 2014/12/24 23:43, Hitoshi Mitake wrote:
> At Fri, 19 Dec 2014 10:07:23 +0900,
> Saeki Masaki wrote:
>>
>> In the current sheepdog, vnodes is recalculated at the time of
>> node increase or decrease.
>>
>> In the auto recovery, first get the object from the other node,
>> finally delete the object do not need.
>> During auto recovery run, available disk decreases.
>> In the worst case, it exhaust available disk.
>>
>> Add the following new commands and options.
>> 1. option to specify vnodes in sheep. (-V, --vnodes)
>>    - The old days it has been implemented as -v(--vnodes) option.
>>    - For now -v option is used to print version. so add in -V (large).
>> $ sheep -V 100 /var/lib/sheepdog
>>    If -V is specified, vnodes strategy of sheep is 'fixed'
>>    (default value is 'auto')
>>
>> 2. option to dog cluster format with vnode fixed. (-V, --fixedvnodes)
>>    $ dog cluster format -V
>>    If 'fixed' and 'auto' vnodes strategy are mixed,
>>    cluster format command fails.
>>    (different sheep of vnodes strategy can not be mixed in the cluster)
>>
>> 3. dog command to change the vnodes
>>    $ dog node vnode set <vnodes>
>>    After changing the vnodes, new epoch are created and auto recovery will start.
>>
>> If you want to operate the vnodes fixed,
>> it is necessary to manage the vnodes with capacity of
>> the data store in each node.
>> So you should use this option carefully.
>>
>> For example of using fixed vnodes strategy:
>>
>> 1) start sheep with fixed vnodes strategy.
>> $ sheep -V 100 /var/lib/sheepdog
>> $ sheep -V 110 /var/lib/sheepdog
>> $ dog node list
>>    Id   Host:Port         V-Nodes       Zone
>>     0   172.16.4.205:7000        100 1812140204
>>     1   172.16.4.206:7000        120 1828917420
>>
>> 2) format the cluster with fixed vnodes strategy.
>> $ dog cluster format -V
>>
>> 3) check vnodes strategy of cluster.
>> $ dog cluster info -v
>> Cluster status: running, auto-recovery enabled
>> Cluster store: plain with 3 redundancy policy
>> Cluster vnodes strategy: fixed
>> Cluster vnode mode: node
>> Cluster created at Wed Dec 17 18:20:10 2014
>>
>> Epoch Time           Version [Host:Port:V-Nodes,,,]
>> 2014-12-17 18:20:10      1 [172.16.4.205:7000:100, 172.16.4.206:7000:120]
>>
>> 4) change of vnodes.
>> $ dog node vnodes set 140
>> $ dog node list
>>    Id   Host:Port         V-Nodes       Zone
>>     0   172.16.4.205:7000        140 1812140204
>>     1   172.16.4.206:7000        120 1828917420
>
> Saeki-san, thanks a lot for this patch. The change seems good to me,
> but rebase is required for the latest master. Could you rebase and
> send v2? In addition, I have some opinions, mainly related to trivial
> coding styles. I'm glad if you can take them in v2.
>
>>
>> Signed-off-by Masaki Saeki <masaki.saeki at po.ntts.co.jp>
>
> You need ':' between Signed-off-by and your name.
>
>>
>> ---
>>   dog/cluster.c            |   82 ++++++++++++++++++++++++++++++++++++++--------
>>   dog/node.c               |   67 +++++++++++++++++++++++++++++++++++++
>>   include/internal_proto.h |    3 ++
>>   include/sheep.h          |    8 ++++
>>   include/sheepdog_proto.h |    2 +
>>   sheep/config.c           |   15 ++++++++-
>>   sheep/group.c            |   62 ++++++++++++++++++++++++++++++++---
>>   sheep/ops.c              |   82 ++++++++++++++++++++++++++++++++++++++++++++++
>>   sheep/sheep.c            |   31 ++++++++++++++++-
>>   9 files changed, 330 insertions(+), 22 deletions(-)
>>
>> diff --git a/dog/cluster.c b/dog/cluster.c
>> index 20f190b..c92141e 100644
>> --- a/dog/cluster.c
>> +++ b/dog/cluster.c
>> @@ -15,6 +15,7 @@
>>   #include <sys/time.h>
>>
>>   #include "dog.h"
>> +#include "sheep.h"
>>   #include "farm/farm.h"
>>
>>   static struct sd_option cluster_options[] = {
>> @@ -27,6 +28,7 @@ static struct sd_option cluster_options[] = {
>>   	 "do not serve write request if number of nodes is not sufficient"},
>>   	{'z', "block_size_shift", true, "specify the shift num of default"
>>   	      " data object size"},
>> +	{'V', "fixedvnodes", false, "disable automatic vnodes calculation"},
>>   	{ 0, NULL, false, NULL },
>>   };
>>
>> @@ -38,6 +40,7 @@ static struct cluster_cmd_data {
>>   	bool force;
>>   	bool strict;
>>   	char name[STORE_LEN];
>> +	bool fixed_vnodes;
>>   } cluster_cmd_data;
>>
>>   #define DEFAULT_STORE	"plain"
>> @@ -87,6 +90,41 @@ static int cluster_format(int argc, char **argv)
>>   	struct timeval tv;
>>   	char store_name[STORE_LEN];
>>   	static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
>> +	struct sd_node *n;
>> +
>> +	rb_for_each_entry(n, &sd_nroot, rb) {
>> +		struct sd_req info_req;
>> +		struct sd_rsp *info_rsp = (struct sd_rsp *)&info_req;
>> +		struct cluster_info cinfo;
>> +
>> +		sd_init_req(&info_req, SD_OP_CLUSTER_INFO);
>> +		info_req.data_length = sizeof(cinfo);
>> +		ret = dog_exec_req(&n->nid, &info_req, &cinfo);
>> +		if (ret < 0) {
>> +			sd_err("Fail to execute request");
>> +			return EXIT_FAILURE;
>> +		}
>> +		if (info_rsp->result != SD_RES_SUCCESS) {
>> +			sd_err("%s", sd_strerror(info_rsp->result));
>> +			return EXIT_FAILURE;
>> +		}
>> +
>> +		if (n->nr_vnodes != 0) {
>> +			if ((cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
>> +				&& cluster_cmd_data.fixed_vnodes) {
>> +				sd_err("Can not apply the option of '-V', "
>> +					"because there are vnode strategy of sheep "
>> +					"is auto in the cluster");
>> +				return EXIT_FAILURE;
>> +			} else if (!(cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
>> +				&& !cluster_cmd_data.fixed_vnodes) {
>> +				sd_err("Need to specify the option of '-V', "
>> +					"because there are vnode strategy of sheep "
>> +					"is fixed in the cluster");
>> +				return EXIT_FAILURE;
>> +			}
>> +		}
>> +	}
>>
>>   	if (cluster_cmd_data.copies > sd_nodes_nr) {
>>   		char info[1024];
>> @@ -132,6 +170,11 @@ static int cluster_format(int argc, char **argv)
>>   	hdr.cluster.flags |= SD_CLUSTER_FLAG_DISKMODE;
>>   #endif
>>
>> +	if (cluster_cmd_data.fixed_vnodes)
>> +		hdr.cluster.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES;
>> +	else
>> +		hdr.cluster.flags |= SD_CLUSTER_FLAG_AUTO_VNODES;
>> +
>>   	printf("using backend %s store\n", store_name);
>>   	ret = dog_exec_req(&sd_nid, &hdr, store_name);
>>   	if (ret < 0)
>> @@ -160,14 +203,15 @@ static void print_nodes(const struct epoch_log *logs, uint16_t flags)
>>   				if (entry->disks[nr_disk].disk_id == 0)
>>   					break;
>>   			}
>> -			printf("%s%s(%d)",
>> -			       (i == 0) ? "" : ", ",
>> -			       addr_to_str(entry->nid.addr, entry->nid.port),
>> -			       nr_disk);
>> +			printf("%s%s:%d(%d)",
>> +				(i == 0) ? "" : ", ",
>> +				addr_to_str(entry->nid.addr, entry->nid.port),
>> +					entry->nr_vnodes, nr_disk);
>>   		} else
>> -			printf("%s%s",
>> -			       (i == 0) ? "" : ", ",
>> -			       addr_to_str(entry->nid.addr, entry->nid.port));
>> +			printf("%s%s:%d",
>> +				(i == 0) ? "" : ", ",
>> +				addr_to_str(entry->nid.addr, entry->nid.port),
>> +					entry->nr_vnodes);
>>   	}
>>   }
>>
>> @@ -232,6 +276,15 @@ retry:
>>   			}
>>   			printf("%s with %s redundancy policy\n",
>>   			       logs->drv_name, copy);
>> +
>> +			/* show vnode strategy */
>> +			if (!raw_output)
>> +				printf("Cluster vnodes strategy: ");
>> +			if (logs->flags & SD_CLUSTER_FLAG_AUTO_VNODES)
>> +				printf("auto\n");
>> +			else
>> +				printf("fixed\n");
>> +
>>   		} else
>>   			printf("%s\n", sd_strerror(rsp->result));
>>
>> @@ -239,15 +292,16 @@ retry:
>>   		if (!raw_output)
>>   			printf("Cluster vnode mode: ");
>>   		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
>> -			printf("disk");
>> +			printf("disk\n");
>>   		else
>> -			printf("node");
>> +			printf("node\n");
>>   	}
>>
>>   	if (!raw_output && rsp->data_length > 0) {
>>   		ct = logs[0].ctime >> 32;
>> -		printf("\nCluster created at %s\n", ctime(&ct));
>> -		printf("Epoch Time           Version\n");
>> +		printf("Cluster created at %s\n", ctime(&ct));
>> +		printf("Epoch Time           Version
>>   		[Host:Port:V-Nodes,,,]");
>
> The above change will break existing tests. Could you create a patch
> for updating tests? You can send it as another patch.
>
>> +		printf("\n");
>>   	}
>>
>>   	nr_logs = rsp->data_length / (sizeof(struct epoch_log)
>> @@ -761,7 +815,7 @@ failure:
>>   static struct subcommand cluster_cmd[] = {
>>   	{"info", NULL, "aprhvT", "show cluster information",
>>   	 NULL, CMD_NEED_NODELIST, cluster_info, cluster_options},
>> -	{"format", NULL, "bctaphzT", "create a Sheepdog store",
>> +	{"format", NULL, "bctaphzTV", "create a Sheepdog store",
>>   	 NULL, CMD_NEED_NODELIST, cluster_format, cluster_options},
>>   	{"shutdown", NULL, "aphT", "stop Sheepdog",
>>   	 NULL, 0, cluster_shutdown, cluster_options},
>> @@ -823,9 +877,9 @@ static int cluster_parser(int ch, const char *opt)
>>   			" Please set shift bit larger than 20");
>>   			exit(EXIT_FAILURE);
>>   		}
>> -
>>   		cluster_cmd_data.block_size_shift = block_size_shift;
>> -
>> +	case 'V':
>> +		cluster_cmd_data.fixed_vnodes = true;
>>   		break;
>>   	}
>>
>> diff --git a/dog/node.c b/dog/node.c
>> index a4e9142..b9d441a 100644
>> --- a/dog/node.c
>> +++ b/dog/node.c
>> @@ -625,6 +625,71 @@ static int node_log(int argc, char **argv)
>>   	return do_generic_subcommand(node_log_cmd, argc, argv);
>>   }
>>
>> +static int do_vnodes_set(const struct node_id *nid, int *nr_vnodes)
>> +{
>> +	int ret = 0;
>> +	struct sd_req hdr;
>> +	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
>> +
>> +	sd_init_req(&hdr, SD_OP_SET_VNODES);
>> +	hdr.flags = SD_FLAG_CMD_WRITE;
>> +	hdr.data_length = sizeof(nr_vnodes);
>> +
>> +	ret = dog_exec_req(nid, &hdr, nr_vnodes);
>> +	if (ret < 0)
>> +		return EXIT_SYSFAIL;
>> +
>> +	if (rsp->result != SD_RES_SUCCESS)
>> +		return EXIT_FAILURE;
>> +
>> +	return ret;
>> +}
>> +
>> +static int node_vnodes_set(int argc, char **argv)
>> +{
>> +	int ret = 0;
>> +	char *p;
>> +	int32_t nr_vnodes = strtol(argv[optind], &p, 10);
>> +
>> +	if (argv[optind] == p || nr_vnodes < 1 || nr_vnodes > UINT16_MAX
>> +		|| *p != '\0') {
>> +		sd_err("Invalid number of vnodes '%s': must be an integer "
>> +			"between 1 and %u",
>> +			argv[optind], UINT16_MAX);
>> +		exit(EXIT_USAGE);
>> +	}
>> +
>> +	ret = do_vnodes_set(&sd_nid, &nr_vnodes);
>> +
>> +	switch (ret) {
>> +	case EXIT_FAILURE:
>> +	case EXIT_SYSFAIL:
>> +		sd_err("Failed to execute request");
>> +		ret = -1;
>> +		break;
>> +	case EXIT_SUCCESS:
>> +		/* do nothing */
>> +		break;
>> +	default:
>> +		sd_err("unknown return code of do_vnodes_set(): %d", ret);
>> +		ret = -1;
>> +		break;
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +static struct subcommand node_vnodes_cmd[] = {
>> +	{"set", "<num of vnodes>", NULL, "set new vnodes",
>> +	 NULL, CMD_NEED_ARG, node_vnodes_set},
>> +	{NULL},
>> +};
>> +
>> +static int node_vnodes(int argc, char **argv)
>> +{
>> +	return do_generic_subcommand(node_vnodes_cmd, argc, argv);
>> +}
>> +
>>   static struct subcommand node_cmd[] = {
>>   	{"kill", "<node id>", "aprhlT", "kill node", NULL,
>>   	 CMD_NEED_NODELIST, node_kill, node_options},
>> @@ -640,6 +705,8 @@ static struct subcommand node_cmd[] = {
>>   	 0, node_stat, node_options},
>>   	{"log", NULL, "aphT", "show or set log level of the node", node_log_cmd,
>>   	 CMD_NEED_ARG, node_log},
>> +	{"vnodes", "<num of vnodes>", "aph", "set new vnodes", node_vnodes_cmd,
>> +	 CMD_NEED_ARG, node_vnodes},
>>   	{NULL,},
>>   };
>>
>> diff --git a/include/internal_proto.h b/include/internal_proto.h
>> index 3f5d77f..f280d6d 100644
>> --- a/include/internal_proto.h
>> +++ b/include/internal_proto.h
>> @@ -111,6 +111,8 @@
>>   #define SD_OP_VDI_STATE_SNAPSHOT_CTL  0xC7
>>   #define SD_OP_INODE_COHERENCE 0xC8
>>   #define SD_OP_READ_DEL_VDIS  0xC9
>> +#define SD_OP_SET_VNODES 0xCC
>> +#define SD_OP_GET_VNODES 0xCD
>>
>>   /* internal flags for hdr.flags, must be above 0x80 */
>>   #define SD_FLAG_CMD_RECOVERY 0x0080
>> @@ -143,6 +145,7 @@
>>
>>   #define SD_CLUSTER_FLAG_STRICT		0x0001 /* Strict mode for write */
>>   #define SD_CLUSTER_FLAG_DISKMODE	0x0002 /* Disk mode for cluster */
>> +#define SD_CLUSTER_FLAG_AUTO_VNODES	0x0004 /* Cluster vnodes strategy */
>>
>>   enum sd_status {
>>   	SD_STATUS_OK = 1,
>> diff --git a/include/sheep.h b/include/sheep.h
>> index 22524c1..fe6f066 100644
>> --- a/include/sheep.h
>> +++ b/include/sheep.h
>> @@ -149,6 +149,9 @@ static inline const char *sd_strerror(int err)
>>   			"IO has halted as there are not enough living nodes",
>>   		[SD_RES_READONLY] = "Object is read-only",
>>   		[SD_RES_INODE_INVALIDATED] = "Inode object is invalidated",
>> +		[SD_RES_INVALID_VNODES_STRATEGY] =
>> +			"Invalid cluster vnodes strategy",
>> +		[SD_RES_GATEWAY_MODE] = "Targeted node is gateway mode",
>>
>>   		/* from internal_proto.h */
>>   		[SD_RES_OLD_NODE_VER] = "Request has an old epoch",
>> @@ -328,4 +331,9 @@ static inline bool is_cluster_diskmode(const struct cluster_info *cinfo)
>>   	return (cinfo->flags & SD_CLUSTER_FLAG_DISKMODE) > 0;
>>   }
>>
>> +static inline bool is_cluster_autovnodes(const struct cluster_info *cinfo)
>> +{
>> +	return (cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES) > 0;
>> +}
>> +
>>   #endif
>> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
>> index 4f0c48c..28ededd 100644
>> --- a/include/sheepdog_proto.h
>> +++ b/include/sheepdog_proto.h
>> @@ -86,6 +86,8 @@
>>   #define SD_RES_INCOMPLETE    0x1B /* Object (in kv) is incomplete uploading */
>>   #define SD_RES_COLLECTING_CINFO 0x1C /* sheep is collecting cluster wide status, not ready for operation */
>>   #define SD_RES_INODE_INVALIDATED 0x1D /* inode object in client is invalidated, refreshing is required */
>> +#define SD_RES_GATEWAY_MODE  0x1E /* Target node is gateway mode */
>> +#define SD_RES_INVALID_VNODES_STRATEGY 0x1F /* Invalid vnodes strategy */
>>
>>   /* errors above 0x80 are sheepdog-internal */
>>
>> diff --git a/sheep/config.c b/sheep/config.c
>> index 383a1ed..4a1e600 100644
>> --- a/sheep/config.c
>> +++ b/sheep/config.c
>> @@ -62,7 +62,12 @@ static int get_cluster_config(struct cluster_info *cinfo)
>>   {
>>   	cinfo->ctime = config.ctime;
>>   	cinfo->nr_copies = config.copies;
>> -	cinfo->flags = config.flags;
>> +	if (config.ctime > 0) {
>> +		cinfo->flags = config.flags;
>> +	} else {
>> +		cinfo->flags = (config.flags & ~SD_CLUSTER_FLAG_AUTO_VNODES) |
>> +			(cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES);
>> +	}
>
> Could you eliminate the braces? In sheepdog coding style, braces of
> if, for, while statements should be removed they don't have more than
> two statements.
>
> # seems that script/checkpatch.pl doesn't work well for this style...
>
>>   	cinfo->copy_policy = config.copy_policy;
>>   	memcpy(cinfo->store, config.store, sizeof(config.store));
>>
>> @@ -121,6 +126,14 @@ int init_config_file(void)
>>   	}
>>
>>   reload:
>> +	if ((config.flags & SD_CLUSTER_FLAG_AUTO_VNODES) !=
>> +			(sys->cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
>> +		&& !sys->gateway_only
>> +		&& config.ctime > 0) {
>> +		sd_err("Designation of before a restart and a vnodes option is different.");
>> +		return -1;
>> +	}
>> +
>>   	ret = 0;
>>   	get_cluster_config(&sys->cinfo);
>>   	if ((config.flags & SD_CLUSTER_FLAG_DISKMODE) !=
>> diff --git a/sheep/group.c b/sheep/group.c
>> index 095b7c5..b33e514 100644
>> --- a/sheep/group.c
>> +++ b/sheep/group.c
>> @@ -145,7 +145,8 @@ struct vnode_info *alloc_vnode_info(const struct rb_root *nroot)
>>   		vnode_info->nr_nodes++;
>>   	}
>>
>> -	recalculate_vnodes(&vnode_info->nroot);
>> +	if (is_cluster_autovnodes(&sys->cinfo))
>> +		recalculate_vnodes(&vnode_info->nroot);
>>
>>   	if (is_cluster_diskmode(&sys->cinfo))
>>   		disks_to_vnodes(&vnode_info->nroot, &vnode_info->vroot);
>> @@ -1098,6 +1099,20 @@ static bool cluster_join_check(const struct cluster_info *cinfo)
>>   	if (!cluster_ctime_check(cinfo))
>>   		return false;
>>
>> +	if (cinfo->ctime > 0 && sys->this_node.nr_vnodes != 0) {
>> +		if (!is_cluster_autovnodes(&sys->cinfo)
>> +			&& is_cluster_autovnodes(cinfo)) {
>> +			sd_err("failed to join for vnodes strategy unmatch. "
>> +				" cluster:fixed, joined:auto");
>> +			return false;
>> +		} else if (is_cluster_autovnodes(&sys->cinfo)
>> +			&& !is_cluster_autovnodes(cinfo)) {
>> +			sd_err("failed to join for vnodes strategy unmatch. "
>> +				" cluster:auto, joined:fixed");
>> +			return false;
>> +		}
>> +	}
>> +
>>   	/*
>>   	 * Sheepdog's recovery code assumes every node have the same epoch
>>   	 * history. But we don't check epoch history of joining node because:
>> @@ -1119,6 +1134,14 @@ main_fn void sd_accept_handler(const struct sd_node *joined,
>>   {
>>   	const struct cluster_info *cinfo = opaque;
>>   	struct sd_node *n;
>> +	uint16_t flags;
>> +
>> +	if (node_is_local(joined) && sys->gateway_only
>> +		&& sys->cinfo.ctime <= 0) {
>> +		flags = cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES;
>> +	} else {
>> +		flags = sys->cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES;
>> +	}
>
> The brace problem, too.
>
>>
>>   	if (node_is_local(joined) && !cluster_join_check(cinfo)) {
>>   		sd_err("failed to join Sheepdog");
>> @@ -1127,6 +1150,9 @@ main_fn void sd_accept_handler(const struct sd_node *joined,
>>
>>   	cluster_info_copy(&sys->cinfo, cinfo);
>>
>> +	sys->cinfo.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES;
>> +	sys->cinfo.flags |= flags;
>> +
>>   	sd_debug("join %s", node_to_str(joined));
>>   	rb_for_each_entry(n, nroot, rb) {
>>   		sd_debug("%s", node_to_str(n));
>> @@ -1191,7 +1217,7 @@ main_fn void sd_leave_handler(const struct sd_node *left,
>>   	remove_node_from_participants(&left->nid);
>>   }
>>
>> -static void update_node_size(struct sd_node *node)
>> +static void update_node_info(struct sd_node *node)
>>   {
>>   	struct vnode_info *cur_vinfo = get_vnode_info();
>>   	struct sd_node *n = rb_search(&cur_vinfo->nroot, node, rb, node_cmp);
>> @@ -1199,6 +1225,11 @@ static void update_node_size(struct sd_node *node)
>>   	if (unlikely(!n))
>>   		panic("can't find %s", node_to_str(node));
>>   	n->space = node->space;
>> +
>> +	if (!is_cluster_autovnodes(&sys->cinfo)) {
>> +		n->nr_vnodes = node->nr_vnodes;
>> +	}
>
> The brace problem.
>
> That's all. Thanks.
> Hitoshi
>

Mitake-san, thank you for checking patch.
I can't take the time to fix and test soon, I will fix later, sorry.

Regards,
Saeki.




More information about the sheepdog mailing list