[Sheepdog] [PATCH v2 5/5] sheep: use SD_STATUS_HALT to stop serving IO

Mon Oct 17 07:44:42 CEST 2011

At Sun, 16 Oct 2011 18:35:15 +0800,
Liu Yuan wrote:
> 
> From: Liu Yuan <tailai.ly at taobao.com>
> 
> We use SD_STATUS_HALT to identify the cluster state when it should not serve
> IO requests.
> 
> This is optional, users might risk themselves to turn off this HALT status. As
> the below command:
> 
> $ collie cluster format -H
> or
> $ collie cluster format --nohalt
> 
> By default, this is enabled.
> 
> [Test Case]
> 
> [1]
> steps:
> 
> for i in 0 1 2 3; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done
> ./collie/collie cluster format --copies=3;
> for i in 0 1; do pkill -f "sheep -d /store/$i"; sleep 1; done
> for i in 2 3; do ./collie/collie cluster info -p 700$i; done
> for i in 0 1; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done
> for i in 0 1 2 3; do ./collie/collie cluster info -p 700$i; done
> 
> output:
> 
> Cluster status: The sheepdog is stopped doing IO, short of living nodes
> 
> Creation time        Epoch Nodes
> 2011-10-11 16:26:02      3 [192.168.0.1:7002, 192.168.0.1:7003]
> 2011-10-11 16:26:02      2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
> 2011-10-11 16:26:02      1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
> Cluster status: The sheepdog is stopped doing IO, short of living nodes
> 
> Creation time        Epoch Nodes
> 2011-10-11 16:26:02      3 [192.168.0.1:7002, 192.168.0.1:7003]
> 2011-10-11 16:26:02      2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
> 2011-10-11 16:26:02      1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
> Cluster status: running
> 
> Creation time        Epoch Nodes
> 2011-10-11 16:26:02      5 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
> 2011-10-11 16:26:02      4 [192.168.0.1:7000, 192.168.0.1:7002, 192.168.0.1:7003]
> 2011-10-11 16:26:02      3 [192.168.0.1:7002, 192.168.0.1:7003]
> 2011-10-11 16:26:02      2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
> 2011-10-11 16:26:02      1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
> 
> ...
> 
> [2]
> steps:
> for i in 0 1; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done
> collie/collie cluster format
> for i in 0 1; do collie/collie cluster info -p 700$i;done
> for i in 0; do pkill -f "sheep/sheep -d /store/$i"; sleep 1; done
> for i in 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done
> for i in 1 2; do pkill -f "sheep/sheep -d /store/$i"; sleep 1; done
> for i in 0 1 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done
> for i in 0 1 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done
> for i in 0 1 2; do collie/collie cluster info -p 700$i;done
> 
> output:
> Cluster status: The sheepdog is stopped doing IO, short of living nodes
> 
> Creation time        Epoch Nodes
> 2011-10-16 18:11:07      1 [192.168.0.1:7000, 192.168.0.1:7001]
> Cluster status: The sheepdog is stopped doing IO, short of living nodes
> 
> Creation time        Epoch Nodes
> 2011-10-16 18:11:07      1 [192.168.0.1:7000, 192.168.0.1:7001]
> Cluster status: running
> 
> Creation time        Epoch Nodes
> 2011-10-16 18:11:07      6 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002]
> 2011-10-16 18:11:07      5 [192.168.0.1:7000, 192.168.0.1:7002]
> 2011-10-16 18:11:07      4 [192.168.0.1:7002]
> 2011-10-16 18:11:07      3 [192.168.0.1:7001, 192.168.0.1:7002]
> 2011-10-16 18:11:07      2 [192.168.0.1:7001]
> 2011-10-16 18:11:07      1 [192.168.0.1:7000, 192.168.0.1:7001]
> 
> ...
> 
> Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
> ---
>  collie/cluster.c   |   14 +++++++++++++-
>  collie/collie.c    |    1 +
>  sheep/group.c      |   30 +++++++++++++++++++++++++++++-
>  sheep/sheep_priv.h |    2 ++
>  4 files changed, 45 insertions(+), 2 deletions(-)
> 
> diff --git a/collie/cluster.c b/collie/cluster.c
> index 0d5dfbe..3b16308 100644
> --- a/collie/cluster.c
> +++ b/collie/cluster.c
> @@ -16,8 +16,15 @@
>  
>  struct cluster_cmd_data {
>  	int copies;
> +	int nohalt;
>  } cluster_cmd_data;
>  
> +static void set_nohalt(uint32_t *p)
> +{
> +	if (p)
> +		*p |= 1 << 31;
> +}
> +
>  static int cluster_format(int argc, char **argv)
>  {
>  	int fd, ret;
> @@ -36,6 +43,8 @@ static int cluster_format(int argc, char **argv)
>  
>  	hdr.opcode = SD_OP_MAKE_FS;
>  	hdr.copies = cluster_cmd_data.copies;
> +	if (cluster_cmd_data.nohalt)
> +		set_nohalt(&hdr.copies);

I think we should use hdr.flags to set the nohalt option.

>  	hdr.epoch = node_list_version;
>  	hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
>  
> @@ -163,7 +172,7 @@ static int cluster_shutdown(int argc, char **argv)
>  static struct subcommand cluster_cmd[] = {
>  	{"info", NULL, "aprh", "show cluster information",
>  	 0, cluster_info},
> -	{"format", NULL, "caph", "create a Sheepdog storage",
> +	{"format", NULL, "cHaph", "create a Sheepdog storage",
>  	 0, cluster_format},
>  	{"shutdown", NULL, "aph", "stop Sheepdog",
>  	 SUBCMD_FLAG_NEED_NODELIST, cluster_shutdown},
> @@ -176,6 +185,9 @@ static int cluster_parser(int ch, char *opt)
>  	case 'c':
>  		cluster_cmd_data.copies = atoi(opt);
>  		break;
> +	case 'H':
> +		cluster_cmd_data.nohalt = 1;
> +		break;
>  	}
>  
>  	return 0;
> diff --git a/collie/collie.c b/collie/collie.c
> index e064a0a..df5dca4 100644
> --- a/collie/collie.c
> +++ b/collie/collie.c
> @@ -41,6 +41,7 @@ static const struct sd_option collie_options[] = {
>  
>  	/* cluster options */
>  	{'c', "copies", 1, "set the number of data redundancy"},
> +	{'H', "nohalt", 0, "serve the IO rquests even lack of enough redundant nodes"},
>  
>  	{ 0, NULL, 0, NULL },
>  };
> diff --git a/sheep/group.c b/sheep/group.c
> index 5d06745..103a647 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -983,7 +983,16 @@ static void vdi_op_done(struct vdi_op_message *msg)
>  
>  		set_global_nr_copies(sys->nr_sobjs);
>  
> -		sys->status = SD_STATUS_OK;
> +		if (sys_nohalt())
> +			sys->status = SD_STATUS_OK;
> +		else {
> +			int nr_zones = get_zones_nr_from(&sys->sd_node_list);
> +
> +			if (nr_zones >= sys->nr_sobjs)
> +				sys->status = SD_STATUS_OK;
> +			else
> +				sys->status = SD_STATUS_HALT;
> +		}
>  		break;
>  	case SD_OP_SHUTDOWN:
>  		sys->status = SD_STATUS_SHUTDOWN;
> @@ -1210,6 +1219,13 @@ static void __sd_notify_done(struct cpg_event *cevent)
>  		}
>  		start_recovery(sys->epoch);
>  	}
> +
> +	if (sys->status == SD_STATUS_HALT) {
> +		int nr_zones = get_zones_nr_from(&sys->sd_node_list);
> +
> +		if (nr_zones >= sys->nr_sobjs)
> +			sys->status = SD_STATUS_OK;
> +	}
>  }
>  
>  static void sd_notify_handler(struct sheepid *sender, void *msg, size_t msg_len)
> @@ -1438,6 +1454,11 @@ static void __sd_join_done(struct cpg_event *cevent)
>  		send_join_request(&w->joined);
>  }
>  
> +int sys_nohalt()
> +{
> +	return sys->nr_sobjs & (1 << 31);

sys->nr_sobjs is used everywhere in the main thread, so I think this
doesn't work at all.

For example:

  $ sheep /store/0
  $ collie cluster format -H
  $ qemu-img create sheepdog:test 4G
  Formatting 'sheepdog:test', fmt=raw size=4294967296 
  qemu-img: Failed to write the requested VDI, test
  qemu-img: sheepdog:test: error while creating raw: Input/output error

We should declare another field in struct cluster info for the nohalt
option?

Thanks,

Kazutaka