[sheepdog] [PATCH 1/2] sheep: introduce strict mode for write

Thu Dec 12 06:11:43 CET 2013

At Tue, 10 Dec 2013 15:10:36 +0800,
Liu Yuan wrote:
> 
> We make sure we write the exact number of copies to honor the promise of the
> redundancy for "strict mode". This means that after writing of targeted data,
> they are redundant as promised and can withstand the random node failures.
> 
> For example, with a 4:2 policy, we need at least write to 6 nodes with data
> strip and parity strips. For non-strict mode, we allow to write successfully
> only if the data are written fully with 4 nodes alive.
> 
> Signed-off-by: Liu Yuan <namei.unix at gmail.com>
> ---
>  dog/cluster.c            |   10 +++++++++-
>  include/internal_proto.h |    2 ++
>  include/sheep.h          |    3 ++-
>  include/sheepdog_proto.h |    2 +-
>  sheep/ops.c              |    2 +-
>  sheep/request.c          |   34 +++++++++++++++++++++++++++++++---
>  6 files changed, 46 insertions(+), 7 deletions(-)
> 
> diff --git a/dog/cluster.c b/dog/cluster.c
> index 611c91d..43df232 100644
> --- a/dog/cluster.c
> +++ b/dog/cluster.c
> @@ -21,6 +21,8 @@ static struct sd_option cluster_options[] = {
>  	{'b', "store", true, "specify backend store"},
>  	{'c', "copies", true, "specify the default data redundancy (number of copies)"},
>  	{'f', "force", false, "do not prompt for confirmation"},
> +	{'t', "strict", false,
> +	 "do not serve write request if number of nodes is not sufficient"},
>  	{'s', "backend", false, "show backend store information"},
>  	{ 0, NULL, false, NULL },
>  };
> @@ -30,6 +32,7 @@ static struct cluster_cmd_data {
>  	uint8_t copy_policy;
>  	bool force;
>  	bool show_store;
> +	bool strict;
>  	char name[STORE_LEN];
>  } cluster_cmd_data;
>  
> @@ -117,6 +120,8 @@ static int cluster_format(int argc, char **argv)
>  		pstrcpy(store_name, STORE_LEN, DEFAULT_STORE);
>  	hdr.data_length = strlen(store_name) + 1;
>  	hdr.flags |= SD_FLAG_CMD_WRITE;
> +	if (cluster_cmd_data.strict)
> +		hdr.cluster.flags |= SD_CLUSTER_FLAG_STRICT;
>  
>  	printf("using backend %s store\n", store_name);
>  	ret = dog_exec_req(&sd_nid, &hdr, store_name);
> @@ -552,7 +557,7 @@ static int cluster_check(int argc, char **argv)
>  static struct subcommand cluster_cmd[] = {
>  	{"info", NULL, "aprhs", "show cluster information",
>  	 NULL, CMD_NEED_NODELIST, cluster_info, cluster_options},
> -	{"format", NULL, "bcaph", "create a Sheepdog store",
> +	{"format", NULL, "bctaph", "create a Sheepdog store",
>  	 NULL, CMD_NEED_NODELIST, cluster_format, cluster_options},
>  	{"shutdown", NULL, "aph", "stop Sheepdog",
>  	 NULL, 0, cluster_shutdown, cluster_options},
> @@ -597,6 +602,9 @@ static int cluster_parser(int ch, const char *opt)
>  	case 's':
>  		cluster_cmd_data.show_store = true;
>  		break;
> +	case 't':
> +		cluster_cmd_data.strict = true;
> +		break;
>  	}
>  
>  	return 0;
> diff --git a/include/internal_proto.h b/include/internal_proto.h
> index b224c49..ac4e3f8 100644
> --- a/include/internal_proto.h
> +++ b/include/internal_proto.h
> @@ -126,6 +126,8 @@
>  #define SD_RES_CLUSTER_ERROR    0x91 /* Cluster driver error */
>  #define SD_RES_OBJ_TAKEN        0x92 /* Object ID is taken up */
>  
> +#define SD_CLUSTER_FLAG_STRICT  0x0001 /* Strict mode for write */
> +
>  enum sd_status {
>  	SD_STATUS_OK = 1,
>  	SD_STATUS_WAIT,
> diff --git a/include/sheep.h b/include/sheep.h
> index 293e057..d460d54 100644
> --- a/include/sheep.h
> +++ b/include/sheep.h
> @@ -160,7 +160,8 @@ static inline const char *sd_strerror(int err)
>  		[SD_RES_WAIT_FOR_FORMAT] = "Waiting for cluster to be formatted",
>  		[SD_RES_WAIT_FOR_JOIN] = "Waiting for other nodes to join cluster",
>  		[SD_RES_JOIN_FAILED] = "Node has failed to join cluster",
> -		[SD_RES_HALT] = "IO has halted as there are no living nodes",
> +		[SD_RES_HALT] =
> +			"IO has halted as there are not enough living nodes",
>  		[SD_RES_READONLY] = "Object is read-only",
>  
>  		/* from internal_proto.h */
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index cb47e3f..366499e 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -156,7 +156,7 @@ struct sd_req {
>  			uint64_t	ctime;
>  			uint8_t		copies;
>  			uint8_t		copy_policy;
> -			uint8_t		reserved[2];
> +			uint16_t	flags;
>  			uint32_t	tag;
>  		} cluster;
>  		struct {
> diff --git a/sheep/ops.c b/sheep/ops.c
> index 75a2565..1e9bc1e 100644
> --- a/sheep/ops.c
> +++ b/sheep/ops.c
> @@ -271,7 +271,7 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
>  
>  	sys->cinfo.nr_copies = req->cluster.copies;
>  	sys->cinfo.copy_policy = req->cluster.copy_policy;
> -	sys->cinfo.flags = req->flags;
> +	sys->cinfo.flags = req->cluster.flags;
>  	if (!sys->cinfo.nr_copies)
>  		sys->cinfo.nr_copies = SD_DEFAULT_COPIES;
>  	sys->cinfo.ctime = req->cluster.ctime;
> diff --git a/sheep/request.c b/sheep/request.c
> index 5113fca..fd54253 100644
> --- a/sheep/request.c
> +++ b/sheep/request.c
> @@ -284,6 +284,22 @@ static void queue_peer_request(struct request *req)
>  	queue_work(sys->io_wqueue, &req->work);
>  }
>  
> +/*
> + * We make sure we write the exact number of copies to honor the promise of the
> + * redundancy for strict mode. This means that after writing of targeted data,
> + * they are redundant as promised and can withstand the random node failures.
> + *
> + * For example, with a 4:2 policy, we need at least write to 6 nodes with data
> + * strip and parity strips. For non-strict mode, we allow to write successfully
> + * only if the data are written fully with 4 nodes alive.
> + */
> +static bool has_enough_zones(struct request *req)
> +{
> +	uint64_t oid = req->rq.obj.oid;
> +
> +	return req->vinfo->nr_zones >= get_vdi_copy_number(oid_to_vid(oid));
> +}
> +
>  static void queue_gateway_request(struct request *req)
>  {
>  	struct sd_req *hdr = &req->rq;
> @@ -310,13 +326,25 @@ static void queue_gateway_request(struct request *req)
>  queue_work:
>  	if (RB_EMPTY_ROOT(&req->vinfo->vroot)) {
>  		sd_err("there is no living nodes");
> -		req->rp.result = SD_RES_HALT;
> -		put_request(req);
> -		return;
> +		goto end_request;
> +	}
> +	if (sys->cinfo.flags & SD_CLUSTER_FLAG_STRICT &&
> +	    hdr->flags & SD_FLAG_CMD_WRITE &&
> +	    !(hdr->flags & SD_FLAG_CMD_RECOVERY) &&
> +	    !has_enough_zones(req)) {

I think the above condition is not correct.

1. hdr->flags & SD_FLAG_CMD_WRITE
The flag SD_FLAG_CMD_WRITE is used for indicate that a request has its
own data. This shouldn't be treated as the condition of strict
mode. e.g. SD_OP_TRACE_ENABLE has this flag.

This condition should be replaced with the below one:
(hdr->opcode == SD_OP_CREATE_AND_WRITE_OBJ || hdr->opcode == SD_OP_WRITE_OBJ)

2. !(hdr->flags & SD_FLAG_CMD_RECOVERY)

SD_FLAG_CMD_RECOVERY is used for indicating a request PEER_READ is for
recovery. The flag is not related to the strict mode. I think it can
be removed simply.

Thanks,
Hitoshi