[sheepdog] [PATCH 1/2] sheep: introduce strict mode for write

Liu Yuan namei.unix at gmail.com
Tue Dec 10 08:10:36 CET 2013


We make sure we write the exact number of copies to honor the promise of the
redundancy for "strict mode". This means that after writing of targeted data,
they are redundant as promised and can withstand the random node failures.

For example, with a 4:2 policy, we need at least write to 6 nodes with data
strip and parity strips. For non-strict mode, we allow to write successfully
only if the data are written fully with 4 nodes alive.

Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 dog/cluster.c            |   10 +++++++++-
 include/internal_proto.h |    2 ++
 include/sheep.h          |    3 ++-
 include/sheepdog_proto.h |    2 +-
 sheep/ops.c              |    2 +-
 sheep/request.c          |   34 +++++++++++++++++++++++++++++++---
 6 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/dog/cluster.c b/dog/cluster.c
index 611c91d..43df232 100644
--- a/dog/cluster.c
+++ b/dog/cluster.c
@@ -21,6 +21,8 @@ static struct sd_option cluster_options[] = {
 	{'b', "store", true, "specify backend store"},
 	{'c', "copies", true, "specify the default data redundancy (number of copies)"},
 	{'f', "force", false, "do not prompt for confirmation"},
+	{'t', "strict", false,
+	 "do not serve write request if number of nodes is not sufficient"},
 	{'s', "backend", false, "show backend store information"},
 	{ 0, NULL, false, NULL },
 };
@@ -30,6 +32,7 @@ static struct cluster_cmd_data {
 	uint8_t copy_policy;
 	bool force;
 	bool show_store;
+	bool strict;
 	char name[STORE_LEN];
 } cluster_cmd_data;
 
@@ -117,6 +120,8 @@ static int cluster_format(int argc, char **argv)
 		pstrcpy(store_name, STORE_LEN, DEFAULT_STORE);
 	hdr.data_length = strlen(store_name) + 1;
 	hdr.flags |= SD_FLAG_CMD_WRITE;
+	if (cluster_cmd_data.strict)
+		hdr.cluster.flags |= SD_CLUSTER_FLAG_STRICT;
 
 	printf("using backend %s store\n", store_name);
 	ret = dog_exec_req(&sd_nid, &hdr, store_name);
@@ -552,7 +557,7 @@ static int cluster_check(int argc, char **argv)
 static struct subcommand cluster_cmd[] = {
 	{"info", NULL, "aprhs", "show cluster information",
 	 NULL, CMD_NEED_NODELIST, cluster_info, cluster_options},
-	{"format", NULL, "bcaph", "create a Sheepdog store",
+	{"format", NULL, "bctaph", "create a Sheepdog store",
 	 NULL, CMD_NEED_NODELIST, cluster_format, cluster_options},
 	{"shutdown", NULL, "aph", "stop Sheepdog",
 	 NULL, 0, cluster_shutdown, cluster_options},
@@ -597,6 +602,9 @@ static int cluster_parser(int ch, const char *opt)
 	case 's':
 		cluster_cmd_data.show_store = true;
 		break;
+	case 't':
+		cluster_cmd_data.strict = true;
+		break;
 	}
 
 	return 0;
diff --git a/include/internal_proto.h b/include/internal_proto.h
index b224c49..ac4e3f8 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -126,6 +126,8 @@
 #define SD_RES_CLUSTER_ERROR    0x91 /* Cluster driver error */
 #define SD_RES_OBJ_TAKEN        0x92 /* Object ID is taken up */
 
+#define SD_CLUSTER_FLAG_STRICT  0x0001 /* Strict mode for write */
+
 enum sd_status {
 	SD_STATUS_OK = 1,
 	SD_STATUS_WAIT,
diff --git a/include/sheep.h b/include/sheep.h
index 293e057..d460d54 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -160,7 +160,8 @@ static inline const char *sd_strerror(int err)
 		[SD_RES_WAIT_FOR_FORMAT] = "Waiting for cluster to be formatted",
 		[SD_RES_WAIT_FOR_JOIN] = "Waiting for other nodes to join cluster",
 		[SD_RES_JOIN_FAILED] = "Node has failed to join cluster",
-		[SD_RES_HALT] = "IO has halted as there are no living nodes",
+		[SD_RES_HALT] =
+			"IO has halted as there are not enough living nodes",
 		[SD_RES_READONLY] = "Object is read-only",
 
 		/* from internal_proto.h */
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index cb47e3f..366499e 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -156,7 +156,7 @@ struct sd_req {
 			uint64_t	ctime;
 			uint8_t		copies;
 			uint8_t		copy_policy;
-			uint8_t		reserved[2];
+			uint16_t	flags;
 			uint32_t	tag;
 		} cluster;
 		struct {
diff --git a/sheep/ops.c b/sheep/ops.c
index 75a2565..1e9bc1e 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -271,7 +271,7 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
 
 	sys->cinfo.nr_copies = req->cluster.copies;
 	sys->cinfo.copy_policy = req->cluster.copy_policy;
-	sys->cinfo.flags = req->flags;
+	sys->cinfo.flags = req->cluster.flags;
 	if (!sys->cinfo.nr_copies)
 		sys->cinfo.nr_copies = SD_DEFAULT_COPIES;
 	sys->cinfo.ctime = req->cluster.ctime;
diff --git a/sheep/request.c b/sheep/request.c
index 5113fca..fd54253 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -284,6 +284,22 @@ static void queue_peer_request(struct request *req)
 	queue_work(sys->io_wqueue, &req->work);
 }
 
+/*
+ * We make sure we write the exact number of copies to honor the promise of the
+ * redundancy for strict mode. This means that after writing of targeted data,
+ * they are redundant as promised and can withstand the random node failures.
+ *
+ * For example, with a 4:2 policy, we need at least write to 6 nodes with data
+ * strip and parity strips. For non-strict mode, we allow to write successfully
+ * only if the data are written fully with 4 nodes alive.
+ */
+static bool has_enough_zones(struct request *req)
+{
+	uint64_t oid = req->rq.obj.oid;
+
+	return req->vinfo->nr_zones >= get_vdi_copy_number(oid_to_vid(oid));
+}
+
 static void queue_gateway_request(struct request *req)
 {
 	struct sd_req *hdr = &req->rq;
@@ -310,13 +326,25 @@ static void queue_gateway_request(struct request *req)
 queue_work:
 	if (RB_EMPTY_ROOT(&req->vinfo->vroot)) {
 		sd_err("there is no living nodes");
-		req->rp.result = SD_RES_HALT;
-		put_request(req);
-		return;
+		goto end_request;
+	}
+	if (sys->cinfo.flags & SD_CLUSTER_FLAG_STRICT &&
+	    hdr->flags & SD_FLAG_CMD_WRITE &&
+	    !(hdr->flags & SD_FLAG_CMD_RECOVERY) &&
+	    !has_enough_zones(req)) {
+		sd_err("not enough zones available");
+		goto end_request;
 	}
+
 	req->work.fn = do_process_work;
 	req->work.done = gateway_op_done;
 	queue_work(sys->gateway_wqueue, &req->work);
+	return;
+
+end_request:
+	req->rp.result = SD_RES_HALT;
+	put_request(req);
+	return;
 }
 
 static void queue_local_request(struct request *req)
-- 
1.7.9.5




More information about the sheepdog mailing list