[sheepdog] [PATCH 1/2] sheep: remove SD_STATUS_HALT

Wed Aug 7 23:29:12 CEST 2013

From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

SD_STATUS_HALT is used to avoid reading an old data when there are too
few nodes and node failure happens with them.  However, the current
implementation already allows users to read the old data when all the
replicas are lost.  It doesn't make sense to have a halt status as a
special case.

This patch removes SD_STATUS_HALT and "-m (unsafe|quorum)" option from
the collie command.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 collie/cluster.c           |   25 +--------------------
 include/internal_proto.h   |    4 ----
 include/sheep.h            |    7 +++++-
 sheep/gateway.c            |   11 +++++++++
 sheep/group.c              |   53 ++++----------------------------------------
 sheep/ops.c                |   12 ++--------
 sheep/request.c            |    6 -----
 sheep/sheep_priv.h         |    1 -
 tests/functional/022.out   |    2 +-
 tests/functional/035       |    2 +-
 tests/functional/043       |    2 +-
 tests/functional/051       |    2 +-
 tests/functional/051.out   |    4 ++--
 tests/functional/common.rc |    2 +-
 14 files changed, 31 insertions(+), 102 deletions(-)

diff --git a/collie/cluster.c b/collie/cluster.c
index c0b3c92..b9d3cf8 100644
--- a/collie/cluster.c
+++ b/collie/cluster.c
@@ -20,7 +20,6 @@
 static struct sd_option cluster_options[] = {
 	{'b', "store", true, "specify backend store"},
 	{'c', "copies", true, "specify the default data redundancy (number of copies)"},
-	{'m', "mode", true, "mode (safe, quorum, unsafe)"},
 	{'f', "force", false, "do not prompt for confirmation"},
 
 	{ 0, NULL, false, NULL },
@@ -28,8 +27,6 @@ static struct sd_option cluster_options[] = {
 
 static struct cluster_cmd_data {
 	int copies;
-	bool nohalt;
-	bool quorum;
 	bool force;
 	char name[STORE_LEN];
 } cluster_cmd_data;
@@ -97,11 +94,6 @@ static int cluster_format(int argc, char **argv)
 
 	sd_init_req(&hdr, SD_OP_MAKE_FS);
 	hdr.cluster.copies = cluster_cmd_data.copies;
-	if (cluster_cmd_data.nohalt)
-		hdr.flags |= SD_FLAG_NOHALT;
-	if (cluster_cmd_data.quorum)
-		hdr.flags |= SD_FLAG_QUORUM;
-
 	hdr.cluster.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
 
 	if (strlen(cluster_cmd_data.name))
@@ -483,7 +475,7 @@ static int cluster_reweight(int argc, char **argv)
 static struct subcommand cluster_cmd[] = {
 	{"info", NULL, "aprh", "show cluster information",
 	 NULL, SUBCMD_FLAG_NEED_NODELIST, cluster_info, cluster_options},
-	{"format", NULL, "bcmaph", "create a Sheepdog store",
+	{"format", NULL, "bcaph", "create a Sheepdog store",
 	 NULL, 0, cluster_format, cluster_options},
 	{"shutdown", NULL, "aph", "stop Sheepdog",
 	 NULL, 0, cluster_shutdown, cluster_options},
@@ -521,21 +513,6 @@ static int cluster_parser(int ch, char *opt)
 		}
 		cluster_cmd_data.copies = copies;
 		break;
-	case 'm':
-		if (strcmp(opt, "safe") == 0) {
-			cluster_cmd_data.nohalt = false;
-			cluster_cmd_data.quorum = false;
-		} else if (strcmp(opt, "quorum") == 0) {
-			cluster_cmd_data.nohalt = false;
-			cluster_cmd_data.quorum = true;
-		} else if (strcmp(opt, "unsafe") == 0) {
-			cluster_cmd_data.nohalt = true;
-			cluster_cmd_data.quorum = false;
-		} else {
-			fprintf(stderr, "Unknown mode '%s'\n", opt);
-			exit(EXIT_FAILURE);
-		}
-		break;
 	case 'f':
 		cluster_cmd_data.force = true;
 		break;
diff --git a/include/internal_proto.h b/include/internal_proto.h
index b172065..7db954e 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -104,13 +104,9 @@
 #define SD_RES_STALE_OBJ        0x90 /* Object may be stale */
 #define SD_RES_CLUSTER_ERROR    0x91 /* Cluster driver error */
 
-#define SD_FLAG_NOHALT       0x0004 /* Serve the IO rquest even lack of nodes */
-#define SD_FLAG_QUORUM       0x0008 /* Serve the IO rquest as long we are quorate */
-
 #define SD_STATUS_OK                0x00000001
 #define SD_STATUS_WAIT              0x00000004
 #define SD_STATUS_SHUTDOWN          0x00000008
-#define SD_STATUS_HALT              0x00000020
 #define SD_STATUS_KILLED            0x00000040
 
 struct node_id {
diff --git a/include/sheep.h b/include/sheep.h
index e3f7755..0577ade 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -69,6 +69,8 @@ static inline int get_vnode_first_idx(const struct sd_vnode *entries,
 	uint64_t id = fnv_64a_buf(&oid, sizeof(oid), FNV1A_64_INIT);
 	int start, end, pos;
 
+	assert(nr_entries > 0);
+
 	start = 0;
 	end = nr_entries - 1;
 
@@ -160,6 +162,9 @@ static inline void oid_to_vnodes(const struct sd_vnode *entries, int nr_entries,
 {
 	int idx, idxs[SD_MAX_COPIES], i;
 
+	if (nr_entries == 0)
+		return;
+
 	idx = get_vnode_first_idx(entries, nr_entries, oid);
 	idxs[0] = idx;
 	vnodes[0] = &entries[idx];
@@ -213,7 +218,7 @@ static inline const char *sd_strerror(int err)
 		[SD_RES_WAIT_FOR_FORMAT] = "Waiting for cluster to be formatted",
 		[SD_RES_WAIT_FOR_JOIN] = "Waiting for other nodes to join cluster",
 		[SD_RES_JOIN_FAILED] = "Node has failed to join cluster",
-		[SD_RES_HALT] = "IO has halted as there are too few living nodes",
+		[SD_RES_HALT] = "IO has halted as there are no living nodes",
 		[SD_RES_READONLY] = "Object is read-only",
 
 		/* from internal_proto.h */
diff --git a/sheep/gateway.c b/sheep/gateway.c
index fb7e9a7..7cb6169 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -42,6 +42,12 @@ int gateway_read_obj(struct request *req)
 	}
 
 	nr_copies = get_req_copy_number(req);
+
+	if (nr_copies == 0) {
+		sd_dprintf("there is no living nodes");
+		return SD_RES_HALT;
+	}
+
 	oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid,
 		      nr_copies, obj_vnodes);
 	for (i = 0; i < nr_copies; i++) {
@@ -268,6 +274,11 @@ static int gateway_forward_request(struct request *req)
 	nr_to_send = init_target_nodes(req, oid, target_nodes);
 	write_info_init(&wi, nr_to_send);
 
+	if (nr_to_send == 0) {
+		sd_dprintf("there is no living nodes");
+		return SD_RES_HALT;
+	}
+
 	for (i = 0; i < nr_to_send; i++) {
 		struct sockfd *sfd;
 		const struct node_id *nid;
diff --git a/sheep/group.c b/sheep/group.c
index 474e48a..c049970 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -60,35 +60,6 @@ static int get_zones_nr_from(const struct sd_node *nodes, int nr_nodes)
 	return nr_zones;
 }
 
-bool have_enough_zones(void)
-{
-	int max_copies;
-	struct vnode_info *cur_vinfo = main_thread_get(current_vnode_info);
-
-	if (sys->cinfo.flags & SD_FLAG_NOHALT)
-		return true;
-
-	if (!cur_vinfo)
-		return false;
-
-	max_copies = get_max_copy_number();
-
-	sd_dprintf("flags %d, nr_zones %d, min copies %d",
-		   sys->cinfo.flags, cur_vinfo->nr_zones, max_copies);
-
-	if (!cur_vinfo->nr_zones)
-		return false;
-
-	if (sys->cinfo.flags & SD_FLAG_QUORUM) {
-		if (cur_vinfo->nr_zones > (max_copies/2))
-			return true;
-	} else {
-		if (cur_vinfo->nr_zones >= max_copies)
-			return true;
-	}
-	return false;
-}
-
 static int get_node_idx(struct vnode_info *vnode_info, struct sd_node *ent)
 {
 	ent = xbsearch(ent, vnode_info->nodes, vnode_info->nr_nodes, node_cmp);
@@ -687,17 +658,13 @@ static void update_cluster_info(const struct join_message *msg,
 
 	get_vdis(nodes, nr_nodes, joined);
 
-	switch (msg->cluster_status) {
-	case SD_STATUS_OK:
-	case SD_STATUS_HALT:
+	if (msg->cluster_status == SD_STATUS_OK) {
 		if (sys->status == SD_STATUS_WAIT) {
 			if (!is_cluster_formatted())
 				/* initialize config file */
 				set_cluster_config(&sys->cinfo);
 		}
 
-		sys->status = msg->cluster_status;
-
 		if (nr_nodes != msg->cinfo.nr_nodes) {
 			int ret = inc_and_log_epoch();
 			if (ret != 0)
@@ -715,15 +682,10 @@ static void update_cluster_info(const struct join_message *msg,
 			start_recovery(main_thread_get(current_vnode_info),
 				       main_thread_get(current_vnode_info),
 				       false);
-
-		if (have_enough_zones())
-			sys->status = SD_STATUS_OK;
-		break;
-	default:
-		sys->status = msg->cluster_status;
-		break;
 	}
 
+	sys->status = msg->cluster_status;
+
 	put_vnode_info(old_vnode_info);
 
 	sockfd_cache_add(&joined->nid);
@@ -981,19 +943,12 @@ void sd_leave_handler(const struct sd_node *left, const struct sd_node *members,
 	old_vnode_info = main_thread_get(current_vnode_info);
 	main_thread_set(current_vnode_info,
 			  alloc_vnode_info(members, nr_members));
-	switch (sys->status) {
-	case SD_STATUS_HALT:
-	case SD_STATUS_OK:
+	if (sys->status == SD_STATUS_OK) {
 		ret = inc_and_log_epoch();
 		if (ret != 0)
 			panic("cannot log current epoch %d", sys->cinfo.epoch);
 		start_recovery(main_thread_get(current_vnode_info),
 			       old_vnode_info, true);
-		if (!have_enough_zones())
-			sys->status = SD_STATUS_HALT;
-		break;
-	default:
-		break;
 	}
 
 	put_vnode_info(old_vnode_info);
diff --git a/sheep/ops.c b/sheep/ops.c
index c25aead..1f1f702 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -264,10 +264,7 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
 	if (ret)
 		return SD_RES_EIO;
 
-	if (have_enough_zones())
-		sys->status = SD_STATUS_OK;
-	else
-		sys->status = SD_STATUS_HALT;
+	sys->status = SD_STATUS_OK;
 
 	return SD_RES_SUCCESS;
 }
@@ -448,8 +445,6 @@ out:
 			return SD_RES_WAIT_FOR_JOIN;
 	case SD_STATUS_SHUTDOWN:
 		return SD_RES_SHUTDOWN;
-	case SD_STATUS_HALT:
-		return SD_RES_HALT;
 	default:
 		return SD_RES_SYSTEM_ERROR;
 	}
@@ -544,10 +539,7 @@ static int cluster_force_recover_main(const struct sd_req *req,
 		/* initialize config file */
 		set_cluster_config(&sys->cinfo);
 
-	if (have_enough_zones())
-		sys->status = SD_STATUS_OK;
-	else
-		sys->status = SD_STATUS_HALT;
+	sys->status = SD_STATUS_OK;
 
 	vnode_info = get_vnode_info();
 	old_vnode_info = alloc_vnode_info(nodes, nr_nodes);
diff --git a/sheep/request.c b/sheep/request.c
index 1cc5a9d..ee1e987 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -362,12 +362,6 @@ static void queue_request(struct request *req)
 			goto done;
 		}
 		break;
-	case SD_STATUS_HALT:
-		if (!is_force_op(req->op)) {
-			rsp->result = SD_RES_HALT;
-			goto done;
-		}
-		break;
 	default:
 		break;
 	}
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index d54b85e..1652218 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -269,7 +269,6 @@ int get_vdi_attr(struct sheepdog_vdi_attr *vattr, int data_len, uint32_t vid,
 int local_get_node_list(const struct sd_req *req, struct sd_rsp *rsp,
 		void *data);
 
-bool have_enough_zones(void);
 struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info);
 struct vnode_info *get_vnode_info(void);
 void put_vnode_info(struct vnode_info *vinfo);
diff --git a/tests/functional/022.out b/tests/functional/022.out
index fd78a89..e26e19b 100644
--- a/tests/functional/022.out
+++ b/tests/functional/022.out
@@ -1,4 +1,4 @@
 QA output created by 022
 using backend plain store
 creating a VDI should fail without data nodes available
-Failed to create VDI test: IO has halted as there are too few living nodes
+Failed to create VDI test: Failed to write to requested VDI
diff --git a/tests/functional/035 b/tests/functional/035
index bcc044a..2f3d236 100755
--- a/tests/functional/035
+++ b/tests/functional/035
@@ -32,7 +32,7 @@ done
 
 _wait_for_sheep 6
 
-_cluster_format -c 3 -m unsafe
+_cluster_format -c 3
 
 $COLLIE vdi create test 40M
 _random | $COLLIE vdi write test &
diff --git a/tests/functional/043 b/tests/functional/043
index 0d26638..73aec75 100755
--- a/tests/functional/043
+++ b/tests/functional/043
@@ -21,7 +21,7 @@ done
 
 _wait_for_sheep 5
 
-_cluster_format -m unsafe
+_cluster_format
 
 $COLLIE vdi create test 40M
 
diff --git a/tests/functional/051 b/tests/functional/051
index 799fe70..f9ca569 100755
--- a/tests/functional/051
+++ b/tests/functional/051
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Test cluster halt
+# Test cluster with too few living nodes
 
 seq=`basename $0`
 echo "QA output created by $seq"
diff --git a/tests/functional/051.out b/tests/functional/051.out
index e7b5411..417e60f 100644
--- a/tests/functional/051.out
+++ b/tests/functional/051.out
@@ -1,6 +1,6 @@
 QA output created by 051
 using backend plain store
-Cluster status: IO has halted as there are too few living nodes
+Cluster status: running, auto-recovery enabled
 
 Cluster created at DATE
 
@@ -13,7 +13,7 @@ Cluster created at DATE
 Epoch Time           Version
 DATE      2 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
 DATE      1 [127.0.0.1:7000, 127.0.0.1:7001]
-Cluster status: IO has halted as there are too few living nodes
+Cluster status: running, auto-recovery enabled
 
 Cluster created at DATE
 
diff --git a/tests/functional/common.rc b/tests/functional/common.rc
index 0ea0be0..50a2e69 100644
--- a/tests/functional/common.rc
+++ b/tests/functional/common.rc
@@ -470,7 +470,7 @@ _cluster_format()
     for port in $ports; do
 	local cnt
 	for cnt in `seq 10`; do # wait at most 10 seconds
-	    $COLLIE cluster info -p $port | grep -E running\|halt > /dev/null
+	    $COLLIE cluster info -p $port | grep -E running > /dev/null
 	    if [ $? == 0 ]; then
 		continue 2
 	    fi
-- 
1.7.9.5