[Sheepdog] [PATCH v2 3/5] sheep: introduce SD_STATUS_HALT

Sun Oct 16 12:35:13 CEST 2011

From: Liu Yuan <tailai.ly at taobao.com>

Currently, sheepdog will serve IO requests even if number of nodes is less than 'copies'.

When the number of the nodes (or zones) is less than the copies specified by
collie-cluster-format command, the sheepdog cluster should stop serving IO requests.

This is necessary to solve the below subtle case:

+ good nodes, - failed nodes.

0       1      2     3
+       -      -     +
+  -->  - -->  - --> +
+       +      -     # <-- permanently down.
        ^
        |
this node has the latest data

at stage 3, we will have a cluster recovered without the data tracked at stage 1.

When the nodes are in the SD_STATUS_HALT, the sheepdog can also serve configuration change
and do the recovery job.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 include/sheep.h          |    1 +
 include/sheepdog_proto.h |    1 +
 sheep/group.c            |   34 +++++++++++++++++++++++-----------
 sheep/sheep_priv.h       |    1 +
 4 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/include/sheep.h b/include/sheep.h
index 31516d9..230917f 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -254,6 +254,7 @@ static inline const char *sd_strerror(int err)
 		{SD_RES_WAIT_FOR_FORMAT, "Waiting for a format operation"},
 		{SD_RES_WAIT_FOR_JOIN, "Waiting for other nodes joining"},
 		{SD_RES_JOIN_FAILED, "The node had failed to join sheepdog"},
+		{SD_RES_HALT, "The sheepdog is stopped doing IO, short of living nodes"},
 
 		{SD_RES_OLD_NODE_VER, "Remote node has an old epoch"},
 		{SD_RES_NEW_NODE_VER, "Remote node has a new epoch"},
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 2b042f4..9467c44 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -58,6 +58,7 @@
 #define SD_RES_WAIT_FOR_FORMAT  0x16 /* Sheepdog is waiting for a format operation */
 #define SD_RES_WAIT_FOR_JOIN    0x17 /* Sheepdog is waiting for other nodes joining */
 #define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */
+#define SD_RES_HALT 0x19 /* Sheepdog is stopped doing IO */
 
 /*
  * Object ID rules
diff --git a/sheep/group.c b/sheep/group.c
index 6a94f01..b905c5d 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -335,6 +335,9 @@ void cluster_queue_request(struct work *work, int idx)
 		case SD_STATUS_JOIN_FAILED:
 			ret = SD_RES_JOIN_FAILED;
 			break;
+		case SD_STATUS_HALT:
+			ret = SD_RES_HALT;
+			break;
 		default:
 			ret = SD_RES_SYSTEM_ERROR;
 			break;
@@ -534,7 +537,7 @@ static int cluster_sanity_check(struct sheepdog_node_list_entry *entries,
 	uint32_t lepoch;
 
 	if (sys->status == SD_STATUS_WAIT_FOR_FORMAT ||
-	    sys->status == SD_STATUS_SHUTDOWN ||
+	    sys->status == SD_STATUS_SHUTDOWN)
 		goto out;
 	/* When the joinning node is newly created, we need to check nothing. */
 	if (nr_entries == 0)
@@ -551,7 +554,7 @@ static int cluster_sanity_check(struct sheepdog_node_list_entry *entries,
 		goto out;
 	}
 
-	if (sys->status == SD_STATUS_OK)
+	if (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT)
 		goto out;
 
 	if (epoch < lepoch) {
@@ -593,6 +596,7 @@ static int get_cluster_status(struct sheepdog_node_list_entry *from,
 		goto out;
 
 	switch (sys->status) {
+	case SD_STATUS_HALT:
 	case SD_STATUS_OK:
 		if (inc_epoch)
 			*inc_epoch = 1;
@@ -778,7 +782,7 @@ static void update_cluster_info(struct join_message *msg)
 	int i;
 	int ret, nr_nodes = msg->nr_nodes;
 
-	eprintf("status = %d, epoch = %d, %d, %d\n", msg->cluster_status, msg->epoch, msg->result, sys->join_finished);
+	eprintf("status = %d, epoch = %d, %x, %d\n", msg->cluster_status, msg->epoch, msg->result, sys->join_finished);
 	if (msg->result != SD_RES_SUCCESS) {
 		if (is_myself(msg->header.from.addr, msg->header.from.port)) {
 			eprintf("failed to join sheepdog, %d\n", msg->result);
@@ -812,12 +816,13 @@ static void update_cluster_info(struct join_message *msg)
 				sheepid_to_str(&msg->nodes[i].sheepid));
 	}
 
-	if (msg->cluster_status != SD_STATUS_OK)
+	if (msg->cluster_status == SD_STATUS_WAIT_FOR_JOIN)
 		add_node_to_leave_list((struct message_header *)msg);
 
 	sys->join_finished = 1;
 
-	if (msg->cluster_status == SD_STATUS_OK && msg->inc_epoch)
+	if ((msg->cluster_status == SD_STATUS_OK || msg->cluster_status == SD_STATUS_HALT)
+	     && msg->inc_epoch)
 		update_epoch_log(sys->epoch);
 
 join_finished:
@@ -830,13 +835,16 @@ join_finished:
 		vprintf(SDOG_ERR, "%s has gone\n",
 			sheepid_to_str(&msg->header.sheepid));
 
-	if (msg->cluster_status == SD_STATUS_OK) {
+	if (msg->cluster_status == SD_STATUS_OK ||
+	    msg->cluster_status == SD_STATUS_HALT) {
 		if (msg->inc_epoch) {
 			sys->epoch++;
 			update_epoch_log(sys->epoch);
 			update_epoch_store(sys->epoch);
 		}
-		if (sys->status != SD_STATUS_OK) {
+
+		if (sys->status != SD_STATUS_OK ||
+		    sys->status != SD_STATUS_HALT) {
 			set_global_nr_copies(sys->nr_sobjs);
 			set_cluster_ctime(msg->ctime);
 		}
@@ -1079,7 +1087,8 @@ static void send_join_response(struct work_notify *w)
 	m->state = DM_FIN;
 
 	dprintf("%d, %d\n", jm->result, jm->cluster_status);
-	if (jm->result == SD_RES_SUCCESS && jm->cluster_status != SD_STATUS_OK) {
+	if (jm->result == SD_RES_SUCCESS &&
+	    jm->cluster_status == SD_STATUS_WAIT_FOR_JOIN) {
 		jm->nr_leave_nodes = 0;
 		list_for_each_entry(node, &sys->leave_list, list) {
 			jm->leave_nodes[jm->nr_leave_nodes].sheepid = node->sheepid;
@@ -1185,7 +1194,8 @@ static void __sd_notify_done(struct cpg_event *cevent)
 		}
 	}
 
-	if (do_recovery && sys->status == SD_STATUS_OK) {
+	if (do_recovery &&
+	    (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT)) {
 		list_for_each_entry_safe(node, t, &sys->leave_list, list) {
 			list_del(&node->list);
 		}
@@ -1257,7 +1267,8 @@ static int del_node(struct sheepid *id)
 		list_del(&node->list);
 		free(node);
 
-		if (sys->status == SD_STATUS_OK) {
+		if (sys->status == SD_STATUS_OK ||
+		    sys->status == SD_STATUS_HALT) {
 			nr = get_ordered_sd_node_list(e);
 			dprintf("update epoch, %d, %d\n", sys->epoch + 1, nr);
 			epoch_log_write(sys->epoch + 1, (char *)e,
@@ -1427,7 +1438,8 @@ static void __sd_leave_done(struct cpg_event *cevent)
 
 	print_node_list(&sys->sd_node_list);
 
-	if (node_left && sys->status == SD_STATUS_OK)
+	if (node_left &&
+	    (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT))
 		start_recovery(sys->epoch);
 }
 
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index e2fcb40..355cd93 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -37,6 +37,7 @@
 #define SD_STATUS_WAIT_FOR_JOIN     0x02
 #define SD_STATUS_SHUTDOWN          0x03
 #define SD_STATUS_JOIN_FAILED       0x04
+#define SD_STATUS_HALT              0x05
 
 #define SD_RES_NETWORK_ERROR    0x81 /* Network error between sheeps */
 
-- 
1.7.6.1