[Sheepdog] [PATCH 1/2] sheep: introduce SD_STATUS_HALT

Tue Oct 11 11:06:16 CEST 2011

From: Liu Yuan <tailai.ly at taobao.com>

Currently, sheepdog will serve IO requests even if number of nodes is less than 'copies'.

When the number of the nodes (or zones) is less than the copies specified by
collie-cluster-format command, the sheepdog cluster should stop serving IO requests.

This is necessary to solve the below subtle case:

+ good nodes, - failed nodes.

0       1      2     3
+       -      -     +
+  -->  - -->  - --> +
+       +      -     # <-- permanently down.
        ^
        |
this node has the latest data

at stage 3, we will have a cluster recovered without the data tracked at stage 1.

When the nodes are in the SD_STATUS_HALT, the sheepdog can also serve configuration change
and do the recovery job.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 include/sheep.h          |    1 +
 include/sheepdog_proto.h |    1 +
 sheep/group.c            |   27 ++++++++++++++++++++++-----
 sheep/sheep_priv.h       |    1 +
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/include/sheep.h b/include/sheep.h
index 31516d9..943cdf7 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -254,6 +254,7 @@ static inline const char *sd_strerror(int err)
 		{SD_RES_WAIT_FOR_FORMAT, "Waiting for a format operation"},
 		{SD_RES_WAIT_FOR_JOIN, "Waiting for other nodes joining"},
 		{SD_RES_JOIN_FAILED, "The node had failed to join sheepdog"},
+		{SD_RES_HALT, "The node is stopped doing IO, short of living nodes"},
 
 		{SD_RES_OLD_NODE_VER, "Remote node has an old epoch"},
 		{SD_RES_NEW_NODE_VER, "Remote node has a new epoch"},
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 2b042f4..a5a41d0 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -58,6 +58,7 @@
 #define SD_RES_WAIT_FOR_FORMAT  0x16 /* Sheepdog is waiting for a format operation */
 #define SD_RES_WAIT_FOR_JOIN    0x17 /* Sheepdog is waiting for other nodes joining */
 #define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */
+#define SD_RES_HALT 0x19 /* Target node is stopped doing IO */
 
 /*
  * Object ID rules
diff --git a/sheep/group.c b/sheep/group.c
index f6743f5..59293b2 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -335,6 +335,9 @@ void cluster_queue_request(struct work *work, int idx)
 		case SD_STATUS_JOIN_FAILED:
 			ret = SD_RES_JOIN_FAILED;
 			break;
+		case SD_STATUS_HALT:
+			ret = SD_RES_HALT;
+			break;
 		default:
 			ret = SD_RES_SYSTEM_ERROR;
 			break;
@@ -639,6 +642,10 @@ static int get_cluster_status(struct sheepdog_node_list_entry *from,
 		break;
 	case SD_STATUS_SHUTDOWN:
 		return SD_RES_SHUTDOWN;
+	case SD_STATUS_HALT:
+		if (inc_epoch);
+			*inc_epoch = 1;
+		break;
 	default:
 		break;
 	}
@@ -810,12 +817,13 @@ static void update_cluster_info(struct join_message *msg)
 				sheepid_to_str(&msg->nodes[i].sheepid));
 	}
 
-	if (msg->cluster_status != SD_STATUS_OK)
+	if (msg->cluster_status == SD_STATUS_WAIT_FOR_JOIN)
 		add_node_to_leave_list((struct message_header *)msg);
 
 	sys->join_finished = 1;
 
-	if (msg->cluster_status == SD_STATUS_OK && msg->inc_epoch)
+	if ((msg->cluster_status == SD_STATUS_OK || msg->cluster_status == SD_STATUS_HALT)
+	     && msg->inc_epoch)
 		update_epoch_log(sys->epoch);
 
 join_finished:
@@ -840,6 +848,12 @@ join_finished:
 		}
 	}
 
+	if (msg->cluster_status == SD_STATUS_HALT && msg->inc_epoch) {
+		sys->epoch++;
+		update_epoch_log(sys->epoch);
+		update_epoch_store(sys->epoch);
+	}
+
 	print_node_list(&sys->sd_node_list);
 
 	sys->status = msg->cluster_status;
@@ -1077,7 +1091,8 @@ static void send_join_response(struct work_notify *w)
 	m->state = DM_FIN;
 
 	dprintf("%d, %d\n", jm->result, jm->cluster_status);
-	if (jm->result == SD_RES_SUCCESS && jm->cluster_status != SD_STATUS_OK) {
+	if (jm->result == SD_RES_SUCCESS &&
+	    jm->cluster_status == SD_STATUS_WAIT_FOR_JOIN) {
 		jm->nr_leave_nodes = 0;
 		list_for_each_entry(node, &sys->leave_list, list) {
 			jm->leave_nodes[jm->nr_leave_nodes].sheepid = node->sheepid;
@@ -1181,7 +1196,8 @@ static void __sd_notify_done(struct cpg_event *cevent)
 		}
 	}
 
-	if (do_recovery && sys->status == SD_STATUS_OK) {
+	if (do_recovery &&
+	    (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT)) {
 		list_for_each_entry_safe(node, t, &sys->leave_list, list) {
 			list_del(&node->list);
 		}
@@ -1423,7 +1439,8 @@ static void __sd_leave_done(struct cpg_event *cevent)
 
 	print_node_list(&sys->sd_node_list);
 
-	if (node_left && sys->status == SD_STATUS_OK)
+	if (node_left &&
+	    (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT))
 		start_recovery(sys->epoch);
 }
 
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index e2fcb40..355cd93 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -37,6 +37,7 @@
 #define SD_STATUS_WAIT_FOR_JOIN     0x02
 #define SD_STATUS_SHUTDOWN          0x03
 #define SD_STATUS_JOIN_FAILED       0x04
+#define SD_STATUS_HALT              0x05
 
 #define SD_RES_NETWORK_ERROR    0x81 /* Network error between sheeps */
 
-- 
1.7.6.1