[Sheepdog] [PATCH 1/4] sheep: add transient failure detection

Thu Mar 15 10:46:47 CET 2012

When one node leave the cluster, it will trigger data recovery at once.

Sometimes we perhaps need restart the sheepdog daemon or machine for
upgrade or other purpose, and perhpas the machine crashes.

Sometimes the synchronization driver used by sheepdog does not work
well.  For example, corosync driver sometimes will lose its token and
tell sheepdog some nodes have left but the node never leave.

These events can be seemed as transient failure. Sheepdog should allow
these eventes. For these events, sheepdog will not start recovery but
wait the node join again. If the node join again, its role in the group
should not change.

If the node does not join again for some time, sheepdog will start
recovery.

The basic idea is to use one timer when one node leave the cluster.
After the timer timeout,  check the left node status. When one node join
the cluster, check it as a new comer or existed one.

Here are changes on header files.

Signed-off-by: HaiTing Yao <wujue.yht at taobao.com>
---
 include/sheep.h    |   12 ++++++++++++
 sheep/cluster.h    |    8 ++++++++
 sheep/sheep_priv.h |   10 ++++++++++
 3 files changed, 30 insertions(+), 0 deletions(-)

diff --git a/include/sheep.h b/include/sheep.h
index e435b63..2b9260a 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -62,6 +62,8 @@
 #define SD_RES_NOT_FORMATTED 0x43 /* Sheepdog is not formatted yet */
 #define SD_RES_INVALID_CTIME 0x44 /* Creation time of sheepdog is different */
 #define SD_RES_INVALID_EPOCH 0x45 /* Invalid epoch */
+#define SD_RES_NODE_COME_BACK 0x46 /* node come back again after temprorary failure */
+#define SD_RES_CLUSTER_TEMP_FAILURE 0x47 /* cluster has temporarily failed node */
 
 #define SD_FLAG_NOHALT       0x0004 /* Serve the IO rquest even lack of nodes */
 
@@ -141,11 +143,17 @@ struct sd_node_rsp {
 	uint64_t	store_free;
 };
 
+enum node_vnode_status {
+	NODE_STATUS_NORMAL,
+	NODE_STATUS_FAIL,
+};
+
 struct sd_node {
 	uint8_t         addr[16];
 	uint16_t        port;
 	uint16_t	nr_vnodes;
 	uint32_t	zone;
+	uint32_t	status;
 };
 
 struct sd_vnode {
@@ -154,6 +162,7 @@ struct sd_vnode {
 	uint16_t        port;
 	uint16_t	node_idx;
 	uint32_t	zone;
+	uint32_t	status;
 };
 
 struct epoch_log {
@@ -275,6 +284,7 @@ static inline const char *sd_strerror(int err)
 		{SD_RES_NOT_FORMATTED, "Cluster has not been formatted"},
 		{SD_RES_INVALID_CTIME, "Creation times differ"},
 		{SD_RES_INVALID_EPOCH, "Invalid epoch"},
+		{SD_RES_CLUSTER_TEMP_FAILURE, "Nodes occur temporary failure"},
 	};
 
 	for (i = 0; i < ARRAY_SIZE(errors); ++i)
@@ -348,4 +358,6 @@ static inline int nodes_to_vnodes(struct sd_node *nodes, int nr,
 	return nr_vnodes;
 }
 
+int temp_failure_enabled(void);
+
 #endif
diff --git a/sheep/cluster.h b/sheep/cluster.h
index b50dbb2..58811e6 100644
--- a/sheep/cluster.h
+++ b/sheep/cluster.h
@@ -29,6 +29,7 @@ enum cluster_join_result {
 	CJ_RES_MASTER_TRANSFER, /* Transfer mastership.  The joining
 				 * node has a newer epoch, so this node
 				 * will leave the cluster (restart later). */
+	CJ_RES_COME_BACK, /* node come back after reboot */
 };
 
 struct cdrv_handlers {
@@ -114,6 +115,13 @@ struct cluster_driver {
 	 */
 	int (*dispatch)(void);
 
+	/*
+	 * remove one useless node
+	 *
+	 * Returns zero on success, -1 on error
+	 */
+	int (*remove_node)(void *node);
+
 	struct list_head list;
 };
 
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index db70c57..fa2eec7 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -48,6 +48,14 @@ struct cpg_event {
 	struct list_head cpg_event_list;
 };
 
+#define SD_MAX_NODES_TEMP_FAIL (SD_MAX_NODES - 1)
+
+struct temp_failure {
+	int busy;
+	struct sd_node node;
+	struct timer leave_timer;
+};
+
 struct client_info {
 	struct connection conn;
 
@@ -147,6 +155,8 @@ struct cluster_info {
 
 	int use_directio;
 
+	uint32_t templeft_time;
+
 	struct work_queue *cpg_wqueue;
 	struct work_queue *gateway_wqueue;
 	struct work_queue *io_wqueue;
-- 
1.7.1