[Sheepdog] [PATCH v2] sheep: tame sheep to recover the crash cluster

Sun Sep 25 06:00:20 CEST 2011

From: Liu Yuan <tailai.ly at taobao.com>

Hi Kazum,
        would this solve the data loss problem as you mentioned when there is no epoch overlap? This patch would allow cluster to recover as if the master (last failed node) were no ever crashed.

Thanks,
Yuan

8>---------------8>---------------
[Rationale]

Currently, we have to start up the frist failed node or last failed one to
recover the crash cluster (nodes with different epoch histories). This patch
simply remove this disgusting constraint.

This patch addes a new concept in the crash cluster recovery phase: mastership transfer.

The master node in the sheepdog is supposed to reply the nodes' join requests, hence managing
other nodes join. It is the first node in the join stage and the last node in the crashed
cluster.

When we recover the crash cluster, the mastership is transfered one another until the last
failed node is started up. When the node is not the last failed one, one of the two nodes simplely
exits, only one node will be left in the recover stage with the mastership.

After the last failed node is started up, we can join others safely with consistent epoch histories.

With this patch, there is no start-up order imposed for the crash cluster to recover. We can
do this because the epoch on each node has the node with the highest epoch number contained.

As side effect, epoch transfer of leave node is removed. Leave node concept is changed a bit during
crashed cluster recovery, that it is defined as "nodes that are supposed to leave and contained in the
latest epoch".

[Test Cases]

The methods that I have tried to test this idea:

$ for i in 0 1 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done
$ collie/collie cluster format
$ for i in 0 1 2; do pkill -f "sheep /store/$i"; sleep 1; done
$ for i in 1 0 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done
$ for i in 0 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done
$ for i in 0 1 2; do ./collie/collie cluster info -p 700$i; done

...
2011-09-25 11:38:17      5 [192.168.0.4:7000, 192.168.0.4:7001, 192.168.0.4:7002]
2011-09-25 11:38:17      4 [192.168.0.4:7000, 192.168.0.4:7002]
2011-09-25 11:38:17      3 [192.168.0.4:7002]
2011-09-25 11:38:17      2 [192.168.0.4:7001, 192.168.0.4:7002]
2011-09-25 11:38:17      1 [192.168.0.4:7000, 192.168.0.4:7001, 192.168.0.4:7002]
...

And Kazutaka's gorgous test case:

    for i in 0 1; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done
    ./collie/collie cluster format
    for i in 2 3 4; do
        pkill -f "sheep /store/$((i - 2))"
        ./sheep/sheep /store/$i -z $i -p 700$i
        sleep 1
    done
    for i in 3 4; do pkill -f "sheep /store/$i"; sleep 1; done
    for i in 0 1 2 3 4; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done # now master 4 is recovered
    for i in 0 1 2 3; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done
    for i in 0 1 2 3 4; do ./collie/collie cluster info -p 700$i; done

Cluster status: running

Creation time        Epoch Nodes
2011-09-25 11:47:49     12 [192.168.0.4:7000, 192.168.0.4:7001, 192.168.0.4:7002, 192.168.0.4:7003, 192.168.0.4:7004]
2011-09-25 11:47:49     11 [192.168.0.4:7000, 192.168.0.4:7001, 192.168.0.4:7002, 192.168.0.4:7004]
2011-09-25 11:47:49     10 [192.168.0.4:7000, 192.168.0.4:7001, 192.168.0.4:7004]
2011-09-25 11:47:49      9 [192.168.0.4:7000, 192.168.0.4:7004]
2011-09-25 11:47:49      8 [192.168.0.4:7004]
2011-09-25 11:47:49      7 [192.168.0.4:7003, 192.168.0.4:7004]
2011-09-25 11:47:49      6 [192.168.0.4:7003]
2011-09-25 11:47:49      5 [192.168.0.4:7002, 192.168.0.4:7003]
...

All is done with good and consistent epoch histories as expected.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 sheep/group.c      |   63 ++++++++++++++++++++++++++++++++++-----------------
 sheep/sheep_priv.h |    1 -
 2 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/sheep/group.c b/sheep/group.c
index 812f6a0..e0fc61e 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -435,10 +435,27 @@ static struct sheepdog_node_list_entry *find_entry_list(struct sheepdog_node_lis
 	return NULL;
 
 }
+
+static struct sheepdog_node_list_entry *find_entry_epoch(struct sheepdog_node_list_entry *entry,
+							 int epoch)
+{
+	struct sheepdog_node_list_entry nodes[SD_MAX_NODES];
+	int nr, i;
+
+	nr = epoch_log_read(epoch, (char *)nodes, sizeof(nodes));
+	nr /= sizeof(nodes[0]);
+
+	for (i = 0; i < nr; i++)
+		if (node_cmp(&nodes[i], entry) == 0)
+			return entry;
+
+	return NULL;
+}
+
 static int add_node_to_leave_list(struct message_header *msg)
 {
 	int ret = SD_RES_SUCCESS;
-	int nr, i;
+	int nr, i, le = get_latest_epoch();
 	LIST_HEAD(tmp_list);
 	struct node *n, *t;
 	struct join_message *jm;
@@ -450,7 +467,8 @@ static int add_node_to_leave_list(struct message_header *msg)
 			goto err;
 		}
 
-		if (find_entry_list(&msg->from, &sys->leave_list)) {
+		if (find_entry_list(&msg->from, &sys->leave_list)
+		    || !find_entry_epoch(&msg->from, le)) {
 			free(n);
 			goto ret;
 		}
@@ -471,7 +489,8 @@ static int add_node_to_leave_list(struct message_header *msg)
 				goto free;
 			}
 
-			if (find_entry_list(&jm->leave_nodes[i].ent, &sys->leave_list)) {
+			if (find_entry_list(&jm->leave_nodes[i].ent, &sys->leave_list)
+			    || !find_entry_epoch(&jm->leave_nodes[i].ent, le)) {
 				free(n);
 				continue;
 			}
@@ -638,7 +657,6 @@ static void join(struct join_message *msg)
 					 msg->epoch, &msg->cluster_status,
 					 &msg->inc_epoch);
 	msg->nr_sobjs = sys->nr_sobjs;
-	msg->epoch = sys->epoch;
 	msg->ctime = get_cluster_ctime();
 	msg->nr_nodes = 0;
 	list_for_each_entry(node, &sys->sd_node_list, list) {
@@ -756,8 +774,7 @@ static void update_cluster_info(struct join_message *msg)
 		if (is_myself(msg->header.from.addr, msg->header.from.port)) {
 			eprintf("failed to join sheepdog, %d\n", msg->result);
 			leave_cluster();
-			eprintf("I am really hurt and gonna leave cluster.\n");
-			eprintf("Fix yourself and restart me later, pleaseeeee...Bye.\n");
+			eprintf("Restart me later when master is up, please.Bye.\n");
 			exit(1);
 			/* sys->status = SD_STATUS_JOIN_FAILED; */
 		}
@@ -975,17 +992,6 @@ static void __sd_deliver(struct cpg_event *cevent)
 		addr_to_str(name, sizeof(name), m->from.addr, m->from.port),
 		m->pid);
 
-	/*
-	 * we don't want to perform any deliver events until we
-	 * join; we wait for our JOIN message.
-	 */
-	if (!sys->join_finished) {
-		if (m->pid != sys->this_pid || m->nodeid != sys->this_nodeid) {
-			cevent->skip = 1;
-			return;
-		}
-	}
-
 	if (m->op == SD_MSG_JOIN) {
 		uint32_t nodeid = m->nodeid;
 		uint32_t pid = m->pid;
@@ -1052,7 +1058,15 @@ static void send_join_response(struct work_deliver *w)
 			jm->nr_leave_nodes++;
 		}
 		print_node_list(&sys->leave_list);
+	} else if (jm->result != SD_RES_SUCCESS &&
+			jm->epoch > sys->epoch &&
+			jm->cluster_status == SD_STATUS_WAIT_FOR_JOIN) {
+		eprintf("Transfer mastership.\n");
+		leave_cluster();
+		eprintf("Restart me later when master is up, please.Bye.\n");
+		exit(1);
 	}
+	jm->epoch = sys->epoch;
 	send_message(sys->handle, m);
 }
 
@@ -1090,15 +1104,23 @@ static void __sd_deliver_done(struct cpg_event *cevent)
 				lm = (struct leave_message *)m;
 				add_node_to_leave_list(m);
 
-				if (lm->epoch > sys->leave_epoch)
-					sys->leave_epoch = lm->epoch;
+				/* Sheep needs this to identify itself as master.
+				 * Now mastership transfer is done.
+				 */
+				if (!sys->join_finished) {
+					sys->join_finished = 1;
+					move_node_to_sd_list(sys->this_nodeid, sys->this_pid, sys->this_node);
+					sys->epoch = get_latest_epoch();
+				}
 
 				nr_local = get_nodes_nr_epoch(sys->epoch);
 				nr = get_nodes_nr_from(&sys->sd_node_list);
 				nr_leave = get_nodes_nr_from(&sys->leave_list);
+
+				dprintf("%d == %d + %d \n", nr_local, nr, nr_leave);
 				if (nr_local == nr + nr_leave) {
 					sys->status = SD_STATUS_OK;
-					sys->epoch = sys->leave_epoch + 1;
+					sys->epoch = sys->epoch;
 					update_epoch_log(sys->epoch);
 					update_epoch_store(sys->epoch);
 				}
@@ -1931,7 +1953,6 @@ join_retry:
 	sys->handle = cpg_handle;
 	sys->this_nodeid = nodeid;
 	sys->this_pid = getpid();
-	sys->leave_epoch = 0;
 
 	ret = set_addr(nodeid, port);
 	if (ret)
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 6680f79..4711cdd 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -144,7 +144,6 @@ struct cluster_info {
 	int nr_outstanding_reqs;
 
 	uint32_t recovered_epoch;
-	uint32_t leave_epoch; /* The highest number in the clsuter */
 
 	int use_directio;
 
-- 
1.7.6.1