From: Liu Yuan <tailai.ly at taobao.com> [Rationale] Currently, we have to start up the frist failed node or last failed one to recover the crash cluster (nodes with different epoch histories). This patch simply remove this disgusting constraint. This patch addes a new concept in the crash cluster recovery phase: mastership transfer. The master node in the sheepdog is supposed to reply the nodes' join requests, hence managing other nodes join. It is the first node in the join stage and the last node in the crashed cluster. When we recover the crash cluster, the mastership is transfered one another until the last failed node is started up. When the node is not the last failed one, one of the two nodes simplely exits, only one node will be left in the recover stage with the mastership. After the last failed node is started up, we can join others safely with consistent epoch histories. With this patch, there is no start-up order imposed for the crash cluster to recover. We can do this because the epoch on each node has the node with the highest epoch number contained. As side effect, epoch transfer of leave node is removed. Leave node concept is changed a bit during crashed cluster recovery, that it is defined as "nodes that are supposed to leave and contained in the latest epoch". [Test Cases] The methods that I have tried to test this idea: $ for i in 0 1 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done $ collie/collie cluster format $ for i in 0 1 2; do pkill -f "sheep /store/$i"; sleep 1; done $ for i in 1 0 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done $ for i in 0 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done $ for i in 0 1 2; do ./collie/collie cluster info -p 700$i; done And Kazutaka's gorgous test case: for i in 0 1; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done ./collie/collie cluster format for i in 2 3 4; do pkill -f "sheep /store/$((i - 2))" ./sheep/sheep /store/$i -z $i -p 700$i sleep 1 done for i in 3 4; do pkill -f "sheep /store/$i"; sleep 1; done for i in 0 1 2 3 4; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done # now master 4 is recovered for i in 0 1 2 3; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done for i in 0 1 2 3 4; do ./collie/collie cluster info -p 700$i; done Cluster status: running Creation time Epoch Nodes 2011-09-24 19:06:02 13 [192.168.0.4:7000, 192.168.0.4:7001, 192.168.0.4:7002, 192.168.0.4:7003, 192.168.0.4:7004] 2011-09-24 19:06:02 12 [192.168.0.4:7000, 192.168.0.4:7001, 192.168.0.4:7002, 192.168.0.4:7004] 2011-09-24 19:06:02 11 [192.168.0.4:7000, 192.168.0.4:7001, 192.168.0.4:7004] 2011-09-24 19:06:02 10 [192.168.0.4:7000, 192.168.0.4:7004] 2011-09-24 19:06:02 9 [192.168.0.4:7004] 2011-09-24 19:06:02 8 [192.168.0.4:7004] 2011-09-24 19:06:02 7 [192.168.0.4:7003, 192.168.0.4:7004] 2011-09-24 19:06:02 6 [192.168.0.4:7003] ... All is done with good and consistent epoch histories as expected. Signed-off-by: Liu Yuan <tailai.ly at taobao.com> --- sheep/group.c | 63 ++++++++++++++++++++++++++++++++++----------------- sheep/sheep_priv.h | 1 - 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/sheep/group.c b/sheep/group.c index 812f6a0..ade1c38 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -435,10 +435,27 @@ static struct sheepdog_node_list_entry *find_entry_list(struct sheepdog_node_lis return NULL; } + +static struct sheepdog_node_list_entry *find_entry_epoch(struct sheepdog_node_list_entry *entry, + int epoch) +{ + struct sheepdog_node_list_entry nodes[SD_MAX_NODES]; + int nr, i; + + nr = epoch_log_read(epoch, (char *)nodes, sizeof(nodes)); + nr /= sizeof(nodes[0]); + + for (i = 0; i < nr; i++) + if (node_cmp(&nodes[i], entry) == 0) + return entry; + + return NULL; +} + static int add_node_to_leave_list(struct message_header *msg) { int ret = SD_RES_SUCCESS; - int nr, i; + int nr, i, le = get_latest_epoch(); LIST_HEAD(tmp_list); struct node *n, *t; struct join_message *jm; @@ -450,7 +467,8 @@ static int add_node_to_leave_list(struct message_header *msg) goto err; } - if (find_entry_list(&msg->from, &sys->leave_list)) { + if (find_entry_list(&msg->from, &sys->leave_list) + || !find_entry_epoch(&msg->from, le)) { free(n); goto ret; } @@ -471,7 +489,8 @@ static int add_node_to_leave_list(struct message_header *msg) goto free; } - if (find_entry_list(&jm->leave_nodes[i].ent, &sys->leave_list)) { + if (find_entry_list(&jm->leave_nodes[i].ent, &sys->leave_list) + || !find_entry_epoch(&jm->leave_nodes[i].ent, le)) { free(n); continue; } @@ -638,7 +657,6 @@ static void join(struct join_message *msg) msg->epoch, &msg->cluster_status, &msg->inc_epoch); msg->nr_sobjs = sys->nr_sobjs; - msg->epoch = sys->epoch; msg->ctime = get_cluster_ctime(); msg->nr_nodes = 0; list_for_each_entry(node, &sys->sd_node_list, list) { @@ -756,8 +774,7 @@ static void update_cluster_info(struct join_message *msg) if (is_myself(msg->header.from.addr, msg->header.from.port)) { eprintf("failed to join sheepdog, %d\n", msg->result); leave_cluster(); - eprintf("I am really hurt and gonna leave cluster.\n"); - eprintf("Fix yourself and restart me later, pleaseeeee...Bye.\n"); + eprintf("Restart me later when master is up, please.Bye.\n"); exit(1); /* sys->status = SD_STATUS_JOIN_FAILED; */ } @@ -975,17 +992,6 @@ static void __sd_deliver(struct cpg_event *cevent) addr_to_str(name, sizeof(name), m->from.addr, m->from.port), m->pid); - /* - * we don't want to perform any deliver events until we - * join; we wait for our JOIN message. - */ - if (!sys->join_finished) { - if (m->pid != sys->this_pid || m->nodeid != sys->this_nodeid) { - cevent->skip = 1; - return; - } - } - if (m->op == SD_MSG_JOIN) { uint32_t nodeid = m->nodeid; uint32_t pid = m->pid; @@ -1052,7 +1058,15 @@ static void send_join_response(struct work_deliver *w) jm->nr_leave_nodes++; } print_node_list(&sys->leave_list); + } else if (jm->result != SD_RES_SUCCESS && + jm->epoch > sys->epoch && + jm->cluster_status == SD_STATUS_WAIT_FOR_JOIN) { + eprintf("Transfer mastership.\n"); + leave_cluster(); + eprintf("Restart me later when master is up, please.Bye.\n"); + exit(1); } + jm->epoch = sys->epoch; send_message(sys->handle, m); } @@ -1090,15 +1104,23 @@ static void __sd_deliver_done(struct cpg_event *cevent) lm = (struct leave_message *)m; add_node_to_leave_list(m); - if (lm->epoch > sys->leave_epoch) - sys->leave_epoch = lm->epoch; + /* Sheep needs this to identify itself as master. + * Now mastership transfer is done. + */ + if (!sys->join_finished) { + sys->join_finished = 1; + move_node_to_sd_list(sys->this_nodeid, sys->this_pid, sys->this_node); + sys->epoch = get_latest_epoch(); + } nr_local = get_nodes_nr_epoch(sys->epoch); nr = get_nodes_nr_from(&sys->sd_node_list); nr_leave = get_nodes_nr_from(&sys->leave_list); + + dprintf("%d == %d + %d \n", nr_local, nr, nr_leave); if (nr_local == nr + nr_leave) { sys->status = SD_STATUS_OK; - sys->epoch = sys->leave_epoch + 1; + sys->epoch = sys->epoch + 1; update_epoch_log(sys->epoch); update_epoch_store(sys->epoch); } @@ -1931,7 +1953,6 @@ join_retry: sys->handle = cpg_handle; sys->this_nodeid = nodeid; sys->this_pid = getpid(); - sys->leave_epoch = 0; ret = set_addr(nodeid, port); if (ret) diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index 6680f79..4711cdd 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -144,7 +144,6 @@ struct cluster_info { int nr_outstanding_reqs; uint32_t recovered_epoch; - uint32_t leave_epoch; /* The highest number in the clsuter */ int use_directio; -- 1.7.6.1 |