From: Liu Yuan <tailai.ly at taobao.com> Currently, sheepdog will serve IO requests even if number of nodes is less than 'copies'. When the number of the nodes (or zones) is less than the copies specified by collie-cluster-format command, the sheepdog cluster should stop serving IO requests. This is necessary to solve the below subtle case: + good nodes, - failed nodes. 0 1 2 3 + - - + + --> - --> - --> + + + - # <-- permanently down. ^ | this node has the latest data at stage 3, we will have a cluster recovered without the data tracked at stage 1. When the nodes are in the SD_STATUS_HALT, the sheepdog can also serve configuration change and do the recovery job. Signed-off-by: Liu Yuan <tailai.ly at taobao.com> --- include/sheep.h | 1 + include/sheepdog_proto.h | 1 + sheep/group.c | 27 ++++++++++++++++++++++----- sheep/sheep_priv.h | 1 + 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/include/sheep.h b/include/sheep.h index 31516d9..943cdf7 100644 --- a/include/sheep.h +++ b/include/sheep.h @@ -254,6 +254,7 @@ static inline const char *sd_strerror(int err) {SD_RES_WAIT_FOR_FORMAT, "Waiting for a format operation"}, {SD_RES_WAIT_FOR_JOIN, "Waiting for other nodes joining"}, {SD_RES_JOIN_FAILED, "The node had failed to join sheepdog"}, + {SD_RES_HALT, "The node is stopped doing IO, short of living nodes"}, {SD_RES_OLD_NODE_VER, "Remote node has an old epoch"}, {SD_RES_NEW_NODE_VER, "Remote node has a new epoch"}, diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h index 2b042f4..a5a41d0 100644 --- a/include/sheepdog_proto.h +++ b/include/sheepdog_proto.h @@ -58,6 +58,7 @@ #define SD_RES_WAIT_FOR_FORMAT 0x16 /* Sheepdog is waiting for a format operation */ #define SD_RES_WAIT_FOR_JOIN 0x17 /* Sheepdog is waiting for other nodes joining */ #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */ +#define SD_RES_HALT 0x19 /* Target node is stopped doing IO */ /* * Object ID rules diff --git a/sheep/group.c b/sheep/group.c index f6743f5..59293b2 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -335,6 +335,9 @@ void cluster_queue_request(struct work *work, int idx) case SD_STATUS_JOIN_FAILED: ret = SD_RES_JOIN_FAILED; break; + case SD_STATUS_HALT: + ret = SD_RES_HALT; + break; default: ret = SD_RES_SYSTEM_ERROR; break; @@ -639,6 +642,10 @@ static int get_cluster_status(struct sheepdog_node_list_entry *from, break; case SD_STATUS_SHUTDOWN: return SD_RES_SHUTDOWN; + case SD_STATUS_HALT: + if (inc_epoch); + *inc_epoch = 1; + break; default: break; } @@ -810,12 +817,13 @@ static void update_cluster_info(struct join_message *msg) sheepid_to_str(&msg->nodes[i].sheepid)); } - if (msg->cluster_status != SD_STATUS_OK) + if (msg->cluster_status == SD_STATUS_WAIT_FOR_JOIN) add_node_to_leave_list((struct message_header *)msg); sys->join_finished = 1; - if (msg->cluster_status == SD_STATUS_OK && msg->inc_epoch) + if ((msg->cluster_status == SD_STATUS_OK || msg->cluster_status == SD_STATUS_HALT) + && msg->inc_epoch) update_epoch_log(sys->epoch); join_finished: @@ -840,6 +848,12 @@ join_finished: } } + if (msg->cluster_status == SD_STATUS_HALT && msg->inc_epoch) { + sys->epoch++; + update_epoch_log(sys->epoch); + update_epoch_store(sys->epoch); + } + print_node_list(&sys->sd_node_list); sys->status = msg->cluster_status; @@ -1077,7 +1091,8 @@ static void send_join_response(struct work_notify *w) m->state = DM_FIN; dprintf("%d, %d\n", jm->result, jm->cluster_status); - if (jm->result == SD_RES_SUCCESS && jm->cluster_status != SD_STATUS_OK) { + if (jm->result == SD_RES_SUCCESS && + jm->cluster_status == SD_STATUS_WAIT_FOR_JOIN) { jm->nr_leave_nodes = 0; list_for_each_entry(node, &sys->leave_list, list) { jm->leave_nodes[jm->nr_leave_nodes].sheepid = node->sheepid; @@ -1181,7 +1196,8 @@ static void __sd_notify_done(struct cpg_event *cevent) } } - if (do_recovery && sys->status == SD_STATUS_OK) { + if (do_recovery && + (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT)) { list_for_each_entry_safe(node, t, &sys->leave_list, list) { list_del(&node->list); } @@ -1423,7 +1439,8 @@ static void __sd_leave_done(struct cpg_event *cevent) print_node_list(&sys->sd_node_list); - if (node_left && sys->status == SD_STATUS_OK) + if (node_left && + (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT)) start_recovery(sys->epoch); } diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index e2fcb40..355cd93 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -37,6 +37,7 @@ #define SD_STATUS_WAIT_FOR_JOIN 0x02 #define SD_STATUS_SHUTDOWN 0x03 #define SD_STATUS_JOIN_FAILED 0x04 +#define SD_STATUS_HALT 0x05 #define SD_RES_NETWORK_ERROR 0x81 /* Network error between sheeps */ -- 1.7.6.1 |