[Sheepdog] [PATCH S003 v2] sheep: handle master crashing before sending JOIN request
Shevek
shevek at anarres.org
Fri May 11 00:40:59 CEST 2012
There are a number of races around join/leave.
This patch fixes a case described by Huxinwei and Liu Yuan where
sheepdog fails to elect a master. A longer description is in the patch.
A problem arises if a node joins the cluster and generates a
confchg event, then crashes or leaves without sending a join
request and receiving a join response. The second node to join
never becomes master, and the entire cluster hangs.
This patch allows a node to detect whether it should promote itself
to master after an arbitrary confchg event. Every node except the
master creates a blocked JOIN event for every node that joined
after itself, therefore the master is the node which has a JOIN
event for every node in the members list.
A following patch will handle the case where a join request
is sent, but the master crashes before sending a join response.
Changes to this patch have been made as requested by Liu Yuan.
Signed-off-by: Shevek <shevek at anarres.org>
---
sheep/cluster/corosync.c | 57 +++++++++++++++++++++++++++++++++------------
1 files changed, 42 insertions(+), 15 deletions(-)
diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
index 4a588e9..a889f1d 100644
--- a/sheep/cluster/corosync.c
+++ b/sheep/cluster/corosync.c
@@ -36,6 +36,8 @@ static LIST_HEAD(corosync_block_list);
static struct cpg_node cpg_nodes[SD_MAX_NODES];
static size_t nr_cpg_nodes;
+static int self_elect;
+static int join_finished;
/* event types which are dispatched in corosync_dispatch() */
enum corosync_event_type {
@@ -67,7 +69,6 @@ struct corosync_event {
int blocked;
int callbacked;
- int first_node;
struct list_head list;
};
@@ -373,7 +374,6 @@ static int __corosync_dispatch_one(struct corosync_event *cevent)
static void __corosync_dispatch(void)
{
struct corosync_event *cevent;
- static int join_finished;
int done;
while (!list_empty(&corosync_event_list)) {
@@ -381,7 +381,7 @@ static void __corosync_dispatch(void)
/* update join status */
if (!join_finished && cevent->type == COROSYNC_EVENT_TYPE_JOIN) {
- if (cevent->first_node) {
+ if (cevent->blocked && self_elect) {
join_finished = 1;
nr_cpg_nodes = 0;
}
@@ -528,6 +528,17 @@ static void cdrv_cpg_deliver(cpg_handle_t handle,
__corosync_dispatch();
}
+static void build_cpg_node_list(struct cpg_node *nodes,
+ const struct cpg_address *list, size_t nr)
+{
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ nodes[i].nodeid = list[i].nodeid;
+ nodes[i].pid = list[i].pid;
+ }
+}
+
static void cdrv_cpg_confchg(cpg_handle_t handle,
const struct cpg_name *group_name,
const struct cpg_address *member_list,
@@ -539,22 +550,19 @@ static void cdrv_cpg_confchg(cpg_handle_t handle,
{
struct corosync_event *cevent;
int i;
+ struct cpg_node member_sheep[SD_MAX_NODES];
struct cpg_node joined_sheep[SD_MAX_NODES];
struct cpg_node left_sheep[SD_MAX_NODES];
+ int promote = 1;
dprintf("mem:%zu, joined:%zu, left:%zu\n",
member_list_entries, joined_list_entries,
left_list_entries);
/* convert cpg_address to cpg_node */
- for (i = 0; i < left_list_entries; i++) {
- left_sheep[i].nodeid = left_list[i].nodeid;
- left_sheep[i].pid = left_list[i].pid;
- }
- for (i = 0; i < joined_list_entries; i++) {
- joined_sheep[i].nodeid = joined_list[i].nodeid;
- joined_sheep[i].pid = joined_list[i].pid;
- }
+ build_cpg_node_list(member_sheep, member_list, member_list_entries);
+ build_cpg_node_list(left_sheep, left_list, left_list_entries);
+ build_cpg_node_list(joined_sheep, joined_list, joined_list_entries);
/* dispatch leave_handler */
for (i = 0; i < left_list_entries; i++) {
@@ -604,13 +612,32 @@ static void cdrv_cpg_confchg(cpg_handle_t handle,
cevent->type = COROSYNC_EVENT_TYPE_JOIN;
cevent->sender = joined_sheep[i];
cevent->blocked = 1; /* FIXME: add explanation */
- if (member_list_entries == joined_list_entries - left_list_entries &&
- cpg_node_equal(&joined_sheep[0], &this_node))
- cevent->first_node = 1;
-
list_add_tail(&cevent->list, &corosync_event_list);
}
+ if (!join_finished) {
+ /*
+ * Exactly one non-master member has seen join events for all other
+ * members, because events are ordered.
+ */
+ for (i = 0; i < member_list_entries; i++) {
+ cevent = find_block_event(COROSYNC_EVENT_TYPE_JOIN,
+ &member_sheep[i]);
+ if (!cevent) {
+ dprintf("Not promoting because member is not in our "
+ "event list.\n");
+ promote = 0;
+ break;
+ }
+ }
+
+ /*
+ * If we see the join events for all nodes promote ourself to master
+ * right here.
+ */
+ if (promote)
+ self_elect = 1;
+ }
__corosync_dispatch();
}
--
1.7.5.4
More information about the sheepdog
mailing list