[Sheepdog] [PATCH] fix master selection race

Thu Apr 8 13:03:08 CEST 2010

We wrongly assume that the node that joins corosync sends Sheepdog's
JOIN message before the other nodes do. We hit a bug that two nodes
are the master node temporarily.

This patch makes sure that the node that joins corosync will be always
the master.

Signed-off-by: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>
---
 collie/collie.h |    2 +
 collie/group.c  |   94 +++++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/collie/collie.h b/collie/collie.h
index fac6809..048cc7b 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -74,6 +74,8 @@ struct cluster_info {
 	DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
 
 	int nr_sobjs;
+
+	struct list_head work_deliver_siblings;
 };
 
 struct cluster_info *sys;
diff --git a/collie/group.c b/collie/group.c
index d659485..56e0fe9 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -71,6 +71,7 @@ struct work_deliver {
 	struct message_header *msg;
 
 	struct work work;
+	struct list_head work_deliver_list;
 };
 
 struct work_confch {
@@ -402,9 +403,6 @@ static void join(struct join_message *msg)
 	if (!sys->synchronized)
 		return;
 
-	if (!is_master())
-		return;
-
 	if (msg->nr_sobjs)
 		sys->nr_sobjs = msg->nr_sobjs;
 
@@ -414,12 +412,8 @@ static void join(struct join_message *msg)
 		msg->epoch = sys->epoch;
 	else
 		msg->epoch = 0;
-	list_for_each_entry(node, &sys->cpg_node_list, list) {
-		if (node->nodeid == msg->nodeid && node->pid == msg->pid)
-			continue;
-		if (node->ent.id == 0)
-			continue;
 
+	list_for_each_entry(node, &sys->sd_node_list, list) {
 		msg->nodes[msg->nr_nodes].nodeid = node->nodeid;
 		msg->nodes[msg->nr_nodes].pid = node->pid;
 		msg->nodes[msg->nr_nodes].ent = node->ent;
@@ -747,6 +741,8 @@ static void __sd_deliver(struct work *work, int idx)
 			break;
 		}
 
+		vprintf(SDOG_DEBUG "will send\n");
+
 		m->done = 1;
 		send_message(sys->handle, m);
 	} else {
@@ -766,8 +762,23 @@ static void __sd_deliver(struct work *work, int idx)
 
 static void __sd_deliver_done(struct work *work, int idx)
 {
-	struct work_deliver *w = container_of(work, struct work_deliver, work);
-	struct message_header *m = w->msg;
+	struct work_deliver *w, *n = NULL;
+	struct message_header *m;
+
+	w = container_of(work, struct work_deliver, work);
+	m = w->msg;
+
+	list_del(&w->work_deliver_list);
+
+	/*
+	 * When I finished one message, if I have pending messages, I
+	 * need to perform the first of them now.
+	 */
+	if (m->done && !list_empty(&sys->work_deliver_siblings)) {
+
+		n = list_first_entry(&sys->work_deliver_siblings,
+				     struct work_deliver, work_deliver_list);
+	}
 
 	/*
 	 * FIXME: we want to recover only after all nodes are fully
@@ -779,6 +790,9 @@ static void __sd_deliver_done(struct work *work, int idx)
 
 	free(w->msg);
 	free(w);
+
+	if (n)
+		queue_work(dobj_queue, &n->work);
 }
 
 static void sd_deliver(cpg_handle_t handle, const struct cpg_name *group_name,
@@ -800,12 +814,34 @@ static void sd_deliver(cpg_handle_t handle, const struct cpg_name *group_name,
 	if (!w->msg)
 		return;
 	memcpy(w->msg, msg, msg_len);
+	INIT_LIST_HEAD(&w->work_deliver_list);
 
 	w->work.fn = __sd_deliver;
 	w->work.done = __sd_deliver_done;
 
-	if (m->op == SD_MSG_JOIN)
-		w->work.attr = WORK_ORDERED;
+	if (is_master()) {
+		if (!m->done) {
+			int run = 0;
+
+			/*
+			 * I can broadcast this message if there is no
+			 * outstanding messages.
+			 */
+			if (list_empty(&sys->work_deliver_siblings))
+				run = 1;
+
+			list_add_tail(&w->work_deliver_list,
+				      &sys->work_deliver_siblings);
+			if (run) {
+				vprintf(SDOG_DEBUG "%u\n", pid);
+				queue_work(dobj_queue, &w->work);
+			} else
+				vprintf(SDOG_DEBUG "%u\n", pid);
+
+			return;
+		}
+	} else if (m->op == SD_MSG_JOIN)
+		  w->work.attr = WORK_ORDERED;
 
 	queue_work(dobj_queue, &w->work);
 }
@@ -815,6 +851,7 @@ static void __sd_confch(struct work *work, int idx)
 	struct work_confch *w = container_of(work, struct work_confch, work);
 	struct node *node;
 	int i;
+	int init = 0;
 
 	const struct cpg_address *member_list = w->member_list;
 	size_t member_list_entries = w->member_list_entries;
@@ -825,8 +862,10 @@ static void __sd_confch(struct work *work, int idx)
 
 	if (member_list_entries == joined_list_entries - left_list_entries &&
 	    sys->this_nodeid == member_list[0].nodeid &&
-	    sys->this_pid == member_list[0].pid)
+	    sys->this_pid == member_list[0].pid){
 		sys->synchronized = 1;
+		init = 1;
+	}
 
 	if (list_empty(&sys->cpg_node_list)) {
 		for (i = 0; i < member_list_entries; i++)
@@ -865,6 +904,28 @@ static void __sd_confch(struct work *work, int idx)
 		}
 	}
 
+	if (init) {
+		struct join_message msg;
+
+		/*
+		 * If I'm the first collie joins in colosync, I
+		 * becomes the master without sending JOIN.
+		 */
+
+		vprintf(SDOG_DEBUG "%d %x\n", sys->this_pid, sys->this_nodeid);
+
+		memset(&msg, 0, sizeof(msg));
+
+		msg.header.from = sys->this_node;
+		msg.nodeid = sys->this_nodeid;
+		msg.pid = sys->this_pid;
+		msg.cluster_status = get_cluster_status(&msg.header.from);
+
+		update_cluster_info(&msg);
+
+		return;
+	}
+
 	for (i = 0; i < joined_list_entries; i++) {
 		if (sys->this_nodeid == joined_list[i].nodeid &&
 		    sys->this_pid == joined_list[i].pid) {
@@ -993,7 +1054,7 @@ int build_node_list(struct list_head *node_list,
 	return nr;
 }
 
-static void set_addr(unsigned int nodeid)
+static void set_addr(unsigned int nodeid, int port)
 {
 	int ret, nr;
 	corosync_cfg_handle_t handle;
@@ -1036,7 +1097,7 @@ static void set_addr(unsigned int nodeid)
 
 	inet_ntop(ss->ss_family, saddr, tmp, sizeof(tmp));
 
-	vprintf(SDOG_INFO "addr = %s\n", tmp);
+	vprintf(SDOG_INFO "addr = %s, port = %d\n", tmp, port);
 }
 
 int create_cluster(int port)
@@ -1082,7 +1143,7 @@ join_retry:
 	sys->this_nodeid = nodeid;
 	sys->this_pid = getpid();
 
-	set_addr(nodeid);
+	set_addr(nodeid, port);
 	sys->this_node.port = port;
 
 	ret = get_nodeid(&sys->this_node.id);
@@ -1104,6 +1165,7 @@ join_retry:
 	INIT_LIST_HEAD(&sys->cpg_node_list);
 	INIT_LIST_HEAD(&sys->vm_list);
 	INIT_LIST_HEAD(&sys->pending_list);
+	INIT_LIST_HEAD(&sys->work_deliver_siblings);
 	cpg_context_set(cpg_handle, sys);
 
 	cpg_fd_get(cpg_handle, &fd);
-- 
1.7.0