[sheepdog] [PATCH 1/2] corosync: handle network partition

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Sun Sep 9 19:06:27 CEST 2012


This patch adds support for network partition detection again with
more efficient way.

Even when many nodes go down at the same time, corosync dispatches
leave messages one by one.  It makes it difficult to detect a network
partition.  To determine whether there are succeeding leave events or
not, this patch uses a cpg file descriptor, which is used to check
notified corosync messages.  If the descriptor is ready for read,
sheep skip processing events and tries to receive the next events.  If
more than half numbers of nodes go down at the same time, sheep
determine that a network partition has occurred and stop serving.

With this patch, sheep can also detect NIC failure and avoid updating
epoch wrongly.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 sheep/cluster/corosync.c |   41 ++++++++++++++++++++++++++++++++++++++---
 1 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
index a2b18b1..3972013 100644
--- a/sheep/cluster/corosync.c
+++ b/sheep/cluster/corosync.c
@@ -10,6 +10,7 @@
  */
 #include <stdio.h>
 #include <unistd.h>
+#include <poll.h>
 #include <sys/epoll.h>
 #include <corosync/cpg.h>
 #include <corosync/cfg.h>
@@ -40,6 +41,8 @@ static struct cpg_node cpg_nodes[SD_MAX_NODES];
 static size_t nr_cpg_nodes;
 static int self_elect;
 static int join_finished;
+static int cpg_fd;
+static size_t nr_majority; /* used for network partition detection */
 
 /* event types which are dispatched in corosync_dispatch() */
 enum corosync_event_type {
@@ -353,6 +356,21 @@ static int __corosync_dispatch_one(struct corosync_event *cevent)
 static void __corosync_dispatch(void)
 {
 	struct corosync_event *cevent;
+	struct pollfd pfd = {
+		.fd = cpg_fd,
+		.events = POLLIN,
+	};
+
+	if (poll(&pfd, 1, 0)) {
+		/* Corosync dispatches leave events one by one even
+		 * when network partition has occured.  To count the
+		 * number of alive nodes correctly, we postpone
+		 * processsing events if there are incoming ones. */
+		dprintf("wait for a next dispatch event\n");
+		return;
+	}
+
+	nr_majority = 0;
 
 	while (!list_empty(&corosync_block_event_list) ||
 	       !list_empty(&corosync_nonblock_event_list)) {
@@ -564,6 +582,23 @@ static void cdrv_cpg_confchg(cpg_handle_t handle,
 		member_list_entries, joined_list_entries,
 		left_list_entries);
 
+	/* check network partition */
+	if (left_list_entries) {
+		if (nr_majority == 0) {
+			size_t total = member_list_entries + left_list_entries;
+
+			/* we need at least 3 nodes to handle network
+			 * partition failure */
+			if (total > 2)
+				nr_majority = total / 2 + 1;
+		}
+
+		if (member_list_entries == 0)
+			panic("NIC failure?\n");
+		if (member_list_entries < nr_majority)
+			panic("Network partition is detected\n");
+	}
+
 	/* convert cpg_address to cpg_node */
 	build_cpg_node_list(member_sheep, member_list, member_list_entries);
 	build_cpg_node_list(left_sheep, left_list, left_list_entries);
@@ -721,7 +756,7 @@ out:
 
 static int corosync_init(const char *option)
 {
-	int ret, fd, retry_cnt = 0;
+	int ret, retry_cnt = 0;
 	uint32_t nodeid;
 	cpg_callbacks_t cb = {
 		.cpg_deliver_fn = cdrv_cpg_deliver,
@@ -764,13 +799,13 @@ again:
 	this_node.nodeid = nodeid;
 	this_node.pid = getpid();
 
-	ret = cpg_fd_get(cpg_handle, &fd);
+	ret = cpg_fd_get(cpg_handle, &cpg_fd);
 	if (ret != CS_OK) {
 		eprintf("failed to get cpg file descriptor (%d)\n", ret);
 		return -1;
 	}
 
-	ret = register_event(fd, corosync_handler, NULL);
+	ret = register_event(cpg_fd, corosync_handler, NULL);
 	if (ret) {
 		eprintf("failed to register corosync event handler (%d)\n",
 			ret);
-- 
1.7.2.5




More information about the sheepdog mailing list