[sheepdog] [PATCH 1/2] Revert "corosync: handle network partition"

Tue Sep 30 10:03:47 CEST 2014

At Tue, 30 Sep 2014 17:01:25 +0900,
Hitoshi Mitake wrote:
> 
> This reverts commit 15df161958a38cf3f7bc83b5bc2c8a1817b3072e.
> 
> The commit introduced a better handling method of network
> partition. But the method seems to have a possibility of losing event
> from corosync. According to an experiment by Tsuji-san, dispatching
> of leave event can be delayed because of the poll(2) in
> __corosync_dispatch() with corosync < 2.x.
> 
> In the next commit, simpler method for detecting network partition
> based on a parameter of cpg callback function will be implemented. The
> reverted method based on poll(2) depends on semantics of fd obtained
> by cpg_fd_get(3) of corosync. The poll(2) based method worked well on
> corosync 1.x, but it doesn't work well on 2.x. Internal big change
> of corosync (maybe employing libqb) seemed to change the semantics so
> we cannot use poll(2) for detecting queued messages.
> 
> With this change, sheepdog cannot use corosync 1.x as its cluster
> manager any more. But the version has a critical bug in its message
> delivery mechanism which can produce broken epoch log. So it is
> reasonable to force users to stop using the version and introduce
> this change.
> 
> Reported bug: https://bugs.launchpad.net/sheepdog-project/+bug/1368503
> 
> Conflicts:
> 	sheep/cluster/corosync.c
> [ Conflicts were resolved by Hitoshi Mitake ]
> 
> Reported-by: Masahiro Tsuji <tuji at atworks.co.jp>
> Cc: Masahiro Tsuji <tuji at atworks.co.jp>
> Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
> ---
>  sheep/cluster/corosync.c | 45 +++------------------------------------------
>  1 file changed, 3 insertions(+), 42 deletions(-)

Tsuji-san, if the change seems to be good for you, could you stamp
your Tested-by: tag?

Thanks,
Hitoshi

> 
> diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
> index cd00733..477e442 100644
> --- a/sheep/cluster/corosync.c
> +++ b/sheep/cluster/corosync.c
> @@ -10,7 +10,6 @@
>   */
>  #include <stdio.h>
>  #include <unistd.h>
> -#include <poll.h>
>  #include <sys/epoll.h>
>  #include <corosync/cpg.h>
>  #include <corosync/cfg.h>
> @@ -41,8 +40,6 @@ static struct cpg_node cpg_nodes[COROSYNC_MAX_NODES];
>  static size_t nr_cpg_nodes;
>  static bool self_elect;
>  static bool join_finished;
> -static int cpg_fd;
> -static size_t nr_majority; /* used for network partition detection */
>  
>  /* event types which are dispatched in corosync_dispatch() */
>  enum corosync_event_type {
> @@ -358,23 +355,6 @@ static bool update_join_status(struct corosync_event *cevent)
>  static void __corosync_dispatch(void)
>  {
>  	struct corosync_event *cevent;
> -	struct pollfd pfd = {
> -		.fd = cpg_fd,
> -		.events = POLLIN,
> -	};
> -
> -	if (poll(&pfd, 1, 0)) {
> -		/*
> -		 * Corosync dispatches leave events one by one even
> -		 * when network partition has occurred.  To count the
> -		 * number of alive nodes correctly, we postpone
> -		 * processing events if there are incoming ones.
> -		 */
> -		sd_debug("wait for a next dispatch event");
> -		return;
> -	}
> -
> -	nr_majority = 0;
>  
>  	while (!list_empty(&corosync_block_event_list) ||
>  	       !list_empty(&corosync_nonblock_event_list)) {
> @@ -554,25 +534,6 @@ static void cdrv_cpg_confchg(cpg_handle_t handle,
>  	sd_debug("mem:%zu, joined:%zu, left:%zu", member_list_entries,
>  		 joined_list_entries, left_list_entries);
>  
> -	/* check network partition */
> -	if (left_list_entries) {
> -		if (nr_majority == 0) {
> -			size_t total = member_list_entries + left_list_entries;
> -
> -			/*
> -			 * we need at least 3 nodes to handle network
> -			 * partition failure
> -			 */
> -			if (total > 2)
> -				nr_majority = total / 2 + 1;
> -		}
> -
> -		if (member_list_entries == 0)
> -			panic("NIC failure?");
> -		if (member_list_entries < nr_majority)
> -			panic("Network partition is detected");
> -	}
> -
>  	/* convert cpg_address to cpg_node */
>  	build_cpg_node_list(member_sheep, member_list, member_list_entries);
>  	build_cpg_node_list(left_sheep, left_list, left_list_entries);
> @@ -714,7 +675,7 @@ out:
>  
>  static int corosync_init(const char *option)
>  {
> -	int ret, retry_cnt = 0;
> +	int ret, fd, retry_cnt = 0;
>  	uint32_t nodeid;
>  	cpg_callbacks_t cb = {
>  		.cpg_deliver_fn = cdrv_cpg_deliver,
> @@ -759,13 +720,13 @@ again:
>  	this_node.nodeid = nodeid;
>  	this_node.pid = getpid();
>  
> -	ret = cpg_fd_get(cpg_handle, &cpg_fd);
> +	ret = cpg_fd_get(cpg_handle, &fd);
>  	if (ret != CS_OK) {
>  		sd_err("failed to get cpg file descriptor (%d)", ret);
>  		return -1;
>  	}
>  
> -	ret = register_event(cpg_fd, corosync_handler, NULL);
> +	ret = register_event(fd, corosync_handler, NULL);
>  	if (ret) {
>  		sd_err("failed to register corosync event handler (%d)", ret);
>  		return -1;
> -- 
> 1.8.3.2
>