[sheepdog] [PATCH 3/3] sheep: start cluster if all the nodes in the previous epoch are gathered

Thu Jul 4 08:42:44 CEST 2013

On Wed, Jul 03, 2013 at 03:49:23PM +0900, MORITA Kazutaka wrote:
> From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> 
> The current rules to start sheepdog automatically are too complex:
> 
>  - One of the node in the latest epoch becomes a master.  If the
>    joining node has the larger epoch, the node becomes a new master
>    and the current master must exit and rejoin later.
> 
>  - If the joining node has a smaller epoch than the existing nodes,
>    the node will be linked to the delayed node list and rejoin
>    automatically after sheepdog starts.
> 
>  - Sheepdog starts if the current node list becomes the same one as in
>    the previous epoch.
> 
> After this patch, we have only one rule:
> 
>  - Sheepdog starts if all the nodes in the previous epoch are
>    gathered.
> 
> The nodes with smaller or larger epoch also can join, and all the
> joined nodes share the same (latest) cluster_info among them.  All the
> nodes have the latest epoch, so any node can be a master.  If all the
> nodes in the previous epoch join Sheepdog, the cluster will start
> automatically.
> 
> This change also brings the following benefit:
> 
>  - We can remove the failed node list, the delayed node list, and the
>    CJ_RES_JOIN_LATER status since the node with smaller epoch can join
>    Sheepdog.
> 
>  - The CJ_RES_MASTER_TRANSFER status can be removed because any node
>    can be a master and no need to transfer mastership even if the node
>    with larger epoch joins Sheepdog.
> 
> Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> ---
>  include/internal_proto.h  |   16 +--
>  include/shepherd.h        |    5 +-
>  sheep/cluster.h           |    2 +
>  sheep/cluster/corosync.c  |   16 +--
>  sheep/cluster/local.c     |   21 +--
>  sheep/cluster/shepherd.c  |   32 ++---
>  sheep/cluster/zookeeper.c |   20 +--
>  sheep/group.c             |  313 ++++++++-------------------------------------
>  shepherd/shepherd.c       |    8 +-
>  tests/functional/001      |   13 --
>  tests/functional/001.out  |   19 +--
>  tests/functional/002      |   14 --
>  tests/functional/002.out  |   18 +--
>  tests/functional/003      |   14 --
>  tests/functional/003.out  |   18 +--
>  tests/functional/004      |   16 ---
>  tests/functional/004.out  |   39 +-----
>  tests/functional/005      |   17 ---
>  tests/functional/005.out  |   45 +------
>  tests/functional/060      |   17 +--
>  20 files changed, 105 insertions(+), 558 deletions(-)
> 
> diff --git a/include/internal_proto.h b/include/internal_proto.h
> index f0c55c0..c5cd76d 100644
> --- a/include/internal_proto.h
> +++ b/include/internal_proto.h
> @@ -155,9 +155,7 @@ struct epoch_log {
>  
>  struct join_message {
>  	uint8_t proto_ver;
> -	uint8_t __pad1[3];
> -	uint16_t nr_failed_nodes;
> -	uint16_t nr_delayed_nodes;
> +	uint8_t __pad1[7];
>  	uint32_t cluster_status;
>  	uint8_t inc_epoch; /* set non-zero when we increment epoch of all nodes */
>  	uint8_t __pad2[3];
> @@ -195,18 +193,6 @@ enum cluster_join_result {
>  
>  	/* Fail to join. The joining node has an invalid epoch. */
>  	CJ_RES_FAIL,
> -
> -	/*
> -	 * Fail to join. The joining node should be added after the cluster
> -	 * start working.
> -	 */
> -	CJ_RES_JOIN_LATER,
> -
> -	/*
> -	 * Transfer mastership.  The joining node has a newer epoch, so this
> -	 * node will leave the cluster (restart later).
> -	 */
> -	CJ_RES_MASTER_TRANSFER,
>  };
>  
>  static inline __attribute__((used)) void __sd_epoch_format_build_bug_ons(void)
> diff --git a/include/shepherd.h b/include/shepherd.h
> index 5dbc64b..e1fbac1 100644
> --- a/include/shepherd.h
> +++ b/include/shepherd.h
> @@ -39,8 +39,11 @@ struct sph_msg {
>  
>  struct sph_msg_join {
>  	uint32_t res;		/* original type: enum cluster_join_result */
> -	struct sd_node node;
> +	struct sd_node new_node;
>  	uint8_t master_elected;
> +
> +	struct sd_node nodes[SD_MAX_NODES];
> +	uint32_t nr_nodes;
>  	uint8_t opaque[0];
>  };
>  
> diff --git a/sheep/cluster.h b/sheep/cluster.h
> index 80a701b..7665d75 100644
> --- a/sheep/cluster.h
> +++ b/sheep/cluster.h
> @@ -158,6 +158,8 @@ void sd_notify_handler(const struct sd_node *sender, void *msg, size_t msg_len);
>  bool sd_block_handler(const struct sd_node *sender);
>  int sd_reconnect_handler(void);
>  enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
> +					  const struct sd_node *members,
> +					  size_t nr_members,
>  					  void *opaque);
>  void recalculate_vnodes(struct sd_node *nodes, int nr_nodes);
>  
> diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
> index 2be978b..bf90209 100644
> --- a/sheep/cluster/corosync.c
> +++ b/sheep/cluster/corosync.c
> @@ -295,28 +295,18 @@ static bool __corosync_dispatch_one(struct corosync_event *cevent)
>  			/* check_join() must be called only once */
>  			return false;
>  
> -		res = sd_check_join_cb(&cevent->sender.ent,
> -						     cevent->msg);
> -		if (res == CJ_RES_MASTER_TRANSFER)
> -			nr_cpg_nodes = 0;
> -
> +		build_node_list(cpg_nodes, nr_cpg_nodes, entries);
> +		res = sd_check_join_cb(&cevent->sender.ent, entries,
> +				       nr_cpg_nodes, cevent->msg);
>  		send_message(COROSYNC_MSG_TYPE_JOIN_RESPONSE, res,
>  			     &cevent->sender, cpg_nodes, nr_cpg_nodes,
>  			     cevent->msg, cevent->msg_len);
>  
> -		if (res == CJ_RES_MASTER_TRANSFER) {
> -			sd_eprintf("failed to join sheepdog cluster:"
> -				   " please retry when master is up");
> -			exit(1);
> -		}
> -
>  		cevent->callbacked = true;
>  		return false;
>  	case COROSYNC_EVENT_TYPE_JOIN_RESPONSE:
>  		switch (cevent->result) {
>  		case CJ_RES_SUCCESS:
> -		case CJ_RES_MASTER_TRANSFER:
> -		case CJ_RES_JOIN_LATER:
>  			add_cpg_node(cpg_nodes, nr_cpg_nodes, &cevent->sender);
>  			nr_cpg_nodes++;
>  			/* fall through */
> diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c
> index 491e3ea..572aa19 100644
> --- a/sheep/cluster/local.c
> +++ b/sheep/cluster/local.c
> @@ -441,31 +441,18 @@ static bool local_process_event(void)
>  
>  	switch (ev->type) {
>  	case EVENT_JOIN_REQUEST:
> -		res = sd_check_join_cb(&ev->sender.node, ev->buf);
> +		/* nodes[nr_nodes - 1] is a sender, so don't include it */
> +		assert(node_eq(&ev->sender.node, &nodes[nr_nodes - 1]));
> +		res = sd_check_join_cb(&ev->sender.node, nodes, nr_nodes - 1,
> +				       ev->buf);
>  		ev->join_result = res;
>  		ev->type = EVENT_JOIN_RESPONSE;
>  		msync(ev, sizeof(*ev), MS_SYNC);
>  
>  		shm_queue_notify();
>  
> -		if (res == CJ_RES_MASTER_TRANSFER) {
> -			sd_eprintf("failed to join sheepdog cluster: "
> -				   "please retry when master is up");
> -			shm_queue_unlock();
> -			exit(1);
> -		}
>  		return false;
>  	case EVENT_JOIN_RESPONSE:
> -		if (ev->join_result == CJ_RES_MASTER_TRANSFER) {
> -			/* FIXME: This code is tricky, but Sheepdog assumes that */
> -			/* nr_nodes = 1 when join_result = MASTER_TRANSFER... */
> -			ev->nr_lnodes = 1;
> -			ev->lnodes[0] = this_node;
> -			nr_nodes = 1;
> -			nodes[0] = this_node.node;
> -			msync(ev, sizeof(*ev), MS_SYNC);
> -		}
> -
>  		sd_join_handler(&ev->sender.node, nodes, nr_nodes,
>  				ev->join_result, ev->buf);
>  		break;
> diff --git a/sheep/cluster/shepherd.c b/sheep/cluster/shepherd.c
> index 26fb005..db8336e 100644
> --- a/sheep/cluster/shepherd.c
> +++ b/sheep/cluster/shepherd.c
> @@ -58,7 +58,7 @@ static int do_shepherd_join(void)
>  	msg.body_len = msg_join_len;
>  
>  	msg_join = xzalloc(msg_join_len);
> -	msg_join->node = this_node;
> +	msg_join->new_node = this_node;
>  	memcpy(msg_join->opaque, kept_opaque, kept_opaque_len);
>  
>  	ret = writev2(sph_comm_fd, &msg, msg_join, msg_join_len);
> @@ -115,7 +115,7 @@ retry:
>  		 * FIXME: member change events must be ordered with nonblocked
>  		 *        events
>  		 */
> -		res = sd_check_join_cb(&join->node, join->opaque);
> +		res = sd_check_join_cb(&join->new_node, NULL, 0, join->opaque);
>  		if (res == CJ_RES_FAIL) {
>  			sd_eprintf("sd_check_join_cb() failed");
>  			exit(1);
> @@ -161,19 +161,9 @@ retry:
>  
>  	sd_iprintf("join reply arrived, nr_nodes: %d", join_reply->nr_nodes);
>  
> -	if (join_reply->res == CJ_RES_MASTER_TRANSFER) {
> -		is_master = true;
> -
> -		/* FIXME: This code is tricky, but Sheepdog assumes that */
> -		/* nr_nodes = 1 when join_result = MASTER_TRANSFER... */
> -		nr_nodes = 1;
> -		nodes[0] = this_node;
> -	} else {
> -		memcpy(nodes, join_reply->nodes,
> -			join_reply->nr_nodes * sizeof(struct sd_node));
> -
> -		nr_nodes = join_reply->nr_nodes;
> -	}
> +	memcpy(nodes, join_reply->nodes,
> +	       join_reply->nr_nodes * sizeof(struct sd_node));
> +	nr_nodes = join_reply->nr_nodes;
>  
>  	/* FIXME: member change events must be ordered with nonblocked events */
>  	sd_join_handler(&this_node, nodes, nr_nodes,
> @@ -343,7 +333,7 @@ static void msg_new_node(struct sph_msg *rcv)
>  	}
>  
>  	/* FIXME: member change events must be ordered with nonblocked events */
> -	res = sd_check_join_cb(&join->node, join->opaque);
> +	res = sd_check_join_cb(&join->new_node, nodes, nr_nodes, join->opaque);
>  
>  	join->res = res;
>  
> @@ -357,12 +347,6 @@ static void msg_new_node(struct sph_msg *rcv)
>  		exit(1);
>  	}
>  	free(join);
> -
> -	if (res == CJ_RES_MASTER_TRANSFER) {
> -		sd_eprintf("failed to join sheepdog cluster: "
> -			"please retry when master is up");
> -		exit(1);
> -	}
>  }
>  
>  static void msg_new_node_finish(struct sph_msg *rcv)
> @@ -380,11 +364,11 @@ static void msg_new_node_finish(struct sph_msg *rcv)
>  
>  	jm = (struct join_message *)join_node_finish->opaque;
>  	memcpy(nodes, join_node_finish->nodes,
> -		join_node_finish->nr_nodes * sizeof(struct sd_node));
> +	       join_node_finish->nr_nodes * sizeof(struct sd_node));
>  	nr_nodes = join_node_finish->nr_nodes;
>  
>  	sd_iprintf("new node: %s",
> -		node_to_str(&join_node_finish->new_node));
> +		   node_to_str(&join_node_finish->new_node));
>  
>  	/* FIXME: member change events must be ordered with nonblocked events */
>  	sd_join_handler(&join_node_finish->new_node, nodes, nr_nodes,
> diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
> index 5ed16cf..aafc18d 100644
> --- a/sheep/cluster/zookeeper.c
> +++ b/sheep/cluster/zookeeper.c
> @@ -845,15 +845,11 @@ static void zk_handle_join_request(struct zk_event *ev)
>  		return;
>  	}
>  
> -	res = sd_check_join_cb(&ev->sender.node, ev->buf);
> +	res = sd_check_join_cb(&ev->sender.node, sd_nodes, nr_sd_nodes,
> +			       ev->buf);
>  	ev->join_result = res;
>  	push_join_response(ev);
> -	if (res == CJ_RES_MASTER_TRANSFER) {
> -		sd_eprintf("failed to join sheepdog cluster: "
> -			   "please retry when master is up");
> -		zk_leave();
> -		exit(1);
> -	}
> +
>  	sd_dprintf("I'm the master now");
>  }
>  
> @@ -896,19 +892,9 @@ static void zk_handle_join_response(struct zk_event *ev)
>  		/* newly joined node */
>  		init_node_list(ev);
>  
> -	if (ev->join_result == CJ_RES_MASTER_TRANSFER)
> -		/*
> -		 * Sheepdog assumes that only one sheep is alive in
> -		 * MASTER_TRANSFER scenario. So only the joining sheep is
> -		 * supposed to return single node view to sd_join_handler().
> -		 */
> -		zk_tree_destroy();
> -
>  	sd_dprintf("%s, %d", node_to_str(&ev->sender.node), ev->join_result);
>  	switch (ev->join_result) {
>  	case CJ_RES_SUCCESS:
> -	case CJ_RES_JOIN_LATER:
> -	case CJ_RES_MASTER_TRANSFER:
>  		snprintf(path, sizeof(path), MEMBER_ZNODE"/%s",
>  			 node_to_str(&ev->sender.node));
>  		if (node_eq(&ev->sender.node, &this_node.node)) {
> diff --git a/sheep/group.c b/sheep/group.c
> index cb87711..743b4fb 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -50,18 +50,6 @@ static main_thread(struct vnode_info *) current_vnode_info;
>  static main_thread(struct list_head *) pending_block_list;
>  static main_thread(struct list_head *) pending_notify_list;
>  
> -/*
> - * List of nodes that were part of the last epoch before a shutdown,
> - * but failed to join.
> - */
> -static main_thread(struct list_head *) failed_nodes;
> -
> -/*
> - * List of nodes that weren't part of the last epoch, but joined
> - * before restarting the cluster.
> - */
> -static main_thread(struct list_head *) delayed_nodes;
> -
>  static int get_zones_nr_from(const struct sd_node *nodes, int nr_nodes)
>  {
>  	int nr_zones = 0, i, j;
> @@ -341,120 +329,6 @@ static inline int get_nodes_nr_from(struct list_head *l)
>  	return nr;
>  }
>  
> -static int get_nodes_nr_epoch(uint32_t epoch)
> -{
> -	struct sd_node nodes[SD_MAX_NODES];
> -
> -	return epoch_log_read(epoch, nodes, sizeof(nodes));
> -}
> -
> -static const struct sd_node *find_entry_list(const struct sd_node *entry,
> -					     struct list_head *head)
> -{
> -	struct node *n;
> -	list_for_each_entry(n, head, list)
> -		if (node_eq(&n->ent, entry))
> -			return entry;
> -
> -	return NULL;
> -
> -}
> -
> -static const struct sd_node *find_entry_epoch(const struct sd_node *entry,
> -					      uint32_t epoch)
> -{
> -	struct sd_node nodes[SD_MAX_NODES];
> -	int nr;
> -
> -	if (!epoch)
> -		return NULL;
> -
> -	nr = epoch_log_read(epoch, nodes, sizeof(nodes));
> -
> -	return xlfind(entry, nodes, nr, node_cmp);
> -}
> -
> -/*
> - * Add a node to the list of nodes that weren't part of the cluster before
> - * it shut down, and thus do not count toward the nodes required to allow
> - * an automated restart.  These nodes will become part of the cluster by
> - * the time it does get restarted.
> - */
> -static bool add_delayed_node(uint32_t epoch, const struct sd_node *node)
> -{
> -	struct node *n;
> -
> -	if (find_entry_list(node, main_thread_get(delayed_nodes)))
> -		return false;
> -	assert(!find_entry_epoch(node, epoch));
> -
> -	n = xmalloc(sizeof(*n));
> -	n->ent = *node;
> -	list_add_tail(&n->list, main_thread_get(delayed_nodes));
> -	return true;
> -}
> -
> -/*
> - * For a node that failed to join check if was part of the original
> - * epoch, and if so add it to the list of node expected to be present
> - * but failing to join.
> - */
> -static bool add_failed_node(uint32_t epoch, const struct sd_node *node)
> -{
> -	struct node *n;
> -
> -	if (find_entry_list(node, main_thread_get(failed_nodes)))
> -		return false;
> -	if (!find_entry_epoch(node, epoch))
> -		return false;
> -
> -	n = xmalloc(sizeof(*n));
> -	n->ent = *node;
> -	list_add_tail(&n->list, main_thread_get(failed_nodes));
> -	return true;
> -}
> -
> -/*
> - * Add the failed and delayed nodes in a join message to the local
> - * lists of such nodes.
> - */
> -static void update_exceptional_node_list(uint32_t epoch,
> -					 const struct join_message *jm)
> -{
> -	int i;
> -
> -	for (i = 0; i < jm->nr_failed_nodes; i++)
> -		add_failed_node(epoch, &jm->cinfo.nodes[i]);
> -	for ( ; i < jm->nr_failed_nodes + jm->nr_delayed_nodes; i++)
> -		add_delayed_node(epoch, &jm->cinfo.nodes[i]);
> -}
> -
> -/* Format the lists of failed or delayed nodes into the join message. */
> -static void format_exceptional_node_list(struct join_message *jm)
> -{
> -	struct node *n;
> -
> -	list_for_each_entry(n, main_thread_get(failed_nodes), list)
> -		jm->cinfo.nodes[jm->nr_failed_nodes++] = n->ent;
> -	list_for_each_entry(n, main_thread_get(delayed_nodes), list)
> -		jm->cinfo.nodes[jm->nr_failed_nodes +
> -				jm->nr_delayed_nodes++] = n->ent;
> -}
> -
> -static void clear_exceptional_node_lists(void)
> -{
> -	struct node *n, *t;
> -
> -	list_for_each_entry_safe(n, t, main_thread_get(failed_nodes), list) {
> -		list_del(&n->list);
> -		free(n);
> -	}
> -	list_for_each_entry_safe(n, t, main_thread_get(delayed_nodes), list) {
> -		list_del(&n->list);
> -		free(n);
> -	}
> -}
> -
>  int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
>  			  time_t *timestamp, struct vnode_info *vinfo)
>  {
> @@ -496,8 +370,6 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
>  
>  static int cluster_sanity_check(struct join_message *jm)
>  {
> -	uint32_t local_epoch = get_latest_epoch();
> -
>  	if (jm->cinfo.ctime != sys->cinfo.ctime) {
>  		sd_eprintf("joining node ctime doesn't match: %"
>  			   PRIu64 " vs %" PRIu64, jm->cinfo.ctime,
> @@ -505,12 +377,6 @@ static int cluster_sanity_check(struct join_message *jm)
>  		return CJ_RES_FAIL;
>  	}
>  
> -	if (jm->cinfo.epoch > local_epoch) {
> -		sd_eprintf("joining node epoch too large: %"
> -			   PRIu32 " vs %" PRIu32, jm->cinfo.epoch, local_epoch);
> -		return CJ_RES_FAIL;
> -	}
> -
>  	if (jm->cinfo.nr_copies != sys->cinfo.nr_copies) {
>  		sd_eprintf("joining node nr_copies doesn't match: %u vs %u",
>  			   jm->cinfo.nr_copies, sys->cinfo.nr_copies);
> @@ -526,88 +392,71 @@ static int cluster_sanity_check(struct join_message *jm)
>  	return CJ_RES_SUCCESS;
>  }
>  
> -static int cluster_wait_for_join_check(const struct sd_node *joined,
> -				       struct join_message *jm)
> +/*
> + * Check whether enough node members are gathered.
> + *
> + * Sheepdog can start automatically if and only if all the members in the latest
> + * epoch are gathered.
> + */
> +static bool enough_nodes_gathered(struct join_message *jm,
> +				  const struct sd_node *joining,
> +				  const struct sd_node *members,
> +				  size_t nr_members)
>  {
> -	struct sd_node local_entries[SD_MAX_NODES];
> -	int nr, nr_local_entries, nr_failed_entries, nr_delayed_nodes;
> -	uint32_t local_epoch = get_latest_epoch();
> -	int ret;
> -	struct vnode_info *cur_vinfo;
> +	for (int i = 0; i < jm->cinfo.nr_nodes; i++) {
> +		const struct sd_node *key = jm->cinfo.nodes + i, *n;
>  
> -	if (jm->cinfo.nr_nodes == 0)
> -		return CJ_RES_JOIN_LATER;
> -
> -	ret = cluster_sanity_check(jm);
> -	if (ret != CJ_RES_SUCCESS)  {
> -		if (jm->cinfo.epoch > sys->cinfo.epoch) {
> -			sd_eprintf("transfer mastership (%d, %d)", jm->cinfo.epoch,
> -				   sys->cinfo.epoch);
> -			return CJ_RES_MASTER_TRANSFER;
> +		n = xlfind(key, members, nr_members, node_cmp);
> +		if (n == NULL && !node_eq(key, joining)) {
> +			sd_dprintf("%s doesn't join yet", node_to_str(key));
> +			return false;
>  		}
> -		return ret;
>  	}
>  
> -	nr_local_entries = epoch_log_read(jm->cinfo.epoch, local_entries,
> -					  sizeof(local_entries));
> -	if (nr_local_entries == -1)
> -		return CJ_RES_FAIL;
> -
> -	if (jm->cinfo.epoch < local_epoch) {
> -		sd_eprintf("joining node epoch too small: %"
> -			   PRIu32 " vs %" PRIu32, jm->cinfo.epoch, local_epoch);
> +	sd_dprintf("all the nodes are gathered, %d, %zd", jm->cinfo.nr_nodes,
> +		   nr_members);
> +	return true;
> +}
>  
> -		if (xbsearch(joined, local_entries, nr_local_entries, node_cmp))
> -			return CJ_RES_FAIL;
> -		return CJ_RES_JOIN_LATER;
> -	}
> +static int cluster_wait_for_join_check(const struct sd_node *joined,
> +				       const struct sd_node *members,
> +				       size_t nr_members,
> +				       struct join_message *jm)
> +{
> +	int ret;
>  
> -	if (jm->cinfo.nr_nodes != nr_local_entries) {
> -		sd_eprintf("epoch log entries do not match: %d vs %d",
> -			   jm->cinfo.nr_nodes, nr_local_entries);
> -		return CJ_RES_FAIL;
> +	if (jm->cinfo.epoch != 0 && sys->cinfo.epoch != 0) {
> +		/* check whether joining node is valid or not */
> +		ret = cluster_sanity_check(jm);
> +		if (ret != CJ_RES_SUCCESS)
> +			return ret;
>  	}
>  
> -
> -	if (memcmp(jm->cinfo.nodes, local_entries,
> -		   sizeof(jm->cinfo.nodes[0]) * jm->cinfo.nr_nodes) != 0) {
> +	if (jm->cinfo.epoch > sys->cinfo.epoch)
> +		sys->cinfo = jm->cinfo;
> +	else if (jm->cinfo.epoch < sys->cinfo.epoch) {
> +		sd_dprintf("joining node has a smaller epoch, %" PRIu32 ", %"
> +			   PRIu32, jm->cinfo.epoch, sys->cinfo.epoch);
> +		jm->cinfo = sys->cinfo;
> +	} else if (memcmp(jm->cinfo.nodes, sys->cinfo.nodes,
> +			  sizeof(jm->cinfo.nodes[0]) * jm->cinfo.nr_nodes) != 0) {
>  		sd_eprintf("epoch log entries does not match");
>  		return CJ_RES_FAIL;
>  	}
>  
> -	cur_vinfo = main_thread_get(current_vnode_info);
> -	if (!cur_vinfo)
> -		nr = 1;
> -	else
> -		nr = cur_vinfo->nr_nodes + 1;
> -
> -	nr_delayed_nodes = get_nodes_nr_from(main_thread_get(delayed_nodes));
> -
>  	/*
>  	 * If we have all members from the last epoch log in the in-memory
> -	 * node list, and no new nodes joining we can set the cluster live
> -	 * now without incrementing the epoch.
> -	 */
> -	if (nr == nr_local_entries && !nr_delayed_nodes) {
> -		jm->cluster_status = SD_STATUS_OK;
> -		return CJ_RES_SUCCESS;
> -	}
> -
> -	/*
> -	 * If we reach the old node count, but some node failed we have to
> -	 * update the epoch before setting the cluster live.
> +	 * node list, we can set the cluster live now.
>  	 */
> -	nr_failed_entries = get_nodes_nr_from(main_thread_get(failed_nodes));
> -	if (nr_local_entries == nr + nr_failed_entries - nr_delayed_nodes) {
> -		jm->inc_epoch = 1;
> +	if (sys->cinfo.epoch > 0 &&
> +	    enough_nodes_gathered(jm, joined,  members, nr_members)) {
> +		if (jm->cinfo.nr_nodes < nr_members + 1)

I'd suggest adding a comment here why you plus 1 onto nr_members or we'd better
add a notice that nr_members means nr of nodes *exclude* joining node or leaving
node. Probably we should rename nr_members as nr_nodes too.

Thanks
Yuan