[sheepdog] [PATCH 3/3] sheep: start cluster if all the nodes in the previous epoch are gathered

Thu Jul 4 09:18:46 CEST 2013

At Thu, 4 Jul 2013 14:42:44 +0800,
Liu Yuan wrote:
> 
> On Wed, Jul 03, 2013 at 03:49:23PM +0900, MORITA Kazutaka wrote:
> > From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> > 
> > The current rules to start sheepdog automatically are too complex:
> > 
> >  - One of the node in the latest epoch becomes a master.  If the
> >    joining node has the larger epoch, the node becomes a new master
> >    and the current master must exit and rejoin later.
> > 
> >  - If the joining node has a smaller epoch than the existing nodes,
> >    the node will be linked to the delayed node list and rejoin
> >    automatically after sheepdog starts.
> > 
> >  - Sheepdog starts if the current node list becomes the same one as in
> >    the previous epoch.
> > 
> > After this patch, we have only one rule:
> > 
> >  - Sheepdog starts if all the nodes in the previous epoch are
> >    gathered.
> > 
> > The nodes with smaller or larger epoch also can join, and all the
> > joined nodes share the same (latest) cluster_info among them.  All the
> > nodes have the latest epoch, so any node can be a master.  If all the
> > nodes in the previous epoch join Sheepdog, the cluster will start
> > automatically.
> > 
> > This change also brings the following benefit:
> > 
> >  - We can remove the failed node list, the delayed node list, and the
> >    CJ_RES_JOIN_LATER status since the node with smaller epoch can join
> >    Sheepdog.
> > 
> >  - The CJ_RES_MASTER_TRANSFER status can be removed because any node
> >    can be a master and no need to transfer mastership even if the node
> >    with larger epoch joins Sheepdog.
> > 
> > Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> > ---
> >  include/internal_proto.h  |   16 +--
> >  include/shepherd.h        |    5 +-
> >  sheep/cluster.h           |    2 +
> >  sheep/cluster/corosync.c  |   16 +--
> >  sheep/cluster/local.c     |   21 +--
> >  sheep/cluster/shepherd.c  |   32 ++---
> >  sheep/cluster/zookeeper.c |   20 +--
> >  sheep/group.c             |  313 ++++++++-------------------------------------
> >  shepherd/shepherd.c       |    8 +-
> >  tests/functional/001      |   13 --
> >  tests/functional/001.out  |   19 +--
> >  tests/functional/002      |   14 --
> >  tests/functional/002.out  |   18 +--
> >  tests/functional/003      |   14 --
> >  tests/functional/003.out  |   18 +--
> >  tests/functional/004      |   16 ---
> >  tests/functional/004.out  |   39 +-----
> >  tests/functional/005      |   17 ---
> >  tests/functional/005.out  |   45 +------
> >  tests/functional/060      |   17 +--
> >  20 files changed, 105 insertions(+), 558 deletions(-)
> > 
> > diff --git a/include/internal_proto.h b/include/internal_proto.h
> > index f0c55c0..c5cd76d 100644
> > --- a/include/internal_proto.h
> > +++ b/include/internal_proto.h
> > @@ -155,9 +155,7 @@ struct epoch_log {
> >  
> >  struct join_message {
> >  	uint8_t proto_ver;
> > -	uint8_t __pad1[3];
> > -	uint16_t nr_failed_nodes;
> > -	uint16_t nr_delayed_nodes;
> > +	uint8_t __pad1[7];
> >  	uint32_t cluster_status;
> >  	uint8_t inc_epoch; /* set non-zero when we increment epoch of all nodes */
> >  	uint8_t __pad2[3];
> > @@ -195,18 +193,6 @@ enum cluster_join_result {
> >  
> >  	/* Fail to join. The joining node has an invalid epoch. */
> >  	CJ_RES_FAIL,
> > -
> > -	/*
> > -	 * Fail to join. The joining node should be added after the cluster
> > -	 * start working.
> > -	 */
> > -	CJ_RES_JOIN_LATER,
> > -
> > -	/*
> > -	 * Transfer mastership.  The joining node has a newer epoch, so this
> > -	 * node will leave the cluster (restart later).
> > -	 */
> > -	CJ_RES_MASTER_TRANSFER,
> >  };
> >  
> >  static inline __attribute__((used)) void __sd_epoch_format_build_bug_ons(void)
> > diff --git a/include/shepherd.h b/include/shepherd.h
> > index 5dbc64b..e1fbac1 100644
> > --- a/include/shepherd.h
> > +++ b/include/shepherd.h
> > @@ -39,8 +39,11 @@ struct sph_msg {
> >  
> >  struct sph_msg_join {
> >  	uint32_t res;		/* original type: enum cluster_join_result */
> > -	struct sd_node node;
> > +	struct sd_node new_node;
> >  	uint8_t master_elected;
> > +
> > +	struct sd_node nodes[SD_MAX_NODES];
> > +	uint32_t nr_nodes;
> >  	uint8_t opaque[0];
> >  };
> >  
> > diff --git a/sheep/cluster.h b/sheep/cluster.h
> > index 80a701b..7665d75 100644
> > --- a/sheep/cluster.h
> > +++ b/sheep/cluster.h
> > @@ -158,6 +158,8 @@ void sd_notify_handler(const struct sd_node *sender, void *msg, size_t msg_len);
> >  bool sd_block_handler(const struct sd_node *sender);
> >  int sd_reconnect_handler(void);
> >  enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
> > +					  const struct sd_node *members,
> > +					  size_t nr_members,
> >  					  void *opaque);
> >  void recalculate_vnodes(struct sd_node *nodes, int nr_nodes);
> >  
> > diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
> > index 2be978b..bf90209 100644
> > --- a/sheep/cluster/corosync.c
> > +++ b/sheep/cluster/corosync.c
> > @@ -295,28 +295,18 @@ static bool __corosync_dispatch_one(struct corosync_event *cevent)
> >  			/* check_join() must be called only once */
> >  			return false;
> >  
> > -		res = sd_check_join_cb(&cevent->sender.ent,
> > -						     cevent->msg);
> > -		if (res == CJ_RES_MASTER_TRANSFER)
> > -			nr_cpg_nodes = 0;
> > -
> > +		build_node_list(cpg_nodes, nr_cpg_nodes, entries);
> > +		res = sd_check_join_cb(&cevent->sender.ent, entries,
> > +				       nr_cpg_nodes, cevent->msg);
> >  		send_message(COROSYNC_MSG_TYPE_JOIN_RESPONSE, res,
> >  			     &cevent->sender, cpg_nodes, nr_cpg_nodes,
> >  			     cevent->msg, cevent->msg_len);
> >  
> > -		if (res == CJ_RES_MASTER_TRANSFER) {
> > -			sd_eprintf("failed to join sheepdog cluster:"
> > -				   " please retry when master is up");
> > -			exit(1);
> > -		}
> > -
> >  		cevent->callbacked = true;
> >  		return false;
> >  	case COROSYNC_EVENT_TYPE_JOIN_RESPONSE:
> >  		switch (cevent->result) {
> >  		case CJ_RES_SUCCESS:
> > -		case CJ_RES_MASTER_TRANSFER:
> > -		case CJ_RES_JOIN_LATER:
> >  			add_cpg_node(cpg_nodes, nr_cpg_nodes, &cevent->sender);
> >  			nr_cpg_nodes++;
> >  			/* fall through */
> > diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c
> > index 491e3ea..572aa19 100644
> > --- a/sheep/cluster/local.c
> > +++ b/sheep/cluster/local.c
> > @@ -441,31 +441,18 @@ static bool local_process_event(void)
> >  
> >  	switch (ev->type) {
> >  	case EVENT_JOIN_REQUEST:
> > -		res = sd_check_join_cb(&ev->sender.node, ev->buf);
> > +		/* nodes[nr_nodes - 1] is a sender, so don't include it */
> > +		assert(node_eq(&ev->sender.node, &nodes[nr_nodes - 1]));
> > +		res = sd_check_join_cb(&ev->sender.node, nodes, nr_nodes - 1,
> > +				       ev->buf);
> >  		ev->join_result = res;
> >  		ev->type = EVENT_JOIN_RESPONSE;
> >  		msync(ev, sizeof(*ev), MS_SYNC);
> >  
> >  		shm_queue_notify();
> >  
> > -		if (res == CJ_RES_MASTER_TRANSFER) {
> > -			sd_eprintf("failed to join sheepdog cluster: "
> > -				   "please retry when master is up");
> > -			shm_queue_unlock();
> > -			exit(1);
> > -		}
> >  		return false;
> >  	case EVENT_JOIN_RESPONSE:
> > -		if (ev->join_result == CJ_RES_MASTER_TRANSFER) {
> > -			/* FIXME: This code is tricky, but Sheepdog assumes that */
> > -			/* nr_nodes = 1 when join_result = MASTER_TRANSFER... */
> > -			ev->nr_lnodes = 1;
> > -			ev->lnodes[0] = this_node;
> > -			nr_nodes = 1;
> > -			nodes[0] = this_node.node;
> > -			msync(ev, sizeof(*ev), MS_SYNC);
> > -		}
> > -
> >  		sd_join_handler(&ev->sender.node, nodes, nr_nodes,
> >  				ev->join_result, ev->buf);
> >  		break;
> > diff --git a/sheep/cluster/shepherd.c b/sheep/cluster/shepherd.c
> > index 26fb005..db8336e 100644
> > --- a/sheep/cluster/shepherd.c
> > +++ b/sheep/cluster/shepherd.c
> > @@ -58,7 +58,7 @@ static int do_shepherd_join(void)
> >  	msg.body_len = msg_join_len;
> >  
> >  	msg_join = xzalloc(msg_join_len);
> > -	msg_join->node = this_node;
> > +	msg_join->new_node = this_node;
> >  	memcpy(msg_join->opaque, kept_opaque, kept_opaque_len);
> >  
> >  	ret = writev2(sph_comm_fd, &msg, msg_join, msg_join_len);
> > @@ -115,7 +115,7 @@ retry:
> >  		 * FIXME: member change events must be ordered with nonblocked
> >  		 *        events
> >  		 */
> > -		res = sd_check_join_cb(&join->node, join->opaque);
> > +		res = sd_check_join_cb(&join->new_node, NULL, 0, join->opaque);
> >  		if (res == CJ_RES_FAIL) {
> >  			sd_eprintf("sd_check_join_cb() failed");
> >  			exit(1);
> > @@ -161,19 +161,9 @@ retry:
> >  
> >  	sd_iprintf("join reply arrived, nr_nodes: %d", join_reply->nr_nodes);
> >  
> > -	if (join_reply->res == CJ_RES_MASTER_TRANSFER) {
> > -		is_master = true;
> > -
> > -		/* FIXME: This code is tricky, but Sheepdog assumes that */
> > -		/* nr_nodes = 1 when join_result = MASTER_TRANSFER... */
> > -		nr_nodes = 1;
> > -		nodes[0] = this_node;
> > -	} else {
> > -		memcpy(nodes, join_reply->nodes,
> > -			join_reply->nr_nodes * sizeof(struct sd_node));
> > -
> > -		nr_nodes = join_reply->nr_nodes;
> > -	}
> > +	memcpy(nodes, join_reply->nodes,
> > +	       join_reply->nr_nodes * sizeof(struct sd_node));
> > +	nr_nodes = join_reply->nr_nodes;
> >  
> >  	/* FIXME: member change events must be ordered with nonblocked events */
> >  	sd_join_handler(&this_node, nodes, nr_nodes,
> > @@ -343,7 +333,7 @@ static void msg_new_node(struct sph_msg *rcv)
> >  	}
> >  
> >  	/* FIXME: member change events must be ordered with nonblocked events */
> > -	res = sd_check_join_cb(&join->node, join->opaque);
> > +	res = sd_check_join_cb(&join->new_node, nodes, nr_nodes, join->opaque);
> >  
> >  	join->res = res;
> >  
> > @@ -357,12 +347,6 @@ static void msg_new_node(struct sph_msg *rcv)
> >  		exit(1);
> >  	}
> >  	free(join);
> > -
> > -	if (res == CJ_RES_MASTER_TRANSFER) {
> > -		sd_eprintf("failed to join sheepdog cluster: "
> > -			"please retry when master is up");
> > -		exit(1);
> > -	}
> >  }
> >  
> >  static void msg_new_node_finish(struct sph_msg *rcv)
> > @@ -380,11 +364,11 @@ static void msg_new_node_finish(struct sph_msg *rcv)
> >  
> >  	jm = (struct join_message *)join_node_finish->opaque;
> >  	memcpy(nodes, join_node_finish->nodes,
> > -		join_node_finish->nr_nodes * sizeof(struct sd_node));
> > +	       join_node_finish->nr_nodes * sizeof(struct sd_node));
> >  	nr_nodes = join_node_finish->nr_nodes;
> >  
> >  	sd_iprintf("new node: %s",
> > -		node_to_str(&join_node_finish->new_node));
> > +		   node_to_str(&join_node_finish->new_node));
> >  
> >  	/* FIXME: member change events must be ordered with nonblocked events */
> >  	sd_join_handler(&join_node_finish->new_node, nodes, nr_nodes,
> > diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
> > index 5ed16cf..aafc18d 100644
> > --- a/sheep/cluster/zookeeper.c
> > +++ b/sheep/cluster/zookeeper.c
> > @@ -845,15 +845,11 @@ static void zk_handle_join_request(struct zk_event *ev)
> >  		return;
> >  	}
> >  
> > -	res = sd_check_join_cb(&ev->sender.node, ev->buf);
> > +	res = sd_check_join_cb(&ev->sender.node, sd_nodes, nr_sd_nodes,
> > +			       ev->buf);
> >  	ev->join_result = res;
> >  	push_join_response(ev);
> > -	if (res == CJ_RES_MASTER_TRANSFER) {
> > -		sd_eprintf("failed to join sheepdog cluster: "
> > -			   "please retry when master is up");
> > -		zk_leave();
> > -		exit(1);
> > -	}
> > +
> >  	sd_dprintf("I'm the master now");
> >  }
> >  
> > @@ -896,19 +892,9 @@ static void zk_handle_join_response(struct zk_event *ev)
> >  		/* newly joined node */
> >  		init_node_list(ev);
> >  
> > -	if (ev->join_result == CJ_RES_MASTER_TRANSFER)
> > -		/*
> > -		 * Sheepdog assumes that only one sheep is alive in
> > -		 * MASTER_TRANSFER scenario. So only the joining sheep is
> > -		 * supposed to return single node view to sd_join_handler().
> > -		 */
> > -		zk_tree_destroy();
> > -
> >  	sd_dprintf("%s, %d", node_to_str(&ev->sender.node), ev->join_result);
> >  	switch (ev->join_result) {
> >  	case CJ_RES_SUCCESS:
> > -	case CJ_RES_JOIN_LATER:
> > -	case CJ_RES_MASTER_TRANSFER:
> >  		snprintf(path, sizeof(path), MEMBER_ZNODE"/%s",
> >  			 node_to_str(&ev->sender.node));
> >  		if (node_eq(&ev->sender.node, &this_node.node)) {
> > diff --git a/sheep/group.c b/sheep/group.c
> > index cb87711..743b4fb 100644
> > --- a/sheep/group.c
> > +++ b/sheep/group.c
> > @@ -50,18 +50,6 @@ static main_thread(struct vnode_info *) current_vnode_info;
> >  static main_thread(struct list_head *) pending_block_list;
> >  static main_thread(struct list_head *) pending_notify_list;
> >  
> > -/*
> > - * List of nodes that were part of the last epoch before a shutdown,
> > - * but failed to join.
> > - */
> > -static main_thread(struct list_head *) failed_nodes;
> > -
> > -/*
> > - * List of nodes that weren't part of the last epoch, but joined
> > - * before restarting the cluster.
> > - */
> > -static main_thread(struct list_head *) delayed_nodes;
> > -
> >  static int get_zones_nr_from(const struct sd_node *nodes, int nr_nodes)
> >  {
> >  	int nr_zones = 0, i, j;
> > @@ -341,120 +329,6 @@ static inline int get_nodes_nr_from(struct list_head *l)
> >  	return nr;
> >  }
> >  
> > -static int get_nodes_nr_epoch(uint32_t epoch)
> > -{
> > -	struct sd_node nodes[SD_MAX_NODES];
> > -
> > -	return epoch_log_read(epoch, nodes, sizeof(nodes));
> > -}
> > -
> > -static const struct sd_node *find_entry_list(const struct sd_node *entry,
> > -					     struct list_head *head)
> > -{
> > -	struct node *n;
> > -	list_for_each_entry(n, head, list)
> > -		if (node_eq(&n->ent, entry))
> > -			return entry;
> > -
> > -	return NULL;
> > -
> > -}
> > -
> > -static const struct sd_node *find_entry_epoch(const struct sd_node *entry,
> > -					      uint32_t epoch)
> > -{
> > -	struct sd_node nodes[SD_MAX_NODES];
> > -	int nr;
> > -
> > -	if (!epoch)
> > -		return NULL;
> > -
> > -	nr = epoch_log_read(epoch, nodes, sizeof(nodes));
> > -
> > -	return xlfind(entry, nodes, nr, node_cmp);
> > -}
> > -
> > -/*
> > - * Add a node to the list of nodes that weren't part of the cluster before
> > - * it shut down, and thus do not count toward the nodes required to allow
> > - * an automated restart.  These nodes will become part of the cluster by
> > - * the time it does get restarted.
> > - */
> > -static bool add_delayed_node(uint32_t epoch, const struct sd_node *node)
> > -{
> > -	struct node *n;
> > -
> > -	if (find_entry_list(node, main_thread_get(delayed_nodes)))
> > -		return false;
> > -	assert(!find_entry_epoch(node, epoch));
> > -
> > -	n = xmalloc(sizeof(*n));
> > -	n->ent = *node;
> > -	list_add_tail(&n->list, main_thread_get(delayed_nodes));
> > -	return true;
> > -}
> > -
> > -/*
> > - * For a node that failed to join check if was part of the original
> > - * epoch, and if so add it to the list of node expected to be present
> > - * but failing to join.
> > - */
> > -static bool add_failed_node(uint32_t epoch, const struct sd_node *node)
> > -{
> > -	struct node *n;
> > -
> > -	if (find_entry_list(node, main_thread_get(failed_nodes)))
> > -		return false;
> > -	if (!find_entry_epoch(node, epoch))
> > -		return false;
> > -
> > -	n = xmalloc(sizeof(*n));
> > -	n->ent = *node;
> > -	list_add_tail(&n->list, main_thread_get(failed_nodes));
> > -	return true;
> > -}
> > -
> > -/*
> > - * Add the failed and delayed nodes in a join message to the local
> > - * lists of such nodes.
> > - */
> > -static void update_exceptional_node_list(uint32_t epoch,
> > -					 const struct join_message *jm)
> > -{
> > -	int i;
> > -
> > -	for (i = 0; i < jm->nr_failed_nodes; i++)
> > -		add_failed_node(epoch, &jm->cinfo.nodes[i]);
> > -	for ( ; i < jm->nr_failed_nodes + jm->nr_delayed_nodes; i++)
> > -		add_delayed_node(epoch, &jm->cinfo.nodes[i]);
> > -}
> > -
> > -/* Format the lists of failed or delayed nodes into the join message. */
> > -static void format_exceptional_node_list(struct join_message *jm)
> > -{
> > -	struct node *n;
> > -
> > -	list_for_each_entry(n, main_thread_get(failed_nodes), list)
> > -		jm->cinfo.nodes[jm->nr_failed_nodes++] = n->ent;
> > -	list_for_each_entry(n, main_thread_get(delayed_nodes), list)
> > -		jm->cinfo.nodes[jm->nr_failed_nodes +
> > -				jm->nr_delayed_nodes++] = n->ent;
> > -}
> > -
> > -static void clear_exceptional_node_lists(void)
> > -{
> > -	struct node *n, *t;
> > -
> > -	list_for_each_entry_safe(n, t, main_thread_get(failed_nodes), list) {
> > -		list_del(&n->list);
> > -		free(n);
> > -	}
> > -	list_for_each_entry_safe(n, t, main_thread_get(delayed_nodes), list) {
> > -		list_del(&n->list);
> > -		free(n);
> > -	}
> > -}
> > -
> >  int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
> >  			  time_t *timestamp, struct vnode_info *vinfo)
> >  {
> > @@ -496,8 +370,6 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
> >  
> >  static int cluster_sanity_check(struct join_message *jm)
> >  {
> > -	uint32_t local_epoch = get_latest_epoch();
> > -
> >  	if (jm->cinfo.ctime != sys->cinfo.ctime) {
> >  		sd_eprintf("joining node ctime doesn't match: %"
> >  			   PRIu64 " vs %" PRIu64, jm->cinfo.ctime,
> > @@ -505,12 +377,6 @@ static int cluster_sanity_check(struct join_message *jm)
> >  		return CJ_RES_FAIL;
> >  	}
> >  
> > -	if (jm->cinfo.epoch > local_epoch) {
> > -		sd_eprintf("joining node epoch too large: %"
> > -			   PRIu32 " vs %" PRIu32, jm->cinfo.epoch, local_epoch);
> > -		return CJ_RES_FAIL;
> > -	}
> > -
> >  	if (jm->cinfo.nr_copies != sys->cinfo.nr_copies) {
> >  		sd_eprintf("joining node nr_copies doesn't match: %u vs %u",
> >  			   jm->cinfo.nr_copies, sys->cinfo.nr_copies);
> > @@ -526,88 +392,71 @@ static int cluster_sanity_check(struct join_message *jm)
> >  	return CJ_RES_SUCCESS;
> >  }
> >  
> > -static int cluster_wait_for_join_check(const struct sd_node *joined,
> > -				       struct join_message *jm)
> > +/*
> > + * Check whether enough node members are gathered.
> > + *
> > + * Sheepdog can start automatically if and only if all the members in the latest
> > + * epoch are gathered.
> > + */
> > +static bool enough_nodes_gathered(struct join_message *jm,
> > +				  const struct sd_node *joining,
> > +				  const struct sd_node *members,
> > +				  size_t nr_members)
> >  {
> > -	struct sd_node local_entries[SD_MAX_NODES];
> > -	int nr, nr_local_entries, nr_failed_entries, nr_delayed_nodes;
> > -	uint32_t local_epoch = get_latest_epoch();
> > -	int ret;
> > -	struct vnode_info *cur_vinfo;
> > +	for (int i = 0; i < jm->cinfo.nr_nodes; i++) {
> > +		const struct sd_node *key = jm->cinfo.nodes + i, *n;
> >  
> > -	if (jm->cinfo.nr_nodes == 0)
> > -		return CJ_RES_JOIN_LATER;
> > -
> > -	ret = cluster_sanity_check(jm);
> > -	if (ret != CJ_RES_SUCCESS)  {
> > -		if (jm->cinfo.epoch > sys->cinfo.epoch) {
> > -			sd_eprintf("transfer mastership (%d, %d)", jm->cinfo.epoch,
> > -				   sys->cinfo.epoch);
> > -			return CJ_RES_MASTER_TRANSFER;
> > +		n = xlfind(key, members, nr_members, node_cmp);
> > +		if (n == NULL && !node_eq(key, joining)) {
> > +			sd_dprintf("%s doesn't join yet", node_to_str(key));
> > +			return false;
> >  		}
> > -		return ret;
> >  	}
> >  
> > -	nr_local_entries = epoch_log_read(jm->cinfo.epoch, local_entries,
> > -					  sizeof(local_entries));
> > -	if (nr_local_entries == -1)
> > -		return CJ_RES_FAIL;
> > -
> > -	if (jm->cinfo.epoch < local_epoch) {
> > -		sd_eprintf("joining node epoch too small: %"
> > -			   PRIu32 " vs %" PRIu32, jm->cinfo.epoch, local_epoch);
> > +	sd_dprintf("all the nodes are gathered, %d, %zd", jm->cinfo.nr_nodes,
> > +		   nr_members);
> > +	return true;
> > +}
> >  
> > -		if (xbsearch(joined, local_entries, nr_local_entries, node_cmp))
> > -			return CJ_RES_FAIL;
> > -		return CJ_RES_JOIN_LATER;
> > -	}
> > +static int cluster_wait_for_join_check(const struct sd_node *joined,
> > +				       const struct sd_node *members,
> > +				       size_t nr_members,
> > +				       struct join_message *jm)
> > +{
> > +	int ret;
> >  
> > -	if (jm->cinfo.nr_nodes != nr_local_entries) {
> > -		sd_eprintf("epoch log entries do not match: %d vs %d",
> > -			   jm->cinfo.nr_nodes, nr_local_entries);
> > -		return CJ_RES_FAIL;
> > +	if (jm->cinfo.epoch != 0 && sys->cinfo.epoch != 0) {
> > +		/* check whether joining node is valid or not */
> > +		ret = cluster_sanity_check(jm);
> > +		if (ret != CJ_RES_SUCCESS)
> > +			return ret;
> >  	}
> >  
> > -
> > -	if (memcmp(jm->cinfo.nodes, local_entries,
> > -		   sizeof(jm->cinfo.nodes[0]) * jm->cinfo.nr_nodes) != 0) {
> > +	if (jm->cinfo.epoch > sys->cinfo.epoch)
> > +		sys->cinfo = jm->cinfo;
> > +	else if (jm->cinfo.epoch < sys->cinfo.epoch) {
> > +		sd_dprintf("joining node has a smaller epoch, %" PRIu32 ", %"
> > +			   PRIu32, jm->cinfo.epoch, sys->cinfo.epoch);
> > +		jm->cinfo = sys->cinfo;
> > +	} else if (memcmp(jm->cinfo.nodes, sys->cinfo.nodes,
> > +			  sizeof(jm->cinfo.nodes[0]) * jm->cinfo.nr_nodes) != 0) {
> >  		sd_eprintf("epoch log entries does not match");
> >  		return CJ_RES_FAIL;
> >  	}
> >  
> > -	cur_vinfo = main_thread_get(current_vnode_info);
> > -	if (!cur_vinfo)
> > -		nr = 1;
> > -	else
> > -		nr = cur_vinfo->nr_nodes + 1;
> > -
> > -	nr_delayed_nodes = get_nodes_nr_from(main_thread_get(delayed_nodes));
> > -
> >  	/*
> >  	 * If we have all members from the last epoch log in the in-memory
> > -	 * node list, and no new nodes joining we can set the cluster live
> > -	 * now without incrementing the epoch.
> > -	 */
> > -	if (nr == nr_local_entries && !nr_delayed_nodes) {
> > -		jm->cluster_status = SD_STATUS_OK;
> > -		return CJ_RES_SUCCESS;
> > -	}
> > -
> > -	/*
> > -	 * If we reach the old node count, but some node failed we have to
> > -	 * update the epoch before setting the cluster live.
> > +	 * node list, we can set the cluster live now.
> >  	 */
> > -	nr_failed_entries = get_nodes_nr_from(main_thread_get(failed_nodes));
> > -	if (nr_local_entries == nr + nr_failed_entries - nr_delayed_nodes) {
> > -		jm->inc_epoch = 1;
> > +	if (sys->cinfo.epoch > 0 &&
> > +	    enough_nodes_gathered(jm, joined,  members, nr_members)) {
> > +		if (jm->cinfo.nr_nodes < nr_members + 1)
> 
> I'd suggest adding a comment here why you plus 1 onto nr_members or we'd better
> add a notice that nr_members means nr of nodes *exclude* joining node or leaving
> node. Probably we should rename nr_members as nr_nodes too.

Okay, I'll address it in the next version.

Thanks,

Kazutaka