[sheepdog] [PATCH 3/3] sheep: start cluster if all the nodes in the previous epoch are gathered

Wed Jul 3 08:49:23 CEST 2013

From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

The current rules to start sheepdog automatically are too complex:

 - One of the node in the latest epoch becomes a master.  If the
   joining node has the larger epoch, the node becomes a new master
   and the current master must exit and rejoin later.

 - If the joining node has a smaller epoch than the existing nodes,
   the node will be linked to the delayed node list and rejoin
   automatically after sheepdog starts.

 - Sheepdog starts if the current node list becomes the same one as in
   the previous epoch.

After this patch, we have only one rule:

 - Sheepdog starts if all the nodes in the previous epoch are
   gathered.

The nodes with smaller or larger epoch also can join, and all the
joined nodes share the same (latest) cluster_info among them.  All the
nodes have the latest epoch, so any node can be a master.  If all the
nodes in the previous epoch join Sheepdog, the cluster will start
automatically.

This change also brings the following benefit:

 - We can remove the failed node list, the delayed node list, and the
   CJ_RES_JOIN_LATER status since the node with smaller epoch can join
   Sheepdog.

 - The CJ_RES_MASTER_TRANSFER status can be removed because any node
   can be a master and no need to transfer mastership even if the node
   with larger epoch joins Sheepdog.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 include/internal_proto.h  |   16 +--
 include/shepherd.h        |    5 +-
 sheep/cluster.h           |    2 +
 sheep/cluster/corosync.c  |   16 +--
 sheep/cluster/local.c     |   21 +--
 sheep/cluster/shepherd.c  |   32 ++---
 sheep/cluster/zookeeper.c |   20 +--
 sheep/group.c             |  313 ++++++++-------------------------------------
 shepherd/shepherd.c       |    8 +-
 tests/functional/001      |   13 --
 tests/functional/001.out  |   19 +--
 tests/functional/002      |   14 --
 tests/functional/002.out  |   18 +--
 tests/functional/003      |   14 --
 tests/functional/003.out  |   18 +--
 tests/functional/004      |   16 ---
 tests/functional/004.out  |   39 +-----
 tests/functional/005      |   17 ---
 tests/functional/005.out  |   45 +------
 tests/functional/060      |   17 +--
 20 files changed, 105 insertions(+), 558 deletions(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index f0c55c0..c5cd76d 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -155,9 +155,7 @@ struct epoch_log {
 
 struct join_message {
 	uint8_t proto_ver;
-	uint8_t __pad1[3];
-	uint16_t nr_failed_nodes;
-	uint16_t nr_delayed_nodes;
+	uint8_t __pad1[7];
 	uint32_t cluster_status;
 	uint8_t inc_epoch; /* set non-zero when we increment epoch of all nodes */
 	uint8_t __pad2[3];
@@ -195,18 +193,6 @@ enum cluster_join_result {
 
 	/* Fail to join. The joining node has an invalid epoch. */
 	CJ_RES_FAIL,
-
-	/*
-	 * Fail to join. The joining node should be added after the cluster
-	 * start working.
-	 */
-	CJ_RES_JOIN_LATER,
-
-	/*
-	 * Transfer mastership.  The joining node has a newer epoch, so this
-	 * node will leave the cluster (restart later).
-	 */
-	CJ_RES_MASTER_TRANSFER,
 };
 
 static inline __attribute__((used)) void __sd_epoch_format_build_bug_ons(void)
diff --git a/include/shepherd.h b/include/shepherd.h
index 5dbc64b..e1fbac1 100644
--- a/include/shepherd.h
+++ b/include/shepherd.h
@@ -39,8 +39,11 @@ struct sph_msg {
 
 struct sph_msg_join {
 	uint32_t res;		/* original type: enum cluster_join_result */
-	struct sd_node node;
+	struct sd_node new_node;
 	uint8_t master_elected;
+
+	struct sd_node nodes[SD_MAX_NODES];
+	uint32_t nr_nodes;
 	uint8_t opaque[0];
 };
 
diff --git a/sheep/cluster.h b/sheep/cluster.h
index 80a701b..7665d75 100644
--- a/sheep/cluster.h
+++ b/sheep/cluster.h
@@ -158,6 +158,8 @@ void sd_notify_handler(const struct sd_node *sender, void *msg, size_t msg_len);
 bool sd_block_handler(const struct sd_node *sender);
 int sd_reconnect_handler(void);
 enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
+					  const struct sd_node *members,
+					  size_t nr_members,
 					  void *opaque);
 void recalculate_vnodes(struct sd_node *nodes, int nr_nodes);
 
diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
index 2be978b..bf90209 100644
--- a/sheep/cluster/corosync.c
+++ b/sheep/cluster/corosync.c
@@ -295,28 +295,18 @@ static bool __corosync_dispatch_one(struct corosync_event *cevent)
 			/* check_join() must be called only once */
 			return false;
 
-		res = sd_check_join_cb(&cevent->sender.ent,
-						     cevent->msg);
-		if (res == CJ_RES_MASTER_TRANSFER)
-			nr_cpg_nodes = 0;
-
+		build_node_list(cpg_nodes, nr_cpg_nodes, entries);
+		res = sd_check_join_cb(&cevent->sender.ent, entries,
+				       nr_cpg_nodes, cevent->msg);
 		send_message(COROSYNC_MSG_TYPE_JOIN_RESPONSE, res,
 			     &cevent->sender, cpg_nodes, nr_cpg_nodes,
 			     cevent->msg, cevent->msg_len);
 
-		if (res == CJ_RES_MASTER_TRANSFER) {
-			sd_eprintf("failed to join sheepdog cluster:"
-				   " please retry when master is up");
-			exit(1);
-		}
-
 		cevent->callbacked = true;
 		return false;
 	case COROSYNC_EVENT_TYPE_JOIN_RESPONSE:
 		switch (cevent->result) {
 		case CJ_RES_SUCCESS:
-		case CJ_RES_MASTER_TRANSFER:
-		case CJ_RES_JOIN_LATER:
 			add_cpg_node(cpg_nodes, nr_cpg_nodes, &cevent->sender);
 			nr_cpg_nodes++;
 			/* fall through */
diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c
index 491e3ea..572aa19 100644
--- a/sheep/cluster/local.c
+++ b/sheep/cluster/local.c
@@ -441,31 +441,18 @@ static bool local_process_event(void)
 
 	switch (ev->type) {
 	case EVENT_JOIN_REQUEST:
-		res = sd_check_join_cb(&ev->sender.node, ev->buf);
+		/* nodes[nr_nodes - 1] is a sender, so don't include it */
+		assert(node_eq(&ev->sender.node, &nodes[nr_nodes - 1]));
+		res = sd_check_join_cb(&ev->sender.node, nodes, nr_nodes - 1,
+				       ev->buf);
 		ev->join_result = res;
 		ev->type = EVENT_JOIN_RESPONSE;
 		msync(ev, sizeof(*ev), MS_SYNC);
 
 		shm_queue_notify();
 
-		if (res == CJ_RES_MASTER_TRANSFER) {
-			sd_eprintf("failed to join sheepdog cluster: "
-				   "please retry when master is up");
-			shm_queue_unlock();
-			exit(1);
-		}
 		return false;
 	case EVENT_JOIN_RESPONSE:
-		if (ev->join_result == CJ_RES_MASTER_TRANSFER) {
-			/* FIXME: This code is tricky, but Sheepdog assumes that */
-			/* nr_nodes = 1 when join_result = MASTER_TRANSFER... */
-			ev->nr_lnodes = 1;
-			ev->lnodes[0] = this_node;
-			nr_nodes = 1;
-			nodes[0] = this_node.node;
-			msync(ev, sizeof(*ev), MS_SYNC);
-		}
-
 		sd_join_handler(&ev->sender.node, nodes, nr_nodes,
 				ev->join_result, ev->buf);
 		break;
diff --git a/sheep/cluster/shepherd.c b/sheep/cluster/shepherd.c
index 26fb005..db8336e 100644
--- a/sheep/cluster/shepherd.c
+++ b/sheep/cluster/shepherd.c
@@ -58,7 +58,7 @@ static int do_shepherd_join(void)
 	msg.body_len = msg_join_len;
 
 	msg_join = xzalloc(msg_join_len);
-	msg_join->node = this_node;
+	msg_join->new_node = this_node;
 	memcpy(msg_join->opaque, kept_opaque, kept_opaque_len);
 
 	ret = writev2(sph_comm_fd, &msg, msg_join, msg_join_len);
@@ -115,7 +115,7 @@ retry:
 		 * FIXME: member change events must be ordered with nonblocked
 		 *        events
 		 */
-		res = sd_check_join_cb(&join->node, join->opaque);
+		res = sd_check_join_cb(&join->new_node, NULL, 0, join->opaque);
 		if (res == CJ_RES_FAIL) {
 			sd_eprintf("sd_check_join_cb() failed");
 			exit(1);
@@ -161,19 +161,9 @@ retry:
 
 	sd_iprintf("join reply arrived, nr_nodes: %d", join_reply->nr_nodes);
 
-	if (join_reply->res == CJ_RES_MASTER_TRANSFER) {
-		is_master = true;
-
-		/* FIXME: This code is tricky, but Sheepdog assumes that */
-		/* nr_nodes = 1 when join_result = MASTER_TRANSFER... */
-		nr_nodes = 1;
-		nodes[0] = this_node;
-	} else {
-		memcpy(nodes, join_reply->nodes,
-			join_reply->nr_nodes * sizeof(struct sd_node));
-
-		nr_nodes = join_reply->nr_nodes;
-	}
+	memcpy(nodes, join_reply->nodes,
+	       join_reply->nr_nodes * sizeof(struct sd_node));
+	nr_nodes = join_reply->nr_nodes;
 
 	/* FIXME: member change events must be ordered with nonblocked events */
 	sd_join_handler(&this_node, nodes, nr_nodes,
@@ -343,7 +333,7 @@ static void msg_new_node(struct sph_msg *rcv)
 	}
 
 	/* FIXME: member change events must be ordered with nonblocked events */
-	res = sd_check_join_cb(&join->node, join->opaque);
+	res = sd_check_join_cb(&join->new_node, nodes, nr_nodes, join->opaque);
 
 	join->res = res;
 
@@ -357,12 +347,6 @@ static void msg_new_node(struct sph_msg *rcv)
 		exit(1);
 	}
 	free(join);
-
-	if (res == CJ_RES_MASTER_TRANSFER) {
-		sd_eprintf("failed to join sheepdog cluster: "
-			"please retry when master is up");
-		exit(1);
-	}
 }
 
 static void msg_new_node_finish(struct sph_msg *rcv)
@@ -380,11 +364,11 @@ static void msg_new_node_finish(struct sph_msg *rcv)
 
 	jm = (struct join_message *)join_node_finish->opaque;
 	memcpy(nodes, join_node_finish->nodes,
-		join_node_finish->nr_nodes * sizeof(struct sd_node));
+	       join_node_finish->nr_nodes * sizeof(struct sd_node));
 	nr_nodes = join_node_finish->nr_nodes;
 
 	sd_iprintf("new node: %s",
-		node_to_str(&join_node_finish->new_node));
+		   node_to_str(&join_node_finish->new_node));
 
 	/* FIXME: member change events must be ordered with nonblocked events */
 	sd_join_handler(&join_node_finish->new_node, nodes, nr_nodes,
diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 5ed16cf..aafc18d 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -845,15 +845,11 @@ static void zk_handle_join_request(struct zk_event *ev)
 		return;
 	}
 
-	res = sd_check_join_cb(&ev->sender.node, ev->buf);
+	res = sd_check_join_cb(&ev->sender.node, sd_nodes, nr_sd_nodes,
+			       ev->buf);
 	ev->join_result = res;
 	push_join_response(ev);
-	if (res == CJ_RES_MASTER_TRANSFER) {
-		sd_eprintf("failed to join sheepdog cluster: "
-			   "please retry when master is up");
-		zk_leave();
-		exit(1);
-	}
+
 	sd_dprintf("I'm the master now");
 }
 
@@ -896,19 +892,9 @@ static void zk_handle_join_response(struct zk_event *ev)
 		/* newly joined node */
 		init_node_list(ev);
 
-	if (ev->join_result == CJ_RES_MASTER_TRANSFER)
-		/*
-		 * Sheepdog assumes that only one sheep is alive in
-		 * MASTER_TRANSFER scenario. So only the joining sheep is
-		 * supposed to return single node view to sd_join_handler().
-		 */
-		zk_tree_destroy();
-
 	sd_dprintf("%s, %d", node_to_str(&ev->sender.node), ev->join_result);
 	switch (ev->join_result) {
 	case CJ_RES_SUCCESS:
-	case CJ_RES_JOIN_LATER:
-	case CJ_RES_MASTER_TRANSFER:
 		snprintf(path, sizeof(path), MEMBER_ZNODE"/%s",
 			 node_to_str(&ev->sender.node));
 		if (node_eq(&ev->sender.node, &this_node.node)) {
diff --git a/sheep/group.c b/sheep/group.c
index cb87711..743b4fb 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -50,18 +50,6 @@ static main_thread(struct vnode_info *) current_vnode_info;
 static main_thread(struct list_head *) pending_block_list;
 static main_thread(struct list_head *) pending_notify_list;
 
-/*
- * List of nodes that were part of the last epoch before a shutdown,
- * but failed to join.
- */
-static main_thread(struct list_head *) failed_nodes;
-
-/*
- * List of nodes that weren't part of the last epoch, but joined
- * before restarting the cluster.
- */
-static main_thread(struct list_head *) delayed_nodes;
-
 static int get_zones_nr_from(const struct sd_node *nodes, int nr_nodes)
 {
 	int nr_zones = 0, i, j;
@@ -341,120 +329,6 @@ static inline int get_nodes_nr_from(struct list_head *l)
 	return nr;
 }
 
-static int get_nodes_nr_epoch(uint32_t epoch)
-{
-	struct sd_node nodes[SD_MAX_NODES];
-
-	return epoch_log_read(epoch, nodes, sizeof(nodes));
-}
-
-static const struct sd_node *find_entry_list(const struct sd_node *entry,
-					     struct list_head *head)
-{
-	struct node *n;
-	list_for_each_entry(n, head, list)
-		if (node_eq(&n->ent, entry))
-			return entry;
-
-	return NULL;
-
-}
-
-static const struct sd_node *find_entry_epoch(const struct sd_node *entry,
-					      uint32_t epoch)
-{
-	struct sd_node nodes[SD_MAX_NODES];
-	int nr;
-
-	if (!epoch)
-		return NULL;
-
-	nr = epoch_log_read(epoch, nodes, sizeof(nodes));
-
-	return xlfind(entry, nodes, nr, node_cmp);
-}
-
-/*
- * Add a node to the list of nodes that weren't part of the cluster before
- * it shut down, and thus do not count toward the nodes required to allow
- * an automated restart.  These nodes will become part of the cluster by
- * the time it does get restarted.
- */
-static bool add_delayed_node(uint32_t epoch, const struct sd_node *node)
-{
-	struct node *n;
-
-	if (find_entry_list(node, main_thread_get(delayed_nodes)))
-		return false;
-	assert(!find_entry_epoch(node, epoch));
-
-	n = xmalloc(sizeof(*n));
-	n->ent = *node;
-	list_add_tail(&n->list, main_thread_get(delayed_nodes));
-	return true;
-}
-
-/*
- * For a node that failed to join check if was part of the original
- * epoch, and if so add it to the list of node expected to be present
- * but failing to join.
- */
-static bool add_failed_node(uint32_t epoch, const struct sd_node *node)
-{
-	struct node *n;
-
-	if (find_entry_list(node, main_thread_get(failed_nodes)))
-		return false;
-	if (!find_entry_epoch(node, epoch))
-		return false;
-
-	n = xmalloc(sizeof(*n));
-	n->ent = *node;
-	list_add_tail(&n->list, main_thread_get(failed_nodes));
-	return true;
-}
-
-/*
- * Add the failed and delayed nodes in a join message to the local
- * lists of such nodes.
- */
-static void update_exceptional_node_list(uint32_t epoch,
-					 const struct join_message *jm)
-{
-	int i;
-
-	for (i = 0; i < jm->nr_failed_nodes; i++)
-		add_failed_node(epoch, &jm->cinfo.nodes[i]);
-	for ( ; i < jm->nr_failed_nodes + jm->nr_delayed_nodes; i++)
-		add_delayed_node(epoch, &jm->cinfo.nodes[i]);
-}
-
-/* Format the lists of failed or delayed nodes into the join message. */
-static void format_exceptional_node_list(struct join_message *jm)
-{
-	struct node *n;
-
-	list_for_each_entry(n, main_thread_get(failed_nodes), list)
-		jm->cinfo.nodes[jm->nr_failed_nodes++] = n->ent;
-	list_for_each_entry(n, main_thread_get(delayed_nodes), list)
-		jm->cinfo.nodes[jm->nr_failed_nodes +
-				jm->nr_delayed_nodes++] = n->ent;
-}
-
-static void clear_exceptional_node_lists(void)
-{
-	struct node *n, *t;
-
-	list_for_each_entry_safe(n, t, main_thread_get(failed_nodes), list) {
-		list_del(&n->list);
-		free(n);
-	}
-	list_for_each_entry_safe(n, t, main_thread_get(delayed_nodes), list) {
-		list_del(&n->list);
-		free(n);
-	}
-}
-
 int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
 			  time_t *timestamp, struct vnode_info *vinfo)
 {
@@ -496,8 +370,6 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
 
 static int cluster_sanity_check(struct join_message *jm)
 {
-	uint32_t local_epoch = get_latest_epoch();
-
 	if (jm->cinfo.ctime != sys->cinfo.ctime) {
 		sd_eprintf("joining node ctime doesn't match: %"
 			   PRIu64 " vs %" PRIu64, jm->cinfo.ctime,
@@ -505,12 +377,6 @@ static int cluster_sanity_check(struct join_message *jm)
 		return CJ_RES_FAIL;
 	}
 
-	if (jm->cinfo.epoch > local_epoch) {
-		sd_eprintf("joining node epoch too large: %"
-			   PRIu32 " vs %" PRIu32, jm->cinfo.epoch, local_epoch);
-		return CJ_RES_FAIL;
-	}
-
 	if (jm->cinfo.nr_copies != sys->cinfo.nr_copies) {
 		sd_eprintf("joining node nr_copies doesn't match: %u vs %u",
 			   jm->cinfo.nr_copies, sys->cinfo.nr_copies);
@@ -526,88 +392,71 @@ static int cluster_sanity_check(struct join_message *jm)
 	return CJ_RES_SUCCESS;
 }
 
-static int cluster_wait_for_join_check(const struct sd_node *joined,
-				       struct join_message *jm)
+/*
+ * Check whether enough node members are gathered.
+ *
+ * Sheepdog can start automatically if and only if all the members in the latest
+ * epoch are gathered.
+ */
+static bool enough_nodes_gathered(struct join_message *jm,
+				  const struct sd_node *joining,
+				  const struct sd_node *members,
+				  size_t nr_members)
 {
-	struct sd_node local_entries[SD_MAX_NODES];
-	int nr, nr_local_entries, nr_failed_entries, nr_delayed_nodes;
-	uint32_t local_epoch = get_latest_epoch();
-	int ret;
-	struct vnode_info *cur_vinfo;
+	for (int i = 0; i < jm->cinfo.nr_nodes; i++) {
+		const struct sd_node *key = jm->cinfo.nodes + i, *n;
 
-	if (jm->cinfo.nr_nodes == 0)
-		return CJ_RES_JOIN_LATER;
-
-	ret = cluster_sanity_check(jm);
-	if (ret != CJ_RES_SUCCESS)  {
-		if (jm->cinfo.epoch > sys->cinfo.epoch) {
-			sd_eprintf("transfer mastership (%d, %d)", jm->cinfo.epoch,
-				   sys->cinfo.epoch);
-			return CJ_RES_MASTER_TRANSFER;
+		n = xlfind(key, members, nr_members, node_cmp);
+		if (n == NULL && !node_eq(key, joining)) {
+			sd_dprintf("%s doesn't join yet", node_to_str(key));
+			return false;
 		}
-		return ret;
 	}
 
-	nr_local_entries = epoch_log_read(jm->cinfo.epoch, local_entries,
-					  sizeof(local_entries));
-	if (nr_local_entries == -1)
-		return CJ_RES_FAIL;
-
-	if (jm->cinfo.epoch < local_epoch) {
-		sd_eprintf("joining node epoch too small: %"
-			   PRIu32 " vs %" PRIu32, jm->cinfo.epoch, local_epoch);
+	sd_dprintf("all the nodes are gathered, %d, %zd", jm->cinfo.nr_nodes,
+		   nr_members);
+	return true;
+}
 
-		if (xbsearch(joined, local_entries, nr_local_entries, node_cmp))
-			return CJ_RES_FAIL;
-		return CJ_RES_JOIN_LATER;
-	}
+static int cluster_wait_for_join_check(const struct sd_node *joined,
+				       const struct sd_node *members,
+				       size_t nr_members,
+				       struct join_message *jm)
+{
+	int ret;
 
-	if (jm->cinfo.nr_nodes != nr_local_entries) {
-		sd_eprintf("epoch log entries do not match: %d vs %d",
-			   jm->cinfo.nr_nodes, nr_local_entries);
-		return CJ_RES_FAIL;
+	if (jm->cinfo.epoch != 0 && sys->cinfo.epoch != 0) {
+		/* check whether joining node is valid or not */
+		ret = cluster_sanity_check(jm);
+		if (ret != CJ_RES_SUCCESS)
+			return ret;
 	}
 
-
-	if (memcmp(jm->cinfo.nodes, local_entries,
-		   sizeof(jm->cinfo.nodes[0]) * jm->cinfo.nr_nodes) != 0) {
+	if (jm->cinfo.epoch > sys->cinfo.epoch)
+		sys->cinfo = jm->cinfo;
+	else if (jm->cinfo.epoch < sys->cinfo.epoch) {
+		sd_dprintf("joining node has a smaller epoch, %" PRIu32 ", %"
+			   PRIu32, jm->cinfo.epoch, sys->cinfo.epoch);
+		jm->cinfo = sys->cinfo;
+	} else if (memcmp(jm->cinfo.nodes, sys->cinfo.nodes,
+			  sizeof(jm->cinfo.nodes[0]) * jm->cinfo.nr_nodes) != 0) {
 		sd_eprintf("epoch log entries does not match");
 		return CJ_RES_FAIL;
 	}
 
-	cur_vinfo = main_thread_get(current_vnode_info);
-	if (!cur_vinfo)
-		nr = 1;
-	else
-		nr = cur_vinfo->nr_nodes + 1;
-
-	nr_delayed_nodes = get_nodes_nr_from(main_thread_get(delayed_nodes));
-
 	/*
 	 * If we have all members from the last epoch log in the in-memory
-	 * node list, and no new nodes joining we can set the cluster live
-	 * now without incrementing the epoch.
-	 */
-	if (nr == nr_local_entries && !nr_delayed_nodes) {
-		jm->cluster_status = SD_STATUS_OK;
-		return CJ_RES_SUCCESS;
-	}
-
-	/*
-	 * If we reach the old node count, but some node failed we have to
-	 * update the epoch before setting the cluster live.
+	 * node list, we can set the cluster live now.
 	 */
-	nr_failed_entries = get_nodes_nr_from(main_thread_get(failed_nodes));
-	if (nr_local_entries == nr + nr_failed_entries - nr_delayed_nodes) {
-		jm->inc_epoch = 1;
+	if (sys->cinfo.epoch > 0 &&
+	    enough_nodes_gathered(jm, joined,  members, nr_members)) {
+		if (jm->cinfo.nr_nodes < nr_members + 1)
+			/* There are nodes which didn't exist in the previous
+			 * epoch, so we have to increment epoch. */
+			jm->inc_epoch = 1;
 		jm->cluster_status = SD_STATUS_OK;
-		return CJ_RES_SUCCESS;
 	}
 
-	/*
-	 * The join was successful, but we don't have enough nodes yet to set
-	 * the cluster live.
-	 */
 	return CJ_RES_SUCCESS;
 }
 
@@ -767,9 +616,6 @@ static void finish_join(const struct join_message *msg,
 	sys->join_finished = true;
 	sys->cinfo.epoch = msg->cinfo.epoch;
 
-	if (msg->cluster_status != SD_STATUS_OK)
-		update_exceptional_node_list(get_latest_epoch(), msg);
-
 	if (msg->cinfo.store[0]) {
 		if (!sys->gateway_only)
 			setup_backend_store((char *)msg->cinfo.store,
@@ -878,7 +724,6 @@ static void update_cluster_info(const struct join_message *msg,
 		if (msg->inc_epoch) {
 			uatomic_inc(&sys->cinfo.epoch);
 			log_current_epoch();
-			clear_exceptional_node_lists();
 
 			if (!old_vnode_info) {
 				old_vnode_info = alloc_old_vnode_info(joined,
@@ -949,6 +794,8 @@ void sd_notify_handler(const struct sd_node *sender, void *data,
 }
 
 enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
+					  const struct sd_node *members,
+					  size_t nr_members,
 					  void *opaque)
 {
 	struct join_message *jm = opaque;
@@ -1014,7 +861,8 @@ enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
 		ret = CJ_RES_SUCCESS;
 		break;
 	case SD_STATUS_WAIT_FOR_JOIN:
-		ret = cluster_wait_for_join_check(joining, jm);
+		ret = cluster_wait_for_join_check(joining, members, nr_members,
+						  jm);
 		break;
 	case SD_STATUS_OK:
 	case SD_STATUS_HALT:
@@ -1029,13 +877,8 @@ enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
 			       joining->nid.port),
 		   ret, jm->cluster_status);
 
-	jm->nr_failed_nodes = 0;
-
 	jm->cinfo = sys->cinfo;
 
-	if (jm->cluster_status != SD_STATUS_OK &&
-	    (ret == CJ_RES_SUCCESS || ret == CJ_RES_JOIN_LATER))
-		format_exceptional_node_list(jm);
 	return ret;
 }
 
@@ -1075,9 +918,7 @@ void sd_join_handler(const struct sd_node *joined,
 		     const void *opaque)
 {
 	int i;
-	int nr, nr_local, nr_failed, nr_delayed;
 	const struct join_message *jm = opaque;
-	uint32_t le = get_latest_epoch();
 
 	sys->cinfo = jm->cinfo;
 
@@ -1090,9 +931,6 @@ void sd_join_handler(const struct sd_node *joined,
 	}
 
 	switch (result) {
-	case CJ_RES_JOIN_LATER:
-		add_delayed_node(le, joined);
-		/*FALLTHRU*/
 	case CJ_RES_SUCCESS:
 		sd_dprintf("join %s", node_to_str(joined));
 		for (i = 0; i < nr_members; i++)
@@ -1108,53 +946,6 @@ void sd_join_handler(const struct sd_node *joined,
 			sd_printf(SDOG_DEBUG, "join Sheepdog cluster");
 		break;
 	case CJ_RES_FAIL:
-		if (sys->status != SD_STATUS_WAIT_FOR_JOIN)
-			break;
-
-		if (!add_failed_node(le, joined))
-			break;
-
-		nr_local = get_nodes_nr_epoch(sys->cinfo.epoch);
-		nr = nr_members;
-		nr_failed = get_nodes_nr_from(main_thread_get(failed_nodes));
-		nr_delayed = get_nodes_nr_from(main_thread_get(delayed_nodes));
-
-		sd_dprintf("%d == %d + %d", nr_local, nr, nr_failed);
-		if (nr_local == nr + nr_failed - nr_delayed) {
-			sys->status = SD_STATUS_OK;
-			log_current_epoch();
-		}
-		break;
-	case CJ_RES_MASTER_TRANSFER:
-		update_exceptional_node_list(le, jm);
-
-		/*
-		 * Sheep needs this to identify itself as master.
-		 * Now mastership transfer is done.
-		 */
-		if (!sys->join_finished) {
-			sys->join_finished = true;
-			sys->cinfo.epoch = get_latest_epoch();
-
-			put_vnode_info(main_thread_get(current_vnode_info));
-			main_thread_set(current_vnode_info,
-					  alloc_vnode_info(&sys->this_node, 1));
-		}
-
-		nr_local = get_nodes_nr_epoch(sys->cinfo.epoch);
-		nr = nr_members;
-		nr_failed = get_nodes_nr_from(main_thread_get(failed_nodes));
-		nr_delayed = get_nodes_nr_from(main_thread_get(delayed_nodes));
-
-		sd_dprintf("%d == %d + %d", nr_local, nr, nr_failed);
-		if (nr_local == nr + nr_failed - nr_delayed) {
-			sys->status = SD_STATUS_OK;
-			log_current_epoch();
-		}
-
-		if (node_is_local(joined))
-			/* this output is used for testing */
-			sd_printf(SDOG_DEBUG, "join Sheepdog cluster");
 		break;
 	default:
 		/* this means sd_check_join_cb() is buggy */
@@ -1281,10 +1072,6 @@ int create_cluster(int port, int64_t zone, int nr_vnodes,
 	main_thread_set(pending_notify_list,
 			  xzalloc(sizeof(struct list_head)));
 	INIT_LIST_HEAD(main_thread_get(pending_notify_list));
-	main_thread_set(failed_nodes, xzalloc(sizeof(struct list_head)));
-	INIT_LIST_HEAD(main_thread_get(failed_nodes));
-	main_thread_set(delayed_nodes, xzalloc(sizeof(struct list_head)));
-	INIT_LIST_HEAD(main_thread_get(delayed_nodes));
 
 	INIT_LIST_HEAD(&sys->local_req_queue);
 	INIT_LIST_HEAD(&sys->req_wait_queue);
diff --git a/shepherd/shepherd.c b/shepherd/shepherd.c
index ffe48f3..6e102b3 100644
--- a/shepherd/shepherd.c
+++ b/shepherd/shepherd.c
@@ -344,7 +344,7 @@ static void sph_handle_join(struct sph_msg *msg, struct sheep *sheep)
 		goto purge_current_sheep;
 	}
 
-	sheep->node = join->node;
+	sheep->node = join->new_node;
 
 	snd.type = SPH_SRV_MSG_NEW_NODE;
 	snd.body_len = msg->body_len;
@@ -412,13 +412,13 @@ static void sph_handle_new_node_reply(struct sph_msg *msg, struct sheep *sheep)
 
 	join_result = join->res;
 
-	sd_dprintf("joining node is %s", node_to_str(&join->node));
+	sd_dprintf("joining node is %s", node_to_str(&join->new_node));
 
-	joining_sheep = find_sheep_by_nid(&join->node.nid);
+	joining_sheep = find_sheep_by_nid(&join->new_node.nid);
 	if (!joining_sheep) {
 		/* master is broken */
 		sd_eprintf("invalid nid is required, %s",
-			node_to_str(&join->node));
+			node_to_str(&join->new_node));
 		sd_eprintf("purging master sheep: %s and joining one",
 			node_to_str(&master_sheep->node));
 
diff --git a/tests/functional/001 b/tests/functional/001
index 2c3fbdb..b06c13a 100755
--- a/tests/functional/001
+++ b/tests/functional/001
@@ -44,19 +44,6 @@ for i in 0 1 2; do
     sleep 1
 done
 
-_wait_for_sheep_stop 0
-_wait_for_sheep_stop 1
-
-_wait_for_sheep 1 2
-echo check whether Sheepdog is running with only one node
-$COLLIE cluster info -p 7002 | _filter_cluster_info
-
-# add the other nodes
-for i in 0 1; do
-    _start_sheep $i
-    sleep 1
-done
-
 _wait_for_sheep 3
 echo check whether all nodes have the same cluster info
 for i in 0 1 2; do
diff --git a/tests/functional/001.out b/tests/functional/001.out
index 60b5c88..82d27da 100644
--- a/tests/functional/001.out
+++ b/tests/functional/001.out
@@ -1,23 +1,12 @@
 QA output created by 001
 using backend plain store
-check whether Sheepdog is running with only one node
-Cluster status: running, auto-recovery enabled
-
-Cluster created at DATE
-
-Epoch Time           Version
-DATE      4 [127.0.0.1:7002]
-DATE      3 [127.0.0.1:7001, 127.0.0.1:7002]
-DATE      2 []
-DATE      1 []
 check whether all nodes have the same cluster info
 Cluster status: running, auto-recovery enabled
 
 Cluster created at DATE
 
 Epoch Time           Version
-DATE      6 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
-DATE      5 [127.0.0.1:7000, 127.0.0.1:7002]
+DATE      5 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
 DATE      4 [127.0.0.1:7002]
 DATE      3 [127.0.0.1:7001, 127.0.0.1:7002]
 DATE      2 [127.0.0.1:7001]
@@ -27,8 +16,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE      6 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
-DATE      5 [127.0.0.1:7000, 127.0.0.1:7002]
+DATE      5 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
 DATE      4 [127.0.0.1:7002]
 DATE      3 [127.0.0.1:7001, 127.0.0.1:7002]
 DATE      2 [127.0.0.1:7001]
@@ -38,8 +26,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE      6 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
-DATE      5 [127.0.0.1:7000, 127.0.0.1:7002]
+DATE      5 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
 DATE      4 [127.0.0.1:7002]
 DATE      3 [127.0.0.1:7001, 127.0.0.1:7002]
 DATE      2 [127.0.0.1:7001]
diff --git a/tests/functional/002 b/tests/functional/002
index 67227c9..a649b27 100755
--- a/tests/functional/002
+++ b/tests/functional/002
@@ -38,20 +38,6 @@ for i in 1 0 2; do
     sleep 1
 done
 
-_wait_for_sheep_stop 0
-_wait_for_sheep_stop 1
-
-_wait_for_sheep 1 2
-
-echo check whether Sheepdog is running with only one node
-$COLLIE cluster info -p 7002 | _filter_cluster_info
-
-# add the other nodes
-for i in 0 1; do
-    _start_sheep $i
-    sleep 1
-done
-
 _wait_for_sheep 3
 
 echo check whether all nodes have the same cluster info
diff --git a/tests/functional/002.out b/tests/functional/002.out
index 07e4cb7..ce99957 100644
--- a/tests/functional/002.out
+++ b/tests/functional/002.out
@@ -1,22 +1,12 @@
 QA output created by 002
 using backend plain store
-check whether Sheepdog is running with only one node
-Cluster status: running, auto-recovery enabled
-
-Cluster created at DATE
-
-Epoch Time           Version
-DATE      3 [127.0.0.1:7002]
-DATE      2 [127.0.0.1:7001, 127.0.0.1:7002]
-DATE      1 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
 check whether all nodes have the same cluster info
 Cluster status: running, auto-recovery enabled
 
 Cluster created at DATE
 
 Epoch Time           Version
-DATE      5 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
-DATE      4 [127.0.0.1:7000, 127.0.0.1:7002]
+DATE      4 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
 DATE      3 [127.0.0.1:7002]
 DATE      2 [127.0.0.1:7001, 127.0.0.1:7002]
 DATE      1 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
@@ -25,8 +15,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE      5 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
-DATE      4 [127.0.0.1:7000, 127.0.0.1:7002]
+DATE      4 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
 DATE      3 [127.0.0.1:7002]
 DATE      2 [127.0.0.1:7001, 127.0.0.1:7002]
 DATE      1 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
@@ -35,8 +24,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE      5 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
-DATE      4 [127.0.0.1:7000, 127.0.0.1:7002]
+DATE      4 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
 DATE      3 [127.0.0.1:7002]
 DATE      2 [127.0.0.1:7001, 127.0.0.1:7002]
 DATE      1 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
diff --git a/tests/functional/003 b/tests/functional/003
index 05522fe..31c8abf 100755
--- a/tests/functional/003
+++ b/tests/functional/003
@@ -37,20 +37,6 @@ for i in 0 1 2; do
     sleep 1
 done
 
-_wait_for_sheep_stop 0
-_wait_for_sheep_stop 1
-
-_wait_for_sheep 1 2
-
-echo check whether Sheepdog is running with only one node
-$COLLIE cluster info -p 7002 | _filter_cluster_info
-
-# add the other nodes
-for i in 0 1; do
-    _start_sheep $i
-    sleep 1
-done
-
 _wait_for_sheep 3
 echo check whether all nodes have the same cluster info
 for i in 0 1 2; do
diff --git a/tests/functional/003.out b/tests/functional/003.out
index f093582..e6d0a23 100644
--- a/tests/functional/003.out
+++ b/tests/functional/003.out
@@ -1,22 +1,12 @@
 QA output created by 003
 using backend plain store
-check whether Sheepdog is running with only one node
-Cluster status: running, auto-recovery enabled
-
-Cluster created at DATE
-
-Epoch Time           Version
-DATE      3 [127.0.0.1:7002]
-DATE      2 [127.0.0.1:7001, 127.0.0.1:7002]
-DATE      1 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
 check whether all nodes have the same cluster info
 Cluster status: running, auto-recovery enabled
 
 Cluster created at DATE
 
 Epoch Time           Version
-DATE      5 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
-DATE      4 [127.0.0.1:7000, 127.0.0.1:7002]
+DATE      4 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
 DATE      3 [127.0.0.1:7002]
 DATE      2 [127.0.0.1:7001, 127.0.0.1:7002]
 DATE      1 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
@@ -25,8 +15,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE      5 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
-DATE      4 [127.0.0.1:7000, 127.0.0.1:7002]
+DATE      4 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
 DATE      3 [127.0.0.1:7002]
 DATE      2 [127.0.0.1:7001, 127.0.0.1:7002]
 DATE      1 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
@@ -35,8 +24,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE      5 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
-DATE      4 [127.0.0.1:7000, 127.0.0.1:7002]
+DATE      4 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
 DATE      3 [127.0.0.1:7002]
 DATE      2 [127.0.0.1:7001, 127.0.0.1:7002]
 DATE      1 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
diff --git a/tests/functional/004 b/tests/functional/004
index 240884d..ece9aa3 100755
--- a/tests/functional/004
+++ b/tests/functional/004
@@ -45,22 +45,6 @@ for i in 0 1 2 3 4; do
     sleep 1
 done
 
-_wait_for_sheep_stop 0
-_wait_for_sheep_stop 1
-_wait_for_sheep_stop 2
-_wait_for_sheep_stop 3
-
-_wait_for_sheep 1 4
-
-echo check whether Sheepdog is running with only one node
-$COLLIE cluster info -p 7004 | _filter_cluster_info
-
-# add the other nodes
-for i in 0 1 2 3; do
-    _start_sheep $i
-    sleep 1
-done
-
 _wait_for_sheep 5
 echo check whether all nodes have the same cluster info
 for i in 0 1 2 3 4; do
diff --git a/tests/functional/004.out b/tests/functional/004.out
index 3aa7c2b..e249487 100644
--- a/tests/functional/004.out
+++ b/tests/functional/004.out
@@ -1,29 +1,12 @@
 QA output created by 004
 using backend plain store
-check whether Sheepdog is running with only one node
-Cluster status: running, auto-recovery enabled
-
-Cluster created at DATE
-
-Epoch Time           Version
-DATE      8 [127.0.0.1:7004]
-DATE      7 [127.0.0.1:7003, 127.0.0.1:7004]
-DATE      6 []
-DATE      5 []
-DATE      4 []
-DATE      3 []
-DATE      2 []
-DATE      1 []
 check whether all nodes have the same cluster info
 Cluster status: running, auto-recovery enabled
 
 Cluster created at DATE
 
 Epoch Time           Version
-DATE     12 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE     11 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7004]
-DATE     10 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7004]
-DATE      9 [127.0.0.1:7000, 127.0.0.1:7004]
+DATE      9 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
 DATE      8 [127.0.0.1:7004]
 DATE      7 [127.0.0.1:7003, 127.0.0.1:7004]
 DATE      6 [127.0.0.1:7003]
@@ -37,10 +20,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE     12 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE     11 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7004]
-DATE     10 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7004]
-DATE      9 [127.0.0.1:7000, 127.0.0.1:7004]
+DATE      9 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
 DATE      8 [127.0.0.1:7004]
 DATE      7 [127.0.0.1:7003, 127.0.0.1:7004]
 DATE      6 [127.0.0.1:7003]
@@ -54,10 +34,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE     12 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE     11 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7004]
-DATE     10 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7004]
-DATE      9 [127.0.0.1:7000, 127.0.0.1:7004]
+DATE      9 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
 DATE      8 [127.0.0.1:7004]
 DATE      7 [127.0.0.1:7003, 127.0.0.1:7004]
 DATE      6 [127.0.0.1:7003]
@@ -71,10 +48,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE     12 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE     11 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7004]
-DATE     10 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7004]
-DATE      9 [127.0.0.1:7000, 127.0.0.1:7004]
+DATE      9 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
 DATE      8 [127.0.0.1:7004]
 DATE      7 [127.0.0.1:7003, 127.0.0.1:7004]
 DATE      6 [127.0.0.1:7003]
@@ -88,10 +62,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE     12 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE     11 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7004]
-DATE     10 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7004]
-DATE      9 [127.0.0.1:7000, 127.0.0.1:7004]
+DATE      9 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
 DATE      8 [127.0.0.1:7004]
 DATE      7 [127.0.0.1:7003, 127.0.0.1:7004]
 DATE      6 [127.0.0.1:7003]
diff --git a/tests/functional/005 b/tests/functional/005
index 1e70669..fc055f9 100755
--- a/tests/functional/005
+++ b/tests/functional/005
@@ -41,23 +41,6 @@ for i in 0 1 2 3 4; do
     sleep 1
 done
 
-_wait_for_sheep_stop 0
-_wait_for_sheep_stop 1
-_wait_for_sheep_stop 2
-
-_wait_for_sheep 2 4
-
-echo check whether Sheepdog is working with two nodes
-for i in 3 4; do
-    $COLLIE cluster info -p 700$i | _filter_cluster_info
-done
-
-# add the other nodes
-for i in 0 1 2; do
-    _start_sheep $i
-    sleep 1
-done
-
 _wait_for_sheep 5
 echo check whether all nodes have the same cluster info
 for i in 0 1 2 3 4; do
diff --git a/tests/functional/005.out b/tests/functional/005.out
index f97f048..9a5cbe9 100644
--- a/tests/functional/005.out
+++ b/tests/functional/005.out
@@ -1,39 +1,12 @@
 QA output created by 005
 using backend plain store
-check whether Sheepdog is working with two nodes
-Cluster status: running, auto-recovery enabled
-
-Cluster created at DATE
-
-Epoch Time           Version
-DATE      7 [127.0.0.1:7003, 127.0.0.1:7004]
-DATE      6 [127.0.0.1:7003]
-DATE      5 [127.0.0.1:7002, 127.0.0.1:7003]
-DATE      4 []
-DATE      3 []
-DATE      2 []
-DATE      1 []
-Cluster status: running, auto-recovery enabled
-
-Cluster created at DATE
-
-Epoch Time           Version
-DATE      7 [127.0.0.1:7003, 127.0.0.1:7004]
-DATE      6 [127.0.0.1:7003]
-DATE      5 [127.0.0.1:7002, 127.0.0.1:7003]
-DATE      4 []
-DATE      3 []
-DATE      2 []
-DATE      1 []
 check whether all nodes have the same cluster info
 Cluster status: running, auto-recovery enabled
 
 Cluster created at DATE
 
 Epoch Time           Version
-DATE     10 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE      9 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE      8 [127.0.0.1:7000, 127.0.0.1:7003, 127.0.0.1:7004]
+DATE      8 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
 DATE      7 [127.0.0.1:7003, 127.0.0.1:7004]
 DATE      6 [127.0.0.1:7003]
 DATE      5 [127.0.0.1:7002, 127.0.0.1:7003]
@@ -46,9 +19,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE     10 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE      9 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE      8 [127.0.0.1:7000, 127.0.0.1:7003, 127.0.0.1:7004]
+DATE      8 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
 DATE      7 [127.0.0.1:7003, 127.0.0.1:7004]
 DATE      6 [127.0.0.1:7003]
 DATE      5 [127.0.0.1:7002, 127.0.0.1:7003]
@@ -61,9 +32,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE     10 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE      9 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE      8 [127.0.0.1:7000, 127.0.0.1:7003, 127.0.0.1:7004]
+DATE      8 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
 DATE      7 [127.0.0.1:7003, 127.0.0.1:7004]
 DATE      6 [127.0.0.1:7003]
 DATE      5 [127.0.0.1:7002, 127.0.0.1:7003]
@@ -76,9 +45,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE     10 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE      9 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE      8 [127.0.0.1:7000, 127.0.0.1:7003, 127.0.0.1:7004]
+DATE      8 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
 DATE      7 [127.0.0.1:7003, 127.0.0.1:7004]
 DATE      6 [127.0.0.1:7003]
 DATE      5 [127.0.0.1:7002, 127.0.0.1:7003]
@@ -91,9 +58,7 @@ Cluster status: running, auto-recovery enabled
 Cluster created at DATE
 
 Epoch Time           Version
-DATE     10 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE      9 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7003, 127.0.0.1:7004]
-DATE      8 [127.0.0.1:7000, 127.0.0.1:7003, 127.0.0.1:7004]
+DATE      8 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002, 127.0.0.1:7003, 127.0.0.1:7004]
 DATE      7 [127.0.0.1:7003, 127.0.0.1:7004]
 DATE      6 [127.0.0.1:7003]
 DATE      5 [127.0.0.1:7002, 127.0.0.1:7003]
diff --git a/tests/functional/060 b/tests/functional/060
index 7683285..21257c4 100755
--- a/tests/functional/060
+++ b/tests/functional/060
@@ -38,19 +38,16 @@ done
 _reboot_without_collie_shutdown()
 {
     local i
-    for i in `seq 1 7`; do
-	_kill_sheep $i
+    for i in `seq 0 7`; do
+	_kill_sheep $i &
     done
+    wait
 
-    _wait_for_sheep 1
-    _kill_sheep 0
-    _wait_for_sheep_stop
-    _start_sheep 0
-    _wait_for_sheep 1
-
-    for i in `seq 1 7`; do
-	_start_sheep $i
+    for i in `seq 0 7`; do
+	_start_sheep $i &
     done
+    wait
+
     _wait_for_sheep 8
 }
 
-- 
1.7.9.5