[sheepdog] [PATCH v2] sheep: fix incorrect status transition during joining cluster

Wed Jun 12 18:02:29 CEST 2013

Current sheep set jm->cluster_status SD_STATUS_OK if
1. itself is a first node to join the cluster, and
2. a number of nodes of latest epoch is 1
in sd_check_join_cb().

This is an invalid behavior. For example, the behavior allows such a
situation:
1. create a cluster with 2 nodes
2. format the cluster with --copies 2
3. kill single sheep
4. shutdown the cluster
5. launch single sheep
6. the status of the cluster is SD_STATUS_OK

This patch solves the problem. The second condition is now checked
with have_enough_zones() and the check is done in sd_join_handler().

Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---

v2:
 - call have_enough_zones() in sd_join_handler(), not in
   sd_check_join_cb(). The current_vnode_info isn't ready when
   sd_check_join_cb() called first.
 - do master transfer in cluster_running_check().

 sheep/group.c |   47 +++++++++++++++++++++++++++++------------------
 1 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/sheep/group.c b/sheep/group.c
index f74ef10..f0be421 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -633,6 +633,11 @@ static int cluster_running_check(struct join_message *jm)
 {
 	int ret;
 
+	if (sys->epoch < jm->epoch) {
+		sd_iprintf("master transfer, %d vs %d", sys->epoch, jm->epoch);
+		return CJ_RES_MASTER_TRANSFER;
+	}
+
 	/*
 	 * When the joining node is newly created and we are not waiting for
 	 * join we do not need to check anything.
@@ -994,38 +999,40 @@ enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
 	}
 
 	if (node_is_local(joining)) {
-		struct sd_node entries[SD_MAX_NODES];
-		int nr_entries;
-		uint32_t epoch;
-
 		/*
-		 * If I'm the first sheep joins in corosync, I
+		 * If I'm the first sheep joins in the cluster, I
 		 * becomes the master without sending JOIN.
 		 */
-
-		sd_printf(SDOG_DEBUG, "%s", node_to_str(&sys->this_node));
+		sd_dprintf("%s", node_to_str(&sys->this_node));
 
 		jm->cluster_status = sys->status;
 
-		epoch = get_latest_epoch();
+		uint32_t epoch = get_latest_epoch();
 		if (!epoch)
 			return CJ_RES_SUCCESS;
 
 		if (sys->status != SD_STATUS_WAIT_FOR_JOIN) {
-			sd_eprintf("unexpected cluster status 0x%x",
-				   sys->status);
-			return CJ_RES_FAIL;
+			if (sys->status == SD_STATUS_KILLED ||
+				sys->status == SD_STATUS_SHUTDOWN)
+				return CJ_RES_FAIL;
+
+			/*
+			 * This condition is not an error, a bug.
+			 *
+			 * sys->status is set as SD_STATUS_WAIT_FOR_JOIN in
+			 * create_cluster() if epoch is not zero. And the status
+			 * cannot be changed between create_cluster() and
+			 * sd_check_join_cb() (exceptions are KILLED and
+			 * SHUTDOWN).
+			 */
+			panic("unexpected cluster status 0x%x"
+				" (expected one is 0x%x, wait for join)",
+				sys->status, SD_STATUS_WAIT_FOR_JOIN);
 		}
 
-		nr_entries = epoch_log_read(epoch, entries, sizeof(entries));
-		if (nr_entries == -1)
-			return CJ_RES_FAIL;
-
 		sys->epoch = epoch;
 		jm->ctime = get_cluster_ctime();
 
-		if (nr_entries == 1)
-			jm->cluster_status = SD_STATUS_OK;
 		return CJ_RES_SUCCESS;
 	}
 
@@ -1139,7 +1146,11 @@ void sd_join_handler(const struct sd_node *joined,
 
 		if (node_is_local(joined))
 			/* this output is used for testing */
-			sd_printf(SDOG_DEBUG, "join Sheepdog cluster");
+			sd_dprintf("join Sheepdog cluster");
+
+		if (sys->epoch && have_enough_zones())
+			sys->status = SD_STATUS_OK;
+
 		break;
 	case CJ_RES_FAIL:
 		if (sys->status != SD_STATUS_WAIT_FOR_JOIN)
-- 
1.7.5.1