[Sheepdog] [PATCH v4 7/7] sheep: use SD_STATUS_HALT to stop serving IO

Tue Oct 18 10:58:58 CEST 2011

From: Liu Yuan <tailai.ly at taobao.com>

We use SD_STATUS_HALT to identify the cluster state when it should not serve
IO requests.

This is optional, users might risk themselves to turn off this HALT status. As
the below command:

$ collie cluster format -H
or
$ collie cluster format --nohalt

By default, this is enabled.

[Test Case]

[1]
steps:

for i in 0 1 2 3; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done
./collie/collie cluster format --copies=3;
for i in 0 1; do pkill -f "sheep -d /store/$i"; sleep 1; done
for i in 2 3; do ./collie/collie cluster info -p 700$i; done
for i in 0 1; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done
for i in 0 1 2 3; do ./collie/collie cluster info -p 700$i; done

output:

Cluster status: The sheepdog is stopped doing IO, short of living nodes

Creation time        Epoch Nodes
2011-10-11 16:26:02      3 [192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
Cluster status: The sheepdog is stopped doing IO, short of living nodes

Creation time        Epoch Nodes
2011-10-11 16:26:02      3 [192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
Cluster status: running

Creation time        Epoch Nodes
2011-10-11 16:26:02      5 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      4 [192.168.0.1:7000, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      3 [192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]

...

[2]
steps:
for i in 0 1; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done
collie/collie cluster format
for i in 0 1; do collie/collie cluster info -p 700$i;done
for i in 0; do pkill -f "sheep/sheep -d /store/$i"; sleep 1; done
for i in 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done
for i in 1 2; do pkill -f "sheep/sheep -d /store/$i"; sleep 1; done
for i in 0 1 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done
for i in 0 1 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done
for i in 0 1 2; do collie/collie cluster info -p 700$i;done

output:
Cluster status: The sheepdog is stopped doing IO, short of living nodes

Creation time        Epoch Nodes
2011-10-16 18:11:07      1 [192.168.0.1:7000, 192.168.0.1:7001]
Cluster status: The sheepdog is stopped doing IO, short of living nodes

Creation time        Epoch Nodes
2011-10-16 18:11:07      1 [192.168.0.1:7000, 192.168.0.1:7001]
Cluster status: running

Creation time        Epoch Nodes
2011-10-16 18:11:07      6 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002]
2011-10-16 18:11:07      5 [192.168.0.1:7000, 192.168.0.1:7002]
2011-10-16 18:11:07      4 [192.168.0.1:7002]
2011-10-16 18:11:07      3 [192.168.0.1:7001, 192.168.0.1:7002]
2011-10-16 18:11:07      2 [192.168.0.1:7001]
2011-10-16 18:11:07      1 [192.168.0.1:7000, 192.168.0.1:7001]

...

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 collie/cluster.c   |   14 +++++++++++++-
 collie/collie.c    |    1 +
 sheep/group.c      |   45 ++++++++++++++++++++++++++++++++++++++++-----
 sheep/sheep_priv.h |    5 +++++
 4 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/collie/cluster.c b/collie/cluster.c
index e0ab37f..fd40f6e 100644
--- a/collie/cluster.c
+++ b/collie/cluster.c
@@ -16,8 +16,15 @@
 
 struct cluster_cmd_data {
 	int copies;
+	int nohalt;
 } cluster_cmd_data;
 
+static void set_nohalt(uint32_t *p)
+{
+	if (p)
+		*p |= 1;
+}
+
 static int cluster_format(int argc, char **argv)
 {
 	int fd, ret;
@@ -36,6 +43,8 @@ static int cluster_format(int argc, char **argv)
 
 	hdr.opcode = SD_OP_MAKE_FS;
 	hdr.copies = cluster_cmd_data.copies;
+	if (cluster_cmd_data.nohalt)
+		set_nohalt(&hdr.flags);
 	hdr.epoch = node_list_version;
 	hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
 
@@ -163,7 +172,7 @@ static int cluster_shutdown(int argc, char **argv)
 static struct subcommand cluster_cmd[] = {
 	{"info", NULL, "aprh", "show cluster information",
 	 0, cluster_info},
-	{"format", NULL, "caph", "create a Sheepdog storage",
+	{"format", NULL, "cHaph", "create a Sheepdog storage",
 	 0, cluster_format},
 	{"shutdown", NULL, "aph", "stop Sheepdog",
 	 SUBCMD_FLAG_NEED_NODELIST, cluster_shutdown},
@@ -185,6 +194,9 @@ static int cluster_parser(int ch, char *opt)
 		}
 		cluster_cmd_data.copies = copies;
 		break;
+	case 'H':
+		cluster_cmd_data.nohalt = 1;
+		break;
 	}
 
 	return 0;
diff --git a/collie/collie.c b/collie/collie.c
index ce51599..456f2cd 100644
--- a/collie/collie.c
+++ b/collie/collie.c
@@ -41,6 +41,7 @@ static const struct sd_option collie_options[] = {
 
 	/* cluster options */
 	{'c', "copies", 1, "set the number of data redundancy"},
+	{'H', "nohalt", 0, "serve the IO rquests even lack of enough redundant nodes"},
 
 	{ 0, NULL, 0, NULL },
 };
diff --git a/sheep/group.c b/sheep/group.c
index 3406672..51c0bce 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -51,6 +51,7 @@ struct join_message {
 	uint32_t nr_nodes;
 	uint32_t nr_sobjs;
 	uint32_t cluster_status;
+	uint32_t cluster_flags;
 	uint32_t epoch;
 	uint64_t ctime;
 	uint32_t result;
@@ -678,6 +679,7 @@ static void join(struct join_message *msg)
 					 msg->epoch, &msg->cluster_status,
 					 &msg->inc_epoch);
 	msg->nr_sobjs = sys->nr_sobjs;
+	msg->cluster_flags = sys->flags;
 	msg->ctime = get_cluster_ctime();
 	msg->nr_nodes = 0;
 	list_for_each_entry(node, &sys->sd_node_list, list) {
@@ -792,7 +794,7 @@ static void update_cluster_info(struct join_message *msg)
 	eprintf("status = %d, epoch = %d, %x, %d\n", msg->cluster_status, msg->epoch, msg->result, sys->join_finished);
 	if (msg->result != SD_RES_SUCCESS) {
 		if (is_myself(msg->header.from.addr, msg->header.from.port)) {
-			eprintf("failed to join sheepdog, %d\n", msg->result);
+			eprintf("failed to join sheepdog, %x\n", msg->result);
 			leave_cluster();
 			eprintf("Restart me later when master is up, please.Bye.\n");
 			exit(1);
@@ -804,13 +806,13 @@ static void update_cluster_info(struct join_message *msg)
 	if (sys->status == SD_STATUS_JOIN_FAILED)
 		return;
 
-	if (!sys->nr_sobjs)
-		sys->nr_sobjs = msg->nr_sobjs;
-
 	if (sys->join_finished)
 		goto join_finished;
 
+	sys->nr_sobjs = msg->nr_sobjs;
 	sys->epoch = msg->epoch;
+	sys->flags = msg->cluster_status;
+
 	for (i = 0; i < nr_nodes; i++) {
 		ret = move_node_to_sd_list(&msg->nodes[i].sheepid,
 					   msg->nodes[i].ent);
@@ -853,6 +855,7 @@ join_finished:
 		if (sys->status != SD_STATUS_OK ||
 		    sys->status != SD_STATUS_HALT) {
 			set_global_nr_copies(sys->nr_sobjs);
+			set_cluster_flags(sys->flags);
 			set_cluster_ctime(msg->ctime);
 		}
 	}
@@ -957,6 +960,7 @@ static void vdi_op_done(struct vdi_op_message *msg)
 		break;
 	case SD_OP_MAKE_FS:
 		sys->nr_sobjs = ((struct sd_so_req *)hdr)->copies;
+		sys->flags = ((struct sd_so_req *)hdr)->flags;
 		if (!sys->nr_sobjs)
 			sys->nr_sobjs = SD_DEFAULT_REDUNDANCY;
 
@@ -980,8 +984,18 @@ static void vdi_op_done(struct vdi_op_message *msg)
 		update_epoch_store(sys->epoch);
 
 		set_global_nr_copies(sys->nr_sobjs);
+		set_cluster_flags(sys->flags);
 
-		sys->status = SD_STATUS_OK;
+		if (sys_flag_nohalt())
+			sys->status = SD_STATUS_OK;
+		else {
+			int nr_zones = get_zones_nr_from(&sys->sd_node_list);
+
+			if (nr_zones >= sys->nr_sobjs)
+				sys->status = SD_STATUS_OK;
+			else
+				sys->status = SD_STATUS_HALT;
+		}
 		break;
 	case SD_OP_SHUTDOWN:
 		sys->status = SD_STATUS_SHUTDOWN;
@@ -1208,6 +1222,13 @@ static void __sd_notify_done(struct cpg_event *cevent)
 		}
 		start_recovery(sys->epoch);
 	}
+
+	if (sys->status == SD_STATUS_HALT) {
+		int nr_zones = get_zones_nr_from(&sys->sd_node_list);
+
+		if (nr_zones >= sys->nr_sobjs)
+			sys->status = SD_STATUS_OK;
+	}
 }
 
 static void sd_notify_handler(struct sheepid *sender, void *msg, size_t msg_len)
@@ -1360,6 +1381,7 @@ static void send_join_request(struct sheepid *id)
 	msg.header.sheepid = sys->this_sheepid;
 
 	get_global_nr_copies(&msg.nr_sobjs);
+	get_cluster_flags(&msg.cluster_flags);
 
 	nr_entries = ARRAY_SIZE(entries);
 	ret = read_epoch(&msg.epoch, &msg.ctime, entries, &nr_entries);
@@ -1384,6 +1406,7 @@ static void __sd_join_done(struct cpg_event *cevent)
 	    sheepid_cmp(&w->joined, &sys->this_sheepid) == 0) {
 		sys->join_finished = 1;
 		get_global_nr_copies(&sys->nr_sobjs);
+		get_cluster_flags(&sys->flags);
 		first_cpg_node = 1;
 	}
 
@@ -1436,6 +1459,11 @@ static void __sd_join_done(struct cpg_event *cevent)
 		send_join_request(&w->joined);
 }
 
+int sys_flag_nohalt()
+{
+	return sys->flags & SD_FLAG_NOHALT;
+}
+
 static void __sd_leave_done(struct cpg_event *cevent)
 {
 	struct work_leave *w = container_of(cevent, struct work_leave, cev);
@@ -1448,6 +1476,13 @@ static void __sd_leave_done(struct cpg_event *cevent)
 	if (node_left &&
 	    (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT))
 		start_recovery(sys->epoch);
+
+	if (sys->status == SD_STATUS_OK && !sys_flag_nohalt()) {
+		int nr_zones = get_zones_nr_from(&sys->sd_node_list);
+
+		if (nr_zones < sys->nr_sobjs)
+			sys->status = SD_STATUS_HALT;
+	}
 }
 
 static void cpg_event_free(struct cpg_event *cevent)
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 62ec9d2..6b46f0f 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -41,6 +41,8 @@
 
 #define SD_RES_NETWORK_ERROR    0x81 /* Network error between sheeps */
 
+#define SD_FLAG_NOHALT          0x00000001
+
 enum cpg_event_type {
 	CPG_EVENT_JOIN,
 	CPG_EVENT_LEAVE,
@@ -116,6 +118,7 @@ struct cluster_info {
 
 	uint32_t epoch;
 	uint32_t status;
+	uint32_t flags;
 
 	/*
 	 * we add a node to cpg_node_list in confchg then move it to
@@ -214,6 +217,8 @@ int get_global_nr_copies(uint32_t *copies);
 int set_cluster_flags(uint32_t flags);
 int get_cluster_flags(uint32_t *flags);
 
+int sys_flag_nohalt(void);
+
 #define NR_GW_WORKER_THREAD 4
 #define NR_IO_WORKER_THREAD 4
 
-- 
1.7.6.1