[Sheepdog] [PATCH v2 3/3] sheep: use SD_STATUS_HALT to stop serving IO

Tue Oct 11 11:27:13 CEST 2011

From: Liu Yuan <tailai.ly at taobao.com>

We use SD_STATUS_HALT to identify the cluster state when it should not serve
IO requests.

[Test Case]

steps:

for i in 0 1 2 3; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done
./collie/collie cluster format --copies=3;
for i in 0 1; do pkill -f "sheep -d /store/$i"; sleep 1; done
for i in 2 3; do ./collie/collie cluster info -p 700$i; done
for i in 0 1; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done
for i in 0 1 2 3; do ./collie/collie cluster info -p 700$i; done

output:

Cluster status: The node is stopped doing IO, short of living nodes

Creation time        Epoch Nodes
2011-10-11 16:26:02      3 [192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
Cluster status: The node is stopped doing IO, short of living nodes

Creation time        Epoch Nodes
2011-10-11 16:26:02      3 [192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
Cluster status: running

Creation time        Epoch Nodes
2011-10-11 16:26:02      5 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      4 [192.168.0.1:7000, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      3 [192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02      1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]

...

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 sheep/group.c |   14 ++++++++++++++
 1 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/sheep/group.c b/sheep/group.c
index 2871e97..756f8a6 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -1212,6 +1212,13 @@ static void __sd_notify_done(struct cpg_event *cevent)
 		}
 		start_recovery(sys->epoch);
 	}
+
+	if (sys->status == SD_STATUS_HALT) {
+		int nr_zones = get_zones_nr_from(&sys->sd_node_list);
+
+		if (nr_zones >= sys->nr_sobjs)
+			sys->status = SD_STATUS_OK;
+	}
 }
 
 static void sd_notify_handler(struct sheepid *sender, void *msg, size_t msg_len)
@@ -1451,6 +1458,13 @@ static void __sd_leave_done(struct cpg_event *cevent)
 	if (node_left &&
 	    (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT))
 		start_recovery(sys->epoch);
+
+	if (sys->status == SD_STATUS_OK) {
+		int nr_zones = get_zones_nr_from(&sys->sd_node_list);
+
+		if (nr_zones < sys->nr_sobjs)
+			sys->status = SD_STATUS_HALT;
+	}
 }
 
 static void cpg_event_free(struct cpg_event *cevent)
-- 
1.7.6.1