From: Liu Yuan <tailai.ly at taobao.com> We use SD_STATUS_HALT to identify the cluster state when it should not serve IO requests. [Test Case] steps: for i in 0 1 2 3; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done ./collie/collie cluster format --copies=3; for i in 0 1; do pkill -f "sheep -d /store/$i"; sleep 1; done for i in 2 3; do ./collie/collie cluster info -p 700$i; done for i in 0 1; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done for i in 0 1 2 3; do ./collie/collie cluster info -p 700$i; done output: Cluster status: The node is stopped doing IO, short of living nodes Creation time Epoch Nodes 2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] Cluster status: The node is stopped doing IO, short of living nodes Creation time Epoch Nodes 2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] Cluster status: running Creation time Epoch Nodes 2011-10-11 16:26:02 5 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 4 [192.168.0.1:7000, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] ... Signed-off-by: Liu Yuan <tailai.ly at taobao.com> --- sheep/group.c | 14 ++++++++++++++ 1 files changed, 14 insertions(+), 0 deletions(-) diff --git a/sheep/group.c b/sheep/group.c index 2871e97..756f8a6 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -1212,6 +1212,13 @@ static void __sd_notify_done(struct cpg_event *cevent) } start_recovery(sys->epoch); } + + if (sys->status == SD_STATUS_HALT) { + int nr_zones = get_zones_nr_from(&sys->sd_node_list); + + if (nr_zones >= sys->nr_sobjs) + sys->status = SD_STATUS_OK; + } } static void sd_notify_handler(struct sheepid *sender, void *msg, size_t msg_len) @@ -1451,6 +1458,13 @@ static void __sd_leave_done(struct cpg_event *cevent) if (node_left && (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT)) start_recovery(sys->epoch); + + if (sys->status == SD_STATUS_OK) { + int nr_zones = get_zones_nr_from(&sys->sd_node_list); + + if (nr_zones < sys->nr_sobjs) + sys->status = SD_STATUS_HALT; + } } static void cpg_event_free(struct cpg_event *cevent) -- 1.7.6.1 |