From: Liu Yuan <tailai.ly at taobao.com> We use SD_STATUS_HALT to identify the cluster state when it should not serve IO requests. [Test Case] steps: for i in 0 1 2 3; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done ./collie/collie cluster format --copies=3; for i in 0 1; do pkill -f "sheep -d /store/$i"; sleep 1; done for i in 2 3; do ./collie/collie cluster info -p 700$i; done for i in 0 1; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done for i in 0 1 2 3; do ./collie/collie cluster info -p 700$i; done output: Cluster status: The node is stopped doing IO, short of living nodes Creation time Epoch Nodes 2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] Cluster status: The node is stopped doing IO, short of living nodes Creation time Epoch Nodes 2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] Cluster status: running Creation time Epoch Nodes 2011-10-11 16:26:02 5 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 4 [192.168.0.1:7000, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] ... Signed-off-by: Liu Yuan <tailai.ly at taobao.com> --- sheep/group.c | 18 ++++++++++++++++++ 1 files changed, 18 insertions(+), 0 deletions(-) diff --git a/sheep/group.c b/sheep/group.c index 59293b2..7ed6b59 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -1203,6 +1203,15 @@ static void __sd_notify_done(struct cpg_event *cevent) } start_recovery(sys->epoch); } + + if (sys->status == SD_STATUS_HALT) { + int nr_dummpy, nr_zones; + struct sheepdog_node_list_entry nodes[SD_MAX_NODES]; + + build_node_list(&sys->sd_node_list, nodes, &nr_dummpy, &nr_zones); + if (nr_zones >= sys->nr_sobjs) + sys->status = SD_STATUS_OK; + } } static void sd_notify_handler(struct sheepid *sender, void *msg, size_t msg_len) @@ -1442,6 +1451,15 @@ static void __sd_leave_done(struct cpg_event *cevent) if (node_left && (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT)) start_recovery(sys->epoch); + + if (sys->status == SD_STATUS_OK) { + int nr_dummpy, nr_zones; + struct sheepdog_node_list_entry nodes[SD_MAX_NODES]; + + build_node_list(&sys->sd_node_list, nodes, &nr_dummpy, &nr_zones); + if (nr_zones < sys->nr_sobjs) + sys->status = SD_STATUS_HALT; + } } static void cpg_event_free(struct cpg_event *cevent) -- 1.7.6.1 |