From: Liu Yuan <tailai.ly at taobao.com> We use SD_STATUS_HALT to identify the cluster state when it should not serve IO requests. This is optional, users might risk themselves to turn off this HALT status. As the below command: $ collie cluster format -H or $ collie cluster format --nohalt By default, this is enabled. [Test Case] [1] steps: for i in 0 1 2 3; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done ./collie/collie cluster format --copies=3; for i in 0 1; do pkill -f "sheep -d /store/$i"; sleep 1; done for i in 2 3; do ./collie/collie cluster info -p 700$i; done for i in 0 1; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done for i in 0 1 2 3; do ./collie/collie cluster info -p 700$i; done output: Cluster status: The sheepdog is stopped doing IO, short of living nodes Creation time Epoch Nodes 2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] Cluster status: The sheepdog is stopped doing IO, short of living nodes Creation time Epoch Nodes 2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] Cluster status: running Creation time Epoch Nodes 2011-10-11 16:26:02 5 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 4 [192.168.0.1:7000, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] ... [2] steps: for i in 0 1; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done collie/collie cluster format for i in 0 1; do collie/collie cluster info -p 700$i;done for i in 0; do pkill -f "sheep/sheep -d /store/$i"; sleep 1; done for i in 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done for i in 1 2; do pkill -f "sheep/sheep -d /store/$i"; sleep 1; done for i in 0 1 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done for i in 0 1 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done for i in 0 1 2; do collie/collie cluster info -p 700$i;done output: Cluster status: The sheepdog is stopped doing IO, short of living nodes Creation time Epoch Nodes 2011-10-16 18:11:07 1 [192.168.0.1:7000, 192.168.0.1:7001] Cluster status: The sheepdog is stopped doing IO, short of living nodes Creation time Epoch Nodes 2011-10-16 18:11:07 1 [192.168.0.1:7000, 192.168.0.1:7001] Cluster status: running Creation time Epoch Nodes 2011-10-16 18:11:07 6 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002] 2011-10-16 18:11:07 5 [192.168.0.1:7000, 192.168.0.1:7002] 2011-10-16 18:11:07 4 [192.168.0.1:7002] 2011-10-16 18:11:07 3 [192.168.0.1:7001, 192.168.0.1:7002] 2011-10-16 18:11:07 2 [192.168.0.1:7001] 2011-10-16 18:11:07 1 [192.168.0.1:7000, 192.168.0.1:7001] ... Signed-off-by: Liu Yuan <tailai.ly at taobao.com> --- collie/cluster.c | 14 +++++++++++++- collie/collie.c | 1 + include/sheep.h | 2 ++ sheep/group.c | 45 ++++++++++++++++++++++++++++++++++++++++----- sheep/sheep_priv.h | 3 +++ 5 files changed, 59 insertions(+), 6 deletions(-) diff --git a/collie/cluster.c b/collie/cluster.c index 0d5dfbe..8240277 100644 --- a/collie/cluster.c +++ b/collie/cluster.c @@ -16,8 +16,15 @@ struct cluster_cmd_data { int copies; + int nohalt; } cluster_cmd_data; +static void set_nohalt(uint16_t *p) +{ + if (p) + *p |= SD_FLAG_NOHALT; +} + static int cluster_format(int argc, char **argv) { int fd, ret; @@ -36,6 +43,8 @@ static int cluster_format(int argc, char **argv) hdr.opcode = SD_OP_MAKE_FS; hdr.copies = cluster_cmd_data.copies; + if (cluster_cmd_data.nohalt) + set_nohalt(&hdr.flags); hdr.epoch = node_list_version; hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000; @@ -163,7 +172,7 @@ static int cluster_shutdown(int argc, char **argv) static struct subcommand cluster_cmd[] = { {"info", NULL, "aprh", "show cluster information", 0, cluster_info}, - {"format", NULL, "caph", "create a Sheepdog storage", + {"format", NULL, "cHaph", "create a Sheepdog storage", 0, cluster_format}, {"shutdown", NULL, "aph", "stop Sheepdog", SUBCMD_FLAG_NEED_NODELIST, cluster_shutdown}, @@ -176,6 +185,9 @@ static int cluster_parser(int ch, char *opt) case 'c': cluster_cmd_data.copies = atoi(opt); break; + case 'H': + cluster_cmd_data.nohalt = 1; + break; } return 0; diff --git a/collie/collie.c b/collie/collie.c index e064a0a..df5dca4 100644 --- a/collie/collie.c +++ b/collie/collie.c @@ -41,6 +41,7 @@ static const struct sd_option collie_options[] = { /* cluster options */ {'c', "copies", 1, "set the number of data redundancy"}, + {'H', "nohalt", 0, "serve the IO rquests even lack of enough redundant nodes"}, { 0, NULL, 0, NULL }, }; diff --git a/include/sheep.h b/include/sheep.h index 230917f..05dd246 100644 --- a/include/sheep.h +++ b/include/sheep.h @@ -48,6 +48,8 @@ #define SD_RES_INVALID_CTIME 0x44 /* Creation time of sheepdog is different */ #define SD_RES_INVALID_EPOCH 0x45 /* Invalid epoch */ +#define SD_FLAG_NOHALT 0x0001 /* Server the IO rquest even lack of nodes */ + struct sd_so_req { uint8_t proto_ver; uint8_t opcode; diff --git a/sheep/group.c b/sheep/group.c index 3406672..4671e0a 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -51,6 +51,7 @@ struct join_message { uint32_t nr_nodes; uint32_t nr_sobjs; uint32_t cluster_status; + uint16_t cluster_flags; uint32_t epoch; uint64_t ctime; uint32_t result; @@ -678,6 +679,7 @@ static void join(struct join_message *msg) msg->epoch, &msg->cluster_status, &msg->inc_epoch); msg->nr_sobjs = sys->nr_sobjs; + msg->cluster_flags = sys->flags; msg->ctime = get_cluster_ctime(); msg->nr_nodes = 0; list_for_each_entry(node, &sys->sd_node_list, list) { @@ -792,7 +794,7 @@ static void update_cluster_info(struct join_message *msg) eprintf("status = %d, epoch = %d, %x, %d\n", msg->cluster_status, msg->epoch, msg->result, sys->join_finished); if (msg->result != SD_RES_SUCCESS) { if (is_myself(msg->header.from.addr, msg->header.from.port)) { - eprintf("failed to join sheepdog, %d\n", msg->result); + eprintf("failed to join sheepdog, %x\n", msg->result); leave_cluster(); eprintf("Restart me later when master is up, please.Bye.\n"); exit(1); @@ -804,13 +806,13 @@ static void update_cluster_info(struct join_message *msg) if (sys->status == SD_STATUS_JOIN_FAILED) return; - if (!sys->nr_sobjs) - sys->nr_sobjs = msg->nr_sobjs; - if (sys->join_finished) goto join_finished; + sys->nr_sobjs = msg->nr_sobjs; sys->epoch = msg->epoch; + sys->flags = msg->cluster_status; + for (i = 0; i < nr_nodes; i++) { ret = move_node_to_sd_list(&msg->nodes[i].sheepid, msg->nodes[i].ent); @@ -853,6 +855,7 @@ join_finished: if (sys->status != SD_STATUS_OK || sys->status != SD_STATUS_HALT) { set_global_nr_copies(sys->nr_sobjs); + set_cluster_flags(sys->flags); set_cluster_ctime(msg->ctime); } } @@ -957,6 +960,7 @@ static void vdi_op_done(struct vdi_op_message *msg) break; case SD_OP_MAKE_FS: sys->nr_sobjs = ((struct sd_so_req *)hdr)->copies; + sys->flags = ((struct sd_so_req *)hdr)->flags; if (!sys->nr_sobjs) sys->nr_sobjs = SD_DEFAULT_REDUNDANCY; @@ -980,8 +984,18 @@ static void vdi_op_done(struct vdi_op_message *msg) update_epoch_store(sys->epoch); set_global_nr_copies(sys->nr_sobjs); + set_cluster_flags(sys->flags); - sys->status = SD_STATUS_OK; + if (sys_flag_nohalt()) + sys->status = SD_STATUS_OK; + else { + int nr_zones = get_zones_nr_from(&sys->sd_node_list); + + if (nr_zones >= sys->nr_sobjs) + sys->status = SD_STATUS_OK; + else + sys->status = SD_STATUS_HALT; + } break; case SD_OP_SHUTDOWN: sys->status = SD_STATUS_SHUTDOWN; @@ -1208,6 +1222,13 @@ static void __sd_notify_done(struct cpg_event *cevent) } start_recovery(sys->epoch); } + + if (sys->status == SD_STATUS_HALT) { + int nr_zones = get_zones_nr_from(&sys->sd_node_list); + + if (nr_zones >= sys->nr_sobjs) + sys->status = SD_STATUS_OK; + } } static void sd_notify_handler(struct sheepid *sender, void *msg, size_t msg_len) @@ -1360,6 +1381,7 @@ static void send_join_request(struct sheepid *id) msg.header.sheepid = sys->this_sheepid; get_global_nr_copies(&msg.nr_sobjs); + get_cluster_flags(&msg.cluster_flags); nr_entries = ARRAY_SIZE(entries); ret = read_epoch(&msg.epoch, &msg.ctime, entries, &nr_entries); @@ -1384,6 +1406,7 @@ static void __sd_join_done(struct cpg_event *cevent) sheepid_cmp(&w->joined, &sys->this_sheepid) == 0) { sys->join_finished = 1; get_global_nr_copies(&sys->nr_sobjs); + get_cluster_flags(&sys->flags); first_cpg_node = 1; } @@ -1436,6 +1459,11 @@ static void __sd_join_done(struct cpg_event *cevent) send_join_request(&w->joined); } +int sys_flag_nohalt() +{ + return sys->flags & SD_FLAG_NOHALT; +} + static void __sd_leave_done(struct cpg_event *cevent) { struct work_leave *w = container_of(cevent, struct work_leave, cev); @@ -1448,6 +1476,13 @@ static void __sd_leave_done(struct cpg_event *cevent) if (node_left && (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT)) start_recovery(sys->epoch); + + if (sys->status == SD_STATUS_OK && !sys_flag_nohalt()) { + int nr_zones = get_zones_nr_from(&sys->sd_node_list); + + if (nr_zones < sys->nr_sobjs) + sys->status = SD_STATUS_HALT; + } } static void cpg_event_free(struct cpg_event *cevent) diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index 24d6a7d..09da642 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -116,6 +116,7 @@ struct cluster_info { uint32_t epoch; uint32_t status; + uint16_t flags; /* * we add a node to cpg_node_list in confchg then move it to @@ -214,6 +215,8 @@ int get_global_nr_copies(uint32_t *copies); int set_cluster_flags(uint16_t flags); int get_cluster_flags(uint16_t *flags); +int sys_flag_nohalt(void); + #define NR_GW_WORKER_THREAD 4 #define NR_IO_WORKER_THREAD 4 -- 1.7.6.1 |