[Sheepdog] [PATCH v4 7/7] sheep: use SD_STATUS_HALT to stop serving IO
Liu Yuan
namei.unix at gmail.com
Tue Oct 18 10:58:58 CEST 2011
From: Liu Yuan <tailai.ly at taobao.com>
We use SD_STATUS_HALT to identify the cluster state when it should not serve
IO requests.
This is optional, users might risk themselves to turn off this HALT status. As
the below command:
$ collie cluster format -H
or
$ collie cluster format --nohalt
By default, this is enabled.
[Test Case]
[1]
steps:
for i in 0 1 2 3; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done
./collie/collie cluster format --copies=3;
for i in 0 1; do pkill -f "sheep -d /store/$i"; sleep 1; done
for i in 2 3; do ./collie/collie cluster info -p 700$i; done
for i in 0 1; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done
for i in 0 1 2 3; do ./collie/collie cluster info -p 700$i; done
output:
Cluster status: The sheepdog is stopped doing IO, short of living nodes
Creation time Epoch Nodes
2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
Cluster status: The sheepdog is stopped doing IO, short of living nodes
Creation time Epoch Nodes
2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
Cluster status: running
Creation time Epoch Nodes
2011-10-11 16:26:02 5 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02 4 [192.168.0.1:7000, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003]
...
[2]
steps:
for i in 0 1; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done
collie/collie cluster format
for i in 0 1; do collie/collie cluster info -p 700$i;done
for i in 0; do pkill -f "sheep/sheep -d /store/$i"; sleep 1; done
for i in 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done
for i in 1 2; do pkill -f "sheep/sheep -d /store/$i"; sleep 1; done
for i in 0 1 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done
for i in 0 1 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done
for i in 0 1 2; do collie/collie cluster info -p 700$i;done
output:
Cluster status: The sheepdog is stopped doing IO, short of living nodes
Creation time Epoch Nodes
2011-10-16 18:11:07 1 [192.168.0.1:7000, 192.168.0.1:7001]
Cluster status: The sheepdog is stopped doing IO, short of living nodes
Creation time Epoch Nodes
2011-10-16 18:11:07 1 [192.168.0.1:7000, 192.168.0.1:7001]
Cluster status: running
Creation time Epoch Nodes
2011-10-16 18:11:07 6 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002]
2011-10-16 18:11:07 5 [192.168.0.1:7000, 192.168.0.1:7002]
2011-10-16 18:11:07 4 [192.168.0.1:7002]
2011-10-16 18:11:07 3 [192.168.0.1:7001, 192.168.0.1:7002]
2011-10-16 18:11:07 2 [192.168.0.1:7001]
2011-10-16 18:11:07 1 [192.168.0.1:7000, 192.168.0.1:7001]
...
Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
collie/cluster.c | 14 +++++++++++++-
collie/collie.c | 1 +
sheep/group.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
sheep/sheep_priv.h | 5 +++++
4 files changed, 59 insertions(+), 6 deletions(-)
diff --git a/collie/cluster.c b/collie/cluster.c
index e0ab37f..fd40f6e 100644
--- a/collie/cluster.c
+++ b/collie/cluster.c
@@ -16,8 +16,15 @@
struct cluster_cmd_data {
int copies;
+ int nohalt;
} cluster_cmd_data;
+static void set_nohalt(uint32_t *p)
+{
+ if (p)
+ *p |= 1;
+}
+
static int cluster_format(int argc, char **argv)
{
int fd, ret;
@@ -36,6 +43,8 @@ static int cluster_format(int argc, char **argv)
hdr.opcode = SD_OP_MAKE_FS;
hdr.copies = cluster_cmd_data.copies;
+ if (cluster_cmd_data.nohalt)
+ set_nohalt(&hdr.flags);
hdr.epoch = node_list_version;
hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
@@ -163,7 +172,7 @@ static int cluster_shutdown(int argc, char **argv)
static struct subcommand cluster_cmd[] = {
{"info", NULL, "aprh", "show cluster information",
0, cluster_info},
- {"format", NULL, "caph", "create a Sheepdog storage",
+ {"format", NULL, "cHaph", "create a Sheepdog storage",
0, cluster_format},
{"shutdown", NULL, "aph", "stop Sheepdog",
SUBCMD_FLAG_NEED_NODELIST, cluster_shutdown},
@@ -185,6 +194,9 @@ static int cluster_parser(int ch, char *opt)
}
cluster_cmd_data.copies = copies;
break;
+ case 'H':
+ cluster_cmd_data.nohalt = 1;
+ break;
}
return 0;
diff --git a/collie/collie.c b/collie/collie.c
index ce51599..456f2cd 100644
--- a/collie/collie.c
+++ b/collie/collie.c
@@ -41,6 +41,7 @@ static const struct sd_option collie_options[] = {
/* cluster options */
{'c', "copies", 1, "set the number of data redundancy"},
+ {'H', "nohalt", 0, "serve the IO rquests even lack of enough redundant nodes"},
{ 0, NULL, 0, NULL },
};
diff --git a/sheep/group.c b/sheep/group.c
index 3406672..51c0bce 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -51,6 +51,7 @@ struct join_message {
uint32_t nr_nodes;
uint32_t nr_sobjs;
uint32_t cluster_status;
+ uint32_t cluster_flags;
uint32_t epoch;
uint64_t ctime;
uint32_t result;
@@ -678,6 +679,7 @@ static void join(struct join_message *msg)
msg->epoch, &msg->cluster_status,
&msg->inc_epoch);
msg->nr_sobjs = sys->nr_sobjs;
+ msg->cluster_flags = sys->flags;
msg->ctime = get_cluster_ctime();
msg->nr_nodes = 0;
list_for_each_entry(node, &sys->sd_node_list, list) {
@@ -792,7 +794,7 @@ static void update_cluster_info(struct join_message *msg)
eprintf("status = %d, epoch = %d, %x, %d\n", msg->cluster_status, msg->epoch, msg->result, sys->join_finished);
if (msg->result != SD_RES_SUCCESS) {
if (is_myself(msg->header.from.addr, msg->header.from.port)) {
- eprintf("failed to join sheepdog, %d\n", msg->result);
+ eprintf("failed to join sheepdog, %x\n", msg->result);
leave_cluster();
eprintf("Restart me later when master is up, please.Bye.\n");
exit(1);
@@ -804,13 +806,13 @@ static void update_cluster_info(struct join_message *msg)
if (sys->status == SD_STATUS_JOIN_FAILED)
return;
- if (!sys->nr_sobjs)
- sys->nr_sobjs = msg->nr_sobjs;
-
if (sys->join_finished)
goto join_finished;
+ sys->nr_sobjs = msg->nr_sobjs;
sys->epoch = msg->epoch;
+ sys->flags = msg->cluster_status;
+
for (i = 0; i < nr_nodes; i++) {
ret = move_node_to_sd_list(&msg->nodes[i].sheepid,
msg->nodes[i].ent);
@@ -853,6 +855,7 @@ join_finished:
if (sys->status != SD_STATUS_OK ||
sys->status != SD_STATUS_HALT) {
set_global_nr_copies(sys->nr_sobjs);
+ set_cluster_flags(sys->flags);
set_cluster_ctime(msg->ctime);
}
}
@@ -957,6 +960,7 @@ static void vdi_op_done(struct vdi_op_message *msg)
break;
case SD_OP_MAKE_FS:
sys->nr_sobjs = ((struct sd_so_req *)hdr)->copies;
+ sys->flags = ((struct sd_so_req *)hdr)->flags;
if (!sys->nr_sobjs)
sys->nr_sobjs = SD_DEFAULT_REDUNDANCY;
@@ -980,8 +984,18 @@ static void vdi_op_done(struct vdi_op_message *msg)
update_epoch_store(sys->epoch);
set_global_nr_copies(sys->nr_sobjs);
+ set_cluster_flags(sys->flags);
- sys->status = SD_STATUS_OK;
+ if (sys_flag_nohalt())
+ sys->status = SD_STATUS_OK;
+ else {
+ int nr_zones = get_zones_nr_from(&sys->sd_node_list);
+
+ if (nr_zones >= sys->nr_sobjs)
+ sys->status = SD_STATUS_OK;
+ else
+ sys->status = SD_STATUS_HALT;
+ }
break;
case SD_OP_SHUTDOWN:
sys->status = SD_STATUS_SHUTDOWN;
@@ -1208,6 +1222,13 @@ static void __sd_notify_done(struct cpg_event *cevent)
}
start_recovery(sys->epoch);
}
+
+ if (sys->status == SD_STATUS_HALT) {
+ int nr_zones = get_zones_nr_from(&sys->sd_node_list);
+
+ if (nr_zones >= sys->nr_sobjs)
+ sys->status = SD_STATUS_OK;
+ }
}
static void sd_notify_handler(struct sheepid *sender, void *msg, size_t msg_len)
@@ -1360,6 +1381,7 @@ static void send_join_request(struct sheepid *id)
msg.header.sheepid = sys->this_sheepid;
get_global_nr_copies(&msg.nr_sobjs);
+ get_cluster_flags(&msg.cluster_flags);
nr_entries = ARRAY_SIZE(entries);
ret = read_epoch(&msg.epoch, &msg.ctime, entries, &nr_entries);
@@ -1384,6 +1406,7 @@ static void __sd_join_done(struct cpg_event *cevent)
sheepid_cmp(&w->joined, &sys->this_sheepid) == 0) {
sys->join_finished = 1;
get_global_nr_copies(&sys->nr_sobjs);
+ get_cluster_flags(&sys->flags);
first_cpg_node = 1;
}
@@ -1436,6 +1459,11 @@ static void __sd_join_done(struct cpg_event *cevent)
send_join_request(&w->joined);
}
+int sys_flag_nohalt()
+{
+ return sys->flags & SD_FLAG_NOHALT;
+}
+
static void __sd_leave_done(struct cpg_event *cevent)
{
struct work_leave *w = container_of(cevent, struct work_leave, cev);
@@ -1448,6 +1476,13 @@ static void __sd_leave_done(struct cpg_event *cevent)
if (node_left &&
(sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT))
start_recovery(sys->epoch);
+
+ if (sys->status == SD_STATUS_OK && !sys_flag_nohalt()) {
+ int nr_zones = get_zones_nr_from(&sys->sd_node_list);
+
+ if (nr_zones < sys->nr_sobjs)
+ sys->status = SD_STATUS_HALT;
+ }
}
static void cpg_event_free(struct cpg_event *cevent)
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 62ec9d2..6b46f0f 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -41,6 +41,8 @@
#define SD_RES_NETWORK_ERROR 0x81 /* Network error between sheeps */
+#define SD_FLAG_NOHALT 0x00000001
+
enum cpg_event_type {
CPG_EVENT_JOIN,
CPG_EVENT_LEAVE,
@@ -116,6 +118,7 @@ struct cluster_info {
uint32_t epoch;
uint32_t status;
+ uint32_t flags;
/*
* we add a node to cpg_node_list in confchg then move it to
@@ -214,6 +217,8 @@ int get_global_nr_copies(uint32_t *copies);
int set_cluster_flags(uint32_t flags);
int get_cluster_flags(uint32_t *flags);
+int sys_flag_nohalt(void);
+
#define NR_GW_WORKER_THREAD 4
#define NR_IO_WORKER_THREAD 4
--
1.7.6.1
More information about the sheepdog
mailing list