[Sheepdog] [PATCH v2 3/5] sheep: introduce SD_STATUS_HALT
Liu Yuan
namei.unix at gmail.com
Sun Oct 16 12:35:13 CEST 2011
From: Liu Yuan <tailai.ly at taobao.com>
Currently, sheepdog will serve IO requests even if number of nodes is less than 'copies'.
When the number of the nodes (or zones) is less than the copies specified by
collie-cluster-format command, the sheepdog cluster should stop serving IO requests.
This is necessary to solve the below subtle case:
+ good nodes, - failed nodes.
0 1 2 3
+ - - +
+ --> - --> - --> +
+ + - # <-- permanently down.
^
|
this node has the latest data
at stage 3, we will have a cluster recovered without the data tracked at stage 1.
When the nodes are in the SD_STATUS_HALT, the sheepdog can also serve configuration change
and do the recovery job.
Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
include/sheep.h | 1 +
include/sheepdog_proto.h | 1 +
sheep/group.c | 34 +++++++++++++++++++++++-----------
sheep/sheep_priv.h | 1 +
4 files changed, 26 insertions(+), 11 deletions(-)
diff --git a/include/sheep.h b/include/sheep.h
index 31516d9..230917f 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -254,6 +254,7 @@ static inline const char *sd_strerror(int err)
{SD_RES_WAIT_FOR_FORMAT, "Waiting for a format operation"},
{SD_RES_WAIT_FOR_JOIN, "Waiting for other nodes joining"},
{SD_RES_JOIN_FAILED, "The node had failed to join sheepdog"},
+ {SD_RES_HALT, "The sheepdog is stopped doing IO, short of living nodes"},
{SD_RES_OLD_NODE_VER, "Remote node has an old epoch"},
{SD_RES_NEW_NODE_VER, "Remote node has a new epoch"},
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 2b042f4..9467c44 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -58,6 +58,7 @@
#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Sheepdog is waiting for a format operation */
#define SD_RES_WAIT_FOR_JOIN 0x17 /* Sheepdog is waiting for other nodes joining */
#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
+#define SD_RES_HALT 0x19 /* Sheepdog is stopped doing IO */
/*
* Object ID rules
diff --git a/sheep/group.c b/sheep/group.c
index 6a94f01..b905c5d 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -335,6 +335,9 @@ void cluster_queue_request(struct work *work, int idx)
case SD_STATUS_JOIN_FAILED:
ret = SD_RES_JOIN_FAILED;
break;
+ case SD_STATUS_HALT:
+ ret = SD_RES_HALT;
+ break;
default:
ret = SD_RES_SYSTEM_ERROR;
break;
@@ -534,7 +537,7 @@ static int cluster_sanity_check(struct sheepdog_node_list_entry *entries,
uint32_t lepoch;
if (sys->status == SD_STATUS_WAIT_FOR_FORMAT ||
- sys->status == SD_STATUS_SHUTDOWN ||
+ sys->status == SD_STATUS_SHUTDOWN)
goto out;
/* When the joinning node is newly created, we need to check nothing. */
if (nr_entries == 0)
@@ -551,7 +554,7 @@ static int cluster_sanity_check(struct sheepdog_node_list_entry *entries,
goto out;
}
- if (sys->status == SD_STATUS_OK)
+ if (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT)
goto out;
if (epoch < lepoch) {
@@ -593,6 +596,7 @@ static int get_cluster_status(struct sheepdog_node_list_entry *from,
goto out;
switch (sys->status) {
+ case SD_STATUS_HALT:
case SD_STATUS_OK:
if (inc_epoch)
*inc_epoch = 1;
@@ -778,7 +782,7 @@ static void update_cluster_info(struct join_message *msg)
int i;
int ret, nr_nodes = msg->nr_nodes;
- eprintf("status = %d, epoch = %d, %d, %d\n", msg->cluster_status, msg->epoch, msg->result, sys->join_finished);
+ eprintf("status = %d, epoch = %d, %x, %d\n", msg->cluster_status, msg->epoch, msg->result, sys->join_finished);
if (msg->result != SD_RES_SUCCESS) {
if (is_myself(msg->header.from.addr, msg->header.from.port)) {
eprintf("failed to join sheepdog, %d\n", msg->result);
@@ -812,12 +816,13 @@ static void update_cluster_info(struct join_message *msg)
sheepid_to_str(&msg->nodes[i].sheepid));
}
- if (msg->cluster_status != SD_STATUS_OK)
+ if (msg->cluster_status == SD_STATUS_WAIT_FOR_JOIN)
add_node_to_leave_list((struct message_header *)msg);
sys->join_finished = 1;
- if (msg->cluster_status == SD_STATUS_OK && msg->inc_epoch)
+ if ((msg->cluster_status == SD_STATUS_OK || msg->cluster_status == SD_STATUS_HALT)
+ && msg->inc_epoch)
update_epoch_log(sys->epoch);
join_finished:
@@ -830,13 +835,16 @@ join_finished:
vprintf(SDOG_ERR, "%s has gone\n",
sheepid_to_str(&msg->header.sheepid));
- if (msg->cluster_status == SD_STATUS_OK) {
+ if (msg->cluster_status == SD_STATUS_OK ||
+ msg->cluster_status == SD_STATUS_HALT) {
if (msg->inc_epoch) {
sys->epoch++;
update_epoch_log(sys->epoch);
update_epoch_store(sys->epoch);
}
- if (sys->status != SD_STATUS_OK) {
+
+ if (sys->status != SD_STATUS_OK ||
+ sys->status != SD_STATUS_HALT) {
set_global_nr_copies(sys->nr_sobjs);
set_cluster_ctime(msg->ctime);
}
@@ -1079,7 +1087,8 @@ static void send_join_response(struct work_notify *w)
m->state = DM_FIN;
dprintf("%d, %d\n", jm->result, jm->cluster_status);
- if (jm->result == SD_RES_SUCCESS && jm->cluster_status != SD_STATUS_OK) {
+ if (jm->result == SD_RES_SUCCESS &&
+ jm->cluster_status == SD_STATUS_WAIT_FOR_JOIN) {
jm->nr_leave_nodes = 0;
list_for_each_entry(node, &sys->leave_list, list) {
jm->leave_nodes[jm->nr_leave_nodes].sheepid = node->sheepid;
@@ -1185,7 +1194,8 @@ static void __sd_notify_done(struct cpg_event *cevent)
}
}
- if (do_recovery && sys->status == SD_STATUS_OK) {
+ if (do_recovery &&
+ (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT)) {
list_for_each_entry_safe(node, t, &sys->leave_list, list) {
list_del(&node->list);
}
@@ -1257,7 +1267,8 @@ static int del_node(struct sheepid *id)
list_del(&node->list);
free(node);
- if (sys->status == SD_STATUS_OK) {
+ if (sys->status == SD_STATUS_OK ||
+ sys->status == SD_STATUS_HALT) {
nr = get_ordered_sd_node_list(e);
dprintf("update epoch, %d, %d\n", sys->epoch + 1, nr);
epoch_log_write(sys->epoch + 1, (char *)e,
@@ -1427,7 +1438,8 @@ static void __sd_leave_done(struct cpg_event *cevent)
print_node_list(&sys->sd_node_list);
- if (node_left && sys->status == SD_STATUS_OK)
+ if (node_left &&
+ (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT))
start_recovery(sys->epoch);
}
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index e2fcb40..355cd93 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -37,6 +37,7 @@
#define SD_STATUS_WAIT_FOR_JOIN 0x02
#define SD_STATUS_SHUTDOWN 0x03
#define SD_STATUS_JOIN_FAILED 0x04
+#define SD_STATUS_HALT 0x05
#define SD_RES_NETWORK_ERROR 0x81 /* Network error between sheeps */
--
1.7.6.1
More information about the sheepdog
mailing list