[sheepdog] [PATCH 1/3] sheep: split cluster_info into system_info and cluster_info
MORITA Kazutaka
morita.kazutaka at gmail.com
Wed Jul 3 08:49:21 CEST 2013
From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
The current cluster_info contains two different things:
- The global variables which are specific to the local sheep.
- The cluster-wide variables which must be kept in the same state
among existing sheep nodes (e.g. epoch).
This patch moves most of fields cluster_info into system_info. After
this patch, cluster_info only contains the cluster-wide variables.
The join process will change a bit. The joining sheep has to send its
cluster_info in the join_message and receives the latest cluster_info
from the master node. If the joining node sends an invalid
cluster_info, the node will fail to join.
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
include/internal_proto.h | 34 ++++++----
sheep/group.c | 158 ++++++++++++++++++++++------------------------
sheep/ops.c | 40 ++++++------
sheep/recovery.c | 7 +-
sheep/request.c | 24 +++----
sheep/sheep.c | 4 +-
sheep/sheep_priv.h | 11 ++--
sheep/store.c | 4 +-
sheep/vdi.c | 2 +-
9 files changed, 140 insertions(+), 144 deletions(-)
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 24d5519..f0c55c0 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -129,6 +129,20 @@ struct sd_node {
uint64_t space;
};
+struct cluster_info {
+ uint8_t nr_copies;
+ uint8_t disable_recovery;
+ int16_t nr_nodes;
+ uint32_t epoch;
+ uint64_t ctime;
+ uint16_t flags;
+ uint16_t __pad[3];
+ uint8_t store[STORE_LEN];
+
+ /* node list at cluster_info->epoch */
+ struct sd_node nodes[SD_MAX_NODES];
+};
+
struct epoch_log {
uint64_t ctime;
uint64_t time; /* treated as time_t */
@@ -141,27 +155,19 @@ struct epoch_log {
struct join_message {
uint8_t proto_ver;
- uint8_t nr_copies;
- int16_t nr_nodes;
+ uint8_t __pad1[3];
uint16_t nr_failed_nodes;
uint16_t nr_delayed_nodes;
uint32_t cluster_status;
- uint32_t epoch;
- uint64_t ctime;
uint8_t inc_epoch; /* set non-zero when we increment epoch of all nodes */
- uint8_t disable_recovery;
- uint16_t cluster_flags;
- uint32_t __pad;
- uint8_t store[STORE_LEN];
+ uint8_t __pad2[3];
/*
- * A joining sheep puts the local node list here, which is nr_nodes
- * entries long. After the master replies it will contain the list of
- * nodes that attempted to join but failed the join process. The
- * number of entries in that case is nr_failed_nodes, which by
- * defintion must be smaller than nr_nodes.
+ * A joining sheep puts the local cluster info here. After the master
+ * replies it will contain the latest cluster info which is shared among
+ * the existing nodes.
*/
- struct sd_node nodes[];
+ struct cluster_info cinfo;
};
struct vdi_op_message {
diff --git a/sheep/group.c b/sheep/group.c
index 1b1cea5..4c0cc42 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -62,15 +62,6 @@ static main_thread(struct list_head *) failed_nodes;
*/
static main_thread(struct list_head *) delayed_nodes;
-static size_t get_join_message_size(struct join_message *jm)
-{
- /*
- * jm->nr_nodes is guaranteed to be larger than jm->nr_failed_nodes,
- * so it is safe to unconditionally use jm->nr_nodes here.
- */
- return sizeof(*jm) + jm->nr_nodes * sizeof(jm->nodes[0]);
-}
-
static int get_zones_nr_from(const struct sd_node *nodes, int nr_nodes)
{
int nr_zones = 0, i, j;
@@ -104,7 +95,7 @@ bool have_enough_zones(void)
int max_copies;
struct vnode_info *cur_vinfo = main_thread_get(current_vnode_info);
- if (sys->flags & SD_FLAG_NOHALT)
+ if (sys->cinfo.flags & SD_FLAG_NOHALT)
return true;
if (!cur_vinfo)
@@ -113,12 +104,12 @@ bool have_enough_zones(void)
max_copies = get_max_copy_number();
sd_dprintf("flags %d, nr_zones %d, min copies %d",
- sys->flags, cur_vinfo->nr_zones, max_copies);
+ sys->cinfo.flags, cur_vinfo->nr_zones, max_copies);
if (!cur_vinfo->nr_zones)
return false;
- if (sys->flags & SD_FLAG_QUORUM) {
+ if (sys->cinfo.flags & SD_FLAG_QUORUM) {
if (cur_vinfo->nr_zones > (max_copies/2))
return true;
} else {
@@ -433,9 +424,9 @@ static void update_exceptional_node_list(uint32_t epoch,
int i;
for (i = 0; i < jm->nr_failed_nodes; i++)
- add_failed_node(epoch, &jm->nodes[i]);
+ add_failed_node(epoch, &jm->cinfo.nodes[i]);
for ( ; i < jm->nr_failed_nodes + jm->nr_delayed_nodes; i++)
- add_delayed_node(epoch, &jm->nodes[i]);
+ add_delayed_node(epoch, &jm->cinfo.nodes[i]);
}
/* Format the lists of failed or delayed nodes into the join message. */
@@ -444,9 +435,10 @@ static void format_exceptional_node_list(struct join_message *jm)
struct node *n;
list_for_each_entry(n, main_thread_get(failed_nodes), list)
- jm->nodes[jm->nr_failed_nodes++] = n->ent;
+ jm->cinfo.nodes[jm->nr_failed_nodes++] = n->ent;
list_for_each_entry(n, main_thread_get(delayed_nodes), list)
- jm->nodes[jm->nr_failed_nodes + jm->nr_delayed_nodes++] = n->ent;
+ jm->cinfo.nodes[jm->nr_failed_nodes +
+ jm->nr_delayed_nodes++] = n->ent;
}
static void clear_exceptional_node_lists(void)
@@ -513,27 +505,27 @@ static int cluster_sanity_check(struct join_message *jm)
return CJ_RES_FAIL;
}
- if (jm->ctime != local_ctime) {
+ if (jm->cinfo.ctime != local_ctime) {
sd_eprintf("joining node ctime doesn't match: %"
- PRIu64 " vs %" PRIu64, jm->ctime, local_ctime);
+ PRIu64 " vs %" PRIu64, jm->cinfo.ctime, local_ctime);
return CJ_RES_FAIL;
}
- if (jm->epoch > local_epoch) {
+ if (jm->cinfo.epoch > local_epoch) {
sd_eprintf("joining node epoch too large: %"
- PRIu32 " vs %" PRIu32, jm->epoch, local_epoch);
+ PRIu32 " vs %" PRIu32, jm->cinfo.epoch, local_epoch);
return CJ_RES_FAIL;
}
- if (jm->nr_copies != local_nr_copies) {
+ if (jm->cinfo.nr_copies != local_nr_copies) {
sd_eprintf("joining node nr_copies doesn't match: %u vs %u",
- jm->nr_copies, local_nr_copies);
+ jm->cinfo.nr_copies, local_nr_copies);
return CJ_RES_FAIL;
}
- if (jm->cluster_flags != sys->flags) {
+ if (jm->cinfo.flags != sys->cinfo.flags) {
sd_eprintf("joining node cluster_flags don't match: %u vs %u",
- jm->cluster_flags, sys->flags);
+ jm->cinfo.flags, sys->cinfo.flags);
return CJ_RES_FAIL;
}
@@ -549,42 +541,42 @@ static int cluster_wait_for_join_check(const struct sd_node *joined,
int ret;
struct vnode_info *cur_vinfo;
- if (jm->nr_nodes == 0)
+ if (jm->cinfo.nr_nodes == 0)
return CJ_RES_JOIN_LATER;
ret = cluster_sanity_check(jm);
if (ret != CJ_RES_SUCCESS) {
- if (jm->epoch > sys->epoch) {
- sd_eprintf("transfer mastership (%d, %d)", jm->epoch,
- sys->epoch);
+ if (jm->cinfo.epoch > sys->cinfo.epoch) {
+ sd_eprintf("transfer mastership (%d, %d)", jm->cinfo.epoch,
+ sys->cinfo.epoch);
return CJ_RES_MASTER_TRANSFER;
}
return ret;
}
- nr_local_entries = epoch_log_read(jm->epoch, local_entries,
+ nr_local_entries = epoch_log_read(jm->cinfo.epoch, local_entries,
sizeof(local_entries));
if (nr_local_entries == -1)
return CJ_RES_FAIL;
- if (jm->epoch < local_epoch) {
+ if (jm->cinfo.epoch < local_epoch) {
sd_eprintf("joining node epoch too small: %"
- PRIu32 " vs %" PRIu32, jm->epoch, local_epoch);
+ PRIu32 " vs %" PRIu32, jm->cinfo.epoch, local_epoch);
if (xbsearch(joined, local_entries, nr_local_entries, node_cmp))
return CJ_RES_FAIL;
return CJ_RES_JOIN_LATER;
}
- if (jm->nr_nodes != nr_local_entries) {
+ if (jm->cinfo.nr_nodes != nr_local_entries) {
sd_eprintf("epoch log entries do not match: %d vs %d",
- jm->nr_nodes, nr_local_entries);
+ jm->cinfo.nr_nodes, nr_local_entries);
return CJ_RES_FAIL;
}
- if (memcmp(jm->nodes, local_entries,
- sizeof(jm->nodes[0]) * jm->nr_nodes) != 0) {
+ if (memcmp(jm->cinfo.nodes, local_entries,
+ sizeof(jm->cinfo.nodes[0]) * jm->cinfo.nr_nodes) != 0) {
sd_eprintf("epoch log entries does not match");
return CJ_RES_FAIL;
}
@@ -633,7 +625,7 @@ static int cluster_running_check(struct join_message *jm)
* When the joining node is newly created and we are not waiting for
* join we do not need to check anything.
*/
- if (jm->nr_nodes != 0) {
+ if (jm->cinfo.nr_nodes != 0) {
ret = cluster_sanity_check(jm);
if (ret != CJ_RES_SUCCESS)
return ret;
@@ -726,8 +718,13 @@ int log_current_epoch(void)
struct vnode_info *cur_vinfo = main_thread_get(current_vnode_info);
if (!cur_vinfo)
- return update_epoch_log(sys->epoch, NULL, 0);
- return update_epoch_log(sys->epoch, cur_vinfo->nodes,
+ return update_epoch_log(sys->cinfo.epoch, NULL, 0);
+
+ /* update cluster info to the latest state */
+ sys->cinfo.nr_nodes = cur_vinfo->nr_nodes;
+ memcpy(sys->cinfo.nodes, cur_vinfo->nodes,
+ sizeof(cur_vinfo->nodes[0]) * cur_vinfo->nr_nodes);
+ return update_epoch_log(sys->cinfo.epoch, cur_vinfo->nodes,
cur_vinfo->nr_nodes);
}
@@ -780,23 +777,23 @@ static void finish_join(const struct join_message *msg,
int ret;
sys->join_finished = true;
- sys->epoch = msg->epoch;
+ sys->cinfo.epoch = msg->cinfo.epoch;
if (msg->cluster_status != SD_STATUS_OK)
update_exceptional_node_list(get_latest_epoch(), msg);
- if (msg->store[0]) {
+ if (msg->cinfo.store[0]) {
/*
* We don't need backend for gateway-only node, but need to save
* store name. Otherwise, the node cannot notify the store name
* when it become master
*/
if (sys->gateway_only) {
- ret = set_cluster_store((char *)msg->store);
+ ret = set_cluster_store((char *)msg->cinfo.store);
if (ret != SD_RES_SUCCESS)
panic("failed to store into config file");
} else
- setup_backend_store((char *)msg->store,
+ setup_backend_store((char *)msg->cinfo.store,
!!msg->inc_epoch);
}
@@ -868,7 +865,7 @@ static void update_cluster_info(const struct join_message *msg,
struct vnode_info *old_vnode_info;
sd_dprintf("status = %d, epoch = %d, finished: %d",
- msg->cluster_status, msg->epoch, sys->join_finished);
+ msg->cluster_status, msg->cinfo.epoch, sys->join_finished);
if (!sys->join_finished)
finish_join(msg, joined, nodes, nr_nodes);
@@ -882,15 +879,16 @@ static void update_cluster_info(const struct join_message *msg,
case SD_STATUS_HALT:
switch (sys->status) {
case SD_STATUS_WAIT_FOR_FORMAT:
- sys->nr_copies = msg->nr_copies;
- sys->flags = msg->cluster_flags;
+ sys->cinfo.nr_copies = msg->cinfo.nr_copies;
+ sys->cinfo.flags = msg->cinfo.flags;
- set_cluster_copies(sys->nr_copies);
- set_cluster_flags(sys->flags);
- set_cluster_ctime(msg->ctime);
+ set_cluster_copies(sys->cinfo.nr_copies);
+ set_cluster_flags(sys->cinfo.flags);
+ set_cluster_ctime(msg->cinfo.ctime);
/*FALLTHROUGH*/
case SD_STATUS_WAIT_FOR_JOIN:
- sys->disable_recovery = msg->disable_recovery;
+ sys->cinfo.disable_recovery =
+ msg->cinfo.disable_recovery;
break;
default:
break;
@@ -901,7 +899,7 @@ static void update_cluster_info(const struct join_message *msg,
sys->status = msg->cluster_status;
if (msg->inc_epoch) {
- uatomic_inc(&sys->epoch);
+ uatomic_inc(&sys->cinfo.epoch);
log_current_epoch();
clear_exceptional_node_lists();
@@ -1015,8 +1013,8 @@ enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
if (nr_entries == -1)
return CJ_RES_FAIL;
- sys->epoch = epoch;
- jm->ctime = get_cluster_ctime();
+ sys->cinfo.epoch = epoch;
+ jm->cinfo.ctime = get_cluster_ctime();
if (nr_entries == 1)
jm->cluster_status = SD_STATUS_OK;
@@ -1031,7 +1029,7 @@ enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
ret = CJ_RES_FAIL;
break;
case SD_STATUS_WAIT_FOR_FORMAT:
- if (jm->nr_nodes != 0) {
+ if (jm->cinfo.nr_nodes != 0) {
ret = CJ_RES_FAIL;
break;
}
@@ -1054,15 +1052,9 @@ enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
joining->nid.port),
ret, jm->cluster_status);
- jm->nr_copies = sys->nr_copies;
- jm->cluster_flags = sys->flags;
- jm->epoch = sys->epoch;
- jm->ctime = get_cluster_ctime();
jm->nr_failed_nodes = 0;
- jm->disable_recovery = sys->disable_recovery;
- if (sd_store)
- pstrcpy((char *)jm->store, sizeof(jm->store), sd_store->name);
+ jm->cinfo = sys->cinfo;
if (jm->cluster_status != SD_STATUS_OK &&
(ret == CJ_RES_SUCCESS || ret == CJ_RES_JOIN_LATER))
@@ -1075,22 +1067,11 @@ static int send_join_request(struct sd_node *ent)
struct join_message *msg;
int ret;
- msg = xzalloc(sizeof(*msg) + SD_MAX_NODES * sizeof(msg->nodes[0]));
+ msg = xzalloc(sizeof(*msg));
msg->proto_ver = SD_SHEEP_PROTO_VER;
- msg->nr_copies = sys->nr_copies;
- msg->cluster_flags = sys->flags;
- msg->epoch = sys->epoch;
- msg->ctime = get_cluster_ctime();
-
- if (msg->epoch) {
- msg->nr_nodes = epoch_log_read(msg->epoch, msg->nodes,
- sizeof(struct sd_node) *
- SD_MAX_NODES);
- if (msg->nr_nodes == -1)
- return SD_RES_EIO;
- }
+ msg->cinfo = sys->cinfo;
- ret = sys->cdrv->join(ent, msg, get_join_message_size(msg));
+ ret = sys->cdrv->join(ent, msg, sizeof(*msg));
sd_printf(SDOG_INFO, "%s", node_to_str(&sys->this_node));
@@ -1121,6 +1102,8 @@ void sd_join_handler(const struct sd_node *joined,
const struct join_message *jm = opaque;
uint32_t le = get_latest_epoch();
+ sys->cinfo = jm->cinfo;
+
if (node_is_local(joined)) {
if (result == CJ_RES_FAIL) {
sd_eprintf("Failed to join, exiting.");
@@ -1154,7 +1137,7 @@ void sd_join_handler(const struct sd_node *joined,
if (!add_failed_node(le, joined))
break;
- nr_local = get_nodes_nr_epoch(sys->epoch);
+ nr_local = get_nodes_nr_epoch(sys->cinfo.epoch);
nr = nr_members;
nr_failed = get_nodes_nr_from(main_thread_get(failed_nodes));
nr_delayed = get_nodes_nr_from(main_thread_get(delayed_nodes));
@@ -1174,14 +1157,14 @@ void sd_join_handler(const struct sd_node *joined,
*/
if (!sys->join_finished) {
sys->join_finished = true;
- sys->epoch = get_latest_epoch();
+ sys->cinfo.epoch = get_latest_epoch();
put_vnode_info(main_thread_get(current_vnode_info));
main_thread_set(current_vnode_info,
alloc_vnode_info(&sys->this_node, 1));
}
- nr_local = get_nodes_nr_epoch(sys->epoch);
+ nr_local = get_nodes_nr_epoch(sys->cinfo.epoch);
nr = nr_members;
nr_failed = get_nodes_nr_from(main_thread_get(failed_nodes));
nr_delayed = get_nodes_nr_from(main_thread_get(delayed_nodes));
@@ -1226,7 +1209,7 @@ void sd_leave_handler(const struct sd_node *left, const struct sd_node *members,
switch (sys->status) {
case SD_STATUS_HALT:
case SD_STATUS_OK:
- uatomic_inc(&sys->epoch);
+ uatomic_inc(&sys->cinfo.epoch);
log_current_epoch();
start_recovery(main_thread_get(current_vnode_info),
old_vnode_info, true);
@@ -1257,7 +1240,7 @@ void kick_node_recover(void)
main_thread_set(current_vnode_info,
alloc_vnode_info(old->nodes, old->nr_nodes));
- uatomic_inc(&sys->epoch);
+ uatomic_inc(&sys->cinfo.epoch);
log_current_epoch();
start_recovery(main_thread_get(current_vnode_info), old, true);
put_vnode_info(old);
@@ -1302,12 +1285,19 @@ int create_cluster(int port, int64_t zone, int nr_vnodes,
sys->this_node.space = sys->disk_space;
- sys->epoch = get_latest_epoch();
- if (sys->epoch) {
+ sys->cinfo.epoch = get_latest_epoch();
+ if (sys->cinfo.epoch) {
sys->status = SD_STATUS_WAIT_FOR_JOIN;
- get_cluster_copies(&sys->nr_copies);
- get_cluster_flags(&sys->flags);
-
+ get_cluster_copies(&sys->cinfo.nr_copies);
+ get_cluster_flags(&sys->cinfo.flags);
+ sys->cinfo.ctime = get_cluster_ctime();
+ get_cluster_store((char *)sys->cinfo.store);
+
+ sys->cinfo.nr_nodes = epoch_log_read(sys->cinfo.epoch,
+ sys->cinfo.nodes,
+ sizeof(sys->cinfo.nodes));
+ if (sys->cinfo.nr_nodes == -1)
+ return -1;
} else {
sys->status = SD_STATUS_WAIT_FOR_FORMAT;
}
diff --git a/sheep/ops.c b/sheep/ops.c
index 5641e81..86df149 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -88,7 +88,8 @@ static int cluster_new_vdi(struct request *req)
.size = hdr->vdi.vdi_size,
.base_vid = hdr->vdi.base_vdi_id,
.create_snapshot = !!hdr->vdi.snapid,
- .nr_copies = hdr->vdi.copies ? hdr->vdi.copies : sys->nr_copies,
+ .nr_copies = hdr->vdi.copies ? hdr->vdi.copies :
+ sys->cinfo.nr_copies,
};
if (hdr->data_length != SD_MAX_VDI_LEN)
@@ -239,7 +240,6 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
{
int i, ret;
uint32_t latest_epoch;
- uint64_t created_time;
struct store_driver *driver;
char *store_name = data;
@@ -247,6 +247,8 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
if (!driver)
return SD_RES_NO_STORE;
+ pstrcpy((char *)sys->cinfo.store, sizeof(sys->cinfo.store),
+ store_name);
sd_store = driver;
latest_epoch = get_latest_epoch();
@@ -260,15 +262,15 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
if (ret != SD_RES_SUCCESS)
return ret;
- sys->nr_copies = req->cluster.copies;
- sys->flags = req->flags;
- if (!sys->nr_copies)
- sys->nr_copies = SD_DEFAULT_COPIES;
+ sys->cinfo.nr_copies = req->cluster.copies;
+ sys->cinfo.flags = req->flags;
+ if (!sys->cinfo.nr_copies)
+ sys->cinfo.nr_copies = SD_DEFAULT_COPIES;
+ sys->cinfo.ctime = req->cluster.ctime;
- created_time = req->cluster.ctime;
- set_cluster_ctime(created_time);
- set_cluster_copies(sys->nr_copies);
- set_cluster_flags(sys->flags);
+ set_cluster_ctime(sys->cinfo.ctime);
+ set_cluster_copies(sys->cinfo.nr_copies);
+ set_cluster_flags(sys->cinfo.flags);
for (i = 1; i <= latest_epoch; i++)
remove_epoch(i);
@@ -276,7 +278,7 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
memset(sys->vdi_inuse, 0, sizeof(sys->vdi_inuse));
clean_vdi_state();
- sys->epoch = 1;
+ sys->cinfo.epoch = 1;
ret = log_current_epoch();
if (ret)
@@ -300,7 +302,7 @@ static int cluster_shutdown(const struct sd_req *req, struct sd_rsp *rsp,
static int cluster_enable_recover(const struct sd_req *req,
struct sd_rsp *rsp, void *data)
{
- sys->disable_recovery = false;
+ sys->cinfo.disable_recovery = false;
resume_suspended_recovery();
return SD_RES_SUCCESS;
}
@@ -308,7 +310,7 @@ static int cluster_enable_recover(const struct sd_req *req,
static int cluster_disable_recover(const struct sd_req *req,
struct sd_rsp *rsp, void *data)
{
- sys->disable_recovery = true;
+ sys->cinfo.disable_recovery = true;
return SD_RES_SUCCESS;
}
@@ -450,7 +452,7 @@ static int local_stat_cluster(struct request *req)
assert(nr_nodes <= SD_MAX_NODES);
log->nr_nodes = nr_nodes;
- log->disable_recovery = sys->disable_recovery;
+ log->disable_recovery = sys->cinfo.disable_recovery;
rsp->data_length += sizeof(*log);
epoch--;
@@ -548,7 +550,7 @@ static int cluster_force_recover_main(const struct sd_req *req,
struct sd_node *nodes = data;
size_t nr_nodes = rsp->data_length / sizeof(*nodes);
- if (rsp->epoch != sys->epoch) {
+ if (rsp->epoch != sys->cinfo.epoch) {
sd_eprintf("epoch was incremented while cluster_force_recover");
return SD_RES_FORCE_RECOVER;
}
@@ -564,10 +566,10 @@ static int cluster_force_recover_main(const struct sd_req *req,
goto err;
}
- sys->nr_copies = c;
- sys->flags = f;
+ sys->cinfo.nr_copies = c;
+ sys->cinfo.flags = f;
- sys->epoch++; /* some nodes are left, so we get a new epoch */
+ sys->cinfo.epoch++; /* some nodes are left, so we get a new epoch */
ret = log_current_epoch();
if (ret) {
sd_printf(SDOG_EMERG, "cannot update epoch log");
@@ -673,7 +675,7 @@ static int cluster_recovery_completion(const struct sd_req *req,
for (i = 0; i < nr_recovereds; i++)
sd_dprintf("[%x] %s", i, node_to_str(recovereds + i));
- if (sys->epoch != latest_epoch)
+ if (sys->cinfo.epoch != latest_epoch)
return SD_RES_SUCCESS;
vnode_info = get_vnode_info();
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 63877b1..f751bfd 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -575,7 +575,7 @@ static void recover_next_object(struct recovery_info *rinfo)
if (rinfo->nr_prio_oids)
finish_schedule_oids(rinfo);
- if (sys->disable_recovery && !has_scheduled_objects(rinfo)) {
+ if (sys->cinfo.disable_recovery && !has_scheduled_objects(rinfo)) {
sd_dprintf("suspended");
rinfo->suspended = true;
/* suspend until resume_suspended_recovery() is called */
@@ -796,8 +796,9 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo,
rinfo = xzalloc(sizeof(struct recovery_info));
rinfo->state = RW_PREPARE_LIST;
- rinfo->epoch = sys->epoch;
- rinfo->tgt_epoch = epoch_lifted ? sys->epoch - 1 : sys->epoch;
+ rinfo->epoch = sys->cinfo.epoch;
+ rinfo->tgt_epoch = epoch_lifted ? sys->cinfo.epoch - 1 :
+ sys->cinfo.epoch;
rinfo->count = 0;
if (epoch_lifted)
rinfo->notify_complete = true; /* Reweight or node recovery */
diff --git a/sheep/request.c b/sheep/request.c
index 76d97e5..a6000d3 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -106,7 +106,7 @@ static void gateway_op_done(struct work *work)
switch (req->rp.result) {
case SD_RES_OLD_NODE_VER:
- if (req->rp.epoch > sys->epoch) {
+ if (req->rp.epoch > sys->cinfo.epoch) {
/*
* Gateway of this node is expected to process this
* request later when epoch is lifted.
@@ -122,7 +122,7 @@ static void gateway_op_done(struct work *work)
case SD_RES_KILLED:
sd_dprintf("retrying failed I/O request op %s result %x epoch %"
PRIu32", sys epoch %"PRIu32, op_name(req->op),
- req->rp.result, req->rq.epoch, sys->epoch);
+ req->rp.result, req->rq.epoch, sys->cinfo.epoch);
goto retry;
case SD_RES_EIO:
if (is_access_local(req, hdr->obj.oid)) {
@@ -158,17 +158,17 @@ static void local_op_done(struct work *work)
static int check_request_epoch(struct request *req)
{
- if (before(req->rq.epoch, sys->epoch)) {
+ if (before(req->rq.epoch, sys->cinfo.epoch)) {
sd_eprintf("old node version %u, %u (%s)",
- sys->epoch, req->rq.epoch, op_name(req->op));
+ sys->cinfo.epoch, req->rq.epoch, op_name(req->op));
/* Ask for sleeping req on requester's wait queue */
req->rp.result = SD_RES_OLD_NODE_VER;
- req->rp.epoch = sys->epoch;
+ req->rp.epoch = sys->cinfo.epoch;
put_request(req);
return -1;
- } else if (after(req->rq.epoch, sys->epoch)) {
+ } else if (after(req->rq.epoch, sys->cinfo.epoch)) {
sd_eprintf("new node version %u, %u (%s)",
- sys->epoch, req->rq.epoch, op_name(req->op));
+ sys->cinfo.epoch, req->rq.epoch, op_name(req->op));
/* Wait for local epoch to be lifted */
req->rp.result = SD_RES_NEW_NODE_VER;
sleep_on_wait_queue(req);
@@ -221,7 +221,7 @@ void wakeup_requests_on_epoch(void)
*/
assert(is_gateway_op(req->op));
sd_dprintf("gateway %"PRIx64, req->rq.obj.oid);
- req->rq.epoch = sys->epoch;
+ req->rq.epoch = sys->cinfo.epoch;
del_requeue_request(req);
break;
case SD_RES_NEW_NODE_VER:
@@ -390,13 +390,13 @@ static void queue_request(struct request *req)
if (is_peer_op(req->op)) {
queue_peer_request(req);
} else if (is_gateway_op(req->op)) {
- hdr->epoch = sys->epoch;
+ hdr->epoch = sys->cinfo.epoch;
queue_gateway_request(req);
} else if (is_local_op(req->op)) {
- hdr->epoch = sys->epoch;
+ hdr->epoch = sys->cinfo.epoch;
queue_local_request(req);
} else if (is_cluster_op(req->op)) {
- hdr->epoch = sys->epoch;
+ hdr->epoch = sys->cinfo.epoch;
queue_cluster_request(req);
} else {
sd_eprintf("unknown operation %d", hdr->opcode);
@@ -645,7 +645,7 @@ static void init_tx_hdr(struct client_info *ci)
/* use cpu_to_le */
memcpy(rsp, &req->rp, sizeof(*rsp));
- rsp->epoch = sys->epoch;
+ rsp->epoch = sys->cinfo.epoch;
rsp->opcode = req->rq.opcode;
rsp->id = req->rq.id;
}
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 84bd269..3955c73 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -186,8 +186,8 @@ static void crash_handler(int signo)
reraise_crash_signal(signo, 1);
}
-static struct cluster_info __sys;
-struct cluster_info *sys = &__sys;
+static struct system_info __sys;
+struct system_info *sys = &__sys;
static void parse_arg(char *arg, const char *delim, void (*fn)(char *))
{
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index cdf8b7a..804aa1c 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -62,7 +62,7 @@ struct request {
struct work work;
};
-struct cluster_info {
+struct system_info {
struct cluster_driver *cdrv;
const char *cdrv_option;
@@ -70,15 +70,13 @@ struct cluster_info {
bool join_finished;
struct sd_node this_node;
- uint32_t epoch;
+ struct cluster_info cinfo;
uint32_t status;
- uint16_t flags;
uint64_t disk_space;
DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
- uint8_t nr_copies;
int local_req_efd;
pthread_mutex_t local_req_lock;
@@ -87,7 +85,6 @@ struct cluster_info {
int nr_outstanding_reqs;
bool gateway_only;
- bool disable_recovery;
bool nosync;
struct work_queue *gateway_wqueue;
@@ -200,7 +197,7 @@ static inline struct store_driver *find_store_driver(const char *name)
return NULL;
}
-extern struct cluster_info *sys;
+extern struct system_info *sys;
extern struct store_driver *sd_store;
extern char *obj_path;
extern char *epoch_path;
@@ -208,7 +205,7 @@ extern char *epoch_path;
/* One should call this function to get sys->epoch outside main thread */
static inline uint32_t sys_epoch(void)
{
- return uatomic_read(&sys->epoch);
+ return uatomic_read(&sys->cinfo.epoch);
}
static inline bool is_aligned_to_pagesize(void *p)
diff --git a/sheep/store.c b/sheep/store.c
index a804d0d..f233f35 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -270,10 +270,10 @@ int init_store_driver(bool is_gateway)
char driver_name[STORE_LEN], *p;
int ret;
- memset(driver_name, '\0', sizeof(driver_name));
- ret = get_cluster_store(driver_name);
+ ret = get_cluster_store((char *)sys->cinfo.store);
if (ret != SD_RES_SUCCESS)
return ret;
+ pstrcpy(driver_name, sizeof(driver_name), (char *)sys->cinfo.store);
p = memchr(driver_name, '\0', STORE_LEN);
if (!p) {
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 096244a..41e451e 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -134,7 +134,7 @@ int get_max_copy_number(void)
int nr_copies = uatomic_read(&max_copies);
if (nr_copies == 0)
- nr_copies = sys->nr_copies;
+ nr_copies = sys->cinfo.nr_copies;
return nr_copies;
}
--
1.7.9.5
More information about the sheepdog
mailing list