[Sheepdog] [PATCH v2 6/7] sheep: remove node_list
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Thu Oct 20 09:24:31 CEST 2011
join_handler() and leave_handler() can notify the node list which can
be used for consistent hashing now, so we don't need to manage node
lists in sheep/group.c any more.
With this patch, 'collie node list' doesn't show the master node. But
I think it is okay because there may be no master node if we use the
other cluster driver than corosync.
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
sheep/group.c | 297 ++++++++++++++--------------------------------------
sheep/sheep_priv.h | 11 +--
2 files changed, 80 insertions(+), 228 deletions(-)
diff --git a/sheep/group.c b/sheep/group.c
index e965be9..f76dc01 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -85,16 +85,16 @@ struct work_leave {
struct sheepdog_node_list_entry left;
};
-#define print_node_list(node_list) \
+#define print_node_list(nodes, nr_nodes) \
({ \
- struct node *__node; \
char __name[128]; \
- list_for_each_entry(__node, node_list, list) { \
+ int __i; \
+ for (__i = 0; __i < (nr_nodes); __i++) { \
dprintf("%c ip: %s, port: %d\n", \
- is_myself(__node->ent.addr, __node->ent.port) ? 'l' : ' ', \
+ is_myself(nodes[__i].addr, nodes[__i].port) ? 'l' : ' ', \
addr_to_str(__name, sizeof(__name), \
- __node->ent.addr, __node->ent.port), \
- __node->ent.port); \
+ nodes[__i].addr, nodes[__i].port), \
+ nodes[__i].port); \
} \
})
@@ -110,65 +110,30 @@ static int get_node_idx(struct sheepdog_node_list_entry *ent,
return ent - entries;
}
-static void build_node_list(struct list_head *node_list,
- struct sheepdog_node_list_entry *entries,
- int *nr_nodes, int *nr_zones)
+static int get_zones_nr_from(struct sheepdog_node_list_entry *nodes, int nr_nodes)
{
- struct node *node;
- int nr = 0, i;
+ int nr_zones = 0, i, j;
uint32_t zones[SD_MAX_REDUNDANCY];
- if (nr_zones)
- *nr_zones = 0;
-
- list_for_each_entry(node, node_list, list) {
- if (entries)
- memcpy(entries + nr, &node->ent, sizeof(*entries));
- nr++;
-
- if (nr_zones && *nr_zones < ARRAY_SIZE(zones)) {
- for (i = 0; i < *nr_zones; i++) {
- if (zones[i] == node->ent.zone)
- break;
- }
- if (i == *nr_zones)
- zones[(*nr_zones)++] = node->ent.zone;
+ for (i = 0; i < nr_nodes; i++) {
+ for (j = 0; j < nr_zones; j++) {
+ if (nodes[i].zone == zones[j])
+ break;
}
- }
- if (entries)
- qsort(entries, nr, sizeof(*entries), node_cmp);
- if (nr_nodes)
- *nr_nodes = nr;
-}
+ if (j == nr_zones)
+ zones[nr_zones++] = nodes[i].zone;
-static int get_zones_nr_from(struct list_head *list)
-{
- int nr_dummpy, nr_zones;
- struct sheepdog_node_list_entry nodes[SD_MAX_NODES];
+ if (nr_zones == ARRAY_SIZE(zones))
+ break;
+ }
- build_node_list(list, nodes, &nr_dummpy, &nr_zones);
return nr_zones;
}
-int get_ordered_sd_node_list(struct sheepdog_node_list_entry *entries)
-{
- int nr_nodes;
-
- build_node_list(&sys->sd_node_list, entries, &nr_nodes, NULL);
-
- return nr_nodes;
-}
-
void get_ordered_sd_vnode_list(struct sheepdog_vnode_list_entry *entries,
int *nr_vnodes, int *nr_zones)
{
- struct sheepdog_node_list_entry nodes[SD_MAX_NODES];
- int nr;
-
- build_node_list(&sys->sd_node_list, nodes, &nr, nr_zones);
-
- if (sys->nr_vnodes == 0)
- sys->nr_vnodes = nodes_to_vnodes(nodes, nr, sys->vnodes);
+ *nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes);
memcpy(entries, sys->vnodes, sizeof(*entries) * sys->nr_vnodes);
@@ -184,19 +149,13 @@ static void get_node_list(struct sd_node_req *req,
struct sd_node_rsp *rsp, void *data)
{
int nr_nodes;
- struct node *node;
- nr_nodes = get_ordered_sd_node_list(data);
+ nr_nodes = sys->nr_nodes;
+ memcpy(data, sys->nodes, sizeof(*sys->nodes) * nr_nodes);
rsp->data_length = nr_nodes * sizeof(struct sheepdog_node_list_entry);
rsp->nr_nodes = nr_nodes;
rsp->local_idx = get_node_idx(&sys->this_node, data, nr_nodes);
-
- if (!nr_nodes) {
- rsp->master_idx = -1;
- return;
- }
- node = list_first_entry(&sys->sd_node_list, struct node, list);
- rsp->master_idx = get_node_idx(&node->ent, data, nr_nodes);
+ rsp->master_idx = -1;
}
static int get_epoch(struct sd_obj_req *req,
@@ -332,19 +291,6 @@ out:
exit(1);
}
-static struct node *find_node(struct list_head *node_list,
- struct sheepdog_node_list_entry *ent)
-{
- struct node *node;
-
- list_for_each_entry(node, node_list, list) {
- if (node_cmp(&node->ent, ent) == 0)
- return node;
- }
-
- return NULL;
-}
-
static inline int get_nodes_nr_from(struct list_head *l)
{
struct node *node;
@@ -443,10 +389,9 @@ static int get_cluster_status(struct sheepdog_node_list_entry *from,
int nr_entries, uint64_t ctime, uint32_t epoch,
uint32_t *status, uint8_t *inc_epoch)
{
- int i, ret = SD_RES_SUCCESS;
+ int i, j, ret = SD_RES_SUCCESS;
int nr, nr_local_entries, nr_leave_entries;
struct sheepdog_node_list_entry local_entries[SD_MAX_NODES];
- struct node *node;
char str[256];
*status = sys->status;
@@ -468,7 +413,7 @@ static int get_cluster_status(struct sheepdog_node_list_entry *from,
ret = SD_RES_NOT_FORMATTED;
break;
case SD_STATUS_WAIT_FOR_JOIN:
- nr = get_nodes_nr_from(&sys->sd_node_list) + 1;
+ nr = sys->nr_nodes + 1;
nr_local_entries = epoch_log_read_nr(epoch, (char *)local_entries,
sizeof(local_entries));
@@ -488,8 +433,8 @@ static int get_cluster_status(struct sheepdog_node_list_entry *from,
for (i = 0; i < nr_local_entries; i++) {
if (node_cmp(local_entries + i, from) == 0)
goto next;
- list_for_each_entry(node, &sys->sd_node_list, list) {
- if (node_cmp(local_entries + i, &node->ent) == 0)
+ for (j = 0; j < sys->nr_nodes; j++) {
+ if (node_cmp(local_entries + i, sys->nodes + j) == 0)
goto next;
}
break;
@@ -579,46 +524,20 @@ out:
static void get_vdi_bitmap_from_sd_list(void)
{
- int i, nr_nodes;
+ int i;
/* fixme: we need this until starting up. */
- struct sheepdog_node_list_entry nodes[SD_MAX_NODES];
-
- /*
- * we don't need the proper order but this is the simplest
- * way.
- */
- nr_nodes = get_ordered_sd_node_list(nodes);
- for (i = 0; i < nr_nodes; i++)
- get_vdi_bitmap_from(&nodes[i]);
-}
-
-static int move_node_to_sd_list(struct sheepdog_node_list_entry ent)
-{
- struct node *node;
-
- node = zalloc(sizeof(*node));
- if (!node)
- panic("failed to alloc memory for a new node\n");
-
- node->ent = ent;
-
- list_add_tail(&node->list, &sys->sd_node_list);
- sys->nr_vnodes = 0;
-
- return 0;
+ for (i = 0; i < sys->nr_nodes; i++)
+ get_vdi_bitmap_from(sys->nodes + i);
}
static int update_epoch_log(int epoch)
{
- int ret, nr_nodes;
- struct sheepdog_node_list_entry entry[SD_MAX_NODES];
-
- nr_nodes = get_ordered_sd_node_list(entry);
+ int ret;
- dprintf("update epoch, %d, %d\n", epoch, nr_nodes);
- ret = epoch_log_write(epoch, (char *)entry,
- nr_nodes * sizeof(struct sheepdog_node_list_entry));
+ dprintf("update epoch, %d, %d\n", epoch, sys->nr_nodes);
+ ret = epoch_log_write(epoch, (char *)sys->nodes,
+ sys->nr_nodes * sizeof(struct sheepdog_node_list_entry));
if (ret < 0)
eprintf("can't write epoch %u\n", epoch);
@@ -630,7 +549,7 @@ static void update_cluster_info(struct join_message *msg,
size_t nr_nodes)
{
int i, le;
- int ret, nr_leave_nodes;
+ int nr_leave_nodes;
struct node *n;
eprintf("status = %d, epoch = %d, %x, %d\n", msg->cluster_status, msg->epoch, msg->result, sys->join_finished);
@@ -643,20 +562,15 @@ static void update_cluster_info(struct join_message *msg,
sys->nr_sobjs = msg->nr_sobjs;
sys->epoch = msg->epoch;
- sys->flags = msg->cluster_status;
+ /* add nodes execept for newly joined one */
for (i = 0; i < nr_nodes; i++) {
if (node_cmp(nodes + i, &msg->header.from) == 0)
continue;
- ret = move_node_to_sd_list(nodes[i]);
- /*
- * the node belonged to sheepdog when the master build
- * the JOIN response however it has gone.
- */
- if (ret)
- vprintf(SDOG_INFO, "%s has gone\n",
- node_to_str(&nodes[i]));
+
+ sys->nodes[sys->nr_nodes++] = nodes[i];
}
+ qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp);
if (msg->cluster_status != SD_STATUS_OK) {
nr_leave_nodes = msg->nr_leave_nodes;
@@ -685,15 +599,10 @@ static void update_cluster_info(struct join_message *msg,
update_epoch_log(sys->epoch);
join_finished:
- ret = move_node_to_sd_list(msg->header.from);
- /*
- * this should not happen since __sd_deliver() checks if the
- * host from msg on cpg_node_list.
- */
- if (ret)
- vprintf(SDOG_ERR, "%s has gone\n",
- node_to_str(&msg->header.from));
-
+ sys->nodes[sys->nr_nodes++] = msg->header.from;
+ qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp);
+ sys->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes,
+ sys->vnodes);
if (msg->cluster_status == SD_STATUS_OK ||
msg->cluster_status == SD_STATUS_HALT) {
if (msg->inc_epoch) {
@@ -710,7 +619,7 @@ join_finished:
}
}
- print_node_list(&sys->sd_node_list);
+ print_node_list(sys->nodes, sys->nr_nodes);
sys->status = msg->cluster_status;
return;
@@ -804,8 +713,7 @@ static void __sd_notify_done(struct cpg_event *cevent)
void *data = msg->data;
struct request *req;
int ret = msg->rsp.result;
- int i, latest_epoch, nr_nodes;
- struct sheepdog_node_list_entry entry[SD_MAX_NODES];
+ int i, latest_epoch;
uint64_t ctime;
if (ret != SD_RES_SUCCESS)
@@ -842,11 +750,10 @@ static void __sd_notify_done(struct cpg_event *cevent)
sys->epoch = 1;
sys->recovered_epoch = 1;
- nr_nodes = get_ordered_sd_node_list(entry);
- dprintf("write epoch log, %d, %d\n", sys->epoch, nr_nodes);
- ret = epoch_log_write(sys->epoch, (char *)entry,
- nr_nodes * sizeof(struct sheepdog_node_list_entry));
+ dprintf("write epoch log, %d, %d\n", sys->epoch, sys->nr_nodes);
+ ret = epoch_log_write(sys->epoch, (char *)sys->nodes,
+ sys->nr_nodes * sizeof(struct sheepdog_node_list_entry));
if (ret < 0)
eprintf("can't write epoch %u\n", sys->epoch);
update_epoch_store(sys->epoch);
@@ -857,7 +764,7 @@ static void __sd_notify_done(struct cpg_event *cevent)
if (sys_flag_nohalt())
sys->status = SD_STATUS_OK;
else {
- int nr_zones = get_zones_nr_from(&sys->sd_node_list);
+ int nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes);
if (nr_zones >= sys->nr_sobjs)
sys->status = SD_STATUS_OK;
@@ -920,66 +827,14 @@ static void sd_notify_handler(struct sheepdog_node_list_entry *sender,
start_cpg_event_work();
}
-static void add_node(struct sheepdog_node_list_entry *ent)
-{
- struct node *node;
-
- node = zalloc(sizeof(*node));
- if (!node)
- panic("failed to alloc memory for a new node\n");
-
- node->ent = *ent;
-
- list_add_tail(&node->list, &sys->cpg_node_list);
-}
-
-static int del_node(struct sheepdog_node_list_entry *ent)
-{
- struct node *node;
-
- node = find_node(&sys->sd_node_list, ent);
- if (node) {
- int nr;
- struct sheepdog_node_list_entry e[SD_MAX_NODES];
-
- sys->nr_vnodes = 0;
-
- list_del(&node->list);
- free(node);
-
- if (sys->status == SD_STATUS_OK ||
- sys->status == SD_STATUS_HALT) {
- nr = get_ordered_sd_node_list(e);
- dprintf("update epoch, %d, %d\n", sys->epoch + 1, nr);
- epoch_log_write(sys->epoch + 1, (char *)e,
- nr * sizeof(struct sheepdog_node_list_entry));
-
- sys->epoch++;
-
- update_epoch_store(sys->epoch);
- }
- return 1;
- }
-
- node = find_node(&sys->cpg_node_list, ent);
- if (node) {
- list_del(&node->list);
- free(node);
- }
-
- return 0;
-}
-
/*
* Check whether the majority of Sheepdog nodes are still alive or not
*/
-static int check_majority(struct sheepdog_node_list_entry *left)
+static int check_majority(struct sheepdog_node_list_entry *nodes, int nr_nodes)
{
- int nr_nodes = 0, nr_majority, nr_reachable = 0, fd;
- struct node *node;
+ int nr_majority, nr_reachable = 0, fd, i;
char name[INET6_ADDRSTRLEN];
- nr_nodes = get_nodes_nr_from(&sys->sd_node_list);
nr_majority = nr_nodes / 2 + 1;
/* we need at least 3 nodes to handle network partition
@@ -987,12 +842,9 @@ static int check_majority(struct sheepdog_node_list_entry *left)
if (nr_nodes < 3)
return 1;
- list_for_each_entry(node, &sys->sd_node_list, list) {
- if (node_cmp(&node->ent, left) == 0)
- continue;
-
- addr_to_str(name, sizeof(name), node->ent.addr, 0);
- fd = connect_to(name, node->ent.port);
+ for (i = 0; i < nr_nodes; i++) {
+ addr_to_str(name, sizeof(name), nodes[i].addr, 0);
+ fd = connect_to(name, nodes[i].port);
if (fd < 0)
continue;
@@ -1029,7 +881,7 @@ static void __sd_leave(struct cpg_event *cevent)
{
struct work_leave *w = container_of(cevent, struct work_leave, cev);
- if (!check_majority(&w->left)) {
+ if (!check_majority(w->member_list, w->member_list_entries)) {
eprintf("perhaps network partition failure has occurred\n");
abort();
}
@@ -1082,7 +934,6 @@ static enum cluster_join_result sd_check_join_cb(
jm->leave_nodes[jm->nr_leave_nodes] = node->ent;
jm->nr_leave_nodes++;
}
- print_node_list(&sys->leave_list);
} else if (jm->result != SD_RES_SUCCESS &&
jm->epoch > sys->epoch &&
jm->cluster_status == SD_STATUS_WAIT_FOR_JOIN) {
@@ -1130,7 +981,6 @@ static void __sd_join_done(struct cpg_event *cevent)
struct work_join *w = container_of(cevent, struct work_join, cev);
struct join_message *jm = &w->jm;
struct node *node, *t;
- int i;
if (w->member_list_entries == 1 &&
node_cmp(&w->joined, &sys->this_node) == 0) {
@@ -1138,13 +988,7 @@ static void __sd_join_done(struct cpg_event *cevent)
get_global_nr_copies(&sys->nr_sobjs);
}
- if (list_empty(&sys->cpg_node_list)) {
- for (i = 0; i < w->member_list_entries; i++)
- add_node(w->member_list + i);
- } else
- add_node(&w->joined);
-
- print_node_list(&sys->sd_node_list);
+ print_node_list(sys->nodes, sys->nr_nodes);
update_cluster_info(jm, w->member_list, w->member_list_entries);
@@ -1156,7 +1000,7 @@ static void __sd_join_done(struct cpg_event *cevent)
}
if (sys->status == SD_STATUS_HALT) {
- int nr_zones = get_zones_nr_from(&sys->sd_node_list);
+ int nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes);
if (nr_zones >= sys->nr_sobjs)
sys->status = SD_STATUS_OK;
@@ -1171,18 +1015,30 @@ int sys_flag_nohalt()
static void __sd_leave_done(struct cpg_event *cevent)
{
struct work_leave *w = container_of(cevent, struct work_leave, cev);
- int node_left;
- node_left = del_node(&w->left);
+ sys->nr_nodes = w->member_list_entries;
+ memcpy(sys->nodes, w->member_list, sizeof(*sys->nodes) * sys->nr_nodes);
+ qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp);
+ sys->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes,
+ sys->vnodes);
+ if (sys->status == SD_STATUS_OK ||
+ sys->status == SD_STATUS_HALT) {
+ dprintf("update epoch, %d, %d\n", sys->epoch + 1, sys->nr_nodes);
+ epoch_log_write(sys->epoch + 1, (char *)sys->nodes,
+ sizeof(*sys->nodes) * sys->nr_nodes);
+
+ sys->epoch++;
+
+ update_epoch_store(sys->epoch);
+ }
- print_node_list(&sys->sd_node_list);
+ print_node_list(sys->nodes, sys->nr_nodes);
- if (node_left &&
- (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT))
+ if (sys->status == SD_STATUS_OK || sys->status == SD_STATUS_HALT)
start_recovery(sys->epoch);
if (sys->status == SD_STATUS_OK && !sys_flag_nohalt()) {
- int nr_zones = get_zones_nr_from(&sys->sd_node_list);
+ int nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes);
if (nr_zones < sys->nr_sobjs)
sys->status = SD_STATUS_HALT;
@@ -1583,7 +1439,10 @@ static void sd_join_handler(struct sheepdog_node_list_entry *joined,
*/
if (!sys->join_finished) {
sys->join_finished = 1;
- move_node_to_sd_list(sys->this_node);
+ sys->nodes[sys->nr_nodes++] = sys->this_node;
+ qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp);
+ sys->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes,
+ sys->vnodes);
sys->epoch = get_latest_epoch();
}
@@ -1686,8 +1545,6 @@ int create_cluster(int port, int64_t zone)
sys->status = SD_STATUS_WAIT_FOR_FORMAT;
else
sys->status = SD_STATUS_WAIT_FOR_JOIN;
- INIT_LIST_HEAD(&sys->sd_node_list);
- INIT_LIST_HEAD(&sys->cpg_node_list);
INIT_LIST_HEAD(&sys->pending_list);
INIT_LIST_HEAD(&sys->leave_list);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 0ada007..130515a 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -110,18 +110,14 @@ struct cluster_info {
uint32_t status;
uint16_t flags;
- /*
- * we add a node to cpg_node_list in confchg then move it to
- * sd_node_list when the node joins sheepdog.
- */
- struct list_head cpg_node_list;
- struct list_head sd_node_list;
-
/* leave list is only used to account for bad nodes when we start
* up the cluster nodes after we shutdown the cluster through collie.
*/
struct list_head leave_list;
+ struct sheepdog_node_list_entry nodes[SD_MAX_NODES];
+ int nr_nodes;
+
/* this array contains a list of ordered virtual nodes */
struct sheepdog_vnode_list_entry vnodes[SD_MAX_VNODES];
int nr_vnodes;
@@ -176,7 +172,6 @@ int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
uint32_t vid, uint32_t *attrid, int copies, uint64_t ctime,
int write, int excl, int delete);
-int get_ordered_sd_node_list(struct sheepdog_node_list_entry *entries);
void setup_ordered_sd_vnode_list(struct request *req);
void get_ordered_sd_vnode_list(struct sheepdog_vnode_list_entry *entries,
int *nr_vnodes, int *nr_zones);
--
1.7.2.5
More information about the sheepdog
mailing list