[Sheepdog] [PATCH 02/18] fix node list bug
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Thu Mar 11 07:48:01 CET 2010
We need to separate a node list into sheepdog node list and corosync node list
because corosync leave messages may come during sheepdog is processing join
messages and it will break node list information.
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
collie/collie.h | 3 +-
collie/group.c | 114 ++++++++++++++++++++++++++++++++++++++++---------------
collie/store.c | 12 +++---
collie/vdi.c | 6 +-
4 files changed, 94 insertions(+), 41 deletions(-)
diff --git a/collie/collie.h b/collie/collie.h
index 098d4c9..d24a6c4 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -63,7 +63,8 @@ struct cluster_info {
uint32_t epoch;
- struct list_head node_list;
+ struct list_head cpg_node_list;
+ struct list_head sd_node_list;
int node_list_idx;
struct list_head vm_list;
struct list_head pending_list;
diff --git a/collie/group.c b/collie/group.c
index c3508e8..9b0fde3 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -135,16 +135,16 @@ static void get_node_list(struct sd_node_req *req,
int nr_nodes;
struct node *node;
- nr_nodes = build_node_list(&sys->node_list, data);
+ nr_nodes = build_node_list(&sys->sd_node_list, data);
rsp->data_length = nr_nodes * sizeof(struct sheepdog_node_list_entry);
rsp->nr_nodes = nr_nodes;
rsp->local_idx = get_node_idx(&sys->this_node, data, nr_nodes);
- if (list_empty(&sys->node_list)) {
+ if (list_empty(&sys->sd_node_list)) {
rsp->master_idx = -1;
return;
}
- node = list_first_entry(&sys->node_list, struct node, list);
+ node = list_first_entry(&sys->sd_node_list, struct node, list);
rsp->master_idx = get_node_idx(&node->ent, data, nr_nodes);
}
@@ -228,11 +228,11 @@ static void group_handler(int listen_fd, int events, void *data)
cpg_dispatch(sys->handle, CPG_DISPATCH_ALL);
}
-static void print_node_list(void)
+static void print_node_list(struct list_head *node_list)
{
struct node *node;
char name[128];
- list_for_each_entry(node, &sys->node_list, list) {
+ list_for_each_entry(node, node_list, list) {
dprintf("%c nodeid: %x, pid: %d, ip: %s\n",
node_cmp(&node->ent, &sys->this_node) ? ' ' : 'l',
node->nodeid, node->pid,
@@ -240,7 +240,7 @@ static void print_node_list(void)
}
}
-static void add_node(uint32_t nodeid, uint32_t pid,
+static void add_node(struct list_head *node_list, uint32_t nodeid, uint32_t pid,
struct sheepdog_node_list_entry *sd_ent)
{
struct node *node;
@@ -252,8 +252,21 @@ static void add_node(uint32_t nodeid, uint32_t pid,
}
node->nodeid = nodeid;
node->pid = pid;
- node->ent = *sd_ent;
- list_add_tail(&node->list, &sys->node_list);
+ if (sd_ent)
+ node->ent = *sd_ent;
+ list_add_tail(&node->list, node_list);
+}
+
+static struct node *find_node(struct list_head *node_list, uint32_t nodeid, uint32_t pid)
+{
+ struct node *node;
+
+ list_for_each_entry(node, node_list, list) {
+ if (node->nodeid == nodeid && node->pid == pid)
+ return node;
+ }
+
+ return NULL;
}
static int is_master(void)
@@ -263,10 +276,10 @@ static int is_master(void)
if (!sys->synchronized)
return 0;
- if (list_empty(&sys->node_list))
+ if (list_empty(&sys->sd_node_list))
return 1;
- node = list_first_entry(&sys->node_list, struct node, list);
+ node = list_first_entry(&sys->sd_node_list, struct node, list);
if (node_cmp(&node->ent, &sys->this_node) == 0)
return 1;
@@ -288,7 +301,12 @@ static void join(struct join_message *msg)
msg->epoch = sys->epoch;
msg->nr_sobjs = sys->nr_sobjs;
- list_for_each_entry(node, &sys->node_list, list) {
+ list_for_each_entry(node, &sys->cpg_node_list, list) {
+ if (node->nodeid == msg->nodeid && node->pid == msg->pid)
+ continue;
+ if (node->ent.id == 0)
+ continue;
+
msg->nodes[msg->nr_nodes].nodeid = node->nodeid;
msg->nodes[msg->nr_nodes].pid = node->pid;
msg->nodes[msg->nr_nodes].ent = node->ent;
@@ -309,20 +327,29 @@ static void update_cluster_info(struct join_message *msg)
if (sys->synchronized)
goto out;
- list_for_each_entry_safe(node, e, &sys->node_list, list) {
+ list_for_each_entry_safe(node, e, &sys->sd_node_list, list) {
list_del(&node->list);
free(node);
}
- INIT_LIST_HEAD(&sys->node_list);
- for (i = 0; i < nr_nodes; i++)
- add_node(msg->nodes[i].nodeid, msg->nodes[i].pid,
+ INIT_LIST_HEAD(&sys->sd_node_list);
+ for (i = 0; i < nr_nodes; i++) {
+ node = find_node(&sys->cpg_node_list, msg->nodes[i].nodeid,
+ msg->nodes[i].pid);
+ if (!node)
+ continue;
+
+ if (!node->ent.id)
+ node->ent = msg->nodes[i].ent;
+
+ add_node(&sys->sd_node_list, msg->nodes[i].nodeid, msg->nodes[i].pid,
&msg->nodes[i].ent);
+ }
sys->epoch = msg->epoch;
sys->synchronized = 1;
- nr_nodes = build_node_list(&sys->node_list, entry);
+ nr_nodes = build_node_list(&sys->sd_node_list, entry);
ret = epoch_log_write(sys->epoch, (char *)entry,
nr_nodes * sizeof(struct sheepdog_node_list_entry));
@@ -332,9 +359,9 @@ static void update_cluster_info(struct join_message *msg)
/* we are ready for object operations */
update_epoch_store(sys->epoch);
out:
- add_node(msg->nodeid, msg->pid, &msg->header.from);
+ add_node(&sys->sd_node_list, msg->nodeid, msg->pid, &msg->header.from);
- nr_nodes = build_node_list(&sys->node_list, entry);
+ nr_nodes = build_node_list(&sys->sd_node_list, entry);
ret = epoch_log_write(sys->epoch + 1, (char *)entry,
nr_nodes * sizeof(struct sheepdog_node_list_entry));
@@ -345,7 +372,7 @@ out:
update_epoch_store(sys->epoch);
- print_node_list();
+ print_node_list(&sys->sd_node_list);
}
static void vdi_op(struct vdi_op_message *msg)
@@ -455,11 +482,26 @@ static void __sd_deliver(struct work *work, int idx)
struct work_deliver *w = container_of(work, struct work_deliver, work);
struct message_header *m = w->msg;
char name[128];
+ struct node *node;
dprintf("op: %d, done: %d, size: %d, from: %s\n",
m->op, m->done, m->msg_length,
addr_to_str(name, sizeof(name), m->from.addr, m->from.port));
+ if (m->op == SD_MSG_JOIN) {
+ uint32_t nodeid = ((struct join_message *)m)->nodeid;
+ uint32_t pid = ((struct join_message *)m)->pid;
+
+ node = find_node(&sys->cpg_node_list, nodeid, pid);
+ if (!node) {
+ dprintf("the node was left before join operation is finished\n");
+ return;
+ }
+
+ if (!node->ent.id)
+ node->ent = m->from;
+ }
+
if (!m->done) {
if (!is_master())
return;
@@ -544,7 +586,7 @@ static void sd_deliver(cpg_handle_t handle, const struct cpg_name *group_name,
static void __sd_confch(struct work *work, int idx)
{
struct work_confch *w = container_of(work, struct work_confch, work);
- struct node *node, *e;
+ struct node *node;
int i;
const struct cpg_address *member_list = w->member_list;
@@ -559,22 +601,31 @@ static void __sd_confch(struct work *work, int idx)
sys->this_pid == member_list[0].pid)
sys->synchronized = 1;
+ if (list_empty(&sys->cpg_node_list)) {
+ for (i = 0; i < member_list_entries; i++)
+ add_node(&sys->cpg_node_list, member_list[i].nodeid, member_list[i].pid, NULL);
+ } else {
+ for (i = 0; i < joined_list_entries; i++)
+ add_node(&sys->cpg_node_list, joined_list[i].nodeid, joined_list[i].pid, NULL);
+ }
+
for (i = 0; i < left_list_entries; i++) {
- list_for_each_entry_safe(node, e, &sys->node_list, list) {
+ node = find_node(&sys->cpg_node_list, left_list[i].nodeid, left_list[i].pid);
+ if (node) {
+ list_del(&node->list);
+ free(node);
+ } else
+ eprintf("System error\n");
+
+ node = find_node(&sys->sd_node_list, left_list[i].nodeid, left_list[i].pid);
+ if (node) {
int nr;
- unsigned pid;
struct sheepdog_node_list_entry e[SD_MAX_NODES];
- if (node->nodeid != left_list[i].nodeid ||
- node->pid != left_list[i].pid)
- continue;
-
- pid = node->pid;
-
list_del(&node->list);
free(node);
- nr = build_node_list(&sys->node_list, e);
+ nr = build_node_list(&sys->sd_node_list, e);
epoch_log_write(sys->epoch + 1, (char *)e,
nr * sizeof(struct sheepdog_node_list_entry));
@@ -608,7 +659,7 @@ static void __sd_confch(struct work *work, int idx)
if (left_list_entries == 0)
return;
- print_node_list();
+ print_node_list(&sys->sd_node_list);
}
static void __sd_confch_done(struct work *work, int idx)
@@ -808,7 +859,8 @@ join_retry:
sys->this_node.id = hval;
sys->synchronized = 0;
- INIT_LIST_HEAD(&sys->node_list);
+ INIT_LIST_HEAD(&sys->sd_node_list);
+ INIT_LIST_HEAD(&sys->cpg_node_list);
INIT_LIST_HEAD(&sys->vm_list);
INIT_LIST_HEAD(&sys->pending_list);
cpg_context_set(cpg_handle, sys);
diff --git a/collie/store.c b/collie/store.c
index 05b19c6..1c416d2 100644
--- a/collie/store.c
+++ b/collie/store.c
@@ -148,7 +148,7 @@ static int read_from_one(uint64_t oid,
e = zalloc(SD_MAX_NODES * sizeof(struct sheepdog_node_list_entry));
again:
- nr = build_node_list(&sys->node_list, e);
+ nr = build_node_list(&sys->sd_node_list, e);
for (i = 0; i < nr; i++) {
n = obj_to_sheep(e, nr, oid, i);
@@ -229,7 +229,7 @@ static int forward_obj_req(struct request *req, char *buf)
e = zalloc(SD_MAX_NODES * sizeof(struct sheepdog_node_list_entry));
again:
- nr = build_node_list(&sys->node_list, e);
+ nr = build_node_list(&sys->sd_node_list, e);
copies = hdr->copies;
@@ -340,7 +340,7 @@ static int is_my_obj(uint64_t oid, int copies)
int i, n, nr;
struct sheepdog_node_list_entry e[SD_MAX_NODES];
- nr = build_node_list(&sys->node_list, e);
+ nr = build_node_list(&sys->sd_node_list, e);
for (i = 0; i < copies; i++) {
n = obj_to_sheep(e, nr, oid, i);
@@ -538,7 +538,7 @@ void store_queue_request(struct work *work, int idx)
dprintf("%d, %x, %" PRIx64" , %u, %u\n", idx, opcode, oid, epoch, req_epoch);
- if (list_empty(&sys->node_list)) {
+ if (list_empty(&sys->sd_node_list)) {
/* we haven't got SD_OP_GET_NODE_LIST response yet. */
ret = SD_RES_SYSTEM_ERROR;
goto out;
@@ -732,7 +732,7 @@ void so_queue_request(struct work *work, int idx)
char oldname[1024];
uint16_t id = 0;
- if (list_empty(&sys->node_list)) {
+ if (list_empty(&sys->sd_node_list)) {
/* we haven't got SD_OP_GET_NODE_LIST response yet. */
result = SD_RES_SYSTEM_ERROR;
goto out;
@@ -748,7 +748,7 @@ void so_queue_request(struct work *work, int idx)
int local = 0;
e = zalloc(SD_MAX_NODES * sizeof(struct sheepdog_node_list_entry));
- nr = build_node_list(&sys->node_list, e);
+ nr = build_node_list(&sys->sd_node_list, e);
for (i = 0; i < sys->nr_sobjs; i++) {
n = obj_to_sheep(e, nr, SD_DIR_OID, i);
diff --git a/collie/vdi.c b/collie/vdi.c
index f30a14a..5904488 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -93,7 +93,7 @@ int add_vdi(char *name, int len, uint64_t size,
memset(&req, 0, sizeof(req));
- nr_nodes = build_node_list(&sys->node_list, entries);
+ nr_nodes = build_node_list(&sys->sd_node_list, entries);
dprintf("%s (%d) %" PRIu64 ", base: %" PRIu64 "\n", name, len, size,
base_oid);
@@ -149,7 +149,7 @@ int lookup_vdi(char *filename, uint64_t * oid, uint32_t tag, int do_lock,
memset(&req, 0, sizeof(req));
- nr_nodes = build_node_list(&sys->node_list, entries);
+ nr_nodes = build_node_list(&sys->sd_node_list, entries);
*current = 0;
@@ -196,7 +196,7 @@ int make_super_object(struct sd_vdi_req *hdr)
req.ctime = (uint64_t)tv.tv_sec << 32 | tv.tv_usec * 1000;
req.copies = ((struct sd_obj_req *)hdr)->copies;
- nr_nodes = build_node_list(&sys->node_list, entries);
+ nr_nodes = build_node_list(&sys->sd_node_list, entries);
ret = exec_reqs(entries, nr_nodes, sys->epoch,
SD_DIR_OID, (struct sd_req *)&req, NULL, 0, 0, req.copies,
--
1.5.6.5
More information about the sheepdog
mailing list