[Sheepdog] [PATCH] fix master selection race
FUJITA Tomonori
fujita.tomonori at lab.ntt.co.jp
Thu Apr 8 13:03:08 CEST 2010
We wrongly assume that the node that joins corosync sends Sheepdog's
JOIN message before the other nodes do. We hit a bug that two nodes
are the master node temporarily.
This patch makes sure that the node that joins corosync will be always
the master.
Signed-off-by: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>
---
collie/collie.h | 2 +
collie/group.c | 94 +++++++++++++++++++++++++++++++++++++++++++++---------
2 files changed, 80 insertions(+), 16 deletions(-)
diff --git a/collie/collie.h b/collie/collie.h
index fac6809..048cc7b 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -74,6 +74,8 @@ struct cluster_info {
DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
int nr_sobjs;
+
+ struct list_head work_deliver_siblings;
};
struct cluster_info *sys;
diff --git a/collie/group.c b/collie/group.c
index d659485..56e0fe9 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -71,6 +71,7 @@ struct work_deliver {
struct message_header *msg;
struct work work;
+ struct list_head work_deliver_list;
};
struct work_confch {
@@ -402,9 +403,6 @@ static void join(struct join_message *msg)
if (!sys->synchronized)
return;
- if (!is_master())
- return;
-
if (msg->nr_sobjs)
sys->nr_sobjs = msg->nr_sobjs;
@@ -414,12 +412,8 @@ static void join(struct join_message *msg)
msg->epoch = sys->epoch;
else
msg->epoch = 0;
- list_for_each_entry(node, &sys->cpg_node_list, list) {
- if (node->nodeid == msg->nodeid && node->pid == msg->pid)
- continue;
- if (node->ent.id == 0)
- continue;
+ list_for_each_entry(node, &sys->sd_node_list, list) {
msg->nodes[msg->nr_nodes].nodeid = node->nodeid;
msg->nodes[msg->nr_nodes].pid = node->pid;
msg->nodes[msg->nr_nodes].ent = node->ent;
@@ -747,6 +741,8 @@ static void __sd_deliver(struct work *work, int idx)
break;
}
+ vprintf(SDOG_DEBUG "will send\n");
+
m->done = 1;
send_message(sys->handle, m);
} else {
@@ -766,8 +762,23 @@ static void __sd_deliver(struct work *work, int idx)
static void __sd_deliver_done(struct work *work, int idx)
{
- struct work_deliver *w = container_of(work, struct work_deliver, work);
- struct message_header *m = w->msg;
+ struct work_deliver *w, *n = NULL;
+ struct message_header *m;
+
+ w = container_of(work, struct work_deliver, work);
+ m = w->msg;
+
+ list_del(&w->work_deliver_list);
+
+ /*
+ * When I finished one message, if I have pending messages, I
+ * need to perform the first of them now.
+ */
+ if (m->done && !list_empty(&sys->work_deliver_siblings)) {
+
+ n = list_first_entry(&sys->work_deliver_siblings,
+ struct work_deliver, work_deliver_list);
+ }
/*
* FIXME: we want to recover only after all nodes are fully
@@ -779,6 +790,9 @@ static void __sd_deliver_done(struct work *work, int idx)
free(w->msg);
free(w);
+
+ if (n)
+ queue_work(dobj_queue, &n->work);
}
static void sd_deliver(cpg_handle_t handle, const struct cpg_name *group_name,
@@ -800,12 +814,34 @@ static void sd_deliver(cpg_handle_t handle, const struct cpg_name *group_name,
if (!w->msg)
return;
memcpy(w->msg, msg, msg_len);
+ INIT_LIST_HEAD(&w->work_deliver_list);
w->work.fn = __sd_deliver;
w->work.done = __sd_deliver_done;
- if (m->op == SD_MSG_JOIN)
- w->work.attr = WORK_ORDERED;
+ if (is_master()) {
+ if (!m->done) {
+ int run = 0;
+
+ /*
+ * I can broadcast this message if there is no
+ * outstanding messages.
+ */
+ if (list_empty(&sys->work_deliver_siblings))
+ run = 1;
+
+ list_add_tail(&w->work_deliver_list,
+ &sys->work_deliver_siblings);
+ if (run) {
+ vprintf(SDOG_DEBUG "%u\n", pid);
+ queue_work(dobj_queue, &w->work);
+ } else
+ vprintf(SDOG_DEBUG "%u\n", pid);
+
+ return;
+ }
+ } else if (m->op == SD_MSG_JOIN)
+ w->work.attr = WORK_ORDERED;
queue_work(dobj_queue, &w->work);
}
@@ -815,6 +851,7 @@ static void __sd_confch(struct work *work, int idx)
struct work_confch *w = container_of(work, struct work_confch, work);
struct node *node;
int i;
+ int init = 0;
const struct cpg_address *member_list = w->member_list;
size_t member_list_entries = w->member_list_entries;
@@ -825,8 +862,10 @@ static void __sd_confch(struct work *work, int idx)
if (member_list_entries == joined_list_entries - left_list_entries &&
sys->this_nodeid == member_list[0].nodeid &&
- sys->this_pid == member_list[0].pid)
+ sys->this_pid == member_list[0].pid){
sys->synchronized = 1;
+ init = 1;
+ }
if (list_empty(&sys->cpg_node_list)) {
for (i = 0; i < member_list_entries; i++)
@@ -865,6 +904,28 @@ static void __sd_confch(struct work *work, int idx)
}
}
+ if (init) {
+ struct join_message msg;
+
+ /*
+ * If I'm the first collie joins in colosync, I
+ * becomes the master without sending JOIN.
+ */
+
+ vprintf(SDOG_DEBUG "%d %x\n", sys->this_pid, sys->this_nodeid);
+
+ memset(&msg, 0, sizeof(msg));
+
+ msg.header.from = sys->this_node;
+ msg.nodeid = sys->this_nodeid;
+ msg.pid = sys->this_pid;
+ msg.cluster_status = get_cluster_status(&msg.header.from);
+
+ update_cluster_info(&msg);
+
+ return;
+ }
+
for (i = 0; i < joined_list_entries; i++) {
if (sys->this_nodeid == joined_list[i].nodeid &&
sys->this_pid == joined_list[i].pid) {
@@ -993,7 +1054,7 @@ int build_node_list(struct list_head *node_list,
return nr;
}
-static void set_addr(unsigned int nodeid)
+static void set_addr(unsigned int nodeid, int port)
{
int ret, nr;
corosync_cfg_handle_t handle;
@@ -1036,7 +1097,7 @@ static void set_addr(unsigned int nodeid)
inet_ntop(ss->ss_family, saddr, tmp, sizeof(tmp));
- vprintf(SDOG_INFO "addr = %s\n", tmp);
+ vprintf(SDOG_INFO "addr = %s, port = %d\n", tmp, port);
}
int create_cluster(int port)
@@ -1082,7 +1143,7 @@ join_retry:
sys->this_nodeid = nodeid;
sys->this_pid = getpid();
- set_addr(nodeid);
+ set_addr(nodeid, port);
sys->this_node.port = port;
ret = get_nodeid(&sys->this_node.id);
@@ -1104,6 +1165,7 @@ join_retry:
INIT_LIST_HEAD(&sys->cpg_node_list);
INIT_LIST_HEAD(&sys->vm_list);
INIT_LIST_HEAD(&sys->pending_list);
+ INIT_LIST_HEAD(&sys->work_deliver_siblings);
cpg_context_set(cpg_handle, sys);
cpg_fd_get(cpg_handle, &fd);
--
1.7.0
More information about the sheepdog
mailing list