[sheepdog] [PATCH V4] sheep: remove unregister_event from process_event_queue()
Yunkai Zhang
yunkai.me at gmail.com
Thu May 17 13:47:15 CEST 2012
From: Yunkai Zhang <qiushu.zyk at taobao.com>
Changes in V4:
- move the updating of sys->status from update_cluster_info() t
__sd_join_done(), otherwise sys_stat_ok() will always return true in
__sd_join().
- update __sd_join() so that sheep can get at least one copy of bitmap
from *other* member when sys_stat_wait_format() return true.
- rebased on upstream
-------------------------------------------------------------------- >8
In old code, we call unregister_event(cdrv_fd, ...) in process_event_queue()
when sheep receives cluster EVENT, we will register cdrv_fd into epoll again
in event_done() after __sd_xxx() finished.
This is dangerous! In our testing, for some reason, __sd_xxx() may be blocked
by network issue, as a result event_done() would not be executed, and cdrv_fd
would keep outstanding from epoll, then all new coming EVENT could not be
process immediately. This will make sheep hard to complete recovery.
Now, we call update_cluster_info() in sd_xxx_handler() directly so that we can
process new EVENT one by one immediately, and needn't to wait previous EVENT's
__sd_xxx() finished. So we can remove unregister_event() from
process_event_queue() safely.
Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
---
sheep/group.c | 43 ++++++++++++++++++++-----------------------
1 files changed, 20 insertions(+), 23 deletions(-)
diff --git a/sheep/group.c b/sheep/group.c
index 8903034..8f13540 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -673,7 +673,6 @@ static void update_cluster_info(struct join_message *msg,
set_cluster_ctime(msg->ctime);
}
}
- sys_stat_set(msg->cluster_status);
}
static void __sd_notify(struct event_struct *cevent)
@@ -791,12 +790,15 @@ static void __sd_join(struct event_struct *cevent)
return;
for (i = 0; i < w->member_list_entries; i++) {
+ /* We should not fetch vdi_bitmap from myself */
+ if (node_eq(w->member_list + i, &sys->this_node))
+ continue;
get_vdi_bitmap_from(w->member_list + i);
/*
* If a new comer try to join the running cluster, it only
- * need read one copy of bitmap from the first member.
+ * need read one copy of bitmap from one of other members.
*/
if (sys_stat_wait_format())
break;
@@ -925,10 +927,7 @@ static void __sd_join_done(struct event_struct *cevent)
print_node_list(sys->nodes, sys->nr_nodes);
- if (!sys_stat_join_failed()) {
- update_cluster_info(jm, &w->joined, w->member_list,
- w->member_list_entries);
- }
+ sys_stat_set(jm->cluster_status);
if (sys_can_recover() && jm->inc_epoch) {
list_for_each_entry_safe(node, t, &sys->leave_list, list) {
@@ -949,17 +948,8 @@ static void __sd_join_done(struct event_struct *cevent)
static void __sd_leave_done(struct event_struct *cevent)
{
- struct work_leave *w = container_of(cevent, struct work_leave, cev);
-
- update_node_info(w->member_list, w->member_list_entries);
-
- if (sys_can_recover()) {
- sys->epoch++;
- update_epoch_store(sys->epoch);
- update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
-
+ if (sys_can_recover())
start_recovery(sys->epoch);
- }
if (sys_can_halt()) {
if (current_vnode_info->nr_zones < sys->nr_copies)
@@ -1023,7 +1013,6 @@ static void event_fn(struct work *work)
static void event_done(struct work *work)
{
struct event_struct *cevent;
- int ret;
if (!sys->cur_cevent)
vprintf(SDOG_ERR, "bug\n");
@@ -1050,9 +1039,6 @@ static void event_done(struct work *work)
vprintf(SDOG_DEBUG, "free %p\n", cevent);
event_free(cevent);
event_running = 0;
- ret = register_event(cdrv_fd, group_handler, NULL);
- if (ret)
- panic("failed to register event fd");
process_request_event_queues();
}
@@ -1164,7 +1150,6 @@ static inline void process_event_queue(void)
event_work.fn = event_fn;
event_work.done = event_done;
- unregister_event(cdrv_fd);
queue_work(sys->event_wqueue, &event_work);
}
@@ -1186,7 +1171,7 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
int i, size;
int nr, nr_local, nr_leave;
struct node *n;
- struct join_message *jm;
+ struct join_message *jm = opaque;
uint32_t le = get_latest_epoch();
if (node_eq(joined, &sys->this_node)) {
@@ -1210,6 +1195,8 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
if (sys_stat_shutdown())
break;
+ update_cluster_info(jm, joined, members, nr_members);
+
w = zalloc(sizeof(*w));
if (!w)
panic("failed to allocate memory");
@@ -1268,7 +1255,6 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
}
break;
case CJ_RES_MASTER_TRANSFER:
- jm = (struct join_message *)opaque;
nr = jm->nr_leave_nodes;
for (i = 0; i < nr; i++) {
if (find_entry_list(&jm->leave_nodes[i], &sys->leave_list)
@@ -1326,6 +1312,17 @@ void sd_leave_handler(struct sd_node *left, struct sd_node *members,
if (sys_stat_shutdown())
return;
+ sys->nr_nodes = nr_members;
+ memcpy(sys->nodes, members, sizeof(*sys->nodes) * sys->nr_nodes);
+ qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp);
+
+ if (sys_can_recover()) {
+ sys->epoch++;
+ update_epoch_store(sys->epoch);
+ update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
+ }
+ update_vnode_info();
+
w = zalloc(sizeof(*w));
if (!w)
goto oom;
--
1.7.7.6
More information about the sheepdog
mailing list