From: Yunkai Zhang <qiushu.zyk at taobao.com> This patch have been tested with zookeeper dirver, if it has any problems in corosync driver, please give comments here. ---------------------------------------------------------------------- >8 In old code, we call unregister_event(cdrv_fd, ...) in process_event_queue() when sheep receives cluster EVENT, we will register cdrv_fd into epoll again in event_done() after __sd_xxx() finished. This is dangerous! In our testing, for some reason, __sd_xxx() may be blocked by network issue, as a result event_done() would not be executed, and cdrv_fd would keep outstanding from epoll, then all new coming EVENT could not be process immediately. This will make sheep hard to complete recovery. Now, we call update_cluster_info() in sd_xxx_handler() directly so that we can process new EVENT one by one immediately, and needn't to wait previous EVENT's __sd_xxx() finished. So we can remove unregister_event() from process_event_queue() safely. Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com> --- sheep/group.c | 63 +++++++++++++++++++++++++++++---------------------------- 1 files changed, 32 insertions(+), 31 deletions(-) diff --git a/sheep/group.c b/sheep/group.c index 54bd8f3..63f7de3 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -515,13 +515,13 @@ out: return ret; } -static void get_vdi_bitmap_from_sd_list(void) +static void get_vdi_bitmap_from_sd_list(struct sd_node *nodes, size_t nr_nodes) { int i; /* fixme: we need this until starting up. */ - for (i = 0; i < sys->nr_nodes; i++) - get_vdi_bitmap_from(sys->nodes + i); + for (i = 0; i < nr_nodes; i++) + get_vdi_bitmap_from(nodes + i); } static void finish_join(struct join_message *msg, struct sd_node *joined, @@ -714,7 +714,8 @@ static void __sd_join(struct event_struct *cevent) { struct work_join *w = container_of(cevent, struct work_join, cev); struct join_message *msg = w->jm; - int i; + struct sd_node nodes[SD_MAX_NODES]; + int i, nr_nodes; if (msg->cluster_status != SD_STATUS_OK && msg->cluster_status != SD_STATUS_HALT) @@ -724,13 +725,23 @@ static void __sd_join(struct event_struct *cevent) return; /* + * w->member_list contains joining node, we should + * exclude it in following operation. + */ + for (i = 0, nr_nodes = 0; i < w->member_list_entries; i++) { + if (node_eq(w->member_list + i, &w->joined)) + continue; + nodes[nr_nodes++] = w->member_list[i]; + } + + /* * If a new comer try to join the running cluster, it only need read * one copy of bitmap from the first member. */ if (sys_stat_wait_format()) - get_vdi_bitmap_from(w->member_list); + get_vdi_bitmap_from(nodes); else { - get_vdi_bitmap_from_sd_list(); + get_vdi_bitmap_from_sd_list(nodes, nr_nodes); for (i = 0; i < w->member_list_entries; i++) get_vdi_bitmap_from(w->member_list + i); } @@ -839,11 +850,6 @@ static void __sd_join_done(struct event_struct *cevent) print_node_list(sys->nodes, sys->nr_nodes); - if (!sys_stat_join_failed()) { - update_cluster_info(jm, &w->joined, w->member_list, - w->member_list_entries); - } - if (sys_can_recover() && jm->inc_epoch) { list_for_each_entry_safe(node, t, &sys->leave_list, list) { list_del(&node->list); @@ -863,19 +869,6 @@ static void __sd_join_done(struct event_struct *cevent) static void __sd_leave_done(struct event_struct *cevent) { - struct work_leave *w = container_of(cevent, struct work_leave, cev); - - sys->nr_nodes = w->member_list_entries; - memcpy(sys->nodes, w->member_list, sizeof(*sys->nodes) * sys->nr_nodes); - qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp); - - if (sys_can_recover()) { - sys->epoch++; - update_epoch_store(sys->epoch); - update_epoch_log(sys->epoch); - } - update_vnode_info(); - print_node_list(sys->nodes, sys->nr_nodes); if (sys_can_recover()) @@ -943,7 +936,6 @@ static void event_fn(struct work *work) static void event_done(struct work *work) { struct event_struct *cevent; - int ret; if (!sys->cur_cevent) vprintf(SDOG_ERR, "bug\n"); @@ -970,9 +962,6 @@ static void event_done(struct work *work) vprintf(SDOG_DEBUG, "free %p\n", cevent); event_free(cevent); event_running = 0; - ret = register_event(cdrv_fd, group_handler, NULL); - if (ret) - panic("failed to register event fd"); process_request_event_queues(); } @@ -1076,7 +1065,6 @@ static inline void process_event_queue(void) event_work.fn = event_fn; event_work.done = event_done; - unregister_event(cdrv_fd); queue_work(sys->event_wqueue, &event_work); } @@ -1098,7 +1086,7 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members, int i, size; int nr, nr_local, nr_leave; struct node *n; - struct join_message *jm; + struct join_message *jm = opaque; uint32_t le = get_latest_epoch(); if (node_eq(joined, &sys->this_node)) { @@ -1113,6 +1101,9 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members, } } + if (result == CJ_RES_SUCCESS || result == CJ_RES_MASTER_TRANSFER) + update_cluster_info(jm, joined, members, nr_members); + switch (result) { case CJ_RES_SUCCESS: dprintf("join %s\n", node_to_str(joined)); @@ -1180,7 +1171,6 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members, } break; case CJ_RES_MASTER_TRANSFER: - jm = (struct join_message *)opaque; nr = jm->nr_leave_nodes; for (i = 0; i < nr; i++) { if (find_entry_list(&jm->leave_nodes[i], &sys->leave_list) @@ -1241,6 +1231,17 @@ void sd_leave_handler(struct sd_node *left, struct sd_node *members, if (sys_stat_shutdown()) return; + sys->nr_nodes = nr_members; + memcpy(sys->nodes, members, sizeof(*sys->nodes) * sys->nr_nodes); + qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp); + + if (sys_can_recover()) { + sys->epoch++; + update_epoch_store(sys->epoch); + update_epoch_log(sys->epoch); + } + update_vnode_info(); + w = zalloc(sizeof(*w)); if (!w) goto oom; -- 1.7.7.6 |