On Wed, May 16, 2012 at 01:20:23PM -0400, Christoph Hellwig wrote: > On Wed, May 16, 2012 at 10:49:44AM -0400, Christoph Hellwig wrote: > > On Wed, May 16, 2012 at 10:44:14PM +0800, Yunkai Zhang wrote: > > > Hi Hellwig, Do you have any comments on my latest unregister patch > > > which have been updated to reflect your previous comments? > > > > I like it, but I ran into an issue where a VDI created on one > > nodes doesn't seem to to be found on another node created later. > > The issues was that sys_stat_ok is now always true in __sd_join. The > patch below records the state before we joined and stores it in the > join work to restore the old behavior. It also filters the joining node > out to match the old behaviour, just as your earlier versions of the > patches did. The patch passes all my testing, but it still introduces > the problem with I/O concurrent to epoch/node list changes vs recovery > that Kazutaka mentioned in his recent mail, so it will need more work. > > Also note that it is based on top of my three patches as my working tree > has them included. This one still had a stupid brown paperbag bug in the leave handler, updated version below: diff --git a/sheep/group.c b/sheep/group.c index 6921f5e..beb10d8 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -9,6 +9,7 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <assert.h> +#include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> @@ -78,6 +79,7 @@ struct work_join { struct sd_node *member_list; size_t member_list_entries; struct sd_node joined; + bool get_vdi_bitmap; struct join_message *jm; }; @@ -722,19 +724,15 @@ static int check_majority(struct sd_node *nodes, int nr_nodes) static void __sd_join(struct event_struct *cevent) { struct work_join *w = container_of(cevent, struct work_join, cev); - struct join_message *msg = w->jm; int i; - if (msg->cluster_status != SD_STATUS_OK && - msg->cluster_status != SD_STATUS_HALT) - return; - - if (sys_stat_ok()) + if (!w->get_vdi_bitmap) return; for (i = 0; i < w->member_list_entries; i++) { - - get_vdi_bitmap_from(w->member_list + i); + if (node_eq(&w->member_list[i], &w->joined)) + continue; + get_vdi_bitmap_from(&w->member_list[i]); /* * If a new comer try to join the running cluster, it only @@ -867,11 +865,6 @@ static void __sd_join_done(struct event_struct *cevent) print_node_list(sys->nodes, sys->nr_nodes); - if (!sys_stat_join_failed()) { - update_cluster_info(jm, &w->joined, w->member_list, - w->member_list_entries); - } - if (sys_can_recover() && jm->inc_epoch) { list_for_each_entry_safe(node, t, &sys->leave_list, list) { list_del(&node->list); @@ -891,17 +884,8 @@ static void __sd_join_done(struct event_struct *cevent) static void __sd_leave_done(struct event_struct *cevent) { - struct work_leave *w = container_of(cevent, struct work_leave, cev); - - update_node_info(w->member_list, w->member_list_entries); - - if (sys_can_recover()) { - sys->epoch++; - update_epoch_store(sys->epoch); - update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes); - + if (sys_can_recover()) start_recovery(sys->epoch); - } if (sys_can_halt()) { if (current_vnode_info->nr_zones < sys->nr_copies) @@ -965,7 +949,6 @@ static void event_fn(struct work *work) static void event_done(struct work *work) { struct event_struct *cevent; - int ret; if (!sys->cur_cevent) vprintf(SDOG_ERR, "bug\n"); @@ -992,9 +975,6 @@ static void event_done(struct work *work) vprintf(SDOG_DEBUG, "free %p\n", cevent); event_free(cevent); event_running = 0; - ret = register_event(cdrv_fd, group_handler, NULL); - if (ret) - panic("failed to register event fd"); process_request_event_queues(); } @@ -1106,7 +1086,6 @@ static inline void process_event_queue(void) event_work.fn = event_fn; event_work.done = event_done; - unregister_event(cdrv_fd); queue_work(sys->event_wqueue, &event_work); } @@ -1128,7 +1107,7 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members, int i, size; int nr, nr_local, nr_leave; struct node *n; - struct join_message *jm; + struct join_message *jm = opaque; uint32_t le = get_latest_epoch(); if (node_eq(joined, &sys->this_node)) { @@ -1171,11 +1150,24 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members, w->joined = *joined; + /* + * Only get the VDI list if we were waiting for a format or + * join before, and the join was successful. + */ + if (sys_stat_ok() || + (jm->cluster_status != SD_STATUS_OK && + jm->cluster_status != SD_STATUS_HALT)) + w->get_vdi_bitmap = false; + else + w->get_vdi_bitmap = true; + size = get_join_message_size(opaque); w->jm = zalloc(size); if (!w->jm) panic("failed to allocate memory\n"); memcpy(w->jm, opaque, size); + + update_cluster_info(jm, joined, members, nr_members); list_add_tail(&cevent->event_list, &sys->event_queue); process_request_event_queues(); @@ -1210,7 +1202,6 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members, } break; case CJ_RES_MASTER_TRANSFER: - jm = (struct join_message *)opaque; nr = jm->nr_leave_nodes; for (i = 0; i < nr; i++) { if (find_entry_list(&jm->leave_nodes[i], &sys->leave_list) @@ -1268,6 +1259,14 @@ void sd_leave_handler(struct sd_node *left, struct sd_node *members, if (sys_stat_shutdown()) return; + update_node_info(members, nr_members); + + if (sys_can_recover()) { + sys->epoch++; + update_epoch_store(sys->epoch); + update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes); + } + w = zalloc(sizeof(*w)); if (!w) goto oom; |