On Thu, May 17, 2012 at 1:20 AM, Christoph Hellwig <hch at infradead.org> wrote: > On Wed, May 16, 2012 at 10:49:44AM -0400, Christoph Hellwig wrote: >> On Wed, May 16, 2012 at 10:44:14PM +0800, Yunkai Zhang wrote: >> > Hi Hellwig, Do you have any comments on my latest unregister patch >> > which have been updated to reflect your previous comments? >> >> I like it, but I ran into an issue where a VDI created on one >> nodes doesn't seem to to be found on another node created later. > > The issues was that sys_stat_ok is now always true in __sd_join. The > patch below records the state before we joined and stores it in the > join work to restore the old behavior. It also filters the joining node > out to match the old behaviour, just as your earlier versions of the > patches did. The patch passes all my testing, but it still introduces > the problem with I/O concurrent to epoch/node list changes vs recovery > that Kazutaka mentioned in his recent mail, so it will need more work. > > Also note that it is based on top of my three patches as my working tree > has them included. > > > diff --git a/sheep/group.c b/sheep/group.c > index 6921f5e..8c684f7 100644 > --- a/sheep/group.c > +++ b/sheep/group.c > @@ -9,6 +9,7 @@ > * along with this program. If not, see <http://www.gnu.org/licenses/>. > */ > #include <assert.h> > +#include <stdbool.h> > #include <stdio.h> > #include <stdlib.h> > #include <unistd.h> > @@ -78,6 +79,7 @@ struct work_join { > struct sd_node *member_list; > size_t member_list_entries; > struct sd_node joined; > + bool get_vdi_bitmap; > > struct join_message *jm; > }; > @@ -722,19 +724,15 @@ static int check_majority(struct sd_node *nodes, int nr_nodes) > static void __sd_join(struct event_struct *cevent) > { > struct work_join *w = container_of(cevent, struct work_join, cev); > - struct join_message *msg = w->jm; > int i; > > - if (msg->cluster_status != SD_STATUS_OK && > - msg->cluster_status != SD_STATUS_HALT) > - return; > - > - if (sys_stat_ok()) > + if (!w->get_vdi_bitmap) > return; > > for (i = 0; i < w->member_list_entries; i++) { > - > - get_vdi_bitmap_from(w->member_list + i); > + if (node_eq(&w->member_list[i], &w->joined)) > + continue; Hi Hellwig, it seems that this is not my latest unregister patch. I have removed this node_eq() condition in it. > + get_vdi_bitmap_from(&w->member_list[i]); > > /* > * If a new comer try to join the running cluster, it only > @@ -867,11 +865,6 @@ static void __sd_join_done(struct event_struct *cevent) > > print_node_list(sys->nodes, sys->nr_nodes); > > - if (!sys_stat_join_failed()) { > - update_cluster_info(jm, &w->joined, w->member_list, > - w->member_list_entries); > - } > - > if (sys_can_recover() && jm->inc_epoch) { > list_for_each_entry_safe(node, t, &sys->leave_list, list) { > list_del(&node->list); > @@ -891,17 +884,8 @@ static void __sd_join_done(struct event_struct *cevent) > > static void __sd_leave_done(struct event_struct *cevent) > { > - struct work_leave *w = container_of(cevent, struct work_leave, cev); > - > - update_node_info(w->member_list, w->member_list_entries); > - > - if (sys_can_recover()) { > - sys->epoch++; > - update_epoch_store(sys->epoch); > - update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes); > - > + if (sys_can_recover()) > start_recovery(sys->epoch); > - } > > if (sys_can_halt()) { > if (current_vnode_info->nr_zones < sys->nr_copies) > @@ -965,7 +949,6 @@ static void event_fn(struct work *work) > static void event_done(struct work *work) > { > struct event_struct *cevent; > - int ret; > > if (!sys->cur_cevent) > vprintf(SDOG_ERR, "bug\n"); > @@ -992,9 +975,6 @@ static void event_done(struct work *work) > vprintf(SDOG_DEBUG, "free %p\n", cevent); > event_free(cevent); > event_running = 0; > - ret = register_event(cdrv_fd, group_handler, NULL); > - if (ret) > - panic("failed to register event fd"); > > process_request_event_queues(); > } > @@ -1106,7 +1086,6 @@ static inline void process_event_queue(void) > event_work.fn = event_fn; > event_work.done = event_done; > > - unregister_event(cdrv_fd); > queue_work(sys->event_wqueue, &event_work); > } > > @@ -1128,7 +1107,7 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members, > int i, size; > int nr, nr_local, nr_leave; > struct node *n; > - struct join_message *jm; > + struct join_message *jm = opaque; > uint32_t le = get_latest_epoch(); > > if (node_eq(joined, &sys->this_node)) { > @@ -1171,11 +1150,24 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members, > > w->joined = *joined; > > + /* > + * Only get the VDI list if we were waiting for a format or > + * join before, and the join was successful. > + */ > + if (sys_stat_ok() || > + (jm->cluster_status != SD_STATUS_OK && > + jm->cluster_status != SD_STATUS_HALT)) > + w->get_vdi_bitmap = false; > + else > + w->get_vdi_bitmap = true; > + > size = get_join_message_size(opaque); > w->jm = zalloc(size); > if (!w->jm) > panic("failed to allocate memory\n"); > memcpy(w->jm, opaque, size); > + > + update_cluster_info(jm, joined, members, nr_members); > > list_add_tail(&cevent->event_list, &sys->event_queue); > process_request_event_queues(); > @@ -1210,7 +1202,6 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members, > } > break; > case CJ_RES_MASTER_TRANSFER: > - jm = (struct join_message *)opaque; > nr = jm->nr_leave_nodes; > for (i = 0; i < nr; i++) { > if (find_entry_list(&jm->leave_nodes[i], &sys->leave_list) > @@ -1268,6 +1259,14 @@ void sd_leave_handler(struct sd_node *left, struct sd_node *members, > if (sys_stat_shutdown()) > return; > > + update_node_info(w->member_list, w->member_list_entries); > + > + if (sys_can_recover()) { > + sys->epoch++; > + update_epoch_store(sys->epoch); > + update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes); > + } > + > w = zalloc(sizeof(*w)); > if (!w) > goto oom; -- Yunkai Zhang Work at Taobao |