On Thu, May 17, 2012 at 10:43 AM, Yunkai Zhang <yunkai.me at gmail.com> wrote: > On Thu, May 17, 2012 at 1:20 AM, Christoph Hellwig <hch at infradead.org> wrote: >> On Wed, May 16, 2012 at 10:49:44AM -0400, Christoph Hellwig wrote: >>> On Wed, May 16, 2012 at 10:44:14PM +0800, Yunkai Zhang wrote: >>> > Hi Hellwig, Do you have any comments on my latest unregister patch >>> > which have been updated to reflect your previous comments? >>> >>> I like it, but I ran into an issue where a VDI created on one >>> nodes doesn't seem to to be found on another node created later. >> >> The issues was that sys_stat_ok is now always true in __sd_join. The I'll try to fix it. >> patch below records the state before we joined and stores it in the >> join work to restore the old behavior. It also filters the joining node >> out to match the old behaviour, just as your earlier versions of the >> patches did. The patch passes all my testing, but it still introduces >> the problem with I/O concurrent to epoch/node list changes vs recovery >> that Kazutaka mentioned in his recent mail, so it will need more work. >> >> Also note that it is based on top of my three patches as my working tree >> has them included. >> >> >> diff --git a/sheep/group.c b/sheep/group.c >> index 6921f5e..8c684f7 100644 >> --- a/sheep/group.c >> +++ b/sheep/group.c >> @@ -9,6 +9,7 @@ >> * along with this program. If not, see <http://www.gnu.org/licenses/>. >> */ >> #include <assert.h> >> +#include <stdbool.h> >> #include <stdio.h> >> #include <stdlib.h> >> #include <unistd.h> >> @@ -78,6 +79,7 @@ struct work_join { >> struct sd_node *member_list; >> size_t member_list_entries; >> struct sd_node joined; >> + bool get_vdi_bitmap; >> >> struct join_message *jm; >> }; >> @@ -722,19 +724,15 @@ static int check_majority(struct sd_node *nodes, int nr_nodes) >> static void __sd_join(struct event_struct *cevent) >> { >> struct work_join *w = container_of(cevent, struct work_join, cev); >> - struct join_message *msg = w->jm; >> int i; >> >> - if (msg->cluster_status != SD_STATUS_OK && >> - msg->cluster_status != SD_STATUS_HALT) >> - return; >> - >> - if (sys_stat_ok()) >> + if (!w->get_vdi_bitmap) >> return; >> >> for (i = 0; i < w->member_list_entries; i++) { >> - >> - get_vdi_bitmap_from(w->member_list + i); >> + if (node_eq(&w->member_list[i], &w->joined)) >> + continue; > > Hi Hellwig, it seems that this is not my latest unregister patch. I > have removed this node_eq() condition in it. Sorry, I have tested your case with latest unregister patch, it also has the some bug. > >> + get_vdi_bitmap_from(&w->member_list[i]); >> >> /* >> * If a new comer try to join the running cluster, it only >> @@ -867,11 +865,6 @@ static void __sd_join_done(struct event_struct *cevent) >> >> print_node_list(sys->nodes, sys->nr_nodes); >> >> - if (!sys_stat_join_failed()) { >> - update_cluster_info(jm, &w->joined, w->member_list, >> - w->member_list_entries); >> - } >> - >> if (sys_can_recover() && jm->inc_epoch) { >> list_for_each_entry_safe(node, t, &sys->leave_list, list) { >> list_del(&node->list); >> @@ -891,17 +884,8 @@ static void __sd_join_done(struct event_struct *cevent) >> >> static void __sd_leave_done(struct event_struct *cevent) >> { >> - struct work_leave *w = container_of(cevent, struct work_leave, cev); >> - >> - update_node_info(w->member_list, w->member_list_entries); >> - >> - if (sys_can_recover()) { >> - sys->epoch++; >> - update_epoch_store(sys->epoch); >> - update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes); >> - >> + if (sys_can_recover()) >> start_recovery(sys->epoch); >> - } >> >> if (sys_can_halt()) { >> if (current_vnode_info->nr_zones < sys->nr_copies) >> @@ -965,7 +949,6 @@ static void event_fn(struct work *work) >> static void event_done(struct work *work) >> { >> struct event_struct *cevent; >> - int ret; >> >> if (!sys->cur_cevent) >> vprintf(SDOG_ERR, "bug\n"); >> @@ -992,9 +975,6 @@ static void event_done(struct work *work) >> vprintf(SDOG_DEBUG, "free %p\n", cevent); >> event_free(cevent); >> event_running = 0; >> - ret = register_event(cdrv_fd, group_handler, NULL); >> - if (ret) >> - panic("failed to register event fd"); >> >> process_request_event_queues(); >> } >> @@ -1106,7 +1086,6 @@ static inline void process_event_queue(void) >> event_work.fn = event_fn; >> event_work.done = event_done; >> >> - unregister_event(cdrv_fd); >> queue_work(sys->event_wqueue, &event_work); >> } >> >> @@ -1128,7 +1107,7 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members, >> int i, size; >> int nr, nr_local, nr_leave; >> struct node *n; >> - struct join_message *jm; >> + struct join_message *jm = opaque; >> uint32_t le = get_latest_epoch(); >> >> if (node_eq(joined, &sys->this_node)) { >> @@ -1171,11 +1150,24 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members, >> >> w->joined = *joined; >> >> + /* >> + * Only get the VDI list if we were waiting for a format or >> + * join before, and the join was successful. >> + */ >> + if (sys_stat_ok() || >> + (jm->cluster_status != SD_STATUS_OK && >> + jm->cluster_status != SD_STATUS_HALT)) >> + w->get_vdi_bitmap = false; >> + else >> + w->get_vdi_bitmap = true; >> + >> size = get_join_message_size(opaque); >> w->jm = zalloc(size); >> if (!w->jm) >> panic("failed to allocate memory\n"); >> memcpy(w->jm, opaque, size); >> + >> + update_cluster_info(jm, joined, members, nr_members); >> >> list_add_tail(&cevent->event_list, &sys->event_queue); >> process_request_event_queues(); >> @@ -1210,7 +1202,6 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members, >> } >> break; >> case CJ_RES_MASTER_TRANSFER: >> - jm = (struct join_message *)opaque; >> nr = jm->nr_leave_nodes; >> for (i = 0; i < nr; i++) { >> if (find_entry_list(&jm->leave_nodes[i], &sys->leave_list) >> @@ -1268,6 +1259,14 @@ void sd_leave_handler(struct sd_node *left, struct sd_node *members, >> if (sys_stat_shutdown()) >> return; >> >> + update_node_info(w->member_list, w->member_list_entries); >> + >> + if (sys_can_recover()) { >> + sys->epoch++; >> + update_epoch_store(sys->epoch); >> + update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes); >> + } >> + >> w = zalloc(sizeof(*w)); >> if (!w) >> goto oom; > > > > -- > Yunkai Zhang > Work at Taobao -- Yunkai Zhang Work at Taobao |