Add a new helper that updates sys->nodes and sys->nr_nodes as well as the vnode list, and use it in all places that update cluster membership. In __sd_leave_done it can be used as is and gives a nice cleanup, in the master transfer case sd_join_handler it can also be used as is, but I've added an assert for the previously implicit assumption that no other nodes can exist. The tricky case is update_cluster_info/finish_join, where we first need to write an entry into the epoch log for the epoch before the joining code so that the recovery code can do the right thing. Signed-off-by: Christoph Hellwig <hch at lst.de> --- sheep/group.c | 73 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 32 deletions(-) Index: sheepdog/sheep/group.c =================================================================== --- sheepdog.orig/sheep/group.c 2012-05-17 09:59:20.719984516 +0200 +++ sheepdog/sheep/group.c 2012-05-17 09:59:58.763984155 +0200 @@ -585,22 +585,50 @@ out: return ret; } +static void update_node_info(struct sd_node *nodes, size_t nr_nodes) +{ + print_node_list(nodes, nr_nodes); + + sys->nr_nodes = nr_nodes; + memcpy(sys->nodes, nodes, sizeof(*sys->nodes) * sys->nr_nodes); + qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp); + + update_vnode_info(); +} + +static void log_last_epoch(struct join_message *msg, struct sd_node *joined, + struct sd_node *nodes, size_t nr_nodes) +{ + if ((msg->cluster_status == SD_STATUS_OK || + msg->cluster_status == SD_STATUS_HALT) && msg->inc_epoch) { + struct sd_node old_nodes[SD_MAX_NODES]; + size_t count = 0, i; + + /* exclude the newly added one */ + for (i = 0; i < nr_nodes; i++) { + if (node_eq(nodes + i, joined)) + old_nodes[count++] = nodes[i]; + } + qsort(old_nodes, count, sizeof(struct sd_node), node_cmp); + + update_epoch_log(sys->epoch, old_nodes, count); + } +} + static void finish_join(struct join_message *msg, struct sd_node *joined, struct sd_node *nodes, size_t nr_nodes) { int i; + sys->join_finished = 1; sys->nr_copies = msg->nr_copies; sys->epoch = msg->epoch; - /* add nodes execept for newly joined one */ - for (i = 0; i < nr_nodes; i++) { - if (node_eq(nodes + i, joined)) - continue; - - sys->nodes[sys->nr_nodes++] = nodes[i]; - } - qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp); + /* + * Make sure we have an epoch log record for the epoch before + * this node joins, as recovery expects this record to exist. + */ + log_last_epoch(msg, joined, nodes, nr_nodes); if (msg->cluster_status != SD_STATUS_OK) { int nr_leave_nodes; @@ -624,12 +652,6 @@ static void finish_join(struct join_mess } } - sys->join_finished = 1; - - if ((msg->cluster_status == SD_STATUS_OK || - msg->cluster_status == SD_STATUS_HALT) && msg->inc_epoch) - update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes); - if (!sd_store && strlen((char *)msg->store)) { sd_store = find_store_driver((char *)msg->store); if (sd_store) { @@ -653,8 +675,7 @@ static void update_cluster_info(struct j if (!sys->join_finished) finish_join(msg, joined, nodes, nr_nodes); - sys->nodes[sys->nr_nodes++] = *joined; - qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp); + update_node_info(nodes, nr_nodes); if (msg->cluster_status == SD_STATUS_OK || msg->cluster_status == SD_STATUS_HALT) { @@ -670,10 +691,7 @@ static void update_cluster_info(struct j set_cluster_ctime(msg->ctime); } } - update_vnode_info(); sys_stat_set(msg->cluster_status); - - print_node_list(sys->nodes, sys->nr_nodes); } static void __sd_notify(struct event_struct *cevent) @@ -932,21 +950,15 @@ static void __sd_leave_done(struct event { struct work_leave *w = container_of(cevent, struct work_leave, cev); - sys->nr_nodes = w->member_list_entries; - memcpy(sys->nodes, w->member_list, sizeof(*sys->nodes) * sys->nr_nodes); - qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp); + update_node_info(w->member_list, w->member_list_entries); if (sys_can_recover()) { sys->epoch++; update_epoch_store(sys->epoch); update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes); - } - update_vnode_info(); - print_node_list(sys->nodes, sys->nr_nodes); - - if (sys_can_recover()) start_recovery(sys->epoch); + } if (sys_can_halt()) { if (current_vnode_info->nr_zones < sys->nr_copies) @@ -1277,11 +1289,8 @@ void sd_join_handler(struct sd_node *joi */ if (!sys->join_finished) { sys->join_finished = 1; - sys->nodes[sys->nr_nodes++] = sys->this_node; - qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp); - sys->epoch = get_latest_epoch(); - - update_vnode_info(); + assert(sys->nr_nodes == 0); + update_node_info(&sys->this_node, 1); } nr_local = get_nodes_nr_epoch(sys->epoch); |