[sheepdog] [Sheepdog] [PATCH 0/4] fix a race when multiple sheep join a cluster very quickly

Thu May 17 04:43:15 CEST 2012

On Thu, May 17, 2012 at 1:20 AM, Christoph Hellwig <hch at infradead.org> wrote:
> On Wed, May 16, 2012 at 10:49:44AM -0400, Christoph Hellwig wrote:
>> On Wed, May 16, 2012 at 10:44:14PM +0800, Yunkai Zhang wrote:
>> > Hi Hellwig, Do you have any comments on my latest unregister patch
>> > which have been updated to reflect your previous comments?
>>
>> I like it, but I ran into an issue where a VDI created on one
>> nodes doesn't seem to to be found on another node created later.
>
> The issues was that sys_stat_ok is now always true in __sd_join.  The
> patch below records the state before we joined and stores it in the
> join work to restore the old behavior.  It also filters the joining node
> out to match the old behaviour, just as your earlier versions of the
> patches did.  The patch passes all my testing, but it still introduces
> the problem with I/O concurrent to epoch/node list changes vs recovery
> that Kazutaka mentioned in his recent mail, so it will need more work.
>
> Also note that it is based on top of my three patches as my working tree
> has them included.
>
>
> diff --git a/sheep/group.c b/sheep/group.c
> index 6921f5e..8c684f7 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -9,6 +9,7 @@
>  * along with this program. If not, see <http://www.gnu.org/licenses/>.
>  */
>  #include <assert.h>
> +#include <stdbool.h>
>  #include <stdio.h>
>  #include <stdlib.h>
>  #include <unistd.h>
> @@ -78,6 +79,7 @@ struct work_join {
>        struct sd_node *member_list;
>        size_t member_list_entries;
>        struct sd_node joined;
> +       bool get_vdi_bitmap;
>
>        struct join_message *jm;
>  };
> @@ -722,19 +724,15 @@ static int check_majority(struct sd_node *nodes, int nr_nodes)
>  static void __sd_join(struct event_struct *cevent)
>  {
>        struct work_join *w = container_of(cevent, struct work_join, cev);
> -       struct join_message *msg = w->jm;
>        int i;
>
> -       if (msg->cluster_status != SD_STATUS_OK &&
> -           msg->cluster_status != SD_STATUS_HALT)
> -               return;
> -
> -       if (sys_stat_ok())
> +       if (!w->get_vdi_bitmap)
>                return;
>
>        for (i = 0; i < w->member_list_entries; i++) {
> -
> -               get_vdi_bitmap_from(w->member_list + i);
> +               if (node_eq(&w->member_list[i], &w->joined))
> +                       continue;

Hi Hellwig, it seems that this is not my latest unregister patch. I
have removed this node_eq() condition in it.

> +               get_vdi_bitmap_from(&w->member_list[i]);
>
>                /*
>                 * If a new comer try to join the running cluster, it only
> @@ -867,11 +865,6 @@ static void __sd_join_done(struct event_struct *cevent)
>
>        print_node_list(sys->nodes, sys->nr_nodes);
>
> -       if (!sys_stat_join_failed()) {
> -               update_cluster_info(jm, &w->joined, w->member_list,
> -                                   w->member_list_entries);
> -       }
> -
>        if (sys_can_recover() && jm->inc_epoch) {
>                list_for_each_entry_safe(node, t, &sys->leave_list, list) {
>                        list_del(&node->list);
> @@ -891,17 +884,8 @@ static void __sd_join_done(struct event_struct *cevent)
>
>  static void __sd_leave_done(struct event_struct *cevent)
>  {
> -       struct work_leave *w = container_of(cevent, struct work_leave, cev);
> -
> -       update_node_info(w->member_list, w->member_list_entries);
> -
> -       if (sys_can_recover()) {
> -               sys->epoch++;
> -               update_epoch_store(sys->epoch);
> -               update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
> -
> +       if (sys_can_recover())
>                start_recovery(sys->epoch);
> -       }
>
>        if (sys_can_halt()) {
>                if (current_vnode_info->nr_zones < sys->nr_copies)
> @@ -965,7 +949,6 @@ static void event_fn(struct work *work)
>  static void event_done(struct work *work)
>  {
>        struct event_struct *cevent;
> -       int ret;
>
>        if (!sys->cur_cevent)
>                vprintf(SDOG_ERR, "bug\n");
> @@ -992,9 +975,6 @@ static void event_done(struct work *work)
>        vprintf(SDOG_DEBUG, "free %p\n", cevent);
>        event_free(cevent);
>        event_running = 0;
> -       ret = register_event(cdrv_fd, group_handler, NULL);
> -       if (ret)
> -               panic("failed to register event fd");
>
>        process_request_event_queues();
>  }
> @@ -1106,7 +1086,6 @@ static inline void process_event_queue(void)
>        event_work.fn = event_fn;
>        event_work.done = event_done;
>
> -       unregister_event(cdrv_fd);
>        queue_work(sys->event_wqueue, &event_work);
>  }
>
> @@ -1128,7 +1107,7 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
>        int i, size;
>        int nr, nr_local, nr_leave;
>        struct node *n;
> -       struct join_message *jm;
> +       struct join_message *jm = opaque;
>        uint32_t le = get_latest_epoch();
>
>        if (node_eq(joined, &sys->this_node)) {
> @@ -1171,11 +1150,24 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
>
>                w->joined = *joined;
>
> +               /*
> +                * Only get the VDI list if we were waiting for a format or
> +                * join before, and the join was successful.
> +                */
> +               if (sys_stat_ok() ||
> +                   (jm->cluster_status != SD_STATUS_OK &&
> +                    jm->cluster_status != SD_STATUS_HALT))
> +                       w->get_vdi_bitmap = false;
> +               else
> +                       w->get_vdi_bitmap = true;
> +
>                size = get_join_message_size(opaque);
>                w->jm = zalloc(size);
>                if (!w->jm)
>                        panic("failed to allocate memory\n");
>                memcpy(w->jm, opaque, size);
> +
> +               update_cluster_info(jm, joined, members, nr_members);
>
>                list_add_tail(&cevent->event_list, &sys->event_queue);
>                process_request_event_queues();
> @@ -1210,7 +1202,6 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
>                }
>                break;
>        case CJ_RES_MASTER_TRANSFER:
> -               jm = (struct join_message *)opaque;
>                nr = jm->nr_leave_nodes;
>                for (i = 0; i < nr; i++) {
>                        if (find_entry_list(&jm->leave_nodes[i], &sys->leave_list)
> @@ -1268,6 +1259,14 @@ void sd_leave_handler(struct sd_node *left, struct sd_node *members,
>        if (sys_stat_shutdown())
>                return;
>
> +       update_node_info(w->member_list, w->member_list_entries);
> +
> +       if (sys_can_recover()) {
> +               sys->epoch++;
> +               update_epoch_store(sys->epoch);
> +               update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
> +       }
> +
>        w = zalloc(sizeof(*w));
>        if (!w)
>                goto oom;

-- 
Yunkai Zhang
Work at Taobao