[sheepdog] [Sheepdog] [PATCH 0/4] fix a race when multiple sheep join a cluster very quickly

Thu May 17 05:05:49 CEST 2012

On Thu, May 17, 2012 at 10:43 AM, Yunkai Zhang <yunkai.me at gmail.com> wrote:
> On Thu, May 17, 2012 at 1:20 AM, Christoph Hellwig <hch at infradead.org> wrote:
>> On Wed, May 16, 2012 at 10:49:44AM -0400, Christoph Hellwig wrote:
>>> On Wed, May 16, 2012 at 10:44:14PM +0800, Yunkai Zhang wrote:
>>> > Hi Hellwig, Do you have any comments on my latest unregister patch
>>> > which have been updated to reflect your previous comments?
>>>
>>> I like it, but I ran into an issue where a VDI created on one
>>> nodes doesn't seem to to be found on another node created later.
>>
>> The issues was that sys_stat_ok is now always true in __sd_join.  The

I'll try to fix it.

>> patch below records the state before we joined and stores it in the
>> join work to restore the old behavior.  It also filters the joining node
>> out to match the old behaviour, just as your earlier versions of the
>> patches did.  The patch passes all my testing, but it still introduces
>> the problem with I/O concurrent to epoch/node list changes vs recovery
>> that Kazutaka mentioned in his recent mail, so it will need more work.
>>
>> Also note that it is based on top of my three patches as my working tree
>> has them included.
>>
>>
>> diff --git a/sheep/group.c b/sheep/group.c
>> index 6921f5e..8c684f7 100644
>> --- a/sheep/group.c
>> +++ b/sheep/group.c
>> @@ -9,6 +9,7 @@
>>  * along with this program. If not, see <http://www.gnu.org/licenses/>.
>>  */
>>  #include <assert.h>
>> +#include <stdbool.h>
>>  #include <stdio.h>
>>  #include <stdlib.h>
>>  #include <unistd.h>
>> @@ -78,6 +79,7 @@ struct work_join {
>>        struct sd_node *member_list;
>>        size_t member_list_entries;
>>        struct sd_node joined;
>> +       bool get_vdi_bitmap;
>>
>>        struct join_message *jm;
>>  };
>> @@ -722,19 +724,15 @@ static int check_majority(struct sd_node *nodes, int nr_nodes)
>>  static void __sd_join(struct event_struct *cevent)
>>  {
>>        struct work_join *w = container_of(cevent, struct work_join, cev);
>> -       struct join_message *msg = w->jm;
>>        int i;
>>
>> -       if (msg->cluster_status != SD_STATUS_OK &&
>> -           msg->cluster_status != SD_STATUS_HALT)
>> -               return;
>> -
>> -       if (sys_stat_ok())
>> +       if (!w->get_vdi_bitmap)
>>                return;
>>
>>        for (i = 0; i < w->member_list_entries; i++) {
>> -
>> -               get_vdi_bitmap_from(w->member_list + i);
>> +               if (node_eq(&w->member_list[i], &w->joined))
>> +                       continue;
>
> Hi Hellwig, it seems that this is not my latest unregister patch. I
> have removed this node_eq() condition in it.

Sorry, I have tested your case with latest unregister patch, it also
has the some bug.

>
>> +               get_vdi_bitmap_from(&w->member_list[i]);
>>
>>                /*
>>                 * If a new comer try to join the running cluster, it only
>> @@ -867,11 +865,6 @@ static void __sd_join_done(struct event_struct *cevent)
>>
>>        print_node_list(sys->nodes, sys->nr_nodes);
>>
>> -       if (!sys_stat_join_failed()) {
>> -               update_cluster_info(jm, &w->joined, w->member_list,
>> -                                   w->member_list_entries);
>> -       }
>> -
>>        if (sys_can_recover() && jm->inc_epoch) {
>>                list_for_each_entry_safe(node, t, &sys->leave_list, list) {
>>                        list_del(&node->list);
>> @@ -891,17 +884,8 @@ static void __sd_join_done(struct event_struct *cevent)
>>
>>  static void __sd_leave_done(struct event_struct *cevent)
>>  {
>> -       struct work_leave *w = container_of(cevent, struct work_leave, cev);
>> -
>> -       update_node_info(w->member_list, w->member_list_entries);
>> -
>> -       if (sys_can_recover()) {
>> -               sys->epoch++;
>> -               update_epoch_store(sys->epoch);
>> -               update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
>> -
>> +       if (sys_can_recover())
>>                start_recovery(sys->epoch);
>> -       }
>>
>>        if (sys_can_halt()) {
>>                if (current_vnode_info->nr_zones < sys->nr_copies)
>> @@ -965,7 +949,6 @@ static void event_fn(struct work *work)
>>  static void event_done(struct work *work)
>>  {
>>        struct event_struct *cevent;
>> -       int ret;
>>
>>        if (!sys->cur_cevent)
>>                vprintf(SDOG_ERR, "bug\n");
>> @@ -992,9 +975,6 @@ static void event_done(struct work *work)
>>        vprintf(SDOG_DEBUG, "free %p\n", cevent);
>>        event_free(cevent);
>>        event_running = 0;
>> -       ret = register_event(cdrv_fd, group_handler, NULL);
>> -       if (ret)
>> -               panic("failed to register event fd");
>>
>>        process_request_event_queues();
>>  }
>> @@ -1106,7 +1086,6 @@ static inline void process_event_queue(void)
>>        event_work.fn = event_fn;
>>        event_work.done = event_done;
>>
>> -       unregister_event(cdrv_fd);
>>        queue_work(sys->event_wqueue, &event_work);
>>  }
>>
>> @@ -1128,7 +1107,7 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
>>        int i, size;
>>        int nr, nr_local, nr_leave;
>>        struct node *n;
>> -       struct join_message *jm;
>> +       struct join_message *jm = opaque;
>>        uint32_t le = get_latest_epoch();
>>
>>        if (node_eq(joined, &sys->this_node)) {
>> @@ -1171,11 +1150,24 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
>>
>>                w->joined = *joined;
>>
>> +               /*
>> +                * Only get the VDI list if we were waiting for a format or
>> +                * join before, and the join was successful.
>> +                */
>> +               if (sys_stat_ok() ||
>> +                   (jm->cluster_status != SD_STATUS_OK &&
>> +                    jm->cluster_status != SD_STATUS_HALT))
>> +                       w->get_vdi_bitmap = false;
>> +               else
>> +                       w->get_vdi_bitmap = true;
>> +
>>                size = get_join_message_size(opaque);
>>                w->jm = zalloc(size);
>>                if (!w->jm)
>>                        panic("failed to allocate memory\n");
>>                memcpy(w->jm, opaque, size);
>> +
>> +               update_cluster_info(jm, joined, members, nr_members);
>>
>>                list_add_tail(&cevent->event_list, &sys->event_queue);
>>                process_request_event_queues();
>> @@ -1210,7 +1202,6 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
>>                }
>>                break;
>>        case CJ_RES_MASTER_TRANSFER:
>> -               jm = (struct join_message *)opaque;
>>                nr = jm->nr_leave_nodes;
>>                for (i = 0; i < nr; i++) {
>>                        if (find_entry_list(&jm->leave_nodes[i], &sys->leave_list)
>> @@ -1268,6 +1259,14 @@ void sd_leave_handler(struct sd_node *left, struct sd_node *members,
>>        if (sys_stat_shutdown())
>>                return;
>>
>> +       update_node_info(w->member_list, w->member_list_entries);
>> +
>> +       if (sys_can_recover()) {
>> +               sys->epoch++;
>> +               update_epoch_store(sys->epoch);
>> +               update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
>> +       }
>> +
>>        w = zalloc(sizeof(*w));
>>        if (!w)
>>                goto oom;
>
>
>
> --
> Yunkai Zhang
> Work at Taobao

--
Yunkai Zhang
Work at Taobao