[sheepdog] [PATCH] dog: repair objects based on majority vote

Hitoshi Mitake mitake.hitoshi at gmail.com
Sun Nov 10 16:17:08 CET 2013


On Fri, Nov 8, 2013 at 10:06 AM, Hitoshi Mitake
<mitake.hitoshi at gmail.com> wrote:
> At Wed, 06 Nov 2013 11:42:33 +0900,
> Hitoshi Mitake wrote:
>>
>> At Tue,  5 Nov 2013 10:54:25 +0900,
>> Hitoshi Mitake wrote:
>> >
>> > Current object repair strategy of "dog vdi check" doesn't work well if
>> > objects are corrupted. This patch adds a mechanism for majority voting
>> > in the command.
>> >
>> > Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
>> > ---
>> >  dog/vdi.c                |   83 +++++++++++++++++++++++++++++++++++++-------
>> >  tests/functional/077     |   85 ++++++++++++++++++++++++++++++++++++++++++++++
>> >  tests/functional/077.out |   16 +++++++++
>> >  tests/functional/group   |    1 +
>> >  4 files changed, 173 insertions(+), 12 deletions(-)
>> >  create mode 100755 tests/functional/077
>> >  create mode 100644 tests/functional/077.out
>>
>> ping?
>
> ping?

ping

>
>>
>> >
>> > diff --git a/dog/vdi.c b/dog/vdi.c
>> > index d9a9a0f..27d03a2 100644
>> > --- a/dog/vdi.c
>> > +++ b/dog/vdi.c
>> > @@ -1434,6 +1434,12 @@ struct vdi_check_work {
>> >     struct work work;
>> >  };
>> >
>> > +enum vdi_check_result {
>> > +   VDI_CHECK_NO_OBJ_FOUND,
>> > +   VDI_CHECK_NO_MAJORITY_FOUND,
>> > +   VDI_CHECK_SUCCESS,
>> > +};
>> > +
>> >  struct vdi_check_info {
>> >     uint64_t oid;
>> >     uint8_t nr_copies;
>> > @@ -1442,7 +1448,8 @@ struct vdi_check_info {
>> >     uint64_t *done;
>> >     int refcnt;
>> >     struct work_queue *wq;
>> > -   struct vdi_check_work *base;
>> > +   enum vdi_check_result result;
>> > +   struct vdi_check_work *majority;
>> >     struct vdi_check_work vcw[0];
>> >  };
>> >
>> > @@ -1462,7 +1469,7 @@ static void vdi_repair_work(struct work *work)
>> >     struct vdi_check_info *info = vcw->info;
>> >     void *buf;
>> >
>> > -   buf = read_object_from(info->base->vnode, info->oid);
>> > +   buf = read_object_from(info->majority->vnode, info->oid);
>> >     write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0);
>> >     free(buf);
>> >  }
>> > @@ -1511,10 +1518,8 @@ static void vdi_check_object_work(struct work *work)
>> >     switch (rsp->result) {
>> >     case SD_RES_SUCCESS:
>> >             vcw->object_found = true;
>> > -           if (!is_erasure_oid(info->oid, info->copy_policy)) {
>> > +           if (!is_erasure_oid(info->oid, info->copy_policy))
>> >                     memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash));
>> > -                   uatomic_set(&info->base, vcw);
>> > -           }
>> >             break;
>> >     case SD_RES_NO_OBJ:
>> >             vcw->object_found = false;
>> > @@ -1530,18 +1535,31 @@ static void vdi_check_object_work(struct work *work)
>> >
>> >  static void check_replicatoin_object(struct vdi_check_info *info)
>> >  {
>> > -   if (info->base == NULL) {
>> > -           sd_err("no node has %" PRIx64, info->oid);
>> > -           exit(EXIT_FAILURE);
>> > +   if (info->majority == NULL) {
>> > +           switch (info->result) {
>> > +           case VDI_CHECK_NO_OBJ_FOUND:
>> > +                   sd_err("no node has %" PRIx64, info->oid);
>> > +                   break;
>> > +           case VDI_CHECK_NO_MAJORITY_FOUND:
>> > +                   sd_err("no majority of %" PRIx64, info->oid);
>> > +                   break;
>> > +           default:
>> > +                   sd_err("unknown result of vdi check: %d", info->result);
>> > +                   exit(EXIT_FAILURE);
>> > +                   break;
>> > +           }
>> > +
>> > +           /* do nothing */
>> > +           return;
>> >     }
>> >
>> >     for (int i = 0; i < info->nr_copies; i++) {
>> > -           if (&info->vcw[i] == info->base)
>> > +           if (&info->vcw[i] == info->majority)
>> >                     continue;
>> >             /* need repair when object not found or consistency broken */
>> >             if (!info->vcw[i].object_found ||
>> > -               memcmp(info->base->hash, info->vcw[i].hash,
>> > -                      sizeof(info->base->hash)) != 0) {
>> > +               memcmp(info->majority->hash, info->vcw[i].hash,
>> > +                      sizeof(info->majority->hash)) != 0) {
>> >                     info->vcw[i].work.fn = vdi_repair_work;
>> >                     info->vcw[i].work.done = vdi_repair_main;
>> >                     info->refcnt++;
>> > @@ -1615,6 +1633,45 @@ out:
>> >     ec_destroy(ctx);
>> >  }
>> >
>> > +static void vote_majority_object(struct vdi_check_info *info)
>> > +{
>> > +   /*
>> > +    * Voting majority object from existing ones.
>> > +    *
>> > +    * The linear majority vote algorithm by Boyer and Moore is used:
>> > +    * http://www.cs.utexas.edu/~moore/best-ideas/mjrty/
>> > +    */
>> > +
>> > +   int count = 0;
>> > +   struct vdi_check_work *majority = NULL;
>> > +
>> > +   for (int i = 0; i < info->nr_copies; i++) {
>> > +           struct vdi_check_work *vcw = &info->vcw[i];
>> > +
>> > +           if (!vcw->object_found)
>> > +                   continue;
>> > +
>> > +           if (!count)
>> > +                   majority = vcw;
>> > +
>> > +           if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash)))
>> > +                   count++;
>> > +           else
>> > +                   count--;
>> > +   }
>> > +
>> > +   if (!majority)
>> > +           info->result = VDI_CHECK_NO_OBJ_FOUND;
>> > +   else if (count < info->nr_copies / 2) {
>> > +           /* no majority found */
>> > +           majority = NULL;
>> > +           info->result = VDI_CHECK_NO_MAJORITY_FOUND;
>> > +   } else
>> > +           info->result = VDI_CHECK_SUCCESS;
>> > +
>> > +   info->majority = majority;
>> > +}
>> > +
>> >  static void vdi_check_object_main(struct work *work)
>> >  {
>> >     struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
>> > @@ -1627,8 +1684,10 @@ static void vdi_check_object_main(struct work *work)
>> >
>> >     if (is_erasure_oid(info->oid, info->copy_policy))
>> >             check_erasure_object(info);
>> > -   else
>> > +   else {
>> > +           vote_majority_object(info);
>> >             check_replicatoin_object(info);
>> > +   }
>> >
>> >     if (info->refcnt == 0)
>> >             free_vdi_check_info(info);
>> > diff --git a/tests/functional/077 b/tests/functional/077
>> > new file mode 100755
>> > index 0000000..dd743bd
>> > --- /dev/null
>> > +++ b/tests/functional/077
>> > @@ -0,0 +1,85 @@
>> > +#!/bin/bash
>> > +
>> > +# Test vdi repair functionality
>> > +
>> > +. ./common
>> > +
>> > +for i in `seq 0 2`; do
>> > +    _start_sheep $i
>> > +done
>> > +
>> > +_wait_for_sheep 3
>> > +
>> > +_cluster_format
>> > +
>> > +$DOG vdi create test 12M
>> > +_vdi_list
>> > +
>> > +echo "original data" | $DOG vdi write test
>> > +
>> > +$DOG cluster shutdown
>> > +
>> > +# single object lost
>> > +
>> > +rm $STORE/0/obj/007c2b2500000000
>> > +
>> > +for i in `seq 0 2`; do
>> > +    _start_sheep $i
>> > +done
>> > +
>> > +_wait_for_sheep 3
>> > +
>> > +$DOG vdi check test
>> > +
>> > +$DOG vdi read test 0 14
>> > +
>> > +$DOG cluster shutdown
>> > +
>> > +# single broken object
>> > +
>> > +dd if=/dev/urandom of=$STORE/1/obj/007c2b2500000000 bs=4096 count=1024 &> /dev/null
>> > +
>> > +for i in `seq 0 2`; do
>> > +    _start_sheep $i
>> > +done
>> > +
>> > +_wait_for_sheep 3
>> > +
>> > +$DOG vdi check test
>> > +
>> > +$DOG vdi read test 0 14
>> > +
>> > +$DOG cluster shutdown
>> > +
>> > +# two object lost
>> > +
>> > +rm $STORE/0/obj/007c2b2500000000
>> > +rm $STORE/1/obj/007c2b2500000000
>> > +
>> > +for i in `seq 0 2`; do
>> > +    _start_sheep $i
>> > +done
>> > +
>> > +_wait_for_sheep 3
>> > +
>> > +$DOG vdi check test
>> > +
>> > +$DOG vdi read test 0 14
>> > +
>> > +$DOG cluster shutdown
>> > +
>> > +# single object lost, single broken object. no majority
>> > +
>> > +rm $STORE/0/obj/007c2b2500000000
>> > +dd if=/dev/urandom of=$STORE/1/obj/007c2b2500000000 bs=4096 count=1024 &> /dev/null
>> > +
>> > +for i in `seq 0 2`; do
>> > +    _start_sheep $i
>> > +done
>> > +
>> > +_wait_for_sheep 3
>> > +
>> > +$DOG vdi check test
>> > +
>> > +$DOG cluster shutdown
>> > +
>> > diff --git a/tests/functional/077.out b/tests/functional/077.out
>> > new file mode 100644
>> > index 0000000..5bddb9b
>> > --- /dev/null
>> > +++ b/tests/functional/077.out
>> > @@ -0,0 +1,16 @@
>> > +QA output created by 077
>> > +using backend plain store
>> > +  Name        Id    Size    Used  Shared    Creation time   VDI id  Copies  Tag
>> > +  test         0   12 MB  0.0 MB  0.0 MB DATE   7c2b25     3
>> > +fixed missing 7c2b2500000000
>> > +finish check&repair test
>> > +original data
>> > +fixed replica 7c2b2500000000
>> > +finish check&repair test
>> > +original data
>> > +fixed missing 7c2b2500000000
>> > +fixed missing 7c2b2500000000
>> > +finish check&repair test
>> > +original data
>> > +no majority of 7c2b2500000000
>> > +finish check&repair test
>> > diff --git a/tests/functional/group b/tests/functional/group
>> > index 8d15ffe..22ec578 100644
>> > --- a/tests/functional/group
>> > +++ b/tests/functional/group
>> > @@ -91,3 +91,4 @@
>> >  074 auto quick clster md
>> >  075 auto quick vdi md
>> >  076 auto quick vdi md
>> > +077 auto quick vdi
>> > --
>> > 1.7.10.4
>> >



More information about the sheepdog mailing list