[sheepdog] [PATCH] dog: repair objects based on majority vote
Hitoshi Mitake
mitake.hitoshi at gmail.com
Fri Nov 8 02:06:02 CET 2013
At Wed, 06 Nov 2013 11:42:33 +0900,
Hitoshi Mitake wrote:
>
> At Tue, 5 Nov 2013 10:54:25 +0900,
> Hitoshi Mitake wrote:
> >
> > Current object repair strategy of "dog vdi check" doesn't work well if
> > objects are corrupted. This patch adds a mechanism for majority voting
> > in the command.
> >
> > Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
> > ---
> > dog/vdi.c | 83 +++++++++++++++++++++++++++++++++++++-------
> > tests/functional/077 | 85 ++++++++++++++++++++++++++++++++++++++++++++++
> > tests/functional/077.out | 16 +++++++++
> > tests/functional/group | 1 +
> > 4 files changed, 173 insertions(+), 12 deletions(-)
> > create mode 100755 tests/functional/077
> > create mode 100644 tests/functional/077.out
>
> ping?
ping?
>
> >
> > diff --git a/dog/vdi.c b/dog/vdi.c
> > index d9a9a0f..27d03a2 100644
> > --- a/dog/vdi.c
> > +++ b/dog/vdi.c
> > @@ -1434,6 +1434,12 @@ struct vdi_check_work {
> > struct work work;
> > };
> >
> > +enum vdi_check_result {
> > + VDI_CHECK_NO_OBJ_FOUND,
> > + VDI_CHECK_NO_MAJORITY_FOUND,
> > + VDI_CHECK_SUCCESS,
> > +};
> > +
> > struct vdi_check_info {
> > uint64_t oid;
> > uint8_t nr_copies;
> > @@ -1442,7 +1448,8 @@ struct vdi_check_info {
> > uint64_t *done;
> > int refcnt;
> > struct work_queue *wq;
> > - struct vdi_check_work *base;
> > + enum vdi_check_result result;
> > + struct vdi_check_work *majority;
> > struct vdi_check_work vcw[0];
> > };
> >
> > @@ -1462,7 +1469,7 @@ static void vdi_repair_work(struct work *work)
> > struct vdi_check_info *info = vcw->info;
> > void *buf;
> >
> > - buf = read_object_from(info->base->vnode, info->oid);
> > + buf = read_object_from(info->majority->vnode, info->oid);
> > write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0);
> > free(buf);
> > }
> > @@ -1511,10 +1518,8 @@ static void vdi_check_object_work(struct work *work)
> > switch (rsp->result) {
> > case SD_RES_SUCCESS:
> > vcw->object_found = true;
> > - if (!is_erasure_oid(info->oid, info->copy_policy)) {
> > + if (!is_erasure_oid(info->oid, info->copy_policy))
> > memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash));
> > - uatomic_set(&info->base, vcw);
> > - }
> > break;
> > case SD_RES_NO_OBJ:
> > vcw->object_found = false;
> > @@ -1530,18 +1535,31 @@ static void vdi_check_object_work(struct work *work)
> >
> > static void check_replicatoin_object(struct vdi_check_info *info)
> > {
> > - if (info->base == NULL) {
> > - sd_err("no node has %" PRIx64, info->oid);
> > - exit(EXIT_FAILURE);
> > + if (info->majority == NULL) {
> > + switch (info->result) {
> > + case VDI_CHECK_NO_OBJ_FOUND:
> > + sd_err("no node has %" PRIx64, info->oid);
> > + break;
> > + case VDI_CHECK_NO_MAJORITY_FOUND:
> > + sd_err("no majority of %" PRIx64, info->oid);
> > + break;
> > + default:
> > + sd_err("unknown result of vdi check: %d", info->result);
> > + exit(EXIT_FAILURE);
> > + break;
> > + }
> > +
> > + /* do nothing */
> > + return;
> > }
> >
> > for (int i = 0; i < info->nr_copies; i++) {
> > - if (&info->vcw[i] == info->base)
> > + if (&info->vcw[i] == info->majority)
> > continue;
> > /* need repair when object not found or consistency broken */
> > if (!info->vcw[i].object_found ||
> > - memcmp(info->base->hash, info->vcw[i].hash,
> > - sizeof(info->base->hash)) != 0) {
> > + memcmp(info->majority->hash, info->vcw[i].hash,
> > + sizeof(info->majority->hash)) != 0) {
> > info->vcw[i].work.fn = vdi_repair_work;
> > info->vcw[i].work.done = vdi_repair_main;
> > info->refcnt++;
> > @@ -1615,6 +1633,45 @@ out:
> > ec_destroy(ctx);
> > }
> >
> > +static void vote_majority_object(struct vdi_check_info *info)
> > +{
> > + /*
> > + * Voting majority object from existing ones.
> > + *
> > + * The linear majority vote algorithm by Boyer and Moore is used:
> > + * http://www.cs.utexas.edu/~moore/best-ideas/mjrty/
> > + */
> > +
> > + int count = 0;
> > + struct vdi_check_work *majority = NULL;
> > +
> > + for (int i = 0; i < info->nr_copies; i++) {
> > + struct vdi_check_work *vcw = &info->vcw[i];
> > +
> > + if (!vcw->object_found)
> > + continue;
> > +
> > + if (!count)
> > + majority = vcw;
> > +
> > + if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash)))
> > + count++;
> > + else
> > + count--;
> > + }
> > +
> > + if (!majority)
> > + info->result = VDI_CHECK_NO_OBJ_FOUND;
> > + else if (count < info->nr_copies / 2) {
> > + /* no majority found */
> > + majority = NULL;
> > + info->result = VDI_CHECK_NO_MAJORITY_FOUND;
> > + } else
> > + info->result = VDI_CHECK_SUCCESS;
> > +
> > + info->majority = majority;
> > +}
> > +
> > static void vdi_check_object_main(struct work *work)
> > {
> > struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
> > @@ -1627,8 +1684,10 @@ static void vdi_check_object_main(struct work *work)
> >
> > if (is_erasure_oid(info->oid, info->copy_policy))
> > check_erasure_object(info);
> > - else
> > + else {
> > + vote_majority_object(info);
> > check_replicatoin_object(info);
> > + }
> >
> > if (info->refcnt == 0)
> > free_vdi_check_info(info);
> > diff --git a/tests/functional/077 b/tests/functional/077
> > new file mode 100755
> > index 0000000..dd743bd
> > --- /dev/null
> > +++ b/tests/functional/077
> > @@ -0,0 +1,85 @@
> > +#!/bin/bash
> > +
> > +# Test vdi repair functionality
> > +
> > +. ./common
> > +
> > +for i in `seq 0 2`; do
> > + _start_sheep $i
> > +done
> > +
> > +_wait_for_sheep 3
> > +
> > +_cluster_format
> > +
> > +$DOG vdi create test 12M
> > +_vdi_list
> > +
> > +echo "original data" | $DOG vdi write test
> > +
> > +$DOG cluster shutdown
> > +
> > +# single object lost
> > +
> > +rm $STORE/0/obj/007c2b2500000000
> > +
> > +for i in `seq 0 2`; do
> > + _start_sheep $i
> > +done
> > +
> > +_wait_for_sheep 3
> > +
> > +$DOG vdi check test
> > +
> > +$DOG vdi read test 0 14
> > +
> > +$DOG cluster shutdown
> > +
> > +# single broken object
> > +
> > +dd if=/dev/urandom of=$STORE/1/obj/007c2b2500000000 bs=4096 count=1024 &> /dev/null
> > +
> > +for i in `seq 0 2`; do
> > + _start_sheep $i
> > +done
> > +
> > +_wait_for_sheep 3
> > +
> > +$DOG vdi check test
> > +
> > +$DOG vdi read test 0 14
> > +
> > +$DOG cluster shutdown
> > +
> > +# two object lost
> > +
> > +rm $STORE/0/obj/007c2b2500000000
> > +rm $STORE/1/obj/007c2b2500000000
> > +
> > +for i in `seq 0 2`; do
> > + _start_sheep $i
> > +done
> > +
> > +_wait_for_sheep 3
> > +
> > +$DOG vdi check test
> > +
> > +$DOG vdi read test 0 14
> > +
> > +$DOG cluster shutdown
> > +
> > +# single object lost, single broken object. no majority
> > +
> > +rm $STORE/0/obj/007c2b2500000000
> > +dd if=/dev/urandom of=$STORE/1/obj/007c2b2500000000 bs=4096 count=1024 &> /dev/null
> > +
> > +for i in `seq 0 2`; do
> > + _start_sheep $i
> > +done
> > +
> > +_wait_for_sheep 3
> > +
> > +$DOG vdi check test
> > +
> > +$DOG cluster shutdown
> > +
> > diff --git a/tests/functional/077.out b/tests/functional/077.out
> > new file mode 100644
> > index 0000000..5bddb9b
> > --- /dev/null
> > +++ b/tests/functional/077.out
> > @@ -0,0 +1,16 @@
> > +QA output created by 077
> > +using backend plain store
> > + Name Id Size Used Shared Creation time VDI id Copies Tag
> > + test 0 12 MB 0.0 MB 0.0 MB DATE 7c2b25 3
> > +fixed missing 7c2b2500000000
> > +finish check&repair test
> > +original data
> > +fixed replica 7c2b2500000000
> > +finish check&repair test
> > +original data
> > +fixed missing 7c2b2500000000
> > +fixed missing 7c2b2500000000
> > +finish check&repair test
> > +original data
> > +no majority of 7c2b2500000000
> > +finish check&repair test
> > diff --git a/tests/functional/group b/tests/functional/group
> > index 8d15ffe..22ec578 100644
> > --- a/tests/functional/group
> > +++ b/tests/functional/group
> > @@ -91,3 +91,4 @@
> > 074 auto quick clster md
> > 075 auto quick vdi md
> > 076 auto quick vdi md
> > +077 auto quick vdi
> > --
> > 1.7.10.4
> >
More information about the sheepdog
mailing list