[sheepdog] [PATCH v3] dog: repair objects based on majority vote
Hitoshi Mitake
mitake.hitoshi at gmail.com
Tue Nov 12 03:36:26 CET 2013
At Mon, 11 Nov 2013 22:11:10 +0900,
Hitoshi Mitake wrote:
>
> From: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
>
> Current object repair strategy of "dog vdi check" doesn't work well if
> objects are corrupted. This patch adds a mechanism for majority voting
> in the command.
>
> Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
> ---
>
> v3: use _random() for test 077
>
> v2: determine majoriby based on a number of live copies
>
> dog/vdi.c | 84 ++++++++++++++++++++++++++++++++++++++++-------
> tests/functional/077 | 85 ++++++++++++++++++++++++++++++++++++++++++++++++
> tests/functional/077.out | 16 +++++++++
> tests/functional/group | 1 +
> 4 files changed, 174 insertions(+), 12 deletions(-)
> create mode 100755 tests/functional/077
> create mode 100644 tests/functional/077.out
Sorry, please ignore this one. I'll send v2 for separating the test
and the change for smooth backporting to stable branches.
Thanks,
Hitoshi
>
> diff --git a/dog/vdi.c b/dog/vdi.c
> index d9a9a0f..124fef4 100644
> --- a/dog/vdi.c
> +++ b/dog/vdi.c
> @@ -1434,6 +1434,12 @@ struct vdi_check_work {
> struct work work;
> };
>
> +enum vdi_check_result {
> + VDI_CHECK_NO_OBJ_FOUND,
> + VDI_CHECK_NO_MAJORITY_FOUND,
> + VDI_CHECK_SUCCESS,
> +};
> +
> struct vdi_check_info {
> uint64_t oid;
> uint8_t nr_copies;
> @@ -1442,7 +1448,8 @@ struct vdi_check_info {
> uint64_t *done;
> int refcnt;
> struct work_queue *wq;
> - struct vdi_check_work *base;
> + enum vdi_check_result result;
> + struct vdi_check_work *majority;
> struct vdi_check_work vcw[0];
> };
>
> @@ -1462,7 +1469,7 @@ static void vdi_repair_work(struct work *work)
> struct vdi_check_info *info = vcw->info;
> void *buf;
>
> - buf = read_object_from(info->base->vnode, info->oid);
> + buf = read_object_from(info->majority->vnode, info->oid);
> write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0);
> free(buf);
> }
> @@ -1511,10 +1518,8 @@ static void vdi_check_object_work(struct work *work)
> switch (rsp->result) {
> case SD_RES_SUCCESS:
> vcw->object_found = true;
> - if (!is_erasure_oid(info->oid, info->copy_policy)) {
> + if (!is_erasure_oid(info->oid, info->copy_policy))
> memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash));
> - uatomic_set(&info->base, vcw);
> - }
> break;
> case SD_RES_NO_OBJ:
> vcw->object_found = false;
> @@ -1530,18 +1535,31 @@ static void vdi_check_object_work(struct work *work)
>
> static void check_replicatoin_object(struct vdi_check_info *info)
> {
> - if (info->base == NULL) {
> - sd_err("no node has %" PRIx64, info->oid);
> - exit(EXIT_FAILURE);
> + if (info->majority == NULL) {
> + switch (info->result) {
> + case VDI_CHECK_NO_OBJ_FOUND:
> + sd_err("no node has %" PRIx64, info->oid);
> + break;
> + case VDI_CHECK_NO_MAJORITY_FOUND:
> + sd_err("no majority of %" PRIx64, info->oid);
> + break;
> + default:
> + sd_err("unknown result of vdi check: %d", info->result);
> + exit(EXIT_FAILURE);
> + break;
> + }
> +
> + /* do nothing */
> + return;
> }
>
> for (int i = 0; i < info->nr_copies; i++) {
> - if (&info->vcw[i] == info->base)
> + if (&info->vcw[i] == info->majority)
> continue;
> /* need repair when object not found or consistency broken */
> if (!info->vcw[i].object_found ||
> - memcmp(info->base->hash, info->vcw[i].hash,
> - sizeof(info->base->hash)) != 0) {
> + memcmp(info->majority->hash, info->vcw[i].hash,
> + sizeof(info->majority->hash)) != 0) {
> info->vcw[i].work.fn = vdi_repair_work;
> info->vcw[i].work.done = vdi_repair_main;
> info->refcnt++;
> @@ -1615,6 +1633,46 @@ out:
> ec_destroy(ctx);
> }
>
> +static void vote_majority_object(struct vdi_check_info *info)
> +{
> + /*
> + * Voting majority object from existing ones.
> + *
> + * The linear majority vote algorithm by Boyer and Moore is used:
> + * http://www.cs.utexas.edu/~moore/best-ideas/mjrty/
> + */
> +
> + int count = 0, nr_live_copies = 0;
> + struct vdi_check_work *majority = NULL;
> +
> + for (int i = 0; i < info->nr_copies; i++) {
> + struct vdi_check_work *vcw = &info->vcw[i];
> +
> + if (!vcw->object_found)
> + continue;
> + nr_live_copies++;
> +
> + if (!count)
> + majority = vcw;
> +
> + if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash)))
> + count++;
> + else
> + count--;
> + }
> +
> + if (!majority)
> + info->result = VDI_CHECK_NO_OBJ_FOUND;
> + else if (count < nr_live_copies / 2) {
> + /* no majority found */
> + majority = NULL;
> + info->result = VDI_CHECK_NO_MAJORITY_FOUND;
> + } else
> + info->result = VDI_CHECK_SUCCESS;
> +
> + info->majority = majority;
> +}
> +
> static void vdi_check_object_main(struct work *work)
> {
> struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
> @@ -1627,8 +1685,10 @@ static void vdi_check_object_main(struct work *work)
>
> if (is_erasure_oid(info->oid, info->copy_policy))
> check_erasure_object(info);
> - else
> + else {
> + vote_majority_object(info);
> check_replicatoin_object(info);
> + }
>
> if (info->refcnt == 0)
> free_vdi_check_info(info);
> diff --git a/tests/functional/077 b/tests/functional/077
> new file mode 100755
> index 0000000..71db820
> --- /dev/null
> +++ b/tests/functional/077
> @@ -0,0 +1,85 @@
> +#!/bin/bash
> +
> +# Test vdi repair functionality
> +
> +. ./common
> +
> +for i in `seq 0 2`; do
> + _start_sheep $i
> +done
> +
> +_wait_for_sheep 3
> +
> +_cluster_format
> +
> +$DOG vdi create test 12M
> +_vdi_list
> +
> +echo "original data" | $DOG vdi write test
> +
> +$DOG cluster shutdown
> +
> +# single object lost
> +
> +rm $STORE/0/obj/007c2b2500000000
> +
> +for i in `seq 0 2`; do
> + _start_sheep $i
> +done
> +
> +_wait_for_sheep 3
> +
> +$DOG vdi check test
> +
> +$DOG vdi read test 0 14
> +
> +$DOG cluster shutdown
> +
> +# single broken object
> +
> +_random | dd of=$STORE/1/obj/007c2b2500000000 bs=4096 count=1024 &> /dev/null
> +
> +for i in `seq 0 2`; do
> + _start_sheep $i
> +done
> +
> +_wait_for_sheep 3
> +
> +$DOG vdi check test
> +
> +$DOG vdi read test 0 14
> +
> +$DOG cluster shutdown
> +
> +# two object lost
> +
> +rm $STORE/0/obj/007c2b2500000000
> +rm $STORE/1/obj/007c2b2500000000
> +
> +for i in `seq 0 2`; do
> + _start_sheep $i
> +done
> +
> +_wait_for_sheep 3
> +
> +$DOG vdi check test
> +
> +$DOG vdi read test 0 14
> +
> +$DOG cluster shutdown
> +
> +# single object lost, single broken object. no majority
> +
> +rm $STORE/0/obj/007c2b2500000000
> +_random | dd of=$STORE/1/obj/007c2b2500000000 bs=4096 count=1024 &> /dev/null
> +
> +for i in `seq 0 2`; do
> + _start_sheep $i
> +done
> +
> +_wait_for_sheep 3
> +
> +$DOG vdi check test
> +
> +$DOG cluster shutdown
> +
> diff --git a/tests/functional/077.out b/tests/functional/077.out
> new file mode 100644
> index 0000000..5bddb9b
> --- /dev/null
> +++ b/tests/functional/077.out
> @@ -0,0 +1,16 @@
> +QA output created by 077
> +using backend plain store
> + Name Id Size Used Shared Creation time VDI id Copies Tag
> + test 0 12 MB 0.0 MB 0.0 MB DATE 7c2b25 3
> +fixed missing 7c2b2500000000
> +finish check&repair test
> +original data
> +fixed replica 7c2b2500000000
> +finish check&repair test
> +original data
> +fixed missing 7c2b2500000000
> +fixed missing 7c2b2500000000
> +finish check&repair test
> +original data
> +no majority of 7c2b2500000000
> +finish check&repair test
> diff --git a/tests/functional/group b/tests/functional/group
> index 8d15ffe..22ec578 100644
> --- a/tests/functional/group
> +++ b/tests/functional/group
> @@ -91,3 +91,4 @@
> 074 auto quick clster md
> 075 auto quick vdi md
> 076 auto quick vdi md
> +077 auto quick vdi
> --
> 1.8.1.2
>
More information about the sheepdog
mailing list