[sheepdog] [PATCH v3] dog: repair objects based on majority vote
Hitoshi Mitake
mitake.hitoshi at gmail.com
Mon Nov 11 14:11:10 CET 2013
From: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
Current object repair strategy of "dog vdi check" doesn't work well if
objects are corrupted. This patch adds a mechanism for majority voting
in the command.
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
v3: use _random() for test 077
v2: determine majoriby based on a number of live copies
dog/vdi.c | 84 ++++++++++++++++++++++++++++++++++++++++-------
tests/functional/077 | 85 ++++++++++++++++++++++++++++++++++++++++++++++++
tests/functional/077.out | 16 +++++++++
tests/functional/group | 1 +
4 files changed, 174 insertions(+), 12 deletions(-)
create mode 100755 tests/functional/077
create mode 100644 tests/functional/077.out
diff --git a/dog/vdi.c b/dog/vdi.c
index d9a9a0f..124fef4 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -1434,6 +1434,12 @@ struct vdi_check_work {
struct work work;
};
+enum vdi_check_result {
+ VDI_CHECK_NO_OBJ_FOUND,
+ VDI_CHECK_NO_MAJORITY_FOUND,
+ VDI_CHECK_SUCCESS,
+};
+
struct vdi_check_info {
uint64_t oid;
uint8_t nr_copies;
@@ -1442,7 +1448,8 @@ struct vdi_check_info {
uint64_t *done;
int refcnt;
struct work_queue *wq;
- struct vdi_check_work *base;
+ enum vdi_check_result result;
+ struct vdi_check_work *majority;
struct vdi_check_work vcw[0];
};
@@ -1462,7 +1469,7 @@ static void vdi_repair_work(struct work *work)
struct vdi_check_info *info = vcw->info;
void *buf;
- buf = read_object_from(info->base->vnode, info->oid);
+ buf = read_object_from(info->majority->vnode, info->oid);
write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0);
free(buf);
}
@@ -1511,10 +1518,8 @@ static void vdi_check_object_work(struct work *work)
switch (rsp->result) {
case SD_RES_SUCCESS:
vcw->object_found = true;
- if (!is_erasure_oid(info->oid, info->copy_policy)) {
+ if (!is_erasure_oid(info->oid, info->copy_policy))
memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash));
- uatomic_set(&info->base, vcw);
- }
break;
case SD_RES_NO_OBJ:
vcw->object_found = false;
@@ -1530,18 +1535,31 @@ static void vdi_check_object_work(struct work *work)
static void check_replicatoin_object(struct vdi_check_info *info)
{
- if (info->base == NULL) {
- sd_err("no node has %" PRIx64, info->oid);
- exit(EXIT_FAILURE);
+ if (info->majority == NULL) {
+ switch (info->result) {
+ case VDI_CHECK_NO_OBJ_FOUND:
+ sd_err("no node has %" PRIx64, info->oid);
+ break;
+ case VDI_CHECK_NO_MAJORITY_FOUND:
+ sd_err("no majority of %" PRIx64, info->oid);
+ break;
+ default:
+ sd_err("unknown result of vdi check: %d", info->result);
+ exit(EXIT_FAILURE);
+ break;
+ }
+
+ /* do nothing */
+ return;
}
for (int i = 0; i < info->nr_copies; i++) {
- if (&info->vcw[i] == info->base)
+ if (&info->vcw[i] == info->majority)
continue;
/* need repair when object not found or consistency broken */
if (!info->vcw[i].object_found ||
- memcmp(info->base->hash, info->vcw[i].hash,
- sizeof(info->base->hash)) != 0) {
+ memcmp(info->majority->hash, info->vcw[i].hash,
+ sizeof(info->majority->hash)) != 0) {
info->vcw[i].work.fn = vdi_repair_work;
info->vcw[i].work.done = vdi_repair_main;
info->refcnt++;
@@ -1615,6 +1633,46 @@ out:
ec_destroy(ctx);
}
+static void vote_majority_object(struct vdi_check_info *info)
+{
+ /*
+ * Voting majority object from existing ones.
+ *
+ * The linear majority vote algorithm by Boyer and Moore is used:
+ * http://www.cs.utexas.edu/~moore/best-ideas/mjrty/
+ */
+
+ int count = 0, nr_live_copies = 0;
+ struct vdi_check_work *majority = NULL;
+
+ for (int i = 0; i < info->nr_copies; i++) {
+ struct vdi_check_work *vcw = &info->vcw[i];
+
+ if (!vcw->object_found)
+ continue;
+ nr_live_copies++;
+
+ if (!count)
+ majority = vcw;
+
+ if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash)))
+ count++;
+ else
+ count--;
+ }
+
+ if (!majority)
+ info->result = VDI_CHECK_NO_OBJ_FOUND;
+ else if (count < nr_live_copies / 2) {
+ /* no majority found */
+ majority = NULL;
+ info->result = VDI_CHECK_NO_MAJORITY_FOUND;
+ } else
+ info->result = VDI_CHECK_SUCCESS;
+
+ info->majority = majority;
+}
+
static void vdi_check_object_main(struct work *work)
{
struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
@@ -1627,8 +1685,10 @@ static void vdi_check_object_main(struct work *work)
if (is_erasure_oid(info->oid, info->copy_policy))
check_erasure_object(info);
- else
+ else {
+ vote_majority_object(info);
check_replicatoin_object(info);
+ }
if (info->refcnt == 0)
free_vdi_check_info(info);
diff --git a/tests/functional/077 b/tests/functional/077
new file mode 100755
index 0000000..71db820
--- /dev/null
+++ b/tests/functional/077
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Test vdi repair functionality
+
+. ./common
+
+for i in `seq 0 2`; do
+ _start_sheep $i
+done
+
+_wait_for_sheep 3
+
+_cluster_format
+
+$DOG vdi create test 12M
+_vdi_list
+
+echo "original data" | $DOG vdi write test
+
+$DOG cluster shutdown
+
+# single object lost
+
+rm $STORE/0/obj/007c2b2500000000
+
+for i in `seq 0 2`; do
+ _start_sheep $i
+done
+
+_wait_for_sheep 3
+
+$DOG vdi check test
+
+$DOG vdi read test 0 14
+
+$DOG cluster shutdown
+
+# single broken object
+
+_random | dd of=$STORE/1/obj/007c2b2500000000 bs=4096 count=1024 &> /dev/null
+
+for i in `seq 0 2`; do
+ _start_sheep $i
+done
+
+_wait_for_sheep 3
+
+$DOG vdi check test
+
+$DOG vdi read test 0 14
+
+$DOG cluster shutdown
+
+# two object lost
+
+rm $STORE/0/obj/007c2b2500000000
+rm $STORE/1/obj/007c2b2500000000
+
+for i in `seq 0 2`; do
+ _start_sheep $i
+done
+
+_wait_for_sheep 3
+
+$DOG vdi check test
+
+$DOG vdi read test 0 14
+
+$DOG cluster shutdown
+
+# single object lost, single broken object. no majority
+
+rm $STORE/0/obj/007c2b2500000000
+_random | dd of=$STORE/1/obj/007c2b2500000000 bs=4096 count=1024 &> /dev/null
+
+for i in `seq 0 2`; do
+ _start_sheep $i
+done
+
+_wait_for_sheep 3
+
+$DOG vdi check test
+
+$DOG cluster shutdown
+
diff --git a/tests/functional/077.out b/tests/functional/077.out
new file mode 100644
index 0000000..5bddb9b
--- /dev/null
+++ b/tests/functional/077.out
@@ -0,0 +1,16 @@
+QA output created by 077
+using backend plain store
+ Name Id Size Used Shared Creation time VDI id Copies Tag
+ test 0 12 MB 0.0 MB 0.0 MB DATE 7c2b25 3
+fixed missing 7c2b2500000000
+finish check&repair test
+original data
+fixed replica 7c2b2500000000
+finish check&repair test
+original data
+fixed missing 7c2b2500000000
+fixed missing 7c2b2500000000
+finish check&repair test
+original data
+no majority of 7c2b2500000000
+finish check&repair test
diff --git a/tests/functional/group b/tests/functional/group
index 8d15ffe..22ec578 100644
--- a/tests/functional/group
+++ b/tests/functional/group
@@ -91,3 +91,4 @@
074 auto quick clster md
075 auto quick vdi md
076 auto quick vdi md
+077 auto quick vdi
--
1.8.1.2
More information about the sheepdog
mailing list