[sheepdog] [PATCH v3] dog: repair objects based on majority vote

Hitoshi Mitake mitake.hitoshi at gmail.com
Mon Nov 11 14:11:10 CET 2013


From: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>

Current object repair strategy of "dog vdi check" doesn't work well if
objects are corrupted. This patch adds a mechanism for majority voting
in the command.

Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---

v3: use _random() for test 077

v2: determine majoriby based on a number of live copies

 dog/vdi.c                | 84 ++++++++++++++++++++++++++++++++++++++++-------
 tests/functional/077     | 85 ++++++++++++++++++++++++++++++++++++++++++++++++
 tests/functional/077.out | 16 +++++++++
 tests/functional/group   |  1 +
 4 files changed, 174 insertions(+), 12 deletions(-)
 create mode 100755 tests/functional/077
 create mode 100644 tests/functional/077.out

diff --git a/dog/vdi.c b/dog/vdi.c
index d9a9a0f..124fef4 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -1434,6 +1434,12 @@ struct vdi_check_work {
 	struct work work;
 };
 
+enum vdi_check_result {
+	VDI_CHECK_NO_OBJ_FOUND,
+	VDI_CHECK_NO_MAJORITY_FOUND,
+	VDI_CHECK_SUCCESS,
+};
+
 struct vdi_check_info {
 	uint64_t oid;
 	uint8_t nr_copies;
@@ -1442,7 +1448,8 @@ struct vdi_check_info {
 	uint64_t *done;
 	int refcnt;
 	struct work_queue *wq;
-	struct vdi_check_work *base;
+	enum vdi_check_result result;
+	struct vdi_check_work *majority;
 	struct vdi_check_work vcw[0];
 };
 
@@ -1462,7 +1469,7 @@ static void vdi_repair_work(struct work *work)
 	struct vdi_check_info *info = vcw->info;
 	void *buf;
 
-	buf = read_object_from(info->base->vnode, info->oid);
+	buf = read_object_from(info->majority->vnode, info->oid);
 	write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0);
 	free(buf);
 }
@@ -1511,10 +1518,8 @@ static void vdi_check_object_work(struct work *work)
 	switch (rsp->result) {
 	case SD_RES_SUCCESS:
 		vcw->object_found = true;
-		if (!is_erasure_oid(info->oid, info->copy_policy)) {
+		if (!is_erasure_oid(info->oid, info->copy_policy))
 			memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash));
-			uatomic_set(&info->base, vcw);
-		}
 		break;
 	case SD_RES_NO_OBJ:
 		vcw->object_found = false;
@@ -1530,18 +1535,31 @@ static void vdi_check_object_work(struct work *work)
 
 static void check_replicatoin_object(struct vdi_check_info *info)
 {
-	if (info->base == NULL) {
-		sd_err("no node has %" PRIx64, info->oid);
-		exit(EXIT_FAILURE);
+	if (info->majority == NULL) {
+		switch (info->result) {
+		case VDI_CHECK_NO_OBJ_FOUND:
+			sd_err("no node has %" PRIx64, info->oid);
+			break;
+		case VDI_CHECK_NO_MAJORITY_FOUND:
+			sd_err("no majority of %" PRIx64, info->oid);
+			break;
+		default:
+			sd_err("unknown result of vdi check: %d", info->result);
+			exit(EXIT_FAILURE);
+			break;
+		}
+
+		/* do nothing */
+		return;
 	}
 
 	for (int i = 0; i < info->nr_copies; i++) {
-		if (&info->vcw[i] == info->base)
+		if (&info->vcw[i] == info->majority)
 			continue;
 		/* need repair when object not found or consistency broken */
 		if (!info->vcw[i].object_found ||
-		    memcmp(info->base->hash, info->vcw[i].hash,
-			   sizeof(info->base->hash)) != 0) {
+		    memcmp(info->majority->hash, info->vcw[i].hash,
+			   sizeof(info->majority->hash)) != 0) {
 			info->vcw[i].work.fn = vdi_repair_work;
 			info->vcw[i].work.done = vdi_repair_main;
 			info->refcnt++;
@@ -1615,6 +1633,46 @@ out:
 	ec_destroy(ctx);
 }
 
+static void vote_majority_object(struct vdi_check_info *info)
+{
+	/*
+	 * Voting majority object from existing ones.
+	 *
+	 * The linear majority vote algorithm by Boyer and Moore is used:
+	 * http://www.cs.utexas.edu/~moore/best-ideas/mjrty/
+	 */
+
+	int count = 0, nr_live_copies = 0;
+	struct vdi_check_work *majority = NULL;
+
+	for (int i = 0; i < info->nr_copies; i++) {
+		struct vdi_check_work *vcw = &info->vcw[i];
+
+		if (!vcw->object_found)
+			continue;
+		nr_live_copies++;
+
+		if (!count)
+			majority = vcw;
+
+		if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash)))
+			count++;
+		else
+			count--;
+	}
+
+	if (!majority)
+		info->result = VDI_CHECK_NO_OBJ_FOUND;
+	else if (count < nr_live_copies / 2) {
+		/* no majority found */
+		majority = NULL;
+		info->result = VDI_CHECK_NO_MAJORITY_FOUND;
+	} else
+		info->result = VDI_CHECK_SUCCESS;
+
+	info->majority = majority;
+}
+
 static void vdi_check_object_main(struct work *work)
 {
 	struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
@@ -1627,8 +1685,10 @@ static void vdi_check_object_main(struct work *work)
 
 	if (is_erasure_oid(info->oid, info->copy_policy))
 		check_erasure_object(info);
-	else
+	else {
+		vote_majority_object(info);
 		check_replicatoin_object(info);
+	}
 
 	if (info->refcnt == 0)
 		free_vdi_check_info(info);
diff --git a/tests/functional/077 b/tests/functional/077
new file mode 100755
index 0000000..71db820
--- /dev/null
+++ b/tests/functional/077
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Test vdi repair functionality
+
+. ./common
+
+for i in `seq 0 2`; do
+    _start_sheep $i
+done
+
+_wait_for_sheep 3
+
+_cluster_format
+
+$DOG vdi create test 12M
+_vdi_list
+
+echo "original data" | $DOG vdi write test
+
+$DOG cluster shutdown
+
+# single object lost
+
+rm $STORE/0/obj/007c2b2500000000
+
+for i in `seq 0 2`; do
+    _start_sheep $i
+done
+
+_wait_for_sheep 3
+
+$DOG vdi check test
+
+$DOG vdi read test 0 14
+
+$DOG cluster shutdown
+
+# single broken object
+
+_random | dd of=$STORE/1/obj/007c2b2500000000 bs=4096 count=1024 &> /dev/null
+
+for i in `seq 0 2`; do
+    _start_sheep $i
+done
+
+_wait_for_sheep 3
+
+$DOG vdi check test
+
+$DOG vdi read test 0 14
+
+$DOG cluster shutdown
+
+# two object lost
+
+rm $STORE/0/obj/007c2b2500000000
+rm $STORE/1/obj/007c2b2500000000
+
+for i in `seq 0 2`; do
+    _start_sheep $i
+done
+
+_wait_for_sheep 3
+
+$DOG vdi check test
+
+$DOG vdi read test 0 14
+
+$DOG cluster shutdown
+
+# single object lost, single broken object. no majority
+
+rm $STORE/0/obj/007c2b2500000000
+_random | dd of=$STORE/1/obj/007c2b2500000000 bs=4096 count=1024 &> /dev/null
+
+for i in `seq 0 2`; do
+    _start_sheep $i
+done
+
+_wait_for_sheep 3
+
+$DOG vdi check test
+
+$DOG cluster shutdown
+
diff --git a/tests/functional/077.out b/tests/functional/077.out
new file mode 100644
index 0000000..5bddb9b
--- /dev/null
+++ b/tests/functional/077.out
@@ -0,0 +1,16 @@
+QA output created by 077
+using backend plain store
+  Name        Id    Size    Used  Shared    Creation time   VDI id  Copies  Tag
+  test         0   12 MB  0.0 MB  0.0 MB DATE   7c2b25     3              
+fixed missing 7c2b2500000000
+finish check&repair test
+original data
+fixed replica 7c2b2500000000
+finish check&repair test
+original data
+fixed missing 7c2b2500000000
+fixed missing 7c2b2500000000
+finish check&repair test
+original data
+no majority of 7c2b2500000000
+finish check&repair test
diff --git a/tests/functional/group b/tests/functional/group
index 8d15ffe..22ec578 100644
--- a/tests/functional/group
+++ b/tests/functional/group
@@ -91,3 +91,4 @@
 074 auto quick clster md
 075 auto quick vdi md
 076 auto quick vdi md
+077 auto quick vdi
-- 
1.8.1.2




More information about the sheepdog mailing list