[sheepdog-users] [PATCH stable-0.7 1/2] dog: repair objects based on majority vote

Hitoshi Mitake mitake.hitoshi at lab.ntt.co.jp
Mon Dec 16 03:18:47 CET 2013


Current object repair strategy of "dog vdi check" doesn't work well if
objects are corrupted. This patch adds a mechanism for majority voting
in the command.

Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
Signed-off-by: Liu Yuan <namei.unix at gmail.com>

Conflicts:
	dog/vdi.c
Conflicts were resolved by Hitoshi Mitake.

Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 dog/vdi.c |   82 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 70 insertions(+), 12 deletions(-)

diff --git a/dog/vdi.c b/dog/vdi.c
index 5cbc85a..bc18fc4 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -1396,6 +1396,12 @@ struct vdi_check_work {
 	struct work work;
 };
 
+enum vdi_check_result {
+	VDI_CHECK_NO_OBJ_FOUND,
+	VDI_CHECK_NO_MAJORITY_FOUND,
+	VDI_CHECK_SUCCESS,
+};
+
 struct vdi_check_info {
 	uint64_t oid;
 	int nr_copies;
@@ -1403,7 +1409,8 @@ struct vdi_check_info {
 	uint64_t *done;
 	int refcnt;
 	struct work_queue *wq;
-	struct vdi_check_work *base;
+	enum vdi_check_result result;
+	struct vdi_check_work *majority;
 	struct vdi_check_work vcw[0];
 };
 
@@ -1423,7 +1430,7 @@ static void vdi_repair_work(struct work *work)
 	struct vdi_check_info *info = vcw->info;
 	void *buf;
 
-	buf = read_object_from(info->base->vnode, info->oid);
+	buf = read_object_from(info->majority->vnode, info->oid);
 	write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found);
 	free(buf);
 }
@@ -1466,7 +1473,6 @@ static void vdi_hash_check_work(struct work *work)
 	case SD_RES_SUCCESS:
 		vcw->object_found = true;
 		memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash));
-		uatomic_set(&info->base, vcw);
 		break;
 	case SD_RES_NO_OBJ:
 		vcw->object_found = false;
@@ -1479,6 +1485,46 @@ static void vdi_hash_check_work(struct work *work)
 	}
 }
 
+static void vote_majority_object(struct vdi_check_info *info)
+{
+	/*
+	 * Voting majority object from existing ones.
+	 *
+	 * The linear majority vote algorithm by Boyer and Moore is used:
+	 * http://www.cs.utexas.edu/~moore/best-ideas/mjrty/
+	 */
+
+	int count = 0, nr_live_copies = 0;
+	struct vdi_check_work *majority = NULL;
+
+	for (int i = 0; i < info->nr_copies; i++) {
+		struct vdi_check_work *vcw = &info->vcw[i];
+
+		if (!vcw->object_found)
+			continue;
+		nr_live_copies++;
+
+		if (!count)
+			majority = vcw;
+
+		if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash)))
+			count++;
+		else
+			count--;
+	}
+
+	if (!majority)
+		info->result = VDI_CHECK_NO_OBJ_FOUND;
+	else if (count < nr_live_copies / 2) {
+		/* no majority found */
+		majority = NULL;
+		info->result = VDI_CHECK_NO_MAJORITY_FOUND;
+	} else
+		info->result = VDI_CHECK_SUCCESS;
+
+	info->majority = majority;
+}
+
 static void vdi_hash_check_main(struct work *work)
 {
 	struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
@@ -1489,27 +1535,39 @@ static void vdi_hash_check_main(struct work *work)
 	if (info->refcnt > 0)
 		return;
 
-	if (info->base  == NULL) {
-		sd_err("no node has %" PRIx64, info->oid);
-		exit(EXIT_FAILURE);
+	vote_majority_object(info);
+
+	if (info->majority == NULL) {
+		switch (info->result) {
+		case VDI_CHECK_NO_OBJ_FOUND:
+			sd_err("no node has %" PRIx64, info->oid);
+			break;
+		case VDI_CHECK_NO_MAJORITY_FOUND:
+			sd_err("no majority of %" PRIx64, info->oid);
+			break;
+		default:
+			sd_err("unknown result of vdi check: %d", info->result);
+			exit(EXIT_FAILURE);
+			break;
+		}
+
+		/* do nothing */
+		return;
 	}
 
 	for (int i = 0; i < info->nr_copies; i++) {
-		if (&info->vcw[i] == info->base)
+		if (&info->vcw[i] == info->majority)
 			continue;
 		/* need repair when object not found or consistency broken */
 		if (!info->vcw[i].object_found ||
-		    memcmp(info->base->hash, info->vcw[i].hash,
-			   sizeof(info->base->hash)) != 0) {
+		    memcmp(info->majority->hash, info->vcw[i].hash,
+			   sizeof(info->majority->hash)) != 0) {
 			info->vcw[i].work.fn = vdi_repair_work;
 			info->vcw[i].work.done = vdi_repair_main;
 			info->refcnt++;
 			queue_work(info->wq, &info->vcw[i].work);
 		}
 	}
-
-	if (info->refcnt == 0)
-		free_vdi_check_info(info);
 }
 
 static void queue_vdi_check_work(struct sd_inode *inode, uint64_t oid,
-- 
1.7.10.4




More information about the sheepdog-users mailing list