[sheepdog] [PATCH v5 2/2] dog: repair objects based on majority vote

Hitoshi Mitake mitake.hitoshi at lab.ntt.co.jp
Tue Nov 12 03:45:25 CET 2013


Current object repair strategy of "dog vdi check" doesn't work well if
objects are corrupted. This patch adds a mechanism for majority voting
in the command.

Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 dog/vdi.c |   84 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 72 insertions(+), 12 deletions(-)

diff --git a/dog/vdi.c b/dog/vdi.c
index d9a9a0f..124fef4 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -1434,6 +1434,12 @@ struct vdi_check_work {
 	struct work work;
 };
 
+enum vdi_check_result {
+	VDI_CHECK_NO_OBJ_FOUND,
+	VDI_CHECK_NO_MAJORITY_FOUND,
+	VDI_CHECK_SUCCESS,
+};
+
 struct vdi_check_info {
 	uint64_t oid;
 	uint8_t nr_copies;
@@ -1442,7 +1448,8 @@ struct vdi_check_info {
 	uint64_t *done;
 	int refcnt;
 	struct work_queue *wq;
-	struct vdi_check_work *base;
+	enum vdi_check_result result;
+	struct vdi_check_work *majority;
 	struct vdi_check_work vcw[0];
 };
 
@@ -1462,7 +1469,7 @@ static void vdi_repair_work(struct work *work)
 	struct vdi_check_info *info = vcw->info;
 	void *buf;
 
-	buf = read_object_from(info->base->vnode, info->oid);
+	buf = read_object_from(info->majority->vnode, info->oid);
 	write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0);
 	free(buf);
 }
@@ -1511,10 +1518,8 @@ static void vdi_check_object_work(struct work *work)
 	switch (rsp->result) {
 	case SD_RES_SUCCESS:
 		vcw->object_found = true;
-		if (!is_erasure_oid(info->oid, info->copy_policy)) {
+		if (!is_erasure_oid(info->oid, info->copy_policy))
 			memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash));
-			uatomic_set(&info->base, vcw);
-		}
 		break;
 	case SD_RES_NO_OBJ:
 		vcw->object_found = false;
@@ -1530,18 +1535,31 @@ static void vdi_check_object_work(struct work *work)
 
 static void check_replicatoin_object(struct vdi_check_info *info)
 {
-	if (info->base == NULL) {
-		sd_err("no node has %" PRIx64, info->oid);
-		exit(EXIT_FAILURE);
+	if (info->majority == NULL) {
+		switch (info->result) {
+		case VDI_CHECK_NO_OBJ_FOUND:
+			sd_err("no node has %" PRIx64, info->oid);
+			break;
+		case VDI_CHECK_NO_MAJORITY_FOUND:
+			sd_err("no majority of %" PRIx64, info->oid);
+			break;
+		default:
+			sd_err("unknown result of vdi check: %d", info->result);
+			exit(EXIT_FAILURE);
+			break;
+		}
+
+		/* do nothing */
+		return;
 	}
 
 	for (int i = 0; i < info->nr_copies; i++) {
-		if (&info->vcw[i] == info->base)
+		if (&info->vcw[i] == info->majority)
 			continue;
 		/* need repair when object not found or consistency broken */
 		if (!info->vcw[i].object_found ||
-		    memcmp(info->base->hash, info->vcw[i].hash,
-			   sizeof(info->base->hash)) != 0) {
+		    memcmp(info->majority->hash, info->vcw[i].hash,
+			   sizeof(info->majority->hash)) != 0) {
 			info->vcw[i].work.fn = vdi_repair_work;
 			info->vcw[i].work.done = vdi_repair_main;
 			info->refcnt++;
@@ -1615,6 +1633,46 @@ out:
 	ec_destroy(ctx);
 }
 
+static void vote_majority_object(struct vdi_check_info *info)
+{
+	/*
+	 * Voting majority object from existing ones.
+	 *
+	 * The linear majority vote algorithm by Boyer and Moore is used:
+	 * http://www.cs.utexas.edu/~moore/best-ideas/mjrty/
+	 */
+
+	int count = 0, nr_live_copies = 0;
+	struct vdi_check_work *majority = NULL;
+
+	for (int i = 0; i < info->nr_copies; i++) {
+		struct vdi_check_work *vcw = &info->vcw[i];
+
+		if (!vcw->object_found)
+			continue;
+		nr_live_copies++;
+
+		if (!count)
+			majority = vcw;
+
+		if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash)))
+			count++;
+		else
+			count--;
+	}
+
+	if (!majority)
+		info->result = VDI_CHECK_NO_OBJ_FOUND;
+	else if (count < nr_live_copies / 2) {
+		/* no majority found */
+		majority = NULL;
+		info->result = VDI_CHECK_NO_MAJORITY_FOUND;
+	} else
+		info->result = VDI_CHECK_SUCCESS;
+
+	info->majority = majority;
+}
+
 static void vdi_check_object_main(struct work *work)
 {
 	struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
@@ -1627,8 +1685,10 @@ static void vdi_check_object_main(struct work *work)
 
 	if (is_erasure_oid(info->oid, info->copy_policy))
 		check_erasure_object(info);
-	else
+	else {
+		vote_majority_object(info);
 		check_replicatoin_object(info);
+	}
 
 	if (info->refcnt == 0)
 		free_vdi_check_info(info);
-- 
1.7.10.4




More information about the sheepdog mailing list