[sheepdog] [PATCH v5 2/2] dog: repair objects based on majority vote
Hitoshi Mitake
mitake.hitoshi at lab.ntt.co.jp
Tue Nov 12 03:45:25 CET 2013
Current object repair strategy of "dog vdi check" doesn't work well if
objects are corrupted. This patch adds a mechanism for majority voting
in the command.
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
dog/vdi.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 72 insertions(+), 12 deletions(-)
diff --git a/dog/vdi.c b/dog/vdi.c
index d9a9a0f..124fef4 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -1434,6 +1434,12 @@ struct vdi_check_work {
struct work work;
};
+enum vdi_check_result {
+ VDI_CHECK_NO_OBJ_FOUND,
+ VDI_CHECK_NO_MAJORITY_FOUND,
+ VDI_CHECK_SUCCESS,
+};
+
struct vdi_check_info {
uint64_t oid;
uint8_t nr_copies;
@@ -1442,7 +1448,8 @@ struct vdi_check_info {
uint64_t *done;
int refcnt;
struct work_queue *wq;
- struct vdi_check_work *base;
+ enum vdi_check_result result;
+ struct vdi_check_work *majority;
struct vdi_check_work vcw[0];
};
@@ -1462,7 +1469,7 @@ static void vdi_repair_work(struct work *work)
struct vdi_check_info *info = vcw->info;
void *buf;
- buf = read_object_from(info->base->vnode, info->oid);
+ buf = read_object_from(info->majority->vnode, info->oid);
write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0);
free(buf);
}
@@ -1511,10 +1518,8 @@ static void vdi_check_object_work(struct work *work)
switch (rsp->result) {
case SD_RES_SUCCESS:
vcw->object_found = true;
- if (!is_erasure_oid(info->oid, info->copy_policy)) {
+ if (!is_erasure_oid(info->oid, info->copy_policy))
memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash));
- uatomic_set(&info->base, vcw);
- }
break;
case SD_RES_NO_OBJ:
vcw->object_found = false;
@@ -1530,18 +1535,31 @@ static void vdi_check_object_work(struct work *work)
static void check_replicatoin_object(struct vdi_check_info *info)
{
- if (info->base == NULL) {
- sd_err("no node has %" PRIx64, info->oid);
- exit(EXIT_FAILURE);
+ if (info->majority == NULL) {
+ switch (info->result) {
+ case VDI_CHECK_NO_OBJ_FOUND:
+ sd_err("no node has %" PRIx64, info->oid);
+ break;
+ case VDI_CHECK_NO_MAJORITY_FOUND:
+ sd_err("no majority of %" PRIx64, info->oid);
+ break;
+ default:
+ sd_err("unknown result of vdi check: %d", info->result);
+ exit(EXIT_FAILURE);
+ break;
+ }
+
+ /* do nothing */
+ return;
}
for (int i = 0; i < info->nr_copies; i++) {
- if (&info->vcw[i] == info->base)
+ if (&info->vcw[i] == info->majority)
continue;
/* need repair when object not found or consistency broken */
if (!info->vcw[i].object_found ||
- memcmp(info->base->hash, info->vcw[i].hash,
- sizeof(info->base->hash)) != 0) {
+ memcmp(info->majority->hash, info->vcw[i].hash,
+ sizeof(info->majority->hash)) != 0) {
info->vcw[i].work.fn = vdi_repair_work;
info->vcw[i].work.done = vdi_repair_main;
info->refcnt++;
@@ -1615,6 +1633,46 @@ out:
ec_destroy(ctx);
}
+static void vote_majority_object(struct vdi_check_info *info)
+{
+ /*
+ * Voting majority object from existing ones.
+ *
+ * The linear majority vote algorithm by Boyer and Moore is used:
+ * http://www.cs.utexas.edu/~moore/best-ideas/mjrty/
+ */
+
+ int count = 0, nr_live_copies = 0;
+ struct vdi_check_work *majority = NULL;
+
+ for (int i = 0; i < info->nr_copies; i++) {
+ struct vdi_check_work *vcw = &info->vcw[i];
+
+ if (!vcw->object_found)
+ continue;
+ nr_live_copies++;
+
+ if (!count)
+ majority = vcw;
+
+ if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash)))
+ count++;
+ else
+ count--;
+ }
+
+ if (!majority)
+ info->result = VDI_CHECK_NO_OBJ_FOUND;
+ else if (count < nr_live_copies / 2) {
+ /* no majority found */
+ majority = NULL;
+ info->result = VDI_CHECK_NO_MAJORITY_FOUND;
+ } else
+ info->result = VDI_CHECK_SUCCESS;
+
+ info->majority = majority;
+}
+
static void vdi_check_object_main(struct work *work)
{
struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
@@ -1627,8 +1685,10 @@ static void vdi_check_object_main(struct work *work)
if (is_erasure_oid(info->oid, info->copy_policy))
check_erasure_object(info);
- else
+ else {
+ vote_majority_object(info);
check_replicatoin_object(info);
+ }
if (info->refcnt == 0)
free_vdi_check_info(info);
--
1.7.10.4
More information about the sheepdog
mailing list