[sheepdog] [PATCH] dog: repair objects based on majority vote

Hitoshi Mitake mitake.hitoshi at gmail.com
Wed Nov 6 03:42:33 CET 2013


At Tue,  5 Nov 2013 10:54:25 +0900,
Hitoshi Mitake wrote:
> 
> Current object repair strategy of "dog vdi check" doesn't work well if
> objects are corrupted. This patch adds a mechanism for majority voting
> in the command.
> 
> Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
> ---
>  dog/vdi.c                |   83 +++++++++++++++++++++++++++++++++++++-------
>  tests/functional/077     |   85 ++++++++++++++++++++++++++++++++++++++++++++++
>  tests/functional/077.out |   16 +++++++++
>  tests/functional/group   |    1 +
>  4 files changed, 173 insertions(+), 12 deletions(-)
>  create mode 100755 tests/functional/077
>  create mode 100644 tests/functional/077.out

ping?

> 
> diff --git a/dog/vdi.c b/dog/vdi.c
> index d9a9a0f..27d03a2 100644
> --- a/dog/vdi.c
> +++ b/dog/vdi.c
> @@ -1434,6 +1434,12 @@ struct vdi_check_work {
>  	struct work work;
>  };
>  
> +enum vdi_check_result {
> +	VDI_CHECK_NO_OBJ_FOUND,
> +	VDI_CHECK_NO_MAJORITY_FOUND,
> +	VDI_CHECK_SUCCESS,
> +};
> +
>  struct vdi_check_info {
>  	uint64_t oid;
>  	uint8_t nr_copies;
> @@ -1442,7 +1448,8 @@ struct vdi_check_info {
>  	uint64_t *done;
>  	int refcnt;
>  	struct work_queue *wq;
> -	struct vdi_check_work *base;
> +	enum vdi_check_result result;
> +	struct vdi_check_work *majority;
>  	struct vdi_check_work vcw[0];
>  };
>  
> @@ -1462,7 +1469,7 @@ static void vdi_repair_work(struct work *work)
>  	struct vdi_check_info *info = vcw->info;
>  	void *buf;
>  
> -	buf = read_object_from(info->base->vnode, info->oid);
> +	buf = read_object_from(info->majority->vnode, info->oid);
>  	write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0);
>  	free(buf);
>  }
> @@ -1511,10 +1518,8 @@ static void vdi_check_object_work(struct work *work)
>  	switch (rsp->result) {
>  	case SD_RES_SUCCESS:
>  		vcw->object_found = true;
> -		if (!is_erasure_oid(info->oid, info->copy_policy)) {
> +		if (!is_erasure_oid(info->oid, info->copy_policy))
>  			memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash));
> -			uatomic_set(&info->base, vcw);
> -		}
>  		break;
>  	case SD_RES_NO_OBJ:
>  		vcw->object_found = false;
> @@ -1530,18 +1535,31 @@ static void vdi_check_object_work(struct work *work)
>  
>  static void check_replicatoin_object(struct vdi_check_info *info)
>  {
> -	if (info->base == NULL) {
> -		sd_err("no node has %" PRIx64, info->oid);
> -		exit(EXIT_FAILURE);
> +	if (info->majority == NULL) {
> +		switch (info->result) {
> +		case VDI_CHECK_NO_OBJ_FOUND:
> +			sd_err("no node has %" PRIx64, info->oid);
> +			break;
> +		case VDI_CHECK_NO_MAJORITY_FOUND:
> +			sd_err("no majority of %" PRIx64, info->oid);
> +			break;
> +		default:
> +			sd_err("unknown result of vdi check: %d", info->result);
> +			exit(EXIT_FAILURE);
> +			break;
> +		}
> +
> +		/* do nothing */
> +		return;
>  	}
>  
>  	for (int i = 0; i < info->nr_copies; i++) {
> -		if (&info->vcw[i] == info->base)
> +		if (&info->vcw[i] == info->majority)
>  			continue;
>  		/* need repair when object not found or consistency broken */
>  		if (!info->vcw[i].object_found ||
> -		    memcmp(info->base->hash, info->vcw[i].hash,
> -			   sizeof(info->base->hash)) != 0) {
> +		    memcmp(info->majority->hash, info->vcw[i].hash,
> +			   sizeof(info->majority->hash)) != 0) {
>  			info->vcw[i].work.fn = vdi_repair_work;
>  			info->vcw[i].work.done = vdi_repair_main;
>  			info->refcnt++;
> @@ -1615,6 +1633,45 @@ out:
>  	ec_destroy(ctx);
>  }
>  
> +static void vote_majority_object(struct vdi_check_info *info)
> +{
> +	/*
> +	 * Voting majority object from existing ones.
> +	 *
> +	 * The linear majority vote algorithm by Boyer and Moore is used:
> +	 * http://www.cs.utexas.edu/~moore/best-ideas/mjrty/
> +	 */
> +
> +	int count = 0;
> +	struct vdi_check_work *majority = NULL;
> +
> +	for (int i = 0; i < info->nr_copies; i++) {
> +		struct vdi_check_work *vcw = &info->vcw[i];
> +
> +		if (!vcw->object_found)
> +			continue;
> +
> +		if (!count)
> +			majority = vcw;
> +
> +		if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash)))
> +			count++;
> +		else
> +			count--;
> +	}
> +
> +	if (!majority)
> +		info->result = VDI_CHECK_NO_OBJ_FOUND;
> +	else if (count < info->nr_copies / 2) {
> +		/* no majority found */
> +		majority = NULL;
> +		info->result = VDI_CHECK_NO_MAJORITY_FOUND;
> +	} else
> +		info->result = VDI_CHECK_SUCCESS;
> +
> +	info->majority = majority;
> +}
> +
>  static void vdi_check_object_main(struct work *work)
>  {
>  	struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
> @@ -1627,8 +1684,10 @@ static void vdi_check_object_main(struct work *work)
>  
>  	if (is_erasure_oid(info->oid, info->copy_policy))
>  		check_erasure_object(info);
> -	else
> +	else {
> +		vote_majority_object(info);
>  		check_replicatoin_object(info);
> +	}
>  
>  	if (info->refcnt == 0)
>  		free_vdi_check_info(info);
> diff --git a/tests/functional/077 b/tests/functional/077
> new file mode 100755
> index 0000000..dd743bd
> --- /dev/null
> +++ b/tests/functional/077
> @@ -0,0 +1,85 @@
> +#!/bin/bash
> +
> +# Test vdi repair functionality
> +
> +. ./common
> +
> +for i in `seq 0 2`; do
> +    _start_sheep $i
> +done
> +
> +_wait_for_sheep 3
> +
> +_cluster_format
> +
> +$DOG vdi create test 12M
> +_vdi_list
> +
> +echo "original data" | $DOG vdi write test
> +
> +$DOG cluster shutdown
> +
> +# single object lost
> +
> +rm $STORE/0/obj/007c2b2500000000
> +
> +for i in `seq 0 2`; do
> +    _start_sheep $i
> +done
> +
> +_wait_for_sheep 3
> +
> +$DOG vdi check test
> +
> +$DOG vdi read test 0 14
> +
> +$DOG cluster shutdown
> +
> +# single broken object
> +
> +dd if=/dev/urandom of=$STORE/1/obj/007c2b2500000000 bs=4096 count=1024 &> /dev/null
> +
> +for i in `seq 0 2`; do
> +    _start_sheep $i
> +done
> +
> +_wait_for_sheep 3
> +
> +$DOG vdi check test
> +
> +$DOG vdi read test 0 14
> +
> +$DOG cluster shutdown
> +
> +# two object lost
> +
> +rm $STORE/0/obj/007c2b2500000000
> +rm $STORE/1/obj/007c2b2500000000
> +
> +for i in `seq 0 2`; do
> +    _start_sheep $i
> +done
> +
> +_wait_for_sheep 3
> +
> +$DOG vdi check test
> +
> +$DOG vdi read test 0 14
> +
> +$DOG cluster shutdown
> +
> +# single object lost, single broken object. no majority
> +
> +rm $STORE/0/obj/007c2b2500000000
> +dd if=/dev/urandom of=$STORE/1/obj/007c2b2500000000 bs=4096 count=1024 &> /dev/null
> +
> +for i in `seq 0 2`; do
> +    _start_sheep $i
> +done
> +
> +_wait_for_sheep 3
> +
> +$DOG vdi check test
> +
> +$DOG cluster shutdown
> +
> diff --git a/tests/functional/077.out b/tests/functional/077.out
> new file mode 100644
> index 0000000..5bddb9b
> --- /dev/null
> +++ b/tests/functional/077.out
> @@ -0,0 +1,16 @@
> +QA output created by 077
> +using backend plain store
> +  Name        Id    Size    Used  Shared    Creation time   VDI id  Copies  Tag
> +  test         0   12 MB  0.0 MB  0.0 MB DATE   7c2b25     3              
> +fixed missing 7c2b2500000000
> +finish check&repair test
> +original data
> +fixed replica 7c2b2500000000
> +finish check&repair test
> +original data
> +fixed missing 7c2b2500000000
> +fixed missing 7c2b2500000000
> +finish check&repair test
> +original data
> +no majority of 7c2b2500000000
> +finish check&repair test
> diff --git a/tests/functional/group b/tests/functional/group
> index 8d15ffe..22ec578 100644
> --- a/tests/functional/group
> +++ b/tests/functional/group
> @@ -91,3 +91,4 @@
>  074 auto quick clster md
>  075 auto quick vdi md
>  076 auto quick vdi md
> +077 auto quick vdi
> -- 
> 1.7.10.4
> 



More information about the sheepdog mailing list