[sheepdog] [PATCH] dog: repair objects based on majority vote

Hitoshi Mitake mitake.hitoshi at gmail.com
Mon Nov 11 11:04:10 CET 2013


At Mon, 11 Nov 2013 18:01:19 +0800,
Liu Yuan wrote:
> 
> On Tue, Nov 05, 2013 at 10:54:25AM +0900, Hitoshi Mitake wrote:
> > Current object repair strategy of "dog vdi check" doesn't work well if
> > objects are corrupted. This patch adds a mechanism for majority voting
> > in the command.
> > 
> > Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
> > ---
> >  dog/vdi.c                |   83 +++++++++++++++++++++++++++++++++++++-------
> >  tests/functional/077     |   85 ++++++++++++++++++++++++++++++++++++++++++++++
> >  tests/functional/077.out |   16 +++++++++
> >  tests/functional/group   |    1 +
> >  4 files changed, 173 insertions(+), 12 deletions(-)
> >  create mode 100755 tests/functional/077
> >  create mode 100644 tests/functional/077.out
> > 
> > diff --git a/dog/vdi.c b/dog/vdi.c
> > index d9a9a0f..27d03a2 100644
> > --- a/dog/vdi.c
> > +++ b/dog/vdi.c
> > @@ -1434,6 +1434,12 @@ struct vdi_check_work {
> >  	struct work work;
> >  };
> >  
> > +enum vdi_check_result {
> > +	VDI_CHECK_NO_OBJ_FOUND,
> > +	VDI_CHECK_NO_MAJORITY_FOUND,
> > +	VDI_CHECK_SUCCESS,
> > +};
> > +
> >  struct vdi_check_info {
> >  	uint64_t oid;
> >  	uint8_t nr_copies;
> > @@ -1442,7 +1448,8 @@ struct vdi_check_info {
> >  	uint64_t *done;
> >  	int refcnt;
> >  	struct work_queue *wq;
> > -	struct vdi_check_work *base;
> > +	enum vdi_check_result result;
> > +	struct vdi_check_work *majority;
> >  	struct vdi_check_work vcw[0];
> >  };
> >  
> > @@ -1462,7 +1469,7 @@ static void vdi_repair_work(struct work *work)
> >  	struct vdi_check_info *info = vcw->info;
> >  	void *buf;
> >  
> > -	buf = read_object_from(info->base->vnode, info->oid);
> > +	buf = read_object_from(info->majority->vnode, info->oid);
> >  	write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0);
> >  	free(buf);
> >  }
> > @@ -1511,10 +1518,8 @@ static void vdi_check_object_work(struct work *work)
> >  	switch (rsp->result) {
> >  	case SD_RES_SUCCESS:
> >  		vcw->object_found = true;
> > -		if (!is_erasure_oid(info->oid, info->copy_policy)) {
> > +		if (!is_erasure_oid(info->oid, info->copy_policy))
> >  			memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash));
> > -			uatomic_set(&info->base, vcw);
> > -		}
> >  		break;
> >  	case SD_RES_NO_OBJ:
> >  		vcw->object_found = false;
> > @@ -1530,18 +1535,31 @@ static void vdi_check_object_work(struct work *work)
> >  
> >  static void check_replicatoin_object(struct vdi_check_info *info)
> >  {
> > -	if (info->base == NULL) {
> > -		sd_err("no node has %" PRIx64, info->oid);
> > -		exit(EXIT_FAILURE);
> > +	if (info->majority == NULL) {
> > +		switch (info->result) {
> > +		case VDI_CHECK_NO_OBJ_FOUND:
> > +			sd_err("no node has %" PRIx64, info->oid);
> > +			break;
> > +		case VDI_CHECK_NO_MAJORITY_FOUND:
> > +			sd_err("no majority of %" PRIx64, info->oid);
> > +			break;
> > +		default:
> > +			sd_err("unknown result of vdi check: %d", info->result);
> > +			exit(EXIT_FAILURE);
> > +			break;
> > +		}
> > +
> > +		/* do nothing */
> > +		return;
> >  	}
> >  
> >  	for (int i = 0; i < info->nr_copies; i++) {
> > -		if (&info->vcw[i] == info->base)
> > +		if (&info->vcw[i] == info->majority)
> >  			continue;
> >  		/* need repair when object not found or consistency broken */
> >  		if (!info->vcw[i].object_found ||
> > -		    memcmp(info->base->hash, info->vcw[i].hash,
> > -			   sizeof(info->base->hash)) != 0) {
> > +		    memcmp(info->majority->hash, info->vcw[i].hash,
> > +			   sizeof(info->majority->hash)) != 0) {
> >  			info->vcw[i].work.fn = vdi_repair_work;
> >  			info->vcw[i].work.done = vdi_repair_main;
> >  			info->refcnt++;
> > @@ -1615,6 +1633,45 @@ out:
> >  	ec_destroy(ctx);
> >  }
> >  
> > +static void vote_majority_object(struct vdi_check_info *info)
> > +{
> > +	/*
> > +	 * Voting majority object from existing ones.
> > +	 *
> > +	 * The linear majority vote algorithm by Boyer and Moore is used:
> > +	 * http://www.cs.utexas.edu/~moore/best-ideas/mjrty/
> > +	 */
> > +
> > +	int count = 0;
> > +	struct vdi_check_work *majority = NULL;
> > +
> > +	for (int i = 0; i < info->nr_copies; i++) {
> > +		struct vdi_check_work *vcw = &info->vcw[i];
> > +
> > +		if (!vcw->object_found)
> > +			continue;
> > +
> > +		if (!count)
> > +			majority = vcw;
> > +
> > +		if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash)))
> > +			count++;
> > +		else
> > +			count--;
> > +	}
> > +
> > +	if (!majority)
> > +		info->result = VDI_CHECK_NO_OBJ_FOUND;
> > +	else if (count < info->nr_copies / 2) {
> > +		/* no majority found */
> > +		majority = NULL;
> > +		info->result = VDI_CHECK_NO_MAJORITY_FOUND;
> 
> Suppose we have 5 copies and after a crash, we only have 1 copy left for some
> object. So isn't it better to fix all missing objects?
> 
> I think it would be better if check
> 
> count < nr_of_live_copies / 2
> 

Thanks for your pointing. I agree with the condition. I'll send v2 later.

Thanks,
Hitoshi



More information about the sheepdog mailing list