[sheepdog] [PATCH 4/7] recovery: handle multiple node events for erasured vdi

Wed Oct 16 05:39:59 CEST 2013

On Wed, Oct 16, 2013 at 03:06:00AM +0900, MORITA Kazutaka wrote:
> At Sun, 13 Oct 2013 19:43:09 +0800,
> Liu Yuan wrote:
> > 
> > 1. read the lost object from its track in epoch history vertically because
> >    every replica is unique
> > 2. if not found in 1, then tries to rebuild it with RS algorithm
> >    2.1 read enough other replica from their tracks in epoch history
> >    2.2 rebuild the lost object from the content of replica read at 2.1
> > 
> > The subtle case is number for available zones is less than SD_EC_DP or the
> > requested index of lost object:
> >     1 we need to make sure nr_zones >= SD_EC_DP to avoid panic of
> >       oid_to_node(s) helpers.
> >     2 we have to search all the available zones when we can't get idx. Its
> >       okay to do a mad search when number of available zones is small
> > 
> > Signed-off-by: Liu Yuan <namei.unix at gmail.com>
> > ---
> >  sheep/plain_store.c |   10 ++-
> >  sheep/recovery.c    |  191 ++++++++++++++++++++++++++++++++++-----------------
> >  2 files changed, 137 insertions(+), 64 deletions(-)
> > 
> > diff --git a/sheep/plain_store.c b/sheep/plain_store.c
> > index eb00be0..a214c46 100644
> > --- a/sheep/plain_store.c
> > +++ b/sheep/plain_store.c
> > @@ -443,8 +443,16 @@ static bool oid_stale(uint64_t oid)
> >  	const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
> >  
> >  	vinfo = get_vnode_info();
> > -	nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
> >  
> > +	/*
> > +	 * If vinfo->nr_zones < SD_EC_DP, we might not get the idx, so we don't
> > +	 * know it is stale or not. In this case, we keep it stay in the working
> > +	 * directory in order to recover it when we get enough zones
> > +	 */
> > +	if (unlikely(vinfo->nr_zones < SD_EC_DP) && is_erasure_oid(oid))
> > +		return false;
> > +
> > +	nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
> >  	oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes);
> >  	for (i = 0; i < nr_copies; i++) {
> >  		v = obj_vnodes[i];
> > diff --git a/sheep/recovery.c b/sheep/recovery.c
> > index 29c5d05..500f112 100644
> > --- a/sheep/recovery.c
> > +++ b/sheep/recovery.c
> > @@ -1,5 +1,6 @@
> >  /*
> >   * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation.
> > + * Copyright (C) 2012-2013 Taobao Inc.
> >   *
> >   * This program is free software; you can redistribute it and/or
> >   * modify it under the terms of the GNU General Public License version
> > @@ -93,15 +94,90 @@ static inline bool node_is_gateway_only(void)
> >  	return sys->this_node.nr_vnodes == 0;
> >  }
> >  
> > -static void *read_erasure_object(const struct sd_node *node, uint64_t oid,
> > -				 uint32_t epoch, uint32_t tgt_epoch,
> > -				 uint8_t idx)
> > +static struct vnode_info *rollback_vnode_info(uint32_t *epoch,
> > +					      struct vnode_info *cur)
> > +{
> > +	struct vnode_info *vinfo;
> > +rollback:
> > +	*epoch -= 1;
> > +	if (*epoch < last_gathered_epoch)
> > +		return NULL;
> > +
> > +	vinfo = get_vnode_info_epoch(*epoch, cur);
> > +	if (!vinfo) {
> > +		/* We rollback in case we don't get a valid epoch */
> > +		sd_alert("cannot get epoch %d", *epoch);
> > +		sd_alert("clients may see old data");
> > +		goto rollback;
> > +	}
> > +	return vinfo;
> > +}
> > +
> > +/*
> > + * A node that does not match any node in current node list means the node has
> > + * left the cluster, then it's an invalid node.
> > + */
> > +static bool invalid_node(const struct sd_node *n, struct vnode_info *info)
> > +{
> > +
> > +	if (rb_search(&info->nroot, n, rb, node_cmp))
> > +		return false;
> > +	return true;
> > +}
> > +
> > +static int search_erasure_object(uint64_t oid, uint8_t idx,
> > +				 struct rb_root *nroot,
> > +				 struct recovery_work *rw,
> > +				 uint32_t tgt_epoch,
> > +				 void *buf)
> >  {
> >  	struct sd_req hdr;
> >  	unsigned rlen = get_store_objsize(oid);
> > -	void *buf = xvalloc(rlen);
> > -	int ret;
> > +	struct sd_node *n;
> > +	uint32_t epoch = rw->epoch;
> >  
> > +	rb_for_each_entry(n, nroot, rb) {
> > +		if (invalid_node(n, rw->cur_vinfo))
> > +			continue;
> > +		sd_init_req(&hdr, SD_OP_READ_PEER);
> > +		hdr.epoch = epoch;
> > +		hdr.flags = SD_FLAG_CMD_RECOVERY;
> > +		hdr.data_length = rlen;
> > +		hdr.obj.oid = oid;
> > +		hdr.obj.tgt_epoch = tgt_epoch;
> > +		hdr.obj.ec_index = idx;
> > +
> > +		sd_debug("%"PRIx64" epoch %"PRIu32" tgt %"PRIu32" idx %d, %s",
> > +			 oid, epoch, tgt_epoch, idx, node_to_str(n));
> > +		if (sheep_exec_req(&n->nid, &hdr, buf) == SD_RES_SUCCESS)
> > +			return SD_RES_SUCCESS;
> > +	}
> > +	return SD_RES_NO_OBJ;
> > +}
> > +
> > +static void *read_erasure_object(uint64_t oid, uint8_t idx,
> > +				 struct recovery_work *rw)
> > +{
> > +	struct sd_req hdr;
> > +	unsigned rlen = get_store_objsize(oid);
> > +	void *buf = xvalloc(rlen);
> > +	struct vnode_info *old = grab_vnode_info(rw->old_vinfo), *new_old;
> > +	uint32_t epoch = rw->epoch, tgt_epoch = rw->tgt_epoch;
> > +	const struct sd_node *node;
> > +again:
> > +	if (old->nr_zones < SD_EC_DP) {
> > +		if (search_erasure_object(oid, idx, &old->nroot, rw,
> > +					  tgt_epoch, buf)
> > +		    == SD_RES_SUCCESS)
> > +			goto done;
> > +		else
> > +			goto rollback;
> > +	}
> > +	node = oid_to_node(oid, &old->vroot, idx);
> > +	sd_debug("%"PRIx64" epoch %"PRIu32" tgt %"PRIu32" idx %d, %s",
> > +		 oid, epoch, tgt_epoch, idx, node_to_str(node));
> > +	if (invalid_node(node, rw->cur_vinfo))
> > +		goto rollback;
> >  	sd_init_req(&hdr, SD_OP_READ_PEER);
> >  	hdr.epoch = epoch;
> >  	hdr.flags = SD_FLAG_CMD_RECOVERY;
> > @@ -110,13 +186,21 @@ static void *read_erasure_object(const struct sd_node *node, uint64_t oid,
> >  	hdr.obj.tgt_epoch = tgt_epoch;
> >  	hdr.obj.ec_index = idx;
> >  
> > -	sd_debug("%s, epoch %"PRIu32" tgt %"PRIu32" idx %d", node_to_str(node),
> > -		 epoch, tgt_epoch, idx);
> > -	ret = sheep_exec_req(&node->nid, &hdr, buf);
> > -	if (ret != SD_RES_SUCCESS) {
> > -		free(buf);
> > -		return NULL;
> > +	if (sheep_exec_req(&node->nid, &hdr, buf) != SD_RES_SUCCESS) {
> > +rollback:
> > +		new_old = rollback_vnode_info(&tgt_epoch, rw->cur_vinfo);
> > +		if (!new_old) {
> > +			sd_err("can not read %"PRIx64" idx %d", oid, idx);
> > +			free(buf);
> > +			buf = NULL;
> > +			goto done;
> > +		}
> > +		put_vnode_info(old);
> > +		old = new_old;
> > +		goto again;
> >  	}
> > +done:
> > +	put_vnode_info(old);
> >  	return buf;
> >  }
> >  
> > @@ -190,18 +274,6 @@ static int recover_object_from(struct recovery_obj_work *row,
> >  	return ret;
> >  }
> >  
> > -/*
> > - * A node that does not match any node in current node list means the node has
> > - * left the cluster, then it's an invalid node.
> > - */
> > -static bool invalid_node(const struct sd_node *n, struct vnode_info *info)
> > -{
> > -
> > -	if (rb_search(&info->nroot, n, rb, node_cmp))
> > -		return false;
> > -	return true;
> > -}
> > -
> >  static int recover_object_from_replica(struct recovery_obj_work *row,
> >  				       struct vnode_info *old,
> >  				       uint32_t tgt_epoch)
> > @@ -299,51 +371,36 @@ again:
> >  		/* fall through */
> >  	default:
> >  		/* No luck, roll back to an older configuration and try again */
> > -rollback:
> > -		tgt_epoch--;
> > -		if (tgt_epoch < 1) {
> > +		new_old = rollback_vnode_info(&tgt_epoch, rw->cur_vinfo);
> > +		if (!new_old) {
> >  			sd_err("can not recover oid %"PRIx64, oid);
> >  			ret = -1;
> > -			break;
> > -		}
> > -
> > -		new_old = get_vnode_info_epoch(tgt_epoch, rw->cur_vinfo);
> > -		if (!new_old) {
> > -			/* We rollback in case we don't get a valid epoch */
> > -			sd_alert("cannot get epoch %d", tgt_epoch);
> > -			sd_alert("clients may see old data");
> > -			goto rollback;
> > +			goto out;
> >  		}
> >  
> >  		put_vnode_info(old);
> >  		old = new_old;
> >  		goto again;
> >  	}
> > -
> > +out:
> >  	put_vnode_info(old);
> >  	return ret;
> >  }
> >  
> > -static void *rebuild_erasure_object(struct recovery_obj_work *row,
> > -				    uint32_t tgt_epoch, const uint8_t idx)
> > +static void *rebuild_erasure_object(uint64_t oid, uint8_t idx,
> > +				    struct recovery_work *rw)
> >  {
> > -	struct vnode_info *old = grab_vnode_info(row->base.old_vinfo);
> > -	const struct sd_node *target_nodes[SD_MAX_NODES];
> >  	uint8_t *bufs[SD_EC_D] = { 0 };
> > -	uint64_t oid = row->oid;
> > -	uint32_t epoch = row->base.epoch;
> >  	int idxs[SD_EC_D], len = get_store_objsize(oid);
> >  	struct fec *ctx = ec_init();
> >  	char *lost = xvalloc(len);
> >  	int i, j;
> >  
> >  	/* Prepare replica */
> > -	oid_to_nodes(oid, &old->vroot, SD_EC_DP, target_nodes);
> >  	for (i = 0, j = 0; i < SD_EC_DP && j < SD_EC_D; i++) {
> >  		if (i == idx)
> >  			continue;
> > -		bufs[j] = read_erasure_object(target_nodes[i], oid,
> > -					      epoch, tgt_epoch, i);
> > +		bufs[j] = read_erasure_object(oid, i, rw);
> >  		if (!bufs[j])
> >  			continue;
> >  		idxs[j++] = i;
> > @@ -366,7 +423,6 @@ static void *rebuild_erasure_object(struct recovery_obj_work *row,
> >  	}
> >  out:
> >  	ec_destroy(ctx);
> > -	put_vnode_info(old);
> >  	for (i = 0; i < SD_EC_D; i++)
> >  		free(bufs[i]);
> >  	return lost;
> > @@ -384,34 +440,45 @@ static uint8_t local_node_copy_index(struct rb_root *vroot, uint64_t oid)
> >  	panic("can't get valid index for %"PRIx64, oid);
> >  }
> >  
> > +/*
> > + * Erasure object recovery algorithm
> > + *
> > + * 1. read the lost object from its track in epoch history vertically because
> > + *    every replica is unique
> > + * 2. if not found in 1, then tries to rebuild it with RS algorithm
> > + *    2.1 read enough other replica from their tracks in epoch history
> 
> I think the word "replica" should be used only for replicated data.
> 
> > + *    2.2 rebuild the lost object from the content of replica read at 2.1
> > + *
> 
> To reduce the risk of reading old data, the algorithm should be
> breadth first search.  Is it possible to change the algorithm to like
> as follows?
> 
>   1. read the lost object from the target epoch.
>   2. rebuild the lost object using data and parity at the target
>      epoch.
>   3. if failed to recover the lost object, decrement the target epoch
>      number and go back to 1.
> 

This is actually the first version of the erasure recovery algorithm. But it
fails some tests like 032. Another reason for not breadth first is, there is
only one copy for each strip data, if it is not lost, it is always the freshes
one with the current alrotihm. I am not sure if it is completely impossible to
write breadth first algorithm, but current algorithm passes all the tests, let's
keep it until we find a better one.

Thanks
Yuan