[sheepdog] [PATCH 2/2] sheep: fix recovery logic

Wed May 23 11:36:59 CEST 2012

On 05/23/2012 05:04 PM, Liu Yuan wrote:
> From: Liu Yuan<tailai.ly at taobao.com>
>
> Current recovey will fail for following simple test case (really surprised)
>
> =====
> set -ex
>
> pkill -9 sheep
> rm store/* -rf
>
> # start three sheep daemons
> for i in 0 1 2 3; do
>      ./sheep/sheep -d /home/tailai.ly/sheepdog/store/$i -z $i -p 700$i -W
> done
> sleep 1
> ./collie/collie cluster format -c 2
> ./collie/collie vdi create test0 40M
> ./collie/collie vdi create test1 40M
> pkill -f "sheep -d /home/tailai.ly/sheepdog/store/3"
>
> =====
>
> after running the script, we can see copy isn't recoveried!
>
> tailai.ly at taobao:~/sheepdog$ find -name '80fd32fc00000000'
> ./store/3/obj/80fd32fc00000000 #<-- this node is killed
> ./store/0/obj/80fd32fc00000000
>
> With the patch, we get a expected result:
> tailai.ly at taobao:~/sheepdog$ find -name '80fd32fc00000000'
> ./store/1/obj/80fd32fc00000000 # copy migrated from node 3
> ./store/3/obj/80fd32fc00000000 # killed
> ./store/0/obj/80fd32fc00000000
>
> The failture is rooted in the original algorithm: we just did one shot search
> and didn't try a breadth-first search before diving into older configuration.
>
> The fix is rather straightforward, do a breadth-first search!. With the patch,
> we also end up with much more simplified code.
>
> Signed-off-by: Liu Yuan<tailai.ly at taobao.com>
> ---
>   sheep/recovery.c |  186 ++++++------------------------------------------------
>   1 file changed, 18 insertions(+), 168 deletions(-)
>
> diff --git a/sheep/recovery.c b/sheep/recovery.c
> index afe58ac..c5535b7 100644
> --- a/sheep/recovery.c
> +++ b/sheep/recovery.c
> @@ -62,115 +62,6 @@ static int obj_cmp(const void *oid1, const void *oid2)
>   	return 0;
>   }
>
> -/*
> - * contains_node - checks that the node id is included in the target nodes
> - *
> - * The target nodes to store replicated objects are the first N nodes
> - * from the base_idx'th on the consistent hash ring, where N is the
> - * number of copies of objects.
> - */
> -static int contains_node(struct sd_vnode *key,
> -			 struct sd_vnode *entry,
> -			 int nr, int base_idx, int copies)
> -{
> -	int i;
> -
> -	for (i = 0; i<  copies; i++) {
> -		int idx = get_nth_node(entry, nr, base_idx, i);
> -		if (memcmp(key->addr, entry[idx].addr, sizeof(key->addr)) == 0
> -		&&  key->port == entry[idx].port)
> -			return idx;
> -	}
> -	return -1;
> -}
> -
> -/*
> - * find_tgt_node - find the node from which we should recover objects
> - *
> - * This function compares two node lists, the current target nodes and
> - * the previous target nodes, and finds the node from the previous
> - * target nodes which corresponds to the copy_idx'th node of the
> - * current target nodes.  The correspondence is injective and
> - * maximizes the number of nodes which can recover objects locally.
> - *
> - * For example, consider the number of redundancy is 5, the consistent
> - * hash ring is {A, B, C, D, E, F}, and the node G is newly added.
> - * The parameters of this function are
> - *   old_entry = {A, B, C, D, E, F},    old_nr = 6, old_idx = 3
> - *   cur_entry = {A, B, C, D, E, F, G}, cur_nr = 7, cur_idx = 3
> - *
> - * In this case:
> - *   the previous target nodes: {D, E, F, A, B}
> - *     (the first 5 nodes from the 3rd node on the previous hash ring)
> - *   the current target nodes : {D, E, F, G, A}
> - *     (the first 5 nodes from the 3rd node on the current hash ring)
> - *
> - * The correspondence between copy_idx and return value are as follows:
> - * ----------------------------
> - * copy_idx       0  1  2  3  4
> - * src_node       D  E  F  G  A
> - * tgt_node       D  E  F  B  A
> - * return value   0  1  2  4  3
> - * ----------------------------
> - *
> - * The node D, E, F, and A can recover objects from local, and the
> - * node G recovers from the node B.
> - */
> -static int find_tgt_node(struct sd_vnode *old_entry,
> -			 int old_nr, int old_idx, int old_copies,
> -			 struct sd_vnode *cur_entry,
> -			 int cur_nr, int cur_idx, int cur_copies,
> -			 int copy_idx)
> -{
> -	int i, j, idx;
> -
> -	dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n",
> -		old_idx, old_nr, old_copies, cur_idx, cur_nr, cur_copies, copy_idx);
> -
> -	/* If the same node is in the previous target nodes, return its index */
> -	idx = contains_node(cur_entry + get_nth_node(cur_entry, cur_nr, cur_idx, copy_idx),
> -			    old_entry, old_nr, old_idx, old_copies);
> -	if (idx>= 0) {
> -		dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", idx, copy_idx, cur_idx, cur_nr);
> -		return idx;
> -	}
> -
> -	for (i = 0, j = 0; ; i++, j++) {
> -		if (i<  copy_idx) {
> -			/* Skip if the node can recover from its local */
> -			idx = contains_node(cur_entry + get_nth_node(cur_entry, cur_nr, cur_idx, i),
> -					    old_entry, old_nr, old_idx, old_copies);
> -			if (idx>= 0)
> -				continue;
> -
> -			/* Find the next target which needs to recover from remote */
> -			while (j<  old_copies&&
> -			       contains_node(old_entry + get_nth_node(old_entry, old_nr, old_idx, j),
> -					     cur_entry, cur_nr, cur_idx, cur_copies)>= 0)
> -				j++;
> -		}
> -		if (j == old_copies) {
> -			/*
> -			 * Cannot find the target because the number of zones
> -			 * is smaller than the number of copies.  We can select
> -			 * any node in this case, so select the first one.
> -			 */
> -			return old_idx;
> -		}
> -
> -		if (i == copy_idx) {
> -			/* Found the target node correspoinding to copy_idx */
> -			dprintf("%"PRIu32", %"PRIu32", %"PRIu32"\n",
> -				get_nth_node(old_entry, old_nr, old_idx, j),
> -				copy_idx, (cur_idx + i) % cur_nr);
> -			return get_nth_node(old_entry, old_nr, old_idx, j);
> -		}
> -
> -	}
> -
> -	return -1;
> -}
> -
>   static void *get_vnodes_from_epoch(uint32_t epoch, int *nr, int *copies)
>   {
>   	int nodes_nr, len = sizeof(struct sd_vnode) * SD_MAX_VNODES;
> @@ -307,14 +198,14 @@ static void rollback_old_cur(struct sd_vnode *old, int *old_nr, int *old_copies,
>    * the routine will try to recovery it from the nodes it has stayed,
>    * at least, *theoretically* on consistent hash ring.
>    */
> -static int do_recover_object(struct recovery_work *rw, int copy_idx)
> +static int do_recover_object(struct recovery_work *rw)
>   {
>   	struct sd_vnode *old, *cur;
>   	uint64_t oid = rw->oids[rw->done];
>   	int old_nr = rw->old_nr_vnodes, cur_nr = rw->cur_nr_vnodes;
>   	uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1;
>   	struct sd_vnode *tgt_entry;
> -	int old_idx, cur_idx, tgt_idx, old_copies, cur_copies, ret;
> +	int old_copies, cur_copies, ret, i;
>
>   	old = xmalloc(sizeof(*old) * SD_MAX_VNODES);
>   	cur = xmalloc(sizeof(*cur) * SD_MAX_VNODES);
> @@ -324,28 +215,19 @@ static int do_recover_object(struct recovery_work *rw, int copy_idx)
>   	cur_copies = get_max_nr_copies_from(rw->cur_nodes, rw->cur_nr_nodes);
>
>   again:
> -	old_idx = obj_to_sheep(old, old_nr, oid, 0);
> -	cur_idx = obj_to_sheep(cur, cur_nr, oid, 0);
> -
>   	dprintf("try recover object %"PRIx64" from epoch %"PRIu32"\n", oid, tgt_epoch);
>
> -	if (cur_copies<= copy_idx) {
> -		eprintf("epoch (%"PRIu32") has less copies (%d) than requested copy_idx: %d\n",
> -		tgt_epoch, cur_copies, copy_idx);
> -		ret = -1;
> -		goto err;
> -	}
> -
> -	tgt_idx = find_tgt_node(old, old_nr, old_idx, old_copies,
> -			cur, cur_nr, cur_idx, cur_copies, copy_idx);
> -	if (tgt_idx<  0) {
> -		eprintf("cannot find target node %"PRIx64"\n", oid);
> -		ret = -1;
> -		goto err;
> +	/* Let's do a breadth-first search */
> +	for (i = 0; i<  old_copies; i++) {
> +		int idx;
> +		idx = obj_to_sheep(old, old_nr, oid, i);

I suggest to use obj_to_sheeps() here to avoid some unnecessary list 
traversing in obj_to_sheep()

thanks,

levin

> +		tgt_entry = old + idx;
> +		ret = recover_object_from_replica(oid, tgt_entry,
> +						  epoch, tgt_epoch);
> +		if (ret>= 0)
> +			break;
>   	}
> -	tgt_entry = old + tgt_idx;
> -
> -	ret = recover_object_from_replica(oid, tgt_entry, epoch, tgt_epoch);
> +	/* If not succeed, roll back to an older configuration and try again */
>   	if (ret<  0) {
>   		struct sd_vnode *new_old;
>   		int new_old_nr, new_old_copies;
> @@ -376,57 +258,25 @@ err:
>   	return ret;
>   }
>
> -static int get_replica_idx(struct recovery_work *rw, uint64_t oid, int *copy_nr)
> -{
> -	int i, ret = -1;
> -	int idx_buf[SD_MAX_COPIES];
> -
> -	*copy_nr = get_max_nr_copies_from(rw->cur_nodes, rw->cur_nr_nodes);
> -	obj_to_sheeps(rw->cur_vnodes, rw->cur_nr_vnodes, oid,
> -				*copy_nr, idx_buf);
> -
> -	for (i = 0; i<  *copy_nr; i++) {
> -		int n = idx_buf[i];
> -		if (vnode_is_local(&rw->cur_vnodes[n])) {
> -			ret = i;
> -			break;
> -		}
> -	}
> -	return ret;
> -}
> -
>   static void recover_object(struct work *work)
>   {
> -	struct recovery_work *rw = container_of(work, struct recovery_work, work);
> +	struct recovery_work *rw = container_of(work, struct recovery_work,
> +						work);
>   	uint64_t oid = rw->oids[rw->done];
> -	int i, copy_idx, copy_nr, ret;
> +	int ret;
>
>   	if (!sys->nr_copies)
>   		return;
>
> -	eprintf("done:%"PRIu32" count:%"PRIu32", oid:%"PRIx64"\n", rw->done, rw->count, oid);
> +	eprintf("done:%"PRIu32" count:%"PRIu32", oid:%"PRIx64"\n",
> +		rw->done, rw->count, oid);
>
>   	if (sd_store->exist(oid)) {
>   		dprintf("the object is already recovered\n");
>   		return;
>   	}
>
> -	copy_idx = get_replica_idx(rw, oid,&copy_nr);
> -	if (copy_idx<  0) {
> -		ret = -1;
> -		goto err;
> -	}
> -	ret = do_recover_object(rw, copy_idx);
> -	if (ret<  0) {
> -		for (i = 0; i<  copy_nr; i++) {
> -			if (i == copy_idx)
> -				continue;
> -			ret = do_recover_object(rw, i);
> -			if (ret == 0)
> -				break;
> -		}
> -	}
> -err:
> +	ret = do_recover_object(rw);
>   	if (ret<  0)
>   		eprintf("failed to recover object %"PRIx64"\n", oid);
>   }