[sheepdog] [PATCH] sheep: fix recovery logic

Liu Yuan namei.unix at gmail.com
Wed May 23 12:01:05 CEST 2012


From: Liu Yuan <tailai.ly at taobao.com>

v2:
 - continue the recovery from peers even someone asks us to retry it

---------------------------------- >8 -----------------------------

Current recovey will fail for following simple test case (really surprised)

=====
set -ex

pkill -9 sheep
rm store/* -rf

# start three sheep daemons
for i in 0 1 2 3; do
    ./sheep/sheep -d /home/tailai.ly/sheepdog/store/$i -z $i -p 700$i -W
done
sleep 1
./collie/collie cluster format -c 2
./collie/collie vdi create test0 40M
./collie/collie vdi create test1 40M
pkill -f "sheep -d /home/tailai.ly/sheepdog/store/3"

=====

after running the script, we can see copy isn't recoveried!

tailai.ly at taobao:~/sheepdog$ find -name '80fd32fc00000000'
./store/3/obj/80fd32fc00000000 # <-- this node is killed
./store/0/obj/80fd32fc00000000

With the patch, we get a expected result:
tailai.ly at taobao:~/sheepdog$ find -name '80fd32fc00000000'
./store/1/obj/80fd32fc00000000 # copy migrated from node 3
./store/3/obj/80fd32fc00000000 # killed
./store/0/obj/80fd32fc00000000

The failture is rooted in the original algorithm: we just did one shot search
and didn't try a breadth-first search before diving into older configuration.

The fix is rather straightforward, do a breadth-first search!. With the patch,
we also end up with much more simplified code.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 sheep/recovery.c |  203 +++++++++---------------------------------------------
 1 file changed, 32 insertions(+), 171 deletions(-)

diff --git a/sheep/recovery.c b/sheep/recovery.c
index afe58ac..f457df5 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -62,115 +62,6 @@ static int obj_cmp(const void *oid1, const void *oid2)
 	return 0;
 }
 
-/*
- * contains_node - checks that the node id is included in the target nodes
- *
- * The target nodes to store replicated objects are the first N nodes
- * from the base_idx'th on the consistent hash ring, where N is the
- * number of copies of objects.
- */
-static int contains_node(struct sd_vnode *key,
-			 struct sd_vnode *entry,
-			 int nr, int base_idx, int copies)
-{
-	int i;
-
-	for (i = 0; i < copies; i++) {
-		int idx = get_nth_node(entry, nr, base_idx, i);
-		if (memcmp(key->addr, entry[idx].addr, sizeof(key->addr)) == 0
-		    && key->port == entry[idx].port)
-			return idx;
-	}
-	return -1;
-}
-
-/*
- * find_tgt_node - find the node from which we should recover objects
- *
- * This function compares two node lists, the current target nodes and
- * the previous target nodes, and finds the node from the previous
- * target nodes which corresponds to the copy_idx'th node of the
- * current target nodes.  The correspondence is injective and
- * maximizes the number of nodes which can recover objects locally.
- *
- * For example, consider the number of redundancy is 5, the consistent
- * hash ring is {A, B, C, D, E, F}, and the node G is newly added.
- * The parameters of this function are
- *   old_entry = {A, B, C, D, E, F},    old_nr = 6, old_idx = 3
- *   cur_entry = {A, B, C, D, E, F, G}, cur_nr = 7, cur_idx = 3
- *
- * In this case:
- *   the previous target nodes: {D, E, F, A, B}
- *     (the first 5 nodes from the 3rd node on the previous hash ring)
- *   the current target nodes : {D, E, F, G, A}
- *     (the first 5 nodes from the 3rd node on the current hash ring)
- *
- * The correspondence between copy_idx and return value are as follows:
- * ----------------------------
- * copy_idx       0  1  2  3  4
- * src_node       D  E  F  G  A
- * tgt_node       D  E  F  B  A
- * return value   0  1  2  4  3
- * ----------------------------
- *
- * The node D, E, F, and A can recover objects from local, and the
- * node G recovers from the node B.
- */
-static int find_tgt_node(struct sd_vnode *old_entry,
-			 int old_nr, int old_idx, int old_copies,
-			 struct sd_vnode *cur_entry,
-			 int cur_nr, int cur_idx, int cur_copies,
-			 int copy_idx)
-{
-	int i, j, idx;
-
-	dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n",
-		old_idx, old_nr, old_copies, cur_idx, cur_nr, cur_copies, copy_idx);
-
-	/* If the same node is in the previous target nodes, return its index */
-	idx = contains_node(cur_entry + get_nth_node(cur_entry, cur_nr, cur_idx, copy_idx),
-			    old_entry, old_nr, old_idx, old_copies);
-	if (idx >= 0) {
-		dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", idx, copy_idx, cur_idx, cur_nr);
-		return idx;
-	}
-
-	for (i = 0, j = 0; ; i++, j++) {
-		if (i < copy_idx) {
-			/* Skip if the node can recover from its local */
-			idx = contains_node(cur_entry + get_nth_node(cur_entry, cur_nr, cur_idx, i),
-					    old_entry, old_nr, old_idx, old_copies);
-			if (idx >= 0)
-				continue;
-
-			/* Find the next target which needs to recover from remote */
-			while (j < old_copies &&
-			       contains_node(old_entry + get_nth_node(old_entry, old_nr, old_idx, j),
-					     cur_entry, cur_nr, cur_idx, cur_copies) >= 0)
-				j++;
-		}
-		if (j == old_copies) {
-			/*
-			 * Cannot find the target because the number of zones
-			 * is smaller than the number of copies.  We can select
-			 * any node in this case, so select the first one.
-			 */
-			return old_idx;
-		}
-
-		if (i == copy_idx) {
-			/* Found the target node correspoinding to copy_idx */
-			dprintf("%"PRIu32", %"PRIu32", %"PRIu32"\n",
-				get_nth_node(old_entry, old_nr, old_idx, j),
-				copy_idx, (cur_idx + i) % cur_nr);
-			return get_nth_node(old_entry, old_nr, old_idx, j);
-		}
-
-	}
-
-	return -1;
-}
-
 static void *get_vnodes_from_epoch(uint32_t epoch, int *nr, int *copies)
 {
 	int nodes_nr, len = sizeof(struct sd_vnode) * SD_MAX_VNODES;
@@ -307,14 +198,14 @@ static void rollback_old_cur(struct sd_vnode *old, int *old_nr, int *old_copies,
  * the routine will try to recovery it from the nodes it has stayed,
  * at least, *theoretically* on consistent hash ring.
  */
-static int do_recover_object(struct recovery_work *rw, int copy_idx)
+static int do_recover_object(struct recovery_work *rw)
 {
 	struct sd_vnode *old, *cur;
 	uint64_t oid = rw->oids[rw->done];
 	int old_nr = rw->old_nr_vnodes, cur_nr = rw->cur_nr_vnodes;
 	uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1;
 	struct sd_vnode *tgt_entry;
-	int old_idx, cur_idx, tgt_idx, old_copies, cur_copies, ret;
+	int old_copies, cur_copies, ret, i, retry;
 
 	old = xmalloc(sizeof(*old) * SD_MAX_VNODES);
 	cur = xmalloc(sizeof(*cur) * SD_MAX_VNODES);
@@ -324,28 +215,33 @@ static int do_recover_object(struct recovery_work *rw, int copy_idx)
 	cur_copies = get_max_nr_copies_from(rw->cur_nodes, rw->cur_nr_nodes);
 
 again:
-	old_idx = obj_to_sheep(old, old_nr, oid, 0);
-	cur_idx = obj_to_sheep(cur, cur_nr, oid, 0);
-
-	dprintf("try recover object %"PRIx64" from epoch %"PRIu32"\n", oid, tgt_epoch);
-
-	if (cur_copies <= copy_idx) {
-		eprintf("epoch (%"PRIu32") has less copies (%d) than requested copy_idx: %d\n",
-		tgt_epoch, cur_copies, copy_idx);
-		ret = -1;
-		goto err;
+	dprintf("try recover object %"PRIx64" from epoch %"PRIu32"\n",
+		oid, tgt_epoch);
+	retry = 0;
+
+	/* Let's do a breadth-first search */
+	for (i = 0; i < old_copies; i++) {
+		int idx;
+		idx = obj_to_sheep(old, old_nr, oid, i);
+		tgt_entry = old + idx;
+		ret = recover_object_from_replica(oid, tgt_entry,
+						  epoch, tgt_epoch);
+		if (ret = 0) {
+			/* Succeed */
+			break;
+		} else (ret > 0) {
+			retry = 1;
+			/* Try our best to recover from peers */
+			continue;
+		}
 	}
-
-	tgt_idx = find_tgt_node(old, old_nr, old_idx, old_copies,
-			cur, cur_nr, cur_idx, cur_copies, copy_idx);
-	if (tgt_idx < 0) {
-		eprintf("cannot find target node %"PRIx64"\n", oid);
-		ret = -1;
+	/* If not succeed but someone orders us to retry it, serve the order */
+	if (ret != 0 && retry == 1) {
+		ret = 0;
+		rw->retry = 1;
 		goto err;
 	}
-	tgt_entry = old + tgt_idx;
-
-	ret = recover_object_from_replica(oid, tgt_entry, epoch, tgt_epoch);
+	/* No luck, roll back to an older configuration and try again */
 	if (ret < 0) {
 		struct sd_vnode *new_old;
 		int new_old_nr, new_old_copies;
@@ -366,9 +262,6 @@ again:
 				new_old, new_old_nr, new_old_copies);
 		free(new_old);
 		goto again;
-	} else if (ret > 0) {
-		ret = 0;
-		rw->retry = 1;
 	}
 err:
 	free(old);
@@ -376,57 +269,25 @@ err:
 	return ret;
 }
 
-static int get_replica_idx(struct recovery_work *rw, uint64_t oid, int *copy_nr)
-{
-	int i, ret = -1;
-	int idx_buf[SD_MAX_COPIES];
-
-	*copy_nr = get_max_nr_copies_from(rw->cur_nodes, rw->cur_nr_nodes);
-	obj_to_sheeps(rw->cur_vnodes, rw->cur_nr_vnodes, oid,
-				*copy_nr, idx_buf);
-
-	for (i = 0; i < *copy_nr; i++) {
-		int n = idx_buf[i];
-		if (vnode_is_local(&rw->cur_vnodes[n])) {
-			ret = i;
-			break;
-		}
-	}
-	return ret;
-}
-
 static void recover_object(struct work *work)
 {
-	struct recovery_work *rw = container_of(work, struct recovery_work, work);
+	struct recovery_work *rw = container_of(work, struct recovery_work,
+						work);
 	uint64_t oid = rw->oids[rw->done];
-	int i, copy_idx, copy_nr, ret;
+	int ret;
 
 	if (!sys->nr_copies)
 		return;
 
-	eprintf("done:%"PRIu32" count:%"PRIu32", oid:%"PRIx64"\n", rw->done, rw->count, oid);
+	eprintf("done:%"PRIu32" count:%"PRIu32", oid:%"PRIx64"\n",
+		rw->done, rw->count, oid);
 
 	if (sd_store->exist(oid)) {
 		dprintf("the object is already recovered\n");
 		return;
 	}
 
-	copy_idx = get_replica_idx(rw, oid, &copy_nr);
-	if (copy_idx < 0) {
-		ret = -1;
-		goto err;
-	}
-	ret = do_recover_object(rw, copy_idx);
-	if (ret < 0) {
-		for (i = 0; i < copy_nr; i++) {
-			if (i == copy_idx)
-				continue;
-			ret = do_recover_object(rw, i);
-			if (ret == 0)
-				break;
-		}
-	}
-err:
+	ret = do_recover_object(rw);
 	if (ret < 0)
 		eprintf("failed to recover object %"PRIx64"\n", oid);
 }
-- 
1.7.10.2




More information about the sheepdog mailing list