On 05/23/2012 05:04 PM, Liu Yuan wrote: > From: Liu Yuan<tailai.ly at taobao.com> > > Current recovey will fail for following simple test case (really surprised) > > ===== > set -ex > > pkill -9 sheep > rm store/* -rf > > # start three sheep daemons > for i in 0 1 2 3; do > ./sheep/sheep -d /home/tailai.ly/sheepdog/store/$i -z $i -p 700$i -W > done > sleep 1 > ./collie/collie cluster format -c 2 > ./collie/collie vdi create test0 40M > ./collie/collie vdi create test1 40M > pkill -f "sheep -d /home/tailai.ly/sheepdog/store/3" > > ===== > > after running the script, we can see copy isn't recoveried! > > tailai.ly at taobao:~/sheepdog$ find -name '80fd32fc00000000' > ./store/3/obj/80fd32fc00000000 #<-- this node is killed > ./store/0/obj/80fd32fc00000000 > > With the patch, we get a expected result: > tailai.ly at taobao:~/sheepdog$ find -name '80fd32fc00000000' > ./store/1/obj/80fd32fc00000000 # copy migrated from node 3 > ./store/3/obj/80fd32fc00000000 # killed > ./store/0/obj/80fd32fc00000000 > > The failture is rooted in the original algorithm: we just did one shot search > and didn't try a breadth-first search before diving into older configuration. > > The fix is rather straightforward, do a breadth-first search!. With the patch, > we also end up with much more simplified code. > > Signed-off-by: Liu Yuan<tailai.ly at taobao.com> > --- > sheep/recovery.c | 186 ++++++------------------------------------------------ > 1 file changed, 18 insertions(+), 168 deletions(-) > > diff --git a/sheep/recovery.c b/sheep/recovery.c > index afe58ac..c5535b7 100644 > --- a/sheep/recovery.c > +++ b/sheep/recovery.c > @@ -62,115 +62,6 @@ static int obj_cmp(const void *oid1, const void *oid2) > return 0; > } > > -/* > - * contains_node - checks that the node id is included in the target nodes > - * > - * The target nodes to store replicated objects are the first N nodes > - * from the base_idx'th on the consistent hash ring, where N is the > - * number of copies of objects. > - */ > -static int contains_node(struct sd_vnode *key, > - struct sd_vnode *entry, > - int nr, int base_idx, int copies) > -{ > - int i; > - > - for (i = 0; i< copies; i++) { > - int idx = get_nth_node(entry, nr, base_idx, i); > - if (memcmp(key->addr, entry[idx].addr, sizeof(key->addr)) == 0 > - && key->port == entry[idx].port) > - return idx; > - } > - return -1; > -} > - > -/* > - * find_tgt_node - find the node from which we should recover objects > - * > - * This function compares two node lists, the current target nodes and > - * the previous target nodes, and finds the node from the previous > - * target nodes which corresponds to the copy_idx'th node of the > - * current target nodes. The correspondence is injective and > - * maximizes the number of nodes which can recover objects locally. > - * > - * For example, consider the number of redundancy is 5, the consistent > - * hash ring is {A, B, C, D, E, F}, and the node G is newly added. > - * The parameters of this function are > - * old_entry = {A, B, C, D, E, F}, old_nr = 6, old_idx = 3 > - * cur_entry = {A, B, C, D, E, F, G}, cur_nr = 7, cur_idx = 3 > - * > - * In this case: > - * the previous target nodes: {D, E, F, A, B} > - * (the first 5 nodes from the 3rd node on the previous hash ring) > - * the current target nodes : {D, E, F, G, A} > - * (the first 5 nodes from the 3rd node on the current hash ring) > - * > - * The correspondence between copy_idx and return value are as follows: > - * ---------------------------- > - * copy_idx 0 1 2 3 4 > - * src_node D E F G A > - * tgt_node D E F B A > - * return value 0 1 2 4 3 > - * ---------------------------- > - * > - * The node D, E, F, and A can recover objects from local, and the > - * node G recovers from the node B. > - */ > -static int find_tgt_node(struct sd_vnode *old_entry, > - int old_nr, int old_idx, int old_copies, > - struct sd_vnode *cur_entry, > - int cur_nr, int cur_idx, int cur_copies, > - int copy_idx) > -{ > - int i, j, idx; > - > - dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", > - old_idx, old_nr, old_copies, cur_idx, cur_nr, cur_copies, copy_idx); > - > - /* If the same node is in the previous target nodes, return its index */ > - idx = contains_node(cur_entry + get_nth_node(cur_entry, cur_nr, cur_idx, copy_idx), > - old_entry, old_nr, old_idx, old_copies); > - if (idx>= 0) { > - dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", idx, copy_idx, cur_idx, cur_nr); > - return idx; > - } > - > - for (i = 0, j = 0; ; i++, j++) { > - if (i< copy_idx) { > - /* Skip if the node can recover from its local */ > - idx = contains_node(cur_entry + get_nth_node(cur_entry, cur_nr, cur_idx, i), > - old_entry, old_nr, old_idx, old_copies); > - if (idx>= 0) > - continue; > - > - /* Find the next target which needs to recover from remote */ > - while (j< old_copies&& > - contains_node(old_entry + get_nth_node(old_entry, old_nr, old_idx, j), > - cur_entry, cur_nr, cur_idx, cur_copies)>= 0) > - j++; > - } > - if (j == old_copies) { > - /* > - * Cannot find the target because the number of zones > - * is smaller than the number of copies. We can select > - * any node in this case, so select the first one. > - */ > - return old_idx; > - } > - > - if (i == copy_idx) { > - /* Found the target node correspoinding to copy_idx */ > - dprintf("%"PRIu32", %"PRIu32", %"PRIu32"\n", > - get_nth_node(old_entry, old_nr, old_idx, j), > - copy_idx, (cur_idx + i) % cur_nr); > - return get_nth_node(old_entry, old_nr, old_idx, j); > - } > - > - } > - > - return -1; > -} > - > static void *get_vnodes_from_epoch(uint32_t epoch, int *nr, int *copies) > { > int nodes_nr, len = sizeof(struct sd_vnode) * SD_MAX_VNODES; > @@ -307,14 +198,14 @@ static void rollback_old_cur(struct sd_vnode *old, int *old_nr, int *old_copies, > * the routine will try to recovery it from the nodes it has stayed, > * at least, *theoretically* on consistent hash ring. > */ > -static int do_recover_object(struct recovery_work *rw, int copy_idx) > +static int do_recover_object(struct recovery_work *rw) > { > struct sd_vnode *old, *cur; > uint64_t oid = rw->oids[rw->done]; > int old_nr = rw->old_nr_vnodes, cur_nr = rw->cur_nr_vnodes; > uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1; > struct sd_vnode *tgt_entry; > - int old_idx, cur_idx, tgt_idx, old_copies, cur_copies, ret; > + int old_copies, cur_copies, ret, i; > > old = xmalloc(sizeof(*old) * SD_MAX_VNODES); > cur = xmalloc(sizeof(*cur) * SD_MAX_VNODES); > @@ -324,28 +215,19 @@ static int do_recover_object(struct recovery_work *rw, int copy_idx) > cur_copies = get_max_nr_copies_from(rw->cur_nodes, rw->cur_nr_nodes); > > again: > - old_idx = obj_to_sheep(old, old_nr, oid, 0); > - cur_idx = obj_to_sheep(cur, cur_nr, oid, 0); > - > dprintf("try recover object %"PRIx64" from epoch %"PRIu32"\n", oid, tgt_epoch); > > - if (cur_copies<= copy_idx) { > - eprintf("epoch (%"PRIu32") has less copies (%d) than requested copy_idx: %d\n", > - tgt_epoch, cur_copies, copy_idx); > - ret = -1; > - goto err; > - } > - > - tgt_idx = find_tgt_node(old, old_nr, old_idx, old_copies, > - cur, cur_nr, cur_idx, cur_copies, copy_idx); > - if (tgt_idx< 0) { > - eprintf("cannot find target node %"PRIx64"\n", oid); > - ret = -1; > - goto err; > + /* Let's do a breadth-first search */ > + for (i = 0; i< old_copies; i++) { > + int idx; > + idx = obj_to_sheep(old, old_nr, oid, i); I suggest to use obj_to_sheeps() here to avoid some unnecessary list traversing in obj_to_sheep() thanks, levin > + tgt_entry = old + idx; > + ret = recover_object_from_replica(oid, tgt_entry, > + epoch, tgt_epoch); > + if (ret>= 0) > + break; > } > - tgt_entry = old + tgt_idx; > - > - ret = recover_object_from_replica(oid, tgt_entry, epoch, tgt_epoch); > + /* If not succeed, roll back to an older configuration and try again */ > if (ret< 0) { > struct sd_vnode *new_old; > int new_old_nr, new_old_copies; > @@ -376,57 +258,25 @@ err: > return ret; > } > > -static int get_replica_idx(struct recovery_work *rw, uint64_t oid, int *copy_nr) > -{ > - int i, ret = -1; > - int idx_buf[SD_MAX_COPIES]; > - > - *copy_nr = get_max_nr_copies_from(rw->cur_nodes, rw->cur_nr_nodes); > - obj_to_sheeps(rw->cur_vnodes, rw->cur_nr_vnodes, oid, > - *copy_nr, idx_buf); > - > - for (i = 0; i< *copy_nr; i++) { > - int n = idx_buf[i]; > - if (vnode_is_local(&rw->cur_vnodes[n])) { > - ret = i; > - break; > - } > - } > - return ret; > -} > - > static void recover_object(struct work *work) > { > - struct recovery_work *rw = container_of(work, struct recovery_work, work); > + struct recovery_work *rw = container_of(work, struct recovery_work, > + work); > uint64_t oid = rw->oids[rw->done]; > - int i, copy_idx, copy_nr, ret; > + int ret; > > if (!sys->nr_copies) > return; > > - eprintf("done:%"PRIu32" count:%"PRIu32", oid:%"PRIx64"\n", rw->done, rw->count, oid); > + eprintf("done:%"PRIu32" count:%"PRIu32", oid:%"PRIx64"\n", > + rw->done, rw->count, oid); > > if (sd_store->exist(oid)) { > dprintf("the object is already recovered\n"); > return; > } > > - copy_idx = get_replica_idx(rw, oid,©_nr); > - if (copy_idx< 0) { > - ret = -1; > - goto err; > - } > - ret = do_recover_object(rw, copy_idx); > - if (ret< 0) { > - for (i = 0; i< copy_nr; i++) { > - if (i == copy_idx) > - continue; > - ret = do_recover_object(rw, i); > - if (ret == 0) > - break; > - } > - } > -err: > + ret = do_recover_object(rw); > if (ret< 0) > eprintf("failed to recover object %"PRIx64"\n", oid); > } |