[sheepdog] [PATCH 2/2] sheep: fix recovery logic
levin li
levin108 at gmail.com
Wed May 23 11:36:59 CEST 2012
On 05/23/2012 05:04 PM, Liu Yuan wrote:
> From: Liu Yuan<tailai.ly at taobao.com>
>
> Current recovey will fail for following simple test case (really surprised)
>
> =====
> set -ex
>
> pkill -9 sheep
> rm store/* -rf
>
> # start three sheep daemons
> for i in 0 1 2 3; do
> ./sheep/sheep -d /home/tailai.ly/sheepdog/store/$i -z $i -p 700$i -W
> done
> sleep 1
> ./collie/collie cluster format -c 2
> ./collie/collie vdi create test0 40M
> ./collie/collie vdi create test1 40M
> pkill -f "sheep -d /home/tailai.ly/sheepdog/store/3"
>
> =====
>
> after running the script, we can see copy isn't recoveried!
>
> tailai.ly at taobao:~/sheepdog$ find -name '80fd32fc00000000'
> ./store/3/obj/80fd32fc00000000 #<-- this node is killed
> ./store/0/obj/80fd32fc00000000
>
> With the patch, we get a expected result:
> tailai.ly at taobao:~/sheepdog$ find -name '80fd32fc00000000'
> ./store/1/obj/80fd32fc00000000 # copy migrated from node 3
> ./store/3/obj/80fd32fc00000000 # killed
> ./store/0/obj/80fd32fc00000000
>
> The failture is rooted in the original algorithm: we just did one shot search
> and didn't try a breadth-first search before diving into older configuration.
>
> The fix is rather straightforward, do a breadth-first search!. With the patch,
> we also end up with much more simplified code.
>
> Signed-off-by: Liu Yuan<tailai.ly at taobao.com>
> ---
> sheep/recovery.c | 186 ++++++------------------------------------------------
> 1 file changed, 18 insertions(+), 168 deletions(-)
>
> diff --git a/sheep/recovery.c b/sheep/recovery.c
> index afe58ac..c5535b7 100644
> --- a/sheep/recovery.c
> +++ b/sheep/recovery.c
> @@ -62,115 +62,6 @@ static int obj_cmp(const void *oid1, const void *oid2)
> return 0;
> }
>
> -/*
> - * contains_node - checks that the node id is included in the target nodes
> - *
> - * The target nodes to store replicated objects are the first N nodes
> - * from the base_idx'th on the consistent hash ring, where N is the
> - * number of copies of objects.
> - */
> -static int contains_node(struct sd_vnode *key,
> - struct sd_vnode *entry,
> - int nr, int base_idx, int copies)
> -{
> - int i;
> -
> - for (i = 0; i< copies; i++) {
> - int idx = get_nth_node(entry, nr, base_idx, i);
> - if (memcmp(key->addr, entry[idx].addr, sizeof(key->addr)) == 0
> - && key->port == entry[idx].port)
> - return idx;
> - }
> - return -1;
> -}
> -
> -/*
> - * find_tgt_node - find the node from which we should recover objects
> - *
> - * This function compares two node lists, the current target nodes and
> - * the previous target nodes, and finds the node from the previous
> - * target nodes which corresponds to the copy_idx'th node of the
> - * current target nodes. The correspondence is injective and
> - * maximizes the number of nodes which can recover objects locally.
> - *
> - * For example, consider the number of redundancy is 5, the consistent
> - * hash ring is {A, B, C, D, E, F}, and the node G is newly added.
> - * The parameters of this function are
> - * old_entry = {A, B, C, D, E, F}, old_nr = 6, old_idx = 3
> - * cur_entry = {A, B, C, D, E, F, G}, cur_nr = 7, cur_idx = 3
> - *
> - * In this case:
> - * the previous target nodes: {D, E, F, A, B}
> - * (the first 5 nodes from the 3rd node on the previous hash ring)
> - * the current target nodes : {D, E, F, G, A}
> - * (the first 5 nodes from the 3rd node on the current hash ring)
> - *
> - * The correspondence between copy_idx and return value are as follows:
> - * ----------------------------
> - * copy_idx 0 1 2 3 4
> - * src_node D E F G A
> - * tgt_node D E F B A
> - * return value 0 1 2 4 3
> - * ----------------------------
> - *
> - * The node D, E, F, and A can recover objects from local, and the
> - * node G recovers from the node B.
> - */
> -static int find_tgt_node(struct sd_vnode *old_entry,
> - int old_nr, int old_idx, int old_copies,
> - struct sd_vnode *cur_entry,
> - int cur_nr, int cur_idx, int cur_copies,
> - int copy_idx)
> -{
> - int i, j, idx;
> -
> - dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n",
> - old_idx, old_nr, old_copies, cur_idx, cur_nr, cur_copies, copy_idx);
> -
> - /* If the same node is in the previous target nodes, return its index */
> - idx = contains_node(cur_entry + get_nth_node(cur_entry, cur_nr, cur_idx, copy_idx),
> - old_entry, old_nr, old_idx, old_copies);
> - if (idx>= 0) {
> - dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", idx, copy_idx, cur_idx, cur_nr);
> - return idx;
> - }
> -
> - for (i = 0, j = 0; ; i++, j++) {
> - if (i< copy_idx) {
> - /* Skip if the node can recover from its local */
> - idx = contains_node(cur_entry + get_nth_node(cur_entry, cur_nr, cur_idx, i),
> - old_entry, old_nr, old_idx, old_copies);
> - if (idx>= 0)
> - continue;
> -
> - /* Find the next target which needs to recover from remote */
> - while (j< old_copies&&
> - contains_node(old_entry + get_nth_node(old_entry, old_nr, old_idx, j),
> - cur_entry, cur_nr, cur_idx, cur_copies)>= 0)
> - j++;
> - }
> - if (j == old_copies) {
> - /*
> - * Cannot find the target because the number of zones
> - * is smaller than the number of copies. We can select
> - * any node in this case, so select the first one.
> - */
> - return old_idx;
> - }
> -
> - if (i == copy_idx) {
> - /* Found the target node correspoinding to copy_idx */
> - dprintf("%"PRIu32", %"PRIu32", %"PRIu32"\n",
> - get_nth_node(old_entry, old_nr, old_idx, j),
> - copy_idx, (cur_idx + i) % cur_nr);
> - return get_nth_node(old_entry, old_nr, old_idx, j);
> - }
> -
> - }
> -
> - return -1;
> -}
> -
> static void *get_vnodes_from_epoch(uint32_t epoch, int *nr, int *copies)
> {
> int nodes_nr, len = sizeof(struct sd_vnode) * SD_MAX_VNODES;
> @@ -307,14 +198,14 @@ static void rollback_old_cur(struct sd_vnode *old, int *old_nr, int *old_copies,
> * the routine will try to recovery it from the nodes it has stayed,
> * at least, *theoretically* on consistent hash ring.
> */
> -static int do_recover_object(struct recovery_work *rw, int copy_idx)
> +static int do_recover_object(struct recovery_work *rw)
> {
> struct sd_vnode *old, *cur;
> uint64_t oid = rw->oids[rw->done];
> int old_nr = rw->old_nr_vnodes, cur_nr = rw->cur_nr_vnodes;
> uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1;
> struct sd_vnode *tgt_entry;
> - int old_idx, cur_idx, tgt_idx, old_copies, cur_copies, ret;
> + int old_copies, cur_copies, ret, i;
>
> old = xmalloc(sizeof(*old) * SD_MAX_VNODES);
> cur = xmalloc(sizeof(*cur) * SD_MAX_VNODES);
> @@ -324,28 +215,19 @@ static int do_recover_object(struct recovery_work *rw, int copy_idx)
> cur_copies = get_max_nr_copies_from(rw->cur_nodes, rw->cur_nr_nodes);
>
> again:
> - old_idx = obj_to_sheep(old, old_nr, oid, 0);
> - cur_idx = obj_to_sheep(cur, cur_nr, oid, 0);
> -
> dprintf("try recover object %"PRIx64" from epoch %"PRIu32"\n", oid, tgt_epoch);
>
> - if (cur_copies<= copy_idx) {
> - eprintf("epoch (%"PRIu32") has less copies (%d) than requested copy_idx: %d\n",
> - tgt_epoch, cur_copies, copy_idx);
> - ret = -1;
> - goto err;
> - }
> -
> - tgt_idx = find_tgt_node(old, old_nr, old_idx, old_copies,
> - cur, cur_nr, cur_idx, cur_copies, copy_idx);
> - if (tgt_idx< 0) {
> - eprintf("cannot find target node %"PRIx64"\n", oid);
> - ret = -1;
> - goto err;
> + /* Let's do a breadth-first search */
> + for (i = 0; i< old_copies; i++) {
> + int idx;
> + idx = obj_to_sheep(old, old_nr, oid, i);
I suggest to use obj_to_sheeps() here to avoid some unnecessary list
traversing in obj_to_sheep()
thanks,
levin
> + tgt_entry = old + idx;
> + ret = recover_object_from_replica(oid, tgt_entry,
> + epoch, tgt_epoch);
> + if (ret>= 0)
> + break;
> }
> - tgt_entry = old + tgt_idx;
> -
> - ret = recover_object_from_replica(oid, tgt_entry, epoch, tgt_epoch);
> + /* If not succeed, roll back to an older configuration and try again */
> if (ret< 0) {
> struct sd_vnode *new_old;
> int new_old_nr, new_old_copies;
> @@ -376,57 +258,25 @@ err:
> return ret;
> }
>
> -static int get_replica_idx(struct recovery_work *rw, uint64_t oid, int *copy_nr)
> -{
> - int i, ret = -1;
> - int idx_buf[SD_MAX_COPIES];
> -
> - *copy_nr = get_max_nr_copies_from(rw->cur_nodes, rw->cur_nr_nodes);
> - obj_to_sheeps(rw->cur_vnodes, rw->cur_nr_vnodes, oid,
> - *copy_nr, idx_buf);
> -
> - for (i = 0; i< *copy_nr; i++) {
> - int n = idx_buf[i];
> - if (vnode_is_local(&rw->cur_vnodes[n])) {
> - ret = i;
> - break;
> - }
> - }
> - return ret;
> -}
> -
> static void recover_object(struct work *work)
> {
> - struct recovery_work *rw = container_of(work, struct recovery_work, work);
> + struct recovery_work *rw = container_of(work, struct recovery_work,
> + work);
> uint64_t oid = rw->oids[rw->done];
> - int i, copy_idx, copy_nr, ret;
> + int ret;
>
> if (!sys->nr_copies)
> return;
>
> - eprintf("done:%"PRIu32" count:%"PRIu32", oid:%"PRIx64"\n", rw->done, rw->count, oid);
> + eprintf("done:%"PRIu32" count:%"PRIu32", oid:%"PRIx64"\n",
> + rw->done, rw->count, oid);
>
> if (sd_store->exist(oid)) {
> dprintf("the object is already recovered\n");
> return;
> }
>
> - copy_idx = get_replica_idx(rw, oid,©_nr);
> - if (copy_idx< 0) {
> - ret = -1;
> - goto err;
> - }
> - ret = do_recover_object(rw, copy_idx);
> - if (ret< 0) {
> - for (i = 0; i< copy_nr; i++) {
> - if (i == copy_idx)
> - continue;
> - ret = do_recover_object(rw, i);
> - if (ret == 0)
> - break;
> - }
> - }
> -err:
> + ret = do_recover_object(rw);
> if (ret< 0)
> eprintf("failed to recover object %"PRIx64"\n", oid);
> }
More information about the sheepdog
mailing list