[sheepdog] [PATCH RFC 09/11] sheep: do recovery with dirty object list

Yunkai Zhang yunkai.me at gmail.com
Thu Aug 9 07:47:06 CEST 2012


On Thu, Aug 9, 2012 at 5:14 AM, Yunkai Zhang <yunkai.me at gmail.com> wrote:
> From: Yunkai Zhang <qiushu.zyk at taobao.com>
>
> Now we have prepared all necessary data, let's do the recovery job with
> dirty object list.
>
> 1) If a sheep joined back to the cluster, but there are some objects which have
>    been deleted after this sheep left, such objects stay in its working
>    directory, after recovery start, this sheep will send its object list to
>    other sheeps. So after fetched all object list from cluster, each sheep
>    should screen out these deleted objects list.
> 2) A sheep which have been left and joined back should drop the old version
>    objects and recover the new ones from other sheeps.
> 3) The objects which have been updated should not recovered from a joined
>    back sheep.
>
> In order to search delected and updated object list quickly, I store them in
>    a temporary cache named dirty_object_cache implemented by rbtree, it shares
>    some common base code with objlist_cache.
>
> Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
> ---
>  sheep/object_list_cache.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++
>  sheep/recovery.c          | 87 +++++++++++++++++++++++++++++++++++++++++++++--
>  sheep/sheep_priv.h        | 16 +++++++++
>  3 files changed, 187 insertions(+), 2 deletions(-)
>
> diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
> index 4c34783..c97fd27 100644
> --- a/sheep/object_list_cache.c
> +++ b/sheep/object_list_cache.c
> @@ -55,6 +55,16 @@ struct objlist_cache obj_list_cache = {
>         .lock           = PTHREAD_RWLOCK_INITIALIZER,
>  };
>
> +struct dirty_objlist_cache deleted_dirty_cache = {
> +       .root       = RB_ROOT,
> +       .entry_list = LIST_HEAD_INIT(deleted_dirty_cache.entry_list),
> +};
> +
> +struct dirty_objlist_cache updated_dirty_cache = {
> +       .root       = RB_ROOT,
> +       .entry_list = LIST_HEAD_INIT(updated_dirty_cache.entry_list),
> +};
> +
>  static struct objlist_cache_entry *objlist_cache_rb_find(struct rb_root *root,
>                                                          uint64_t oid)
>  {
> @@ -128,6 +138,58 @@ static int objlist_cache_rb_remove(struct rb_root *root, uint64_t oid)
>         return -1; /* fail to remove */
>  }
>
> +int dirty_objlist_cache_find(struct dirty_objlist_cache *dirty_cache,
> +                            uint64_t oid)
> +{
> +       struct objlist_cache_entry *p;
> +
> +       p = objlist_cache_rb_find(&dirty_cache->root, oid);
> +
> +       return !!p;
> +}
> +
> +int dirty_objlist_cache_insert(struct dirty_objlist_cache *dirty_cache,
> +                              uint64_t oid)
> +{
> +       struct objlist_cache_entry *entry, *p;
> +
> +       entry = zalloc(sizeof(*entry));
> +
> +       if (!entry) {
> +               eprintf("Out of memory.\n");
> +               return -1;
> +       }
> +
> +       entry->oid = oid;
> +       rb_init_node(&entry->node);
> +
> +       p = objlist_cache_rb_insert(&dirty_cache->root, entry);
> +       if (p)
> +               free(entry);
> +       else
> +               list_add(&entry->list, &dirty_cache->entry_list);
> +       return 1;
> +}
> +
> +void dirty_objlist_cache_clear(struct dirty_objlist_cache *dirty_cache)
> +{
> +       struct objlist_cache_entry *entry, *t;
> +       list_for_each_entry_safe(entry, t, &dirty_cache->entry_list, list) {
> +               dprintf("delete oid:%lx\n", entry->oid);
> +               objlist_cache_rb_remove(&dirty_cache->root, entry->oid);
> +       }
> +
> +       assert(list_empty(&dirty_cache->entry_list));
> +}
> +
> +void dirty_objlist_cache_show(struct dirty_objlist_cache *dirty_cache)
> +{
> +       struct objlist_cache_entry *entry, *t;
> +       list_for_each_entry_safe(entry, t, &dirty_cache->entry_list, list) {
> +               dprintf("oid:%lx\n", entry->oid);
> +       }
> +}
> +
>  void objlist_cache_remove(uint64_t oid, int has_left_node)
>  {
>         struct objlist_cache_entry *entry;
> @@ -181,6 +243,30 @@ int objlist_cache_insert(uint64_t oid, int dirty_flag)
>         return 0;
>  }
>
> +void objlist_cache_clear_dirty_flag(void)
> +{
> +       struct objlist_cache_entry *entry, *t;
> +
> +       pthread_rwlock_wrlock(&obj_list_cache.lock);
> +       list_for_each_entry_safe(entry, t, &obj_list_cache.entry_list, list) {
> +               if (entry->dirty_flag == 1)
> +                       entry->dirty_flag = 0;
> +               else if (entry->dirty_flag == 2)
> +                       /*
> +                        * entries which dirty_flag is 2 will not kept in
> +                        * cache buffer, so needn't to update tree_version
> +                        * when remove it.
> +                        */
> +                       objlist_cache_rb_remove(&obj_list_cache.root,
> +                                               entry->oid);
> +               if (entry->dirty_flag == 1)
> +                       dprintf("oid:%"PRIx64" dirty_flag: 1, updated\n", entry->oid);
> +               else if (entry->dirty_flag == 2)
> +                       dprintf("oid:%"PRIx64" dirty_flag: 2, deleted\n", entry->oid);
> +       }
> +       pthread_rwlock_unlock(&obj_list_cache.lock);
> +}
> +
>  int get_obj_list(const struct sd_list_req *hdr, struct sd_list_rsp *rsp, void *data)
>  {
>         int nr = 0;
> diff --git a/sheep/recovery.c b/sheep/recovery.c
> index 9d3d431..ba6e002 100644
> --- a/sheep/recovery.c
> +++ b/sheep/recovery.c
> @@ -181,6 +181,15 @@ again:
>                 if (is_invalid_vnode(tgt_vnode, rw->cur_vinfo->nodes,
>                                      rw->cur_vinfo->nr_nodes))
>                         continue;
> +               /*
> +                * objects which have been updated should not recovered
> +                * from a joined back sheep.
> +                */
> +               if (sys->disable_recovery
> +                   && vnode_has_left(old, tgt_vnode)
> +                   && dirty_objlist_cache_find(&updated_dirty_cache, oid))
> +                       continue;
> +
>                 ret = recover_object_from_replica(oid, tgt_vnode,
>                                                   epoch, tgt_epoch);
>                 if (ret == 0) {
> @@ -219,6 +228,14 @@ err:
>         return ret;
>  }
>
> +static inline bool is_leaving_node(struct sd_node *node)
> +{
> +       if (find_node(node, leaving_nodes, nr_leaving_nodes))
> +               return true;
> +
> +       return false;
> +}
> +
>  static void recover_object_work(struct work *work)
>  {
>         struct recovery_work *rw = container_of(work, struct recovery_work,
> @@ -229,11 +246,22 @@ static void recover_object_work(struct work *work)
>         eprintf("done:%"PRIu32" count:%"PRIu32", oid:%"PRIx64"\n",
>                 rw->done, rw->count, oid);
>
> +       /*
> +        * sheep which have been left and joined back should drop the
> +        * old version objects and recover the new ones from other sheeps.
> +        */
> +       if (sys->disable_recovery
> +           && is_leaving_node(&sys->this_node)
> +           && dirty_objlist_cache_find(&updated_dirty_cache, oid)) {
> +               goto do_recover;
> +       }
> +
>         if (sd_store->exist(oid)) {
>                 dprintf("the object is already recovered\n");
>                 return;
>         }
>
> +do_recover:
>         ret = do_recover_object(rw);
>         if (ret < 0)
>                 eprintf("failed to recover object %"PRIx64"\n", oid);
> @@ -345,6 +373,10 @@ static inline void finish_recovery(struct recovery_work *rw)
>         if (sd_store->end_recover)
>                 sd_store->end_recover(sys->epoch - 1, rw->old_vinfo);
>
> +       objlist_cache_clear_dirty_flag();
> +       dirty_objlist_cache_clear(&deleted_dirty_cache);
> +       dirty_objlist_cache_clear(&updated_dirty_cache);
> +
>         free_recovery_work(rw);
>
>         sys->disable_recovery = 0;
> @@ -539,6 +571,42 @@ static void screen_object_list(struct recovery_work *rw,
>         qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
>  }
>
> +/*
> + * If a sheep joined back to the cluster, but there are some objects which have
> + * been deleted after this sheep left, such objects stay in its working
> + * directory, after recovery start, this sheep will send its object list to
> + * other sheeps. So after fetched all object list from cluster, each sheep
> + * should screen out these deleted objects list.
> + */
> +static void screen_dirty_object_list(struct recovery_work *rw)
> +{
> +       int i, n;
> +       uint64_t *oids;
> +       struct dirty_objlist_cache *deleted;
> +
> +       oids = xmalloc(SD_DATA_OBJ_SIZE * sizeof(uint64_t));
> +       deleted = &deleted_dirty_cache;
> +
> +       dirty_objlist_cache_show(deleted);
> +
> +       for (n = i = 0; i < rw->count; i++) {
> +               if (!dirty_objlist_cache_find(deleted, rw->oids[i])) {
> +                       dprintf("oid:%lx\n", rw->oids[i]);
> +                       oids[n++] =  rw->oids[i];
> +               }
> +       }
> +
> +       dprintf("old count:%d, new count:%d\n", rw->count, n);
> +
> +       memcpy(rw->oids, oids, n * sizeof(uint64_t));
> +       rw->count = n;
> +
> +       qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
> +
> +       free(oids);
> +}
> +
> +
>  static int newly_joined(struct sd_node *node, struct recovery_work *rw)
>  {
>         if (bsearch(node, rw->old_vinfo->nodes, rw->old_vinfo->nr_nodes,
> @@ -586,9 +654,21 @@ again:
>                 if (sys->disable_recovery) {
>                         offset = nr_oids * sizeof(uint64_t);
>                         dois = (struct dirty_oid_info *)&buf[offset];
> +
>                         for (j = 0; j < nr_dois; j++) {
> +                               struct dirty_objlist_cache *updated, *deleted;
> +
> +                               updated = &updated_dirty_cache;
> +                               deleted = &deleted_dirty_cache;
> +
>                                 dprintf("dirty_oid:%"PRIx64", dirty_flag:%d\n",
>                                         dois[j].oid, dois[j].dirty_flag);
> +                               if (dois[i].dirty_flag == 1)

there is a typo, it should be (dois[j].dirty_flag == 1)

I'll give v2 after do more testing.

> +                                       dirty_objlist_cache_insert(updated,
> +                                                                  dois[j].oid);
> +                               else
> +                                       dirty_objlist_cache_insert(deleted,
> +                                                                  dois[j].oid);
>                         }
>                 }
>
> @@ -601,7 +681,10 @@ again:
>                 goto again;
>         }
>
> -       dprintf("%d\n", rw->count);
> +       if (sys->disable_recovery)
> +               screen_dirty_object_list(rw);
> +
> +       dprintf("count:%d\n", rw->count);
>  out:
>         free(buf);
>  }
> @@ -625,7 +708,7 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo)
>         }
>
>         rw->state = RW_INIT;
> -       rw->oids = xmalloc(1 << 20); /* FIXME */
> +       rw->oids = xmalloc(SD_DATA_OBJ_SIZE * sizeof(uint64_t));
>         rw->epoch = sys->epoch;
>         rw->count = 0;
>
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index 2c3e195..7196f12 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -150,6 +150,11 @@ struct store_driver {
>         int (*get_snap_file)(struct siocb *);
>  };
>
> +struct dirty_objlist_cache {
> +       struct rb_root root;
> +       struct list_head entry_list;
> +};
> +
>  extern struct list_head store_drivers;
>  #define add_store_driver(driver)                                 \
>  static void __attribute__((constructor)) add_ ## driver(void) {  \
> @@ -278,6 +283,17 @@ int prealloc(int fd, uint32_t size);
>
>  int objlist_cache_insert(uint64_t oid, int has_left_node);
>  void objlist_cache_remove(uint64_t oid, int has_left_node);
> +void objlist_cache_clear_dirty_flag(void);
> +
> +extern struct dirty_objlist_cache deleted_dirty_cache;
> +extern struct dirty_objlist_cache updated_dirty_cache;
> +
> +int dirty_objlist_cache_find(struct dirty_objlist_cache *dirty_cache,
> +                            uint64_t oid);
> +int dirty_objlist_cache_insert(struct dirty_objlist_cache *dirty_cache,
> +                              uint64_t oid);
> +void dirty_objlist_cache_clear(struct dirty_objlist_cache *dirty_cache);
> +void dirty_objlist_cache_show(struct dirty_objlist_cache *dirty_cache);
>
>  void put_request(struct request *req);
>
> --
> 1.7.11.2
>



-- 
Yunkai Zhang
Work at Taobao



More information about the sheepdog mailing list