[sheepdog] [PATCH RFC 09/11] sheep: do recovery with dirty object list
Yunkai Zhang
yunkai.me at gmail.com
Thu Aug 9 07:47:06 CEST 2012
On Thu, Aug 9, 2012 at 5:14 AM, Yunkai Zhang <yunkai.me at gmail.com> wrote:
> From: Yunkai Zhang <qiushu.zyk at taobao.com>
>
> Now we have prepared all necessary data, let's do the recovery job with
> dirty object list.
>
> 1) If a sheep joined back to the cluster, but there are some objects which have
> been deleted after this sheep left, such objects stay in its working
> directory, after recovery start, this sheep will send its object list to
> other sheeps. So after fetched all object list from cluster, each sheep
> should screen out these deleted objects list.
> 2) A sheep which have been left and joined back should drop the old version
> objects and recover the new ones from other sheeps.
> 3) The objects which have been updated should not recovered from a joined
> back sheep.
>
> In order to search delected and updated object list quickly, I store them in
> a temporary cache named dirty_object_cache implemented by rbtree, it shares
> some common base code with objlist_cache.
>
> Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
> ---
> sheep/object_list_cache.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++
> sheep/recovery.c | 87 +++++++++++++++++++++++++++++++++++++++++++++--
> sheep/sheep_priv.h | 16 +++++++++
> 3 files changed, 187 insertions(+), 2 deletions(-)
>
> diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
> index 4c34783..c97fd27 100644
> --- a/sheep/object_list_cache.c
> +++ b/sheep/object_list_cache.c
> @@ -55,6 +55,16 @@ struct objlist_cache obj_list_cache = {
> .lock = PTHREAD_RWLOCK_INITIALIZER,
> };
>
> +struct dirty_objlist_cache deleted_dirty_cache = {
> + .root = RB_ROOT,
> + .entry_list = LIST_HEAD_INIT(deleted_dirty_cache.entry_list),
> +};
> +
> +struct dirty_objlist_cache updated_dirty_cache = {
> + .root = RB_ROOT,
> + .entry_list = LIST_HEAD_INIT(updated_dirty_cache.entry_list),
> +};
> +
> static struct objlist_cache_entry *objlist_cache_rb_find(struct rb_root *root,
> uint64_t oid)
> {
> @@ -128,6 +138,58 @@ static int objlist_cache_rb_remove(struct rb_root *root, uint64_t oid)
> return -1; /* fail to remove */
> }
>
> +int dirty_objlist_cache_find(struct dirty_objlist_cache *dirty_cache,
> + uint64_t oid)
> +{
> + struct objlist_cache_entry *p;
> +
> + p = objlist_cache_rb_find(&dirty_cache->root, oid);
> +
> + return !!p;
> +}
> +
> +int dirty_objlist_cache_insert(struct dirty_objlist_cache *dirty_cache,
> + uint64_t oid)
> +{
> + struct objlist_cache_entry *entry, *p;
> +
> + entry = zalloc(sizeof(*entry));
> +
> + if (!entry) {
> + eprintf("Out of memory.\n");
> + return -1;
> + }
> +
> + entry->oid = oid;
> + rb_init_node(&entry->node);
> +
> + p = objlist_cache_rb_insert(&dirty_cache->root, entry);
> + if (p)
> + free(entry);
> + else
> + list_add(&entry->list, &dirty_cache->entry_list);
> + return 1;
> +}
> +
> +void dirty_objlist_cache_clear(struct dirty_objlist_cache *dirty_cache)
> +{
> + struct objlist_cache_entry *entry, *t;
> + list_for_each_entry_safe(entry, t, &dirty_cache->entry_list, list) {
> + dprintf("delete oid:%lx\n", entry->oid);
> + objlist_cache_rb_remove(&dirty_cache->root, entry->oid);
> + }
> +
> + assert(list_empty(&dirty_cache->entry_list));
> +}
> +
> +void dirty_objlist_cache_show(struct dirty_objlist_cache *dirty_cache)
> +{
> + struct objlist_cache_entry *entry, *t;
> + list_for_each_entry_safe(entry, t, &dirty_cache->entry_list, list) {
> + dprintf("oid:%lx\n", entry->oid);
> + }
> +}
> +
> void objlist_cache_remove(uint64_t oid, int has_left_node)
> {
> struct objlist_cache_entry *entry;
> @@ -181,6 +243,30 @@ int objlist_cache_insert(uint64_t oid, int dirty_flag)
> return 0;
> }
>
> +void objlist_cache_clear_dirty_flag(void)
> +{
> + struct objlist_cache_entry *entry, *t;
> +
> + pthread_rwlock_wrlock(&obj_list_cache.lock);
> + list_for_each_entry_safe(entry, t, &obj_list_cache.entry_list, list) {
> + if (entry->dirty_flag == 1)
> + entry->dirty_flag = 0;
> + else if (entry->dirty_flag == 2)
> + /*
> + * entries which dirty_flag is 2 will not kept in
> + * cache buffer, so needn't to update tree_version
> + * when remove it.
> + */
> + objlist_cache_rb_remove(&obj_list_cache.root,
> + entry->oid);
> + if (entry->dirty_flag == 1)
> + dprintf("oid:%"PRIx64" dirty_flag: 1, updated\n", entry->oid);
> + else if (entry->dirty_flag == 2)
> + dprintf("oid:%"PRIx64" dirty_flag: 2, deleted\n", entry->oid);
> + }
> + pthread_rwlock_unlock(&obj_list_cache.lock);
> +}
> +
> int get_obj_list(const struct sd_list_req *hdr, struct sd_list_rsp *rsp, void *data)
> {
> int nr = 0;
> diff --git a/sheep/recovery.c b/sheep/recovery.c
> index 9d3d431..ba6e002 100644
> --- a/sheep/recovery.c
> +++ b/sheep/recovery.c
> @@ -181,6 +181,15 @@ again:
> if (is_invalid_vnode(tgt_vnode, rw->cur_vinfo->nodes,
> rw->cur_vinfo->nr_nodes))
> continue;
> + /*
> + * objects which have been updated should not recovered
> + * from a joined back sheep.
> + */
> + if (sys->disable_recovery
> + && vnode_has_left(old, tgt_vnode)
> + && dirty_objlist_cache_find(&updated_dirty_cache, oid))
> + continue;
> +
> ret = recover_object_from_replica(oid, tgt_vnode,
> epoch, tgt_epoch);
> if (ret == 0) {
> @@ -219,6 +228,14 @@ err:
> return ret;
> }
>
> +static inline bool is_leaving_node(struct sd_node *node)
> +{
> + if (find_node(node, leaving_nodes, nr_leaving_nodes))
> + return true;
> +
> + return false;
> +}
> +
> static void recover_object_work(struct work *work)
> {
> struct recovery_work *rw = container_of(work, struct recovery_work,
> @@ -229,11 +246,22 @@ static void recover_object_work(struct work *work)
> eprintf("done:%"PRIu32" count:%"PRIu32", oid:%"PRIx64"\n",
> rw->done, rw->count, oid);
>
> + /*
> + * sheep which have been left and joined back should drop the
> + * old version objects and recover the new ones from other sheeps.
> + */
> + if (sys->disable_recovery
> + && is_leaving_node(&sys->this_node)
> + && dirty_objlist_cache_find(&updated_dirty_cache, oid)) {
> + goto do_recover;
> + }
> +
> if (sd_store->exist(oid)) {
> dprintf("the object is already recovered\n");
> return;
> }
>
> +do_recover:
> ret = do_recover_object(rw);
> if (ret < 0)
> eprintf("failed to recover object %"PRIx64"\n", oid);
> @@ -345,6 +373,10 @@ static inline void finish_recovery(struct recovery_work *rw)
> if (sd_store->end_recover)
> sd_store->end_recover(sys->epoch - 1, rw->old_vinfo);
>
> + objlist_cache_clear_dirty_flag();
> + dirty_objlist_cache_clear(&deleted_dirty_cache);
> + dirty_objlist_cache_clear(&updated_dirty_cache);
> +
> free_recovery_work(rw);
>
> sys->disable_recovery = 0;
> @@ -539,6 +571,42 @@ static void screen_object_list(struct recovery_work *rw,
> qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
> }
>
> +/*
> + * If a sheep joined back to the cluster, but there are some objects which have
> + * been deleted after this sheep left, such objects stay in its working
> + * directory, after recovery start, this sheep will send its object list to
> + * other sheeps. So after fetched all object list from cluster, each sheep
> + * should screen out these deleted objects list.
> + */
> +static void screen_dirty_object_list(struct recovery_work *rw)
> +{
> + int i, n;
> + uint64_t *oids;
> + struct dirty_objlist_cache *deleted;
> +
> + oids = xmalloc(SD_DATA_OBJ_SIZE * sizeof(uint64_t));
> + deleted = &deleted_dirty_cache;
> +
> + dirty_objlist_cache_show(deleted);
> +
> + for (n = i = 0; i < rw->count; i++) {
> + if (!dirty_objlist_cache_find(deleted, rw->oids[i])) {
> + dprintf("oid:%lx\n", rw->oids[i]);
> + oids[n++] = rw->oids[i];
> + }
> + }
> +
> + dprintf("old count:%d, new count:%d\n", rw->count, n);
> +
> + memcpy(rw->oids, oids, n * sizeof(uint64_t));
> + rw->count = n;
> +
> + qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
> +
> + free(oids);
> +}
> +
> +
> static int newly_joined(struct sd_node *node, struct recovery_work *rw)
> {
> if (bsearch(node, rw->old_vinfo->nodes, rw->old_vinfo->nr_nodes,
> @@ -586,9 +654,21 @@ again:
> if (sys->disable_recovery) {
> offset = nr_oids * sizeof(uint64_t);
> dois = (struct dirty_oid_info *)&buf[offset];
> +
> for (j = 0; j < nr_dois; j++) {
> + struct dirty_objlist_cache *updated, *deleted;
> +
> + updated = &updated_dirty_cache;
> + deleted = &deleted_dirty_cache;
> +
> dprintf("dirty_oid:%"PRIx64", dirty_flag:%d\n",
> dois[j].oid, dois[j].dirty_flag);
> + if (dois[i].dirty_flag == 1)
there is a typo, it should be (dois[j].dirty_flag == 1)
I'll give v2 after do more testing.
> + dirty_objlist_cache_insert(updated,
> + dois[j].oid);
> + else
> + dirty_objlist_cache_insert(deleted,
> + dois[j].oid);
> }
> }
>
> @@ -601,7 +681,10 @@ again:
> goto again;
> }
>
> - dprintf("%d\n", rw->count);
> + if (sys->disable_recovery)
> + screen_dirty_object_list(rw);
> +
> + dprintf("count:%d\n", rw->count);
> out:
> free(buf);
> }
> @@ -625,7 +708,7 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo)
> }
>
> rw->state = RW_INIT;
> - rw->oids = xmalloc(1 << 20); /* FIXME */
> + rw->oids = xmalloc(SD_DATA_OBJ_SIZE * sizeof(uint64_t));
> rw->epoch = sys->epoch;
> rw->count = 0;
>
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index 2c3e195..7196f12 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -150,6 +150,11 @@ struct store_driver {
> int (*get_snap_file)(struct siocb *);
> };
>
> +struct dirty_objlist_cache {
> + struct rb_root root;
> + struct list_head entry_list;
> +};
> +
> extern struct list_head store_drivers;
> #define add_store_driver(driver) \
> static void __attribute__((constructor)) add_ ## driver(void) { \
> @@ -278,6 +283,17 @@ int prealloc(int fd, uint32_t size);
>
> int objlist_cache_insert(uint64_t oid, int has_left_node);
> void objlist_cache_remove(uint64_t oid, int has_left_node);
> +void objlist_cache_clear_dirty_flag(void);
> +
> +extern struct dirty_objlist_cache deleted_dirty_cache;
> +extern struct dirty_objlist_cache updated_dirty_cache;
> +
> +int dirty_objlist_cache_find(struct dirty_objlist_cache *dirty_cache,
> + uint64_t oid);
> +int dirty_objlist_cache_insert(struct dirty_objlist_cache *dirty_cache,
> + uint64_t oid);
> +void dirty_objlist_cache_clear(struct dirty_objlist_cache *dirty_cache);
> +void dirty_objlist_cache_show(struct dirty_objlist_cache *dirty_cache);
>
> void put_request(struct request *req);
>
> --
> 1.7.11.2
>
--
Yunkai Zhang
Work at Taobao
More information about the sheepdog
mailing list