[sheepdog] [PATCH V3 09/11] sheep: do recovery with dirty object list
Yunkai Zhang
yunkai.me at gmail.com
Fri Aug 10 12:31:26 CEST 2012
From: Yunkai Zhang <qiushu.zyk at taobao.com>
V3:
- a joined back sheep should drop all outdated objects before recover,
(In V2, we drop outdated objects in recover_object_work(), but if they
doesn't map to the original node in the new hash-ring, they have no chance
to be dropped).
- remove some debug logs
- update commit log
------------------------------------------------------- >8
Now we have prepared all necessary data, let's do the recovery job with
dirty object list.
1) If a sheep joined back to the cluster, but there are some objects which have
been deleted after this sheep left, such objects stay in its working
directory, after recovery start, this sheep will send its object list to
other sheeps. So after fetched all object list from cluster, each sheep
should screen out these deleted objects list.
2) A sheep which have been left and joined back should drop all the outdated
objects before recover the new ones from other sheeps.
3) The objects which have been updated should not recovered from a joined
back sheep.
In order to search delected and updated object list quickly, I store them in
a temporary cache named dirty_object_cache implemented by rbtree, it shares
some common base code with objlist_cache.
Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
---
sheep/object_list_cache.c | 94 +++++++++++++++++++++++++++++++++++++++++------
sheep/recovery.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++-
sheep/sheep_priv.h | 29 +++++++++++++++
3 files changed, 203 insertions(+), 14 deletions(-)
diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
index 4c34783..15589c6 100644
--- a/sheep/object_list_cache.c
+++ b/sheep/object_list_cache.c
@@ -21,18 +21,6 @@
#include "strbuf.h"
#include "util.h"
-struct objlist_cache_entry {
- uint64_t oid;
- struct list_head list;
- struct rb_node node;
-
- /* This flag indicates whether this object
- * is dirty after disabled cluster recovery
- * 0: clean, 1: updated, 2: deleted
- */
- uint8_t dirty_flag;
-};
-
struct objlist_cache {
int tree_version;
int buf_version;
@@ -55,6 +43,16 @@ struct objlist_cache obj_list_cache = {
.lock = PTHREAD_RWLOCK_INITIALIZER,
};
+struct dirty_objlist_cache deleted_dirty_cache = {
+ .root = RB_ROOT,
+ .entry_list = LIST_HEAD_INIT(deleted_dirty_cache.entry_list),
+};
+
+struct dirty_objlist_cache updated_dirty_cache = {
+ .root = RB_ROOT,
+ .entry_list = LIST_HEAD_INIT(updated_dirty_cache.entry_list),
+};
+
static struct objlist_cache_entry *objlist_cache_rb_find(struct rb_root *root,
uint64_t oid)
{
@@ -128,6 +126,58 @@ static int objlist_cache_rb_remove(struct rb_root *root, uint64_t oid)
return -1; /* fail to remove */
}
+int dirty_objlist_cache_find(struct dirty_objlist_cache *dirty_cache,
+ uint64_t oid)
+{
+ struct objlist_cache_entry *p;
+
+ p = objlist_cache_rb_find(&dirty_cache->root, oid);
+
+ return !!p;
+}
+
+int dirty_objlist_cache_insert(struct dirty_objlist_cache *dirty_cache,
+ uint64_t oid)
+{
+ struct objlist_cache_entry *entry, *p;
+
+ entry = zalloc(sizeof(*entry));
+
+ if (!entry) {
+ eprintf("Out of memory.\n");
+ return -1;
+ }
+
+ entry->oid = oid;
+ rb_init_node(&entry->node);
+
+ p = objlist_cache_rb_insert(&dirty_cache->root, entry);
+ if (p)
+ free(entry);
+ else
+ list_add(&entry->list, &dirty_cache->entry_list);
+ return 1;
+}
+
+void dirty_objlist_cache_clear(struct dirty_objlist_cache *dirty_cache)
+{
+ struct objlist_cache_entry *entry, *t;
+ list_for_each_entry_safe(entry, t, &dirty_cache->entry_list, list) {
+ objlist_cache_rb_remove(&dirty_cache->root, entry->oid);
+ }
+
+ assert(list_empty(&dirty_cache->entry_list));
+}
+
+void dirty_objlist_cache_show(struct dirty_objlist_cache *dirty_cache,
+ const char *tag)
+{
+ struct objlist_cache_entry *entry;
+ list_for_each_entry(entry, &dirty_cache->entry_list, list) {
+ dprintf("%s oid:%lx\n", tag, entry->oid);
+ }
+}
+
void objlist_cache_remove(uint64_t oid, int has_left_node)
{
struct objlist_cache_entry *entry;
@@ -181,6 +231,26 @@ int objlist_cache_insert(uint64_t oid, int dirty_flag)
return 0;
}
+void objlist_cache_clear_dirty_flag(void)
+{
+ struct objlist_cache_entry *entry, *t;
+
+ pthread_rwlock_wrlock(&obj_list_cache.lock);
+ list_for_each_entry_safe(entry, t, &obj_list_cache.entry_list, list) {
+ if (entry->dirty_flag == 1)
+ entry->dirty_flag = 0;
+ else if (entry->dirty_flag == 2)
+ /*
+ * entries which dirty_flag is 2 will not kept in
+ * cache buffer, so needn't to update tree_version
+ * when remove it.
+ */
+ objlist_cache_rb_remove(&obj_list_cache.root,
+ entry->oid);
+ }
+ pthread_rwlock_unlock(&obj_list_cache.lock);
+}
+
int get_obj_list(const struct sd_list_req *hdr, struct sd_list_rsp *rsp, void *data)
{
int nr = 0;
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 38d79d8..e08e85d 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -181,6 +181,15 @@ again:
if (is_invalid_vnode(tgt_vnode, rw->cur_vinfo->nodes,
rw->cur_vinfo->nr_nodes))
continue;
+ /*
+ * objects which have been updated should not recovered
+ * from a joined back sheep.
+ */
+ if (sys->disable_recovery
+ && vnode_has_left(old, tgt_vnode)
+ && dirty_objlist_cache_find(&updated_dirty_cache, oid))
+ continue;
+
ret = recover_object_from_replica(oid, tgt_vnode,
epoch, tgt_epoch);
if (ret == 0) {
@@ -219,6 +228,14 @@ err:
return ret;
}
+static inline bool is_leaving_node(struct sd_node *node)
+{
+ if (find_node(node, leaving_nodes, nr_leaving_nodes))
+ return true;
+
+ return false;
+}
+
static void recover_object_work(struct work *work)
{
struct recovery_work *rw = container_of(work, struct recovery_work,
@@ -345,6 +362,10 @@ static inline void finish_recovery(struct recovery_work *rw)
if (sd_store->end_recover)
sd_store->end_recover(sys->epoch - 1, rw->old_vinfo);
+ objlist_cache_clear_dirty_flag();
+ dirty_objlist_cache_clear(&deleted_dirty_cache);
+ dirty_objlist_cache_clear(&updated_dirty_cache);
+
free_recovery_work(rw);
sys->disable_recovery = 0;
@@ -542,6 +563,58 @@ static void screen_object_list(struct recovery_work *rw,
qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
}
+/*
+ * If a sheep joined back to the cluster, but there are some objects which have
+ * been deleted after this sheep left, such objects stay in its working
+ * directory, after recovery start, this sheep will send its object list to
+ * other sheeps. So after fetched all object list from cluster, each sheep
+ * should screen out these deleted objects list.
+ */
+static void screen_dirty_object_list(struct recovery_work *rw)
+{
+ int i, n;
+ uint64_t *oids;
+ struct dirty_objlist_cache *deleted;
+
+ oids = xmalloc(SD_DATA_OBJ_SIZE * sizeof(uint64_t));
+ deleted = &deleted_dirty_cache;
+
+ dirty_objlist_cache_show(deleted, "deleted");
+
+ for (n = i = 0; i < rw->count; i++) {
+ if (!dirty_objlist_cache_find(deleted, rw->oids[i]))
+ oids[n++] = rw->oids[i];
+ }
+
+ dprintf("old count:%d, new count:%d\n", rw->count, n);
+
+ memcpy(rw->oids, oids, n * sizeof(uint64_t));
+ rw->count = n;
+
+ qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
+
+ free(oids);
+}
+
+/*
+ * sheep which have been left and joined back should drop the outdated
+ * objects before recover the new ones from other sheeps.
+ */
+static inline void clear_outdated_objects(void) {
+ struct objlist_cache_entry *entry;
+ struct dirty_objlist_cache *updated = &updated_dirty_cache;
+
+ if (is_leaving_node(&sys->this_node)) {
+ list_for_each_entry(entry, &updated->entry_list, list) {
+ if (sd_store->exist(entry->oid)) {
+ dprintf("remove outdated object:%"PRIx64"\n",
+ entry->oid);
+ sd_store->remove_object(entry->oid);
+ }
+ }
+ }
+}
+
static int newly_joined(struct sd_node *node, struct recovery_work *rw)
{
if (bsearch(node, rw->old_vinfo->nodes, rw->old_vinfo->nr_nodes,
@@ -589,9 +662,21 @@ again:
if (sys->disable_recovery) {
offset = nr_oids * sizeof(uint64_t);
dois = (struct dirty_oid_info *)&buf[offset];
+
for (j = 0; j < nr_dois; j++) {
+ struct dirty_objlist_cache *updated, *deleted;
+
+ updated = &updated_dirty_cache;
+ deleted = &deleted_dirty_cache;
+
dprintf("dirty_oid:%"PRIx64", dirty_flag:%d\n",
dois[j].oid, dois[j].dirty_flag);
+ if (dois[j].dirty_flag == 1)
+ dirty_objlist_cache_insert(updated,
+ dois[j].oid);
+ else
+ dirty_objlist_cache_insert(deleted,
+ dois[j].oid);
}
}
@@ -604,7 +689,12 @@ again:
goto again;
}
- dprintf("%d\n", rw->count);
+ if (sys->disable_recovery) {
+ clear_outdated_objects();
+ screen_dirty_object_list(rw);
+ }
+
+ dprintf("count:%d\n", rw->count);
out:
free(buf);
}
@@ -628,7 +718,7 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo)
}
rw->state = RW_INIT;
- rw->oids = xmalloc(1 << 20); /* FIXME */
+ rw->oids = xmalloc(SD_DATA_OBJ_SIZE * sizeof(uint64_t));
rw->epoch = sys->epoch;
rw->count = 0;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 2c3e195..c5372a0 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -150,6 +150,23 @@ struct store_driver {
int (*get_snap_file)(struct siocb *);
};
+struct objlist_cache_entry {
+ uint64_t oid;
+ struct list_head list;
+ struct rb_node node;
+
+ /* This flag indicates whether this object
+ * is dirty after disabled cluster recovery
+ * 0: clean, 1: updated, 2: deleted
+ */
+ uint8_t dirty_flag;
+};
+
+struct dirty_objlist_cache {
+ struct rb_root root;
+ struct list_head entry_list;
+};
+
extern struct list_head store_drivers;
#define add_store_driver(driver) \
static void __attribute__((constructor)) add_ ## driver(void) { \
@@ -278,6 +295,18 @@ int prealloc(int fd, uint32_t size);
int objlist_cache_insert(uint64_t oid, int has_left_node);
void objlist_cache_remove(uint64_t oid, int has_left_node);
+void objlist_cache_clear_dirty_flag(void);
+
+extern struct dirty_objlist_cache deleted_dirty_cache;
+extern struct dirty_objlist_cache updated_dirty_cache;
+
+int dirty_objlist_cache_find(struct dirty_objlist_cache *dirty_cache,
+ uint64_t oid);
+int dirty_objlist_cache_insert(struct dirty_objlist_cache *dirty_cache,
+ uint64_t oid);
+void dirty_objlist_cache_clear(struct dirty_objlist_cache *dirty_cache);
+void dirty_objlist_cache_show(struct dirty_objlist_cache *dirty_cache,
+ const char *tag);
void put_request(struct request *req);
--
1.7.11.2
More information about the sheepdog
mailing list