[sheepdog] [PATCH RFC 09/11] sheep: do recovery with dirty object list

Wed Aug 8 23:14:21 CEST 2012

From: Yunkai Zhang <qiushu.zyk at taobao.com>

Now we have prepared all necessary data, let's do the recovery job with
dirty object list.

1) If a sheep joined back to the cluster, but there are some objects which have
   been deleted after this sheep left, such objects stay in its working
   directory, after recovery start, this sheep will send its object list to
   other sheeps. So after fetched all object list from cluster, each sheep
   should screen out these deleted objects list.
2) A sheep which have been left and joined back should drop the old version
   objects and recover the new ones from other sheeps.
3) The objects which have been updated should not recovered from a joined
   back sheep.

In order to search delected and updated object list quickly, I store them in
   a temporary cache named dirty_object_cache implemented by rbtree, it shares
   some common base code with objlist_cache.

Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
---
 sheep/object_list_cache.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++
 sheep/recovery.c          | 87 +++++++++++++++++++++++++++++++++++++++++++++--
 sheep/sheep_priv.h        | 16 +++++++++
 3 files changed, 187 insertions(+), 2 deletions(-)

diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
index 4c34783..c97fd27 100644
--- a/sheep/object_list_cache.c
+++ b/sheep/object_list_cache.c
@@ -55,6 +55,16 @@ struct objlist_cache obj_list_cache = {
 	.lock		= PTHREAD_RWLOCK_INITIALIZER,
 };
 
+struct dirty_objlist_cache deleted_dirty_cache = {
+	.root	    = RB_ROOT,
+	.entry_list = LIST_HEAD_INIT(deleted_dirty_cache.entry_list),
+};
+
+struct dirty_objlist_cache updated_dirty_cache = {
+	.root	    = RB_ROOT,
+	.entry_list = LIST_HEAD_INIT(updated_dirty_cache.entry_list),
+};
+
 static struct objlist_cache_entry *objlist_cache_rb_find(struct rb_root *root,
 							 uint64_t oid)
 {
@@ -128,6 +138,58 @@ static int objlist_cache_rb_remove(struct rb_root *root, uint64_t oid)
 	return -1; /* fail to remove */
 }
 
+int dirty_objlist_cache_find(struct dirty_objlist_cache *dirty_cache,
+			     uint64_t oid)
+{
+	struct objlist_cache_entry *p;
+
+	p = objlist_cache_rb_find(&dirty_cache->root, oid);
+
+	return !!p;
+}
+
+int dirty_objlist_cache_insert(struct dirty_objlist_cache *dirty_cache,
+			       uint64_t oid)
+{
+	struct objlist_cache_entry *entry, *p;
+
+	entry = zalloc(sizeof(*entry));
+
+	if (!entry) {
+		eprintf("Out of memory.\n");
+		return -1;
+	}
+
+	entry->oid = oid;
+	rb_init_node(&entry->node);
+
+	p = objlist_cache_rb_insert(&dirty_cache->root, entry);
+	if (p)
+		free(entry);
+	else
+		list_add(&entry->list, &dirty_cache->entry_list);
+	return 1;
+}
+
+void dirty_objlist_cache_clear(struct dirty_objlist_cache *dirty_cache)
+{
+	struct objlist_cache_entry *entry, *t;
+	list_for_each_entry_safe(entry, t, &dirty_cache->entry_list, list) {
+		dprintf("delete oid:%lx\n", entry->oid);
+		objlist_cache_rb_remove(&dirty_cache->root, entry->oid);
+	}
+
+	assert(list_empty(&dirty_cache->entry_list));
+}
+
+void dirty_objlist_cache_show(struct dirty_objlist_cache *dirty_cache)
+{
+	struct objlist_cache_entry *entry, *t;
+	list_for_each_entry_safe(entry, t, &dirty_cache->entry_list, list) {
+		dprintf("oid:%lx\n", entry->oid);
+	}
+}
+
 void objlist_cache_remove(uint64_t oid, int has_left_node)
 {
 	struct objlist_cache_entry *entry;
@@ -181,6 +243,30 @@ int objlist_cache_insert(uint64_t oid, int dirty_flag)
 	return 0;
 }
 
+void objlist_cache_clear_dirty_flag(void)
+{
+	struct objlist_cache_entry *entry, *t;
+
+	pthread_rwlock_wrlock(&obj_list_cache.lock);
+	list_for_each_entry_safe(entry, t, &obj_list_cache.entry_list, list) {
+		if (entry->dirty_flag == 1)
+			entry->dirty_flag = 0;
+		else if (entry->dirty_flag == 2)
+			/*
+			 * entries which dirty_flag is 2 will not kept in
+			 * cache buffer, so needn't to update tree_version
+			 * when remove it.
+			 */
+			objlist_cache_rb_remove(&obj_list_cache.root,
+						entry->oid);
+		if (entry->dirty_flag == 1)
+			dprintf("oid:%"PRIx64" dirty_flag: 1, updated\n", entry->oid);
+		else if (entry->dirty_flag == 2)
+			dprintf("oid:%"PRIx64" dirty_flag: 2, deleted\n", entry->oid);
+	}
+	pthread_rwlock_unlock(&obj_list_cache.lock);
+}
+
 int get_obj_list(const struct sd_list_req *hdr, struct sd_list_rsp *rsp, void *data)
 {
 	int nr = 0;
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 9d3d431..ba6e002 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -181,6 +181,15 @@ again:
 		if (is_invalid_vnode(tgt_vnode, rw->cur_vinfo->nodes,
 				     rw->cur_vinfo->nr_nodes))
 			continue;
+		/*
+		 * objects which have been updated should not recovered
+		 * from a joined back sheep.
+		 */
+		if (sys->disable_recovery
+		    && vnode_has_left(old, tgt_vnode)
+		    && dirty_objlist_cache_find(&updated_dirty_cache, oid))
+			continue;
+
 		ret = recover_object_from_replica(oid, tgt_vnode,
 						  epoch, tgt_epoch);
 		if (ret == 0) {
@@ -219,6 +228,14 @@ err:
 	return ret;
 }
 
+static inline bool is_leaving_node(struct sd_node *node)
+{
+	if (find_node(node, leaving_nodes, nr_leaving_nodes))
+		return true;
+
+	return false;
+}
+
 static void recover_object_work(struct work *work)
 {
 	struct recovery_work *rw = container_of(work, struct recovery_work,
@@ -229,11 +246,22 @@ static void recover_object_work(struct work *work)
 	eprintf("done:%"PRIu32" count:%"PRIu32", oid:%"PRIx64"\n",
 		rw->done, rw->count, oid);
 
+	/*
+	 * sheep which have been left and joined back should drop the
+	 * old version objects and recover the new ones from other sheeps.
+	 */
+	if (sys->disable_recovery
+	    && is_leaving_node(&sys->this_node)
+	    && dirty_objlist_cache_find(&updated_dirty_cache, oid)) {
+		goto do_recover;
+	}
+
 	if (sd_store->exist(oid)) {
 		dprintf("the object is already recovered\n");
 		return;
 	}
 
+do_recover:
 	ret = do_recover_object(rw);
 	if (ret < 0)
 		eprintf("failed to recover object %"PRIx64"\n", oid);
@@ -345,6 +373,10 @@ static inline void finish_recovery(struct recovery_work *rw)
 	if (sd_store->end_recover)
 		sd_store->end_recover(sys->epoch - 1, rw->old_vinfo);
 
+	objlist_cache_clear_dirty_flag();
+	dirty_objlist_cache_clear(&deleted_dirty_cache);
+	dirty_objlist_cache_clear(&updated_dirty_cache);
+
 	free_recovery_work(rw);
 
 	sys->disable_recovery = 0;
@@ -539,6 +571,42 @@ static void screen_object_list(struct recovery_work *rw,
 	qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
 }
 
+/*
+ * If a sheep joined back to the cluster, but there are some objects which have
+ * been deleted after this sheep left, such objects stay in its working
+ * directory, after recovery start, this sheep will send its object list to
+ * other sheeps. So after fetched all object list from cluster, each sheep
+ * should screen out these deleted objects list.
+ */
+static void screen_dirty_object_list(struct recovery_work *rw)
+{
+	int i, n;
+	uint64_t *oids;
+	struct dirty_objlist_cache *deleted;
+
+	oids = xmalloc(SD_DATA_OBJ_SIZE * sizeof(uint64_t));
+	deleted = &deleted_dirty_cache;
+
+	dirty_objlist_cache_show(deleted);
+
+	for (n = i = 0; i < rw->count; i++) {
+		if (!dirty_objlist_cache_find(deleted, rw->oids[i])) {
+			dprintf("oid:%lx\n", rw->oids[i]);
+			oids[n++] =  rw->oids[i];
+		}
+	}
+
+	dprintf("old count:%d, new count:%d\n", rw->count, n);
+
+	memcpy(rw->oids, oids, n * sizeof(uint64_t));
+	rw->count = n;
+
+	qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
+
+	free(oids);
+}
+
+
 static int newly_joined(struct sd_node *node, struct recovery_work *rw)
 {
 	if (bsearch(node, rw->old_vinfo->nodes, rw->old_vinfo->nr_nodes,
@@ -586,9 +654,21 @@ again:
 		if (sys->disable_recovery) {
 			offset = nr_oids * sizeof(uint64_t);
 			dois = (struct dirty_oid_info *)&buf[offset];
+
 			for (j = 0; j < nr_dois; j++) {
+				struct dirty_objlist_cache *updated, *deleted;
+
+				updated = &updated_dirty_cache;
+				deleted = &deleted_dirty_cache;
+
 				dprintf("dirty_oid:%"PRIx64", dirty_flag:%d\n",
 					dois[j].oid, dois[j].dirty_flag);
+				if (dois[i].dirty_flag == 1)
+					dirty_objlist_cache_insert(updated,
+								   dois[j].oid);
+				else
+					dirty_objlist_cache_insert(deleted,
+								   dois[j].oid);
 			}
 		}
 
@@ -601,7 +681,10 @@ again:
 		goto again;
 	}
 
-	dprintf("%d\n", rw->count);
+	if (sys->disable_recovery)
+		screen_dirty_object_list(rw);
+
+	dprintf("count:%d\n", rw->count);
 out:
 	free(buf);
 }
@@ -625,7 +708,7 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo)
 	}
 
 	rw->state = RW_INIT;
-	rw->oids = xmalloc(1 << 20); /* FIXME */
+	rw->oids = xmalloc(SD_DATA_OBJ_SIZE * sizeof(uint64_t));
 	rw->epoch = sys->epoch;
 	rw->count = 0;
 
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 2c3e195..7196f12 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -150,6 +150,11 @@ struct store_driver {
 	int (*get_snap_file)(struct siocb *);
 };
 
+struct dirty_objlist_cache {
+	struct rb_root root;
+	struct list_head entry_list;
+};
+
 extern struct list_head store_drivers;
 #define add_store_driver(driver)                                 \
 static void __attribute__((constructor)) add_ ## driver(void) {  \
@@ -278,6 +283,17 @@ int prealloc(int fd, uint32_t size);
 
 int objlist_cache_insert(uint64_t oid, int has_left_node);
 void objlist_cache_remove(uint64_t oid, int has_left_node);
+void objlist_cache_clear_dirty_flag(void);
+
+extern struct dirty_objlist_cache deleted_dirty_cache;
+extern struct dirty_objlist_cache updated_dirty_cache;
+
+int dirty_objlist_cache_find(struct dirty_objlist_cache *dirty_cache,
+			     uint64_t oid);
+int dirty_objlist_cache_insert(struct dirty_objlist_cache *dirty_cache,
+			       uint64_t oid);
+void dirty_objlist_cache_clear(struct dirty_objlist_cache *dirty_cache);
+void dirty_objlist_cache_show(struct dirty_objlist_cache *dirty_cache);
 
 void put_request(struct request *req);
 
-- 
1.7.11.2