[sheepdog] [PATCH 1/2] object cache: introduce background pusher to boost flush performance

Fri Jan 25 15:14:18 CET 2013

From: Liu Yuan <tailai.ly at taobao.com>

Try to push dirty objects in the background to boost VM flush request such as
fsync(). This doesn't voliate the flush semantics because the flush request will
still be served synchronously and guaranteed to drain all the dirty objects.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 sheep/object_cache.c |  215 +++++++++++++++++++++++++++++++++++---------------
 sheep/sheep.c        |    3 +-
 sheep/sheep_priv.h   |    1 +
 3 files changed, 155 insertions(+), 64 deletions(-)

diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index f0ac231..9491bcd 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -60,12 +60,15 @@ struct object_cache_entry {
 	struct rb_node node;
 	struct list_head dirty_list;
 	struct list_head lru_list;
+
+	pthread_rwlock_t lock;
 };
 
 struct object_cache {
 	uint32_t vid;
+	uint32_t dirty_count;
+	uatomic_bool in_push;
 	struct hlist_node hash;
-
 	struct rb_root lru_tree;
 	struct list_head lru_head; /* Per VDI LRU list for reclaimer */
 	struct list_head dirty_head;
@@ -178,13 +181,15 @@ static struct object_cache_entry *lru_tree_search(struct rb_root *root,
 }
 
 static inline void
-free_cache_entry(struct object_cache_entry *entry,
-	      struct rb_root *lru_tree)
+free_cache_entry(struct object_cache_entry *entry)
 {
-	rb_erase(&entry->node, lru_tree);
+	struct object_cache *oc = entry->oc;
+
+	rb_erase(&entry->node, &oc->lru_tree);
 	list_del_init(&entry->lru_list);
 	if (!list_empty(&entry->dirty_list))
 		list_del_init(&entry->dirty_list);
+	pthread_rwlock_destroy(&entry->lock);
 	free(entry);
 }
 
@@ -300,53 +305,6 @@ static int read_cache_object(struct object_cache_entry *entry, void *buf,
 	return ret;
 }
 
-static int write_cache_object(struct object_cache_entry *entry, void *buf,
-			      size_t count, off_t offset, bool create,
-			      bool writeback)
-{
-	uint32_t vid = entry->oc->vid, idx = entry_idx(entry);
-	uint64_t oid = idx_to_oid(vid, idx);
-	struct object_cache *oc = entry->oc;
-	struct sd_req hdr;
-	int ret;
-
-	ret = write_cache_object_noupdate(vid, idx, buf, count, offset);
-	if (ret != SD_RES_SUCCESS)
-		return ret;
-
-	pthread_rwlock_wrlock(&oc->lock);
-	if (writeback) {
-		entry->bmap |= calc_object_bmap(count, offset);
-		if (list_empty(&entry->dirty_list))
-			list_add_tail(&entry->dirty_list,
-				      &oc->dirty_head);
-	}
-	list_move_tail(&entry->lru_list, &oc->lru_head);
-	pthread_rwlock_unlock(&oc->lock);
-
-	if (writeback)
-		goto out;
-
-	if (create)
-		sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ);
-	else
-		sd_init_req(&hdr, SD_OP_WRITE_OBJ);
-	hdr.flags =  SD_FLAG_CMD_WRITE;
-	hdr.data_length = count;
-
-	hdr.obj.oid = oid;
-	hdr.obj.offset = offset;
-
-	ret = exec_local_req(&hdr, buf);
-	if (ret != SD_RES_SUCCESS) {
-		sd_eprintf("failed to write object %" PRIx64 ", %x\n",
-			oid, ret);
-		return ret;
-	}
-out:
-	return ret;
-}
-
 static int push_cache_object(uint32_t vid, uint32_t idx, uint64_t bmap,
 			     bool create)
 {
@@ -407,6 +365,134 @@ out:
 	return ret;
 }
 
+struct push_work {
+	struct work work;
+	struct object_cache *oc;
+};
+
+/*
+ * Try to push dirty objects in the background to boost VM flush request such as
+ * fsync().
+ *
+ * This doesn't voliate the flush semantics because the flush request will
+ * still be served synchronously and guaranteed to drain all the dirty objects.
+ */
+static void do_async_push(struct work *work)
+{
+	struct push_work *pw = container_of(work, struct push_work, work);
+	struct object_cache *oc = pw->oc;
+	struct object_cache_entry *entry;
+
+	sd_dprintf("%"PRIx32"\n", oc->vid);
+next:
+	pthread_rwlock_wrlock(&oc->lock);
+	if (list_empty(&oc->dirty_head)) {
+	    pthread_rwlock_unlock(&oc->lock);
+	    return;
+	}
+	entry = list_first_entry(&oc->dirty_head, typeof(*entry), dirty_list);
+	uatomic_inc(&entry->refcnt);
+	uatomic_dec(&oc->dirty_count);
+	pthread_rwlock_unlock(&oc->lock);
+
+	pthread_rwlock_rdlock(&entry->lock);
+
+	assert(push_cache_object(oc->vid, entry_idx(entry), entry->bmap,
+				 !!(entry->idx & CACHE_CREATE_BIT))
+	       == SD_RES_SUCCESS);
+	entry->idx &= ~CACHE_CREATE_BIT;
+	entry->bmap = 0;
+
+	pthread_rwlock_wrlock(&oc->lock);
+	list_del_init(&entry->dirty_list);
+	pthread_rwlock_unlock(&oc->lock);
+
+	pthread_rwlock_unlock(&entry->lock);
+
+	uatomic_dec(&entry->refcnt);
+	goto next;
+}
+
+static void async_push_done(struct work *work)
+{
+	struct push_work *pw = container_of(work, struct push_work, work);
+
+	sd_dprintf("%"PRIx32"\n", pw->oc->vid);
+	uatomic_set_false(&pw->oc->in_push);
+	free(pw);
+}
+
+/* Must be called with oc->lock held */
+static void add_to_dirty_list(struct object_cache_entry *entry)
+{
+	struct object_cache *oc = entry->oc;
+	struct push_work *pw;
+	uint32_t dc;
+
+	list_add_tail(&entry->dirty_list, &oc->dirty_head);
+/* We want to push as soon as possible to reduce dirty count */
+#define PUSH_THRESHOLD 4
+	dc = uatomic_add_return(&oc->dirty_count, 1);
+	if (dc >= PUSH_THRESHOLD && uatomic_set_true(&oc->in_push) &&
+	    sys->status == SD_STATUS_OK) {
+		pw = xzalloc(sizeof(struct push_work));
+		pw->work.fn = do_async_push;
+		pw->work.done = async_push_done;
+		pw->oc = oc;
+		queue_work(sys->push_wqueue, &pw->work);
+	}
+}
+
+static int write_cache_object(struct object_cache_entry *entry, void *buf,
+			      size_t count, off_t offset, bool create,
+			      bool writeback)
+{
+	uint32_t vid = entry->oc->vid, idx = entry_idx(entry);
+	uint64_t oid = idx_to_oid(vid, idx);
+	struct object_cache *oc = entry->oc;
+	struct sd_req hdr;
+	int ret;
+
+	pthread_rwlock_wrlock(&entry->lock);
+	ret = write_cache_object_noupdate(vid, idx, buf, count, offset);
+	if (ret != SD_RES_SUCCESS) {
+		pthread_rwlock_unlock(&entry->lock);
+		return ret;
+	}
+
+	pthread_rwlock_wrlock(&oc->lock);
+	if (writeback) {
+		entry->bmap |= calc_object_bmap(count, offset);
+		if (list_empty(&entry->dirty_list))
+			add_to_dirty_list(entry);
+	}
+	list_move_tail(&entry->lru_list, &oc->lru_head);
+	pthread_rwlock_unlock(&oc->lock);
+
+	pthread_rwlock_unlock(&entry->lock);
+	if (writeback)
+		goto out;
+
+	if (create)
+		sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ);
+	else
+		sd_init_req(&hdr, SD_OP_WRITE_OBJ);
+	hdr.flags =  SD_FLAG_CMD_WRITE;
+	hdr.data_length = count;
+
+	hdr.obj.oid = oid;
+	hdr.obj.offset = offset;
+
+	ret = exec_local_req(&hdr, buf);
+	if (ret != SD_RES_SUCCESS) {
+		sd_eprintf("failed to write object %" PRIx64 ", %x\n",
+			oid, ret);
+		return ret;
+	}
+out:
+	return ret;
+}
+
 /*
  * The reclaim algorithm is similar to Linux kernel's page cache:
  *  - only tries to reclaim 'clean' object, which doesn't has any dirty updates,
@@ -441,7 +527,7 @@ static void do_reclaim_object(struct object_cache *oc)
 		if (remove_cache_object(oc, entry_idx(entry))
 		    != SD_RES_SUCCESS)
 			continue;
-		free_cache_entry(entry, &oc->lru_tree);
+		free_cache_entry(entry);
 		cap = uatomic_sub_return(&gcache.capacity, CACHE_OBJECT_SIZE);
 		sd_dprintf("%"PRIx64" reclaimed. capacity:%"PRId32"\n",
 			oid, cap);
@@ -590,6 +676,7 @@ alloc_cache_entry(struct object_cache *oc, uint32_t idx)
 	entry->idx = idx;
 	INIT_LIST_HEAD(&entry->dirty_list);
 	INIT_LIST_HEAD(&entry->lru_list);
+	pthread_rwlock_init(&entry->lock, NULL);
 
 	return entry;
 }
@@ -607,7 +694,7 @@ static void add_to_lru_cache(struct object_cache *oc, uint32_t idx, bool create)
 	if (create) {
 		entry->bmap = UINT64_MAX;
 		entry->idx |= CACHE_CREATE_BIT;
-		list_add(&entry->dirty_list, &oc->dirty_head);
+		add_to_dirty_list(entry);
 	}
 	pthread_rwlock_unlock(&oc->lock);
 }
@@ -767,23 +854,25 @@ err:
 }
 
 /* Push back all the dirty objects to sheep cluster storage */
-static int object_cache_push(struct object_cache *oc)
+static int object_cache_sync_push(struct object_cache *oc)
 {
 	struct object_cache_entry *entry, *t;
-
 	int ret = SD_RES_SUCCESS;
 
 	pthread_rwlock_wrlock(&oc->lock);
 	list_for_each_entry_safe(entry, t, &oc->dirty_head, dirty_list) {
-		ret = push_cache_object(oc->vid, entry_idx(entry), entry->bmap,
-					!!(entry->idx & CACHE_CREATE_BIT));
-		if (ret != SD_RES_SUCCESS)
-			goto push_failed;
+		if (uatomic_read(&entry->refcnt) > 0)
+			/* async pusher has been handling it */
+			continue;
+
+		assert(push_cache_object(oc->vid, entry_idx(entry), entry->bmap,
+					!!(entry->idx & CACHE_CREATE_BIT))
+		       == SD_RES_SUCCESS);
 		entry->idx &= ~CACHE_CREATE_BIT;
 		entry->bmap = 0;
+		uatomic_dec(&oc->dirty_count);
 		list_del_init(&entry->dirty_list);
 	}
-push_failed:
 	pthread_rwlock_unlock(&oc->lock);
 	return ret;
 }
@@ -819,7 +908,7 @@ void object_cache_delete(uint32_t vid)
 
 	pthread_rwlock_wrlock(&cache->lock);
 	list_for_each_entry_safe(entry, t, &cache->lru_head, lru_list) {
-		free_cache_entry(entry, &cache->lru_tree);
+		free_cache_entry(entry);
 		uatomic_sub(&gcache.capacity, CACHE_OBJECT_SIZE);
 	}
 	pthread_rwlock_unlock(&cache->lock);
@@ -1050,7 +1139,7 @@ int object_cache_flush_vdi(const struct request *req)
 		return SD_RES_SUCCESS;
 	}
 
-	return object_cache_push(cache);
+	return object_cache_sync_push(cache);
 }
 
 int object_cache_flush_and_del(const struct request *req)
@@ -1081,7 +1170,7 @@ void object_cache_remove(uint64_t oid)
 	entry = lru_tree_search(&oc->lru_tree, idx);
 	if (!entry)
 		goto out;
-	free_cache_entry(entry, &oc->lru_tree);
+	free_cache_entry(entry);
 out:
 	pthread_rwlock_unlock(&oc->lock);
 	return;
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 61e166c..06ed685 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -375,7 +375,8 @@ static int init_work_queues(void)
 	sys->sockfd_wqueue = init_work_queue("sockfd", true);
 	if (is_object_cache_enabled()) {
 		sys->reclaim_wqueue = init_work_queue("reclaim", true);
-		if (!sys->reclaim_wqueue)
+		sys->push_wqueue = init_work_queue("push", false);
+		if (!sys->reclaim_wqueue || !sys->push_wqueue)
 			return -1;
 	}
 	if (!sys->gateway_wqueue || !sys->io_wqueue || !sys->recovery_wqueue ||
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index b19ca03..cc0ea39 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -114,6 +114,7 @@ struct cluster_info {
 	struct work_queue *block_wqueue;
 	struct work_queue *sockfd_wqueue;
 	struct work_queue *reclaim_wqueue;
+	struct work_queue *push_wqueue;
 
 #define CACHE_TYPE_OBJECT 0x1
 #define CACHE_TYPE_DISK   0x2
-- 
1.7.9.5