[Sheepdog] [PATCH 2/2] object cache: introduce async flush

Mon Apr 2 10:21:11 CEST 2012

From: Liu Yuan <tailai.ly at taobao.com>

We async flush dirty object as default to achieve the best performance.
If users prefer strong consistency over performance, users can launch
sheep with -S or --sync option.

We need async flush because:
	1) some APP are responsive time sensitive, the writeback of dirty bits in
	the guest will mostly hurt RT because guest need to await its completion.
	This is a considerably long operation in the sheep cluster.
	2) some APP are just memory and CPU intensive, has little of concern of disk
	data. (For e.g, just use disk to store logs of APP)
	3) People simply prefer performance over consistency.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 sheep/object_cache.c |   23 +++++++++++++++++++++++
 sheep/ops.c          |   34 ++++++++++++++++++++++++++++++++--
 sheep/sheep.c        |    9 ++++++++-
 sheep/sheep_priv.h   |    2 ++
 4 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index 389dc6d..ed10b86 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -203,9 +203,20 @@ static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t coun
 		flags |= O_DIRECT;
 
 	fd = open(p.buf, flags, def_fmode);
+	if (flock(fd, LOCK_EX) < 0) {
+		ret = SD_RES_EIO;
+		eprintf("%m\n");
+		goto out;
+	}
 	size = xpwrite(fd, buf, count, offset);
+	if (flock(fd, LOCK_UN) < 0) {
+		ret = SD_RES_EIO;
+		eprintf("%m\n");
+		goto out;
+	}
 	if (size != count)
 		ret = SD_RES_EIO;
+out:
 	close(fd);
 	strbuf_release(&p);
 	return ret;
@@ -225,9 +236,21 @@ static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count
 		flags |= O_DIRECT;
 
 	fd = open(p.buf, flags, def_fmode);
+	if (flock(fd, LOCK_SH) < 0) {
+		ret = SD_RES_EIO;
+		eprintf("%m\n");
+		goto out;
+	}
+
 	size = xpread(fd, buf, count, offset);
+	if (flock(fd, LOCK_UN) < 0) {
+		ret = SD_RES_EIO;
+		eprintf("%m\n");
+		goto out;
+	}
 	if (size != count)
 		ret = SD_RES_EIO;
+out:
 	close(fd);
 	strbuf_release(&p);
 	return ret;
diff --git a/sheep/ops.c b/sheep/ops.c
index bedafbe..7fd5a18 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -51,6 +51,11 @@ struct sd_op_template {
 	int (*process_main)(const struct sd_req *req, struct sd_rsp *rsp, void *data);
 };
 
+struct flush_work {
+	struct object_cache *cache;
+	struct work work;
+};
+
 static int cluster_new_vdi(const struct sd_req *req, struct sd_rsp *rsp,
 			   void *data)
 {
@@ -476,6 +481,22 @@ static int local_get_snap_file(const struct sd_req *req, struct sd_rsp *rsp,
 	return ret;
 }
 
+static void flush_vdi_fn(struct work *work)
+{
+	struct flush_work *fw = container_of(work, struct flush_work, work);
+
+	dprintf("flush vdi %"PRIx32"\n", fw->cache->vid);
+	if (object_cache_push(fw->cache) != SD_RES_SUCCESS)
+		eprintf("failed to flush vdi %"PRIx32"\n", fw->cache->vid);
+}
+
+static void flush_vdi_done(struct work *work)
+{
+	struct flush_work *fw = container_of(work, struct flush_work, work);
+	dprintf("flush vdi %"PRIx32" done\n", fw->cache->vid);
+	free(fw);
+}
+
 static int local_flush_vdi(const struct sd_req *req, struct sd_rsp *rsp, void *data)
 {
 	struct sd_obj_req *hdr = (struct sd_obj_req *)req;
@@ -483,8 +504,17 @@ static int local_flush_vdi(const struct sd_req *req, struct sd_rsp *rsp, void *d
 	uint32_t vid = oid_to_vid(oid);
 	struct object_cache *cache = find_object_cache(vid, 0);
 
-	if (cache)
-		return object_cache_push(cache);
+	if (cache) {
+		if (sys->sync_flush)
+			return object_cache_push(cache);
+		else {
+			struct flush_work *fw = xmalloc(sizeof(*fw));
+			fw->work.fn = flush_vdi_fn;
+			fw->work.done = flush_vdi_done;
+			fw->cache = cache;
+			queue_work(sys->flush_wqueue, &fw->work);
+		}
+	}
 
 	return SD_RES_SUCCESS;
 }
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 6d64a40..ef28f87 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -59,6 +59,7 @@ Options:\n\
   -l, --loglevel          specify the level of logging detail\n\
   -d, --debug             include debug messages in the log\n\
   -D, --directio          use direct IO when accessing the object from cache or backend store\n\
+  -S, --sync              flush the object cache synchronously\n\
   -z, --zone              specify the zone id\n\
   -v, --vnodes            specify the number of virtual nodes\n\
   -c, --cluster           specify the cluster driver\n\
@@ -132,6 +133,10 @@ int main(int argc, char **argv)
 			dprintf("direct IO mode\n");
 			sys->use_directio = 1;
 			break;
+		case 'S':
+			dprintf("sync flush\n");
+			sys->sync_flush = 1;
+			break;
 		case 'z':
 			zone = strtol(optarg, &p, 10);
 			if (optarg == p || zone < 0 || UINT32_MAX < zone) {
@@ -215,8 +220,10 @@ int main(int argc, char **argv)
 	sys->io_wqueue = init_work_queue(NR_IO_WORKER_THREAD);
 	sys->recovery_wqueue = init_work_queue(1);
 	sys->deletion_wqueue = init_work_queue(1);
+	sys->flush_wqueue = init_work_queue(1);
 	if (!sys->cpg_wqueue || !sys->gateway_wqueue || !sys->io_wqueue ||
-	    !sys->recovery_wqueue || !sys->deletion_wqueue)
+	    !sys->recovery_wqueue || !sys->deletion_wqueue ||
+	    !sys->flush_wqueue)
 		exit(1);
 
 	vprintf(SDOG_NOTICE, "sheepdog daemon (version %s) started\n", PACKAGE_VERSION);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index a316e2c..b932e83 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -148,12 +148,14 @@ struct cluster_info {
 	uint32_t recovered_epoch;
 
 	int use_directio;
+	uint8_t sync_flush;
 
 	struct work_queue *cpg_wqueue;
 	struct work_queue *gateway_wqueue;
 	struct work_queue *io_wqueue;
 	struct work_queue *deletion_wqueue;
 	struct work_queue *recovery_wqueue;
+	struct work_queue *flush_wqueue;
 };
 
 struct siocb {
-- 
1.7.8.2