[sheepdog] [PATCH v0, RFC] sheep: writeback cache semantics in backend store

Hitoshi Mitake h.mitake at gmail.com
Thu Aug 16 13:45:57 CEST 2012


From: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>

Hi sheepdog list, nice to meet you.

This patch implements writeback cache semantics in backend
store of sheep. Current backend store farm calls open() with
O_DSYNC, so every object write causes slow disk access. This incurs
overhead and this overhead is not necessary. Because current qemu
block driver invokes SD_OP_FLUSH_VDI explicitly for object
cache. Flushing disk cache with the invocation of SD_OP_FLUSH_VDI
instead of every object write is enough for current sheep.

For improving performance by reducing needless disk access, this patch
adds new inter-sheep operation SD_OP_FLUSH_PEER. This operation is
used in a situation like this:
qemu sends SD_OP_FLUSH_VDI -> gateway sheep sends SD_OP_FLUSH_PEER ->
other sheeps
And sheeps which received SD_OP_FLUSH_PEER flush disk cache with
syncfs() system call.


Below is the evaluation result with dbench:

 Before applying this patch, without -s (O_SYNC) option:
 Throughput 13.9269 MB/sec  1 clients  1 procs  max_latency=818.428 ms
 Before applying this patch, with -s option:
 Throughput 2.76792 MB/sec (sync open)  1 clients  1 procs
 max_latency=291.670 ms

 After applying this patch, without -s option:
 Throughput 29.7306 MB/sec  1 clients  1 procs  max_latency=1344.463 ms
 After applying this patch, with -s option:
 Throughput 4.29357 MB/sec (sync open)  1 clients  1 procs
 max_latency=450.045 ms


This patch adds new command line option -W to sheep. With -W, sheep
uses writeback cache semantics in backend store. I added this new
option mainly for easy testing and evaluation. If writeback cache
semantics is more suitable than the previous writethrough semantics as
default, I'll delete this option again.

This patch may contain lots of bad code because I'm new to sheepdog.
I'd like to hear your comments.

Cc: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>

---
 include/internal_proto.h |    1 +
 sheep/farm/farm.c        |    3 +++
 sheep/gateway.c          |    2 +-
 sheep/ops.c              |   44 ++++++++++++++++++++++++++++++++++++++++++++
 sheep/sheep.c            |    7 ++++++-
 sheep/sheep_priv.h       |    5 +++++
 6 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index 83d98f1..f34ef92 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -63,6 +63,7 @@
 #define SD_OP_ENABLE_RECOVER 0xA8
 #define SD_OP_DISABLE_RECOVER 0xA9
 #define SD_OP_INFO_RECOVER 0xAA
+#define SD_OP_FLUSH_PEER 0xAB
 
 /* internal flags for hdr.flags, must be above 0x80 */
 #define SD_FLAG_CMD_RECOVERY 0x0080
diff --git a/sheep/farm/farm.c b/sheep/farm/farm.c
index 7eeae9a..991c009 100644
--- a/sheep/farm/farm.c
+++ b/sheep/farm/farm.c
@@ -362,6 +362,9 @@ static int farm_init(char *p)
 	iocb.epoch = sys->epoch ? sys->epoch - 1 : 0;
 	farm_cleanup_sys_obj(&iocb);

+	if (sys->store_writeback)
+		def_open_flags &= ~O_DSYNC;
+
 	return SD_RES_SUCCESS;
 err:
 	return SD_RES_EIO;
diff --git a/sheep/gateway.c b/sheep/gateway.c
index bdbd08c..79fdd07 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -225,7 +225,7 @@ static inline void gateway_init_fwd_hdr(struct sd_req *fwd, struct sd_req *hdr)
 	fwd->proto_ver = SD_SHEEP_PROTO_VER;
 }
 
-static int gateway_forward_request(struct request *req)
+int gateway_forward_request(struct request *req)
 {
 	int i, err_ret = SD_RES_SUCCESS, ret, local = -1;
 	unsigned wlen;
diff --git a/sheep/ops.c b/sheep/ops.c
index 8ca8748..0ec4b63 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -22,6 +22,11 @@
 #include <sys/stat.h>
 #include <pthread.h>
 
+#include <asm/unistd.h>                /* for __NR_syncfs */
+#ifndef __NR_syncfs
+#define __NR_syncfs 306
+#endif
+
 #include "sheep_priv.h"
 #include "strbuf.h"
 #include "trace/trace.h"
@@ -584,6 +589,9 @@ static int local_get_snap_file(struct request *req)
 
 static int local_flush_vdi(struct request *req)
 {
+	if (sys->store_writeback)
+		gateway_forward_request(req);
+
 	if (!sys->enable_write_cache)
 		return SD_RES_SUCCESS;
 	return object_cache_flush_vdi(req);
@@ -837,6 +845,35 @@ out:
 	return ret;
 }
 
+static int syncfs(int fd)
+{
+	return syscall(__NR_syncfs, fd);
+}
+
+int peer_flush_dcache(struct request *req)
+{
+	int fd;
+	struct sd_req *hdr = &req->rq;
+	uint64_t oid = hdr->obj.oid;
+	char path[PATH_MAX];
+
+	sprintf(path, "%s%016"PRIx64, obj_path, oid);
+	fd = open(path, O_RDONLY);
+	if (fd < 0) {
+		eprintf("error at open() %s, %s\n", path, strerror(errno));
+		return SD_RES_NO_OBJ;
+	}
+
+	if (syncfs(fd)) {
+		eprintf("error at syncfs(), %s\n", strerror(errno));
+		return SD_RES_EIO;
+	}
+
+	close(fd);
+
+	return SD_RES_SUCCESS;
+}
+
 static struct sd_op_template sd_ops[] = {
 
 	/* cluster operations */
@@ -1089,6 +1126,12 @@ static struct sd_op_template sd_ops[] = {
 		.type = SD_OP_TYPE_LOCAL,
 		.process_main = local_info_recover,
 	},
+
+	[SD_OP_FLUSH_PEER] = {
+		.name = "FLUSH_PEER",
+		.type = SD_OP_TYPE_PEER,
+		.process_work = peer_flush_dcache,
+	},
 };
 
 struct sd_op_template *get_sd_op(uint8_t opcode)
@@ -1174,6 +1217,7 @@ static int map_table[] = {
 	[SD_OP_READ_OBJ] = SD_OP_READ_PEER,
 	[SD_OP_WRITE_OBJ] = SD_OP_WRITE_PEER,
 	[SD_OP_REMOVE_OBJ] = SD_OP_REMOVE_PEER,
+	[SD_OP_FLUSH_VDI] = SD_OP_FLUSH_PEER,
 };
 
 int gateway_to_peer_opcode(int opcode)
diff --git a/sheep/sheep.c b/sheep/sheep.c
index c743184..9c7dc7f 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -50,10 +50,11 @@ static struct option const long_options[] = {
 	{"enable-cache", required_argument, NULL, 'w'},
 	{"zone", required_argument, NULL, 'z'},
 	{"pidfile", required_argument, NULL, 'P'},
+	{"writeback", no_argument, NULL, 'W'},
 	{NULL, 0, NULL, 0},
 };
 
-static const char *short_options = "c:dDfghl:op:P:s:w:y:z:";
+static const char *short_options = "c:dDfghl:op:P:s:w:y:z:W";
 
 static void usage(int status)
 {
@@ -79,6 +80,7 @@ Options:\n\
   -w, --enable-cache      enable object cache and specify the max size (M) and mode\n\
   -y, --myaddr            specify the address advertised to other sheep\n\
   -z, --zone              specify the zone id\n\
+  -W, --writeback         use writeback semantics in backend store\n\
 ", PACKAGE_VERSION, program_name);
 	exit(status);
 }
@@ -310,6 +312,9 @@ int main(int argc, char **argv)
 
 			sys->cdrv_option = get_cdrv_option(sys->cdrv, optarg);
 			break;
+		case 'W':
+			sys->store_writeback = 1;
+			break;
 		case 'h':
 			usage(0);
 			break;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 857cf87..a74e669 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -118,6 +118,8 @@ struct cluster_info {
 	struct work_queue *block_wqueue;
 	struct work_queue *sockfd_wqueue;
 	struct work_queue *reclaim_wqueue;
+
+	int store_writeback;
 };
 
 struct siocb {
@@ -325,6 +327,7 @@ int peer_read_obj(struct request *req);
 int peer_write_obj(struct request *req);
 int peer_create_and_write_obj(struct request *req);
 int peer_remove_obj(struct request *req);
+int peer_flush_dcache(struct request *req);
 
 /* object_cache */
 
@@ -357,4 +360,6 @@ struct sockfd *sheep_get_sockfd(struct node_id *);
 void sheep_put_sockfd(struct node_id *, struct sockfd *);
 void sheep_del_sockfd(struct node_id *, struct sockfd *);
 
+int gateway_forward_request(struct request *req);
+
 #endif
-- 
1.7.2.5





More information about the sheepdog mailing list