[sheepdog] [PATCH v2] sheep: remove disk write cache

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Tue Mar 5 10:19:29 CET 2013


This removes disk cache from 'w' option because it shows poor
performance against writethrough I/O requests because of many syncfs
calls.  Currently, object cache doesn't break block semantic against
live migration and VM shutdown, so it will be fit for most users.

This breaks backward compatibility.  The syntax will be as follows:

 $ sheep -w size=<size>[,directio][,dir=<dir>]

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---

Changes from v2:
 - clean up more unnecessary functions
 - remove "object:" check

 configure.ac         |  2 +-
 sheep/farm/farm.c    |  1 -
 sheep/gateway.c      | 33 +++++++++-------------------
 sheep/object_cache.c |  2 +-
 sheep/ops.c          | 33 +++-------------------------
 sheep/plain_store.c  | 35 ++----------------------------
 sheep/request.c      |  2 +-
 sheep/sheep.c        | 61 +++++++---------------------------------------------
 sheep/sheep_priv.h   | 17 +--------------
 sheep/store.c        |  4 ++--
 tests/018            |  2 +-
 tests/019            |  2 +-
 tests/020            |  2 +-
 tests/044            |  2 +-
 tests/049            |  4 ++--
 15 files changed, 35 insertions(+), 167 deletions(-)

diff --git a/configure.ac b/configure.ac
index e610f12..5b8bb4b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -124,7 +124,7 @@ AC_FUNC_VPRINTF
 AC_CHECK_FUNCS([alarm alphasort atexit bzero dup2 endgrent endpwent fcntl \
 		getcwd getpeerucred getpeereid gettimeofday inet_ntoa memmove \
 		memset mkdir scandir select socket strcasecmp strchr strdup \
-		strerror strrchr strspn strstr syncfs])
+		strerror strrchr strspn strstr])
 
 AC_CONFIG_FILES([Makefile
 		collie/Makefile
diff --git a/sheep/farm/farm.c b/sheep/farm/farm.c
index 91d4bc4..b45b53f 100644
--- a/sheep/farm/farm.c
+++ b/sheep/farm/farm.c
@@ -290,7 +290,6 @@ static struct store_driver farm = {
 	.format = default_format,
 	.purge_obj = default_purge_obj,
 	.remove_object = default_remove_object,
-	.flush = default_flush,
 };
 
 add_store_driver(farm);
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 71f87e4..0f6bfc1 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -39,7 +39,7 @@ int gateway_read_obj(struct request *req)
 	uint64_t oid = req->rq.obj.oid;
 	int nr_copies, j;
 
-	if (is_object_cache_enabled() && !req->local &&
+	if (sys->enable_object_cache && !req->local &&
 	    !bypass_object_cache(req)) {
 		ret = object_cache_handle_request(req);
 		goto out;
@@ -234,28 +234,20 @@ write_info_advance(struct write_info *wi, const struct node_id *nid,
 	wi->nr_sent++;
 }
 
-static int init_target_nodes(struct request *req, bool all_node,
-			     uint64_t oid, const struct sd_node **target_nodes)
+static int init_target_nodes(struct request *req, uint64_t oid,
+			     const struct sd_node **target_nodes)
 {
-	int i, nr_to_send;
+	int nr_to_send;
 	const struct vnode_info *vinfo = req->vinfo;
 
-	if (all_node) {
-		nr_to_send = vinfo->nr_nodes;
-		for (i = 0; i < nr_to_send; i++)
-			target_nodes[i] = &vinfo->nodes[i];
-
-		return nr_to_send;
-	}
-
 	nr_to_send = get_req_copy_number(req);
 	oid_to_nodes(vinfo->vnodes, vinfo->nr_vnodes, oid, nr_to_send,
-		vinfo->nodes, target_nodes);
+		     vinfo->nodes, target_nodes);
 
 	return nr_to_send;
 }
 
-static int gateway_forward_request(struct request *req, bool all_node)
+static int gateway_forward_request(struct request *req)
 {
 	int i, err_ret = SD_RES_SUCCESS, ret, local = -1;
 	unsigned wlen;
@@ -272,7 +264,7 @@ static int gateway_forward_request(struct request *req, bool all_node)
 	op = get_sd_op(hdr.opcode);
 
 	wlen = hdr.data_length;
-	nr_to_send = init_target_nodes(req, all_node, oid, target_nodes);
+	nr_to_send = init_target_nodes(req, oid, target_nodes);
 	write_info_init(&wi, nr_to_send);
 
 	for (i = 0; i < nr_to_send; i++) {
@@ -327,7 +319,7 @@ int gateway_write_obj(struct request *req)
 	if (!bypass_object_cache(req))
 		return object_cache_handle_request(req);
 
-	return gateway_forward_request(req, false);
+	return gateway_forward_request(req);
 }
 
 int gateway_create_and_write_obj(struct request *req)
@@ -335,15 +327,10 @@ int gateway_create_and_write_obj(struct request *req)
 	if (!bypass_object_cache(req))
 		return object_cache_handle_request(req);
 
-	return gateway_forward_request(req, false);
+	return gateway_forward_request(req);
 }
 
 int gateway_remove_obj(struct request *req)
 {
-	return gateway_forward_request(req, false);
-}
-
-int gateway_flush_nodes(struct request *req)
-{
-	return gateway_forward_request(req, true);
+	return gateway_forward_request(req);
 }
diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index 214ce1c..0781c28 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -1006,7 +1006,7 @@ bool bypass_object_cache(const struct request *req)
 {
 	uint64_t oid = req->rq.obj.oid;
 
-	if (!is_object_cache_enabled() || req->local)
+	if (!sys->enable_object_cache || req->local)
 		return true;
 
 	if (req->rq.flags & SD_FLAG_CMD_DIRECT) {
diff --git a/sheep/ops.c b/sheep/ops.c
index ef74871..e9ad3f1 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -187,7 +187,7 @@ static int post_cluster_del_vdi(const struct sd_req *req, struct sd_rsp *rsp,
 	struct cache_deletion_work *dw;
 	int ret = rsp->result;
 
-	if (!is_object_cache_enabled())
+	if (!sys->enable_object_cache)
 		return ret;
 
 	dw = xzalloc(sizeof(*dw));
@@ -700,26 +700,19 @@ static int local_flush_vdi(struct request *req)
 {
 	int ret = SD_RES_INVALID_PARMS;
 
-	if (is_object_cache_enabled()) {
+	if (sys->enable_object_cache) {
 		uint32_t vid = oid_to_vid(req->rq.obj.oid);
 		ret = object_cache_flush_vdi(vid);
 		if (ret != SD_RES_SUCCESS)
 			return ret;
 	}
 
-	if (is_disk_cache_enabled()) {
-		struct sd_req hdr;
-
-		sd_init_req(&hdr, SD_OP_FLUSH_NODES);
-		return exec_local_req(&hdr, NULL);
-	}
-
 	return ret;
 }
 
 static int local_flush_and_del(struct request *req)
 {
-	if (!is_object_cache_enabled())
+	if (!sys->enable_object_cache)
 		return SD_RES_SUCCESS;
 	return object_cache_flush_and_del(req);
 }
@@ -903,14 +896,6 @@ out:
 	return ret;
 }
 
-int peer_flush(struct request *req)
-{
-	if (sys->gateway_only)
-		return SD_RES_SUCCESS;
-
-	return sd_store->flush();
-}
-
 static struct sd_op_template sd_ops[] = {
 
 	/* cluster operations */
@@ -1187,17 +1172,6 @@ static struct sd_op_template sd_ops[] = {
 		.type = SD_OP_TYPE_CLUSTER,
 		.process_main = cluster_disable_recover,
 	},
-
-	[SD_OP_FLUSH_PEER] = {
-		.name = "FLUSH_PEER",
-		.type = SD_OP_TYPE_PEER,
-		.process_work = peer_flush,
-	},
-	[SD_OP_FLUSH_NODES] = {
-		.name = "FLUSH_NODES",
-		.type = SD_OP_TYPE_GATEWAY,
-		.process_work = gateway_flush_nodes,
-	},
 };
 
 const struct sd_op_template *get_sd_op(uint8_t opcode)
@@ -1283,7 +1257,6 @@ static int map_table[] = {
 	[SD_OP_READ_OBJ] = SD_OP_READ_PEER,
 	[SD_OP_WRITE_OBJ] = SD_OP_WRITE_PEER,
 	[SD_OP_REMOVE_OBJ] = SD_OP_REMOVE_PEER,
-	[SD_OP_FLUSH_NODES] = SD_OP_FLUSH_PEER,
 };
 
 int gateway_to_peer_opcode(int opcode)
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 3ef22eb..357a0a6 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -24,9 +24,7 @@ static int get_open_flags(uint64_t oid, bool create, int fl)
 {
 	int flags = O_DSYNC | O_RDWR;
 
-	if ((fl & SD_FLAG_CMD_CACHE && is_disk_cache_enabled()) ||
-	    uatomic_is_true(&sys->use_journal) ||
-	    sys->nosync == true)
+	if (uatomic_is_true(&sys->use_journal) || sys->nosync == true)
 		flags &= ~O_DSYNC;
 
 	/*
@@ -475,7 +473,7 @@ int default_format(void)
 		sd_eprintf("%m");
 		return SD_RES_EIO;
 	}
-	if (is_object_cache_enabled())
+	if (sys->enable_object_cache)
 		object_cache_format();
 
 	return SD_RES_SUCCESS;
@@ -505,34 +503,6 @@ int default_purge_obj(void)
 	return for_each_object_in_wd(move_object_to_stale_dir, true, &tgt_epoch);
 }
 
-#ifndef HAVE_SYNCFS
-static int syncfs(int fd)
-{
-	sync();
-	return 0;
-}
-#endif
-
-int default_flush(void)
-{
-	int fd, ret = SD_RES_SUCCESS;
-
-	fd = open(obj_path, O_RDONLY);
-	if (fd < 0) {
-		sd_eprintf("error at open() %s, %m", obj_path);
-		return SD_RES_NO_OBJ;
-	}
-
-	if (syncfs(fd)) {
-		sd_eprintf("error at syncfs(), %m");
-		ret = SD_RES_EIO;
-	}
-
-	close(fd);
-
-	return ret;
-}
-
 static struct store_driver plain_store = {
 	.name = "plain",
 	.init = default_init,
@@ -546,7 +516,6 @@ static struct store_driver plain_store = {
 	.format = default_format,
 	.remove_object = default_remove_object,
 	.purge_obj = default_purge_obj,
-	.flush = default_flush,
 };
 
 add_store_driver(plain_store);
diff --git a/sheep/request.c b/sheep/request.c
index 6821167..3ebdff7 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -297,7 +297,7 @@ static void queue_gateway_request(struct request *req)
 	 * Even if it doesn't exist in cache, we'll rely on cache layer to pull
 	 * it.
 	 */
-	if (is_object_cache_enabled())
+	if (sys->enable_object_cache)
 		goto queue_work;
 
 	if (req->local_oid)
diff --git a/sheep/sheep.c b/sheep/sheep.c
index aa2a769..38fe350 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -59,7 +59,7 @@ static struct sd_option sheep_options[] = {
 	{'s', "disk-space", true, "specify the free disk space in megabytes"},
 	{'u', "upgrade", false, "upgrade to the latest data layout"},
 	{'v', "version", false, "show the version"},
-	{'w', "write-cache", true, "specify the cache type"},
+	{'w', "enable-cache", true, "enable object cache"},
 	{'y', "myaddr", true, "specify the address advertised to other sheep"},
 	{'z', "zone", true, "specify the zone id"},
 	{ 0, NULL, false, NULL },
@@ -240,7 +240,6 @@ static void object_cache_dir_set(char *s)
 static void _object_cache_set(char *s)
 {
 	int i;
-	static bool first = true;
 
 	struct object_cache_arg {
 		const char *name;
@@ -254,12 +253,6 @@ static void _object_cache_set(char *s)
 		{ NULL, NULL },
 	};
 
-	if (first) {
-		assert(!strcmp(s, "object"));
-		first = false;
-		return;
-	}
-
 	for (i = 0; object_cache_args[i].name; i++) {
 		const char *n = object_cache_args[i].name;
 
@@ -273,52 +266,14 @@ static void _object_cache_set(char *s)
 	exit(1);
 }
 
-static void object_cache_set(char *s)
-{
-	sys->enabled_cache_type |= CACHE_TYPE_OBJECT;
-	parse_arg(s, ":", _object_cache_set);
-}
-
-static void disk_cache_set(char *s)
-{
-	assert(!strcmp(s, "disk"));
-	sys->enabled_cache_type |= CACHE_TYPE_DISK;
-}
-
-static void do_cache_type(char *s)
-{
-	int i;
-
-	struct cache_type {
-		const char *name;
-		void (*set)(char *);
-	};
-	struct cache_type cache_types[] = {
-		{ "object", object_cache_set },
-		{ "disk", disk_cache_set },
-		{ NULL, NULL },
-	};
-
-	for (i = 0; cache_types[i].name; i++) {
-		const char *n = cache_types[i].name;
-
-		if (!strncmp(s, n, strlen(n))) {
-			cache_types[i].set(s);
-			return;
-		}
-	}
-
-	fprintf(stderr, "invalid cache type: %s\n", s);
-	exit(1);
-}
-
-static void init_cache_type(char *arg)
+static void object_cache_set(char *arg)
 {
+	sys->enable_object_cache = true;
 	sys->object_cache_size = 0;
 
-	parse_arg(arg, ",", do_cache_type);
+	parse_arg(arg, ",", _object_cache_set);
 
-	if (is_object_cache_enabled() && sys->object_cache_size == 0) {
+	if (sys->object_cache_size == 0) {
 		fprintf(stderr, "object cache size is not set\n");
 		exit(1);
 	}
@@ -385,7 +340,7 @@ static int init_work_queues(void)
 	sys->deletion_wqueue = init_ordered_work_queue("deletion");
 	sys->block_wqueue = init_ordered_work_queue("block");
 	sys->sockfd_wqueue = init_ordered_work_queue("sockfd");
-	if (is_object_cache_enabled()) {
+	if (sys->enable_object_cache) {
 		sys->oc_reclaim_wqueue = init_ordered_work_queue("oc_reclaim");
 		sys->oc_push_wqueue = init_work_queue("oc_push", WQ_DYNAMIC);
 		if (!sys->oc_reclaim_wqueue || !sys->oc_push_wqueue)
@@ -542,7 +497,7 @@ int main(int argc, char **argv)
 			sys->cdrv_option = get_cdrv_option(sys->cdrv, optarg);
 			break;
 		case 'w':
-			init_cache_type(optarg);
+			object_cache_set(optarg);
 			break;
 		case 'i':
 			parse_arg(optarg, ",", init_io_arg);
@@ -692,7 +647,7 @@ int main(int argc, char **argv)
 			exit(1);
 	}
 
-	if (is_object_cache_enabled()) {
+	if (sys->enable_object_cache) {
 		if (!strlen(ocpath))
 			/* use object cache internally */
 			memcpy(ocpath, dir, strlen(dir));
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index c0fefb4..eaa326f 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -117,9 +117,7 @@ struct cluster_info {
 	struct work_queue *oc_reclaim_wqueue;
 	struct work_queue *oc_push_wqueue;
 
-#define CACHE_TYPE_OBJECT 0x1
-#define CACHE_TYPE_DISK   0x2
-	int enabled_cache_type;
+	bool enable_object_cache;
 
 	uint32_t object_cache_size;
 	bool object_cache_directio;
@@ -169,7 +167,6 @@ struct store_driver {
 	int (*cleanup)(void);
 	int (*restore)(const struct siocb *);
 	int (*get_snap_file)(struct siocb *);
-	int (*flush)(void);
 };
 
 int default_init(const char *p);
@@ -369,14 +366,12 @@ int gateway_read_obj(struct request *req);
 int gateway_write_obj(struct request *req);
 int gateway_create_and_write_obj(struct request *req);
 int gateway_remove_obj(struct request *req);
-int gateway_flush_nodes(struct request *req);
 
 /* backend store */
 int peer_read_obj(struct request *req);
 int peer_write_obj(struct request *req);
 int peer_create_and_write_obj(struct request *req);
 int peer_remove_obj(struct request *req);
-int peer_flush(struct request *req);
 
 int default_flush(void);
 
@@ -416,16 +411,6 @@ void sheep_del_sockfd(const struct node_id *, struct sockfd *);
 int sheep_exec_req(const struct node_id *nid, struct sd_req *hdr, void *data);
 bool sheep_need_retry(uint32_t epoch);
 
-static inline bool is_object_cache_enabled(void)
-{
-	return !!(sys->enabled_cache_type & CACHE_TYPE_OBJECT);
-}
-
-static inline bool is_disk_cache_enabled(void)
-{
-	return !!(sys->enabled_cache_type & CACHE_TYPE_DISK);
-}
-
 /* journal_file.c */
 int journal_file_init(const char *path, size_t size, bool skip);
 int journal_file_write(uint64_t oid, const char *buf, size_t size, off_t, bool);
diff --git a/sheep/store.c b/sheep/store.c
index dbc687b..2a4301a 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -364,7 +364,7 @@ int write_object(uint64_t oid, char *data, unsigned int datalen,
 	struct sd_req hdr;
 	int ret;
 
-	if (is_object_cache_enabled() && object_is_cached(oid)) {
+	if (sys->enable_object_cache && object_is_cached(oid)) {
 		ret = object_cache_write(oid, data, datalen, offset,
 					 flags, create);
 		if (ret == SD_RES_NO_CACHE)
@@ -427,7 +427,7 @@ int read_object(uint64_t oid, char *data, unsigned int datalen,
 {
 	int ret;
 
-	if (is_object_cache_enabled() && object_is_cached(oid)) {
+	if (sys->enable_object_cache && object_is_cached(oid)) {
 		ret = object_cache_read(oid, data, datalen, offset);
 		if (ret != SD_RES_SUCCESS) {
 			sd_eprintf("try forward read %"PRIx64" %"PRIx32, oid,
diff --git a/tests/018 b/tests/018
index cd2cfd7..d8af69b 100755
--- a/tests/018
+++ b/tests/018
@@ -16,7 +16,7 @@ status=1        # failure is the default!
 _cleanup
 
 for i in `seq 0 2`; do
-    _start_sheep $i "-w object:size=100"
+    _start_sheep $i "-w size=100"
 done
 
 _wait_for_sheep "3"
diff --git a/tests/019 b/tests/019
index f3862fb..98fc2b8 100755
--- a/tests/019
+++ b/tests/019
@@ -16,7 +16,7 @@ status=1        # failure is the default!
 _cleanup
 
 for i in `seq 0 2`; do
-    _start_sheep $i "-w object:size=100"
+    _start_sheep $i "-w size=100"
 done
 
 _wait_for_sheep "3"
diff --git a/tests/020 b/tests/020
index e9943dd..a5fccf8 100755
--- a/tests/020
+++ b/tests/020
@@ -16,7 +16,7 @@ status=1        # failure is the default!
 _cleanup
 
 for i in `seq 0 2`; do
-    _start_sheep $i "-w object:size=20"
+    _start_sheep $i "-w size=20"
 done
 
 _wait_for_sheep "3"
diff --git a/tests/044 b/tests/044
index dcb8be3..a774bb5 100755
--- a/tests/044
+++ b/tests/044
@@ -20,7 +20,7 @@ fi
 _cleanup
 
 for i in 0 1 2; do
-    _start_sheep $i '-s 4096 -w object:size=1000'
+    _start_sheep $i '-s 4096 -w size=1000'
 done
 
 _wait_for_sheep 3
diff --git a/tests/049 b/tests/049
index 79b9bd7..184b3a6 100755
--- a/tests/049
+++ b/tests/049
@@ -16,7 +16,7 @@ status=1        # failure is the default!
 _cleanup
 
 for i in `seq 0 2`; do
-    _start_sheep $i "-w object:size=30"
+    _start_sheep $i "-w size=30"
 done
 
 _wait_for_sheep 3
@@ -32,7 +32,7 @@ sleep 2
 
 #trigger an object reclaim at startup
 for i in `seq 0 2`; do
-    _start_sheep $i "-w object:size=10"
+    _start_sheep $i "-w size=10"
 done
 
 _wait_for_sheep 3
-- 
1.8.1.3.566.gaa39828




More information about the sheepdog mailing list