[sheepdog] [PATCH] sheep: add simple_store driver again

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Thu Aug 23 16:58:37 CEST 2012


Currently, the farm driver has a fatal problem in object recovery; it
blocks I/O requests long time while moving stale objects to the
backend store.  Here is a test script to show that.

==
sheep -d /store/0 -z 0 -p 7000
sleep 2
collie cluster format -c 1 -b $1
collie vdi create test 100G -P
sheep -d /store/1 -z 1 -p 7001
sleep 2
time collie vdi list
==

The result is as follows:
==
$ ./farm.sh farm
using backend farm store
  Name        Id    Size    Used  Shared    Creation time   VDI id  Tag
  test         1  100 GB  100 GB  0.0 MB 2012-08-23 12:16   7c2b25  

real    18m36.962s
user    0m0.108s
sys     0m0.000s
==

To fix this problem, we need to move stale objects in the worker
thread.  I've spent several days trying to fix it, but it seems to
need a lot of work, and it looks difficult to stable the change in a
short time.

That brings me another approach; adding a lightweight storage driver
'simple_store' as a workaround.  (Perhaps, we should use a different
name to distinguish with the previous simple_store)

------------------------------------------ >8
This introduces a storage driver 'simple_store' based on the current
storage interface.  The design of the new simple_store is similar to
one of the farm driver.  The main difference is that farm uses the
sha1 based backend store for stale objects, but simple_store uses a
flat directory for them.  With this design, simple_store can move
objects from the working directory to the backend store efficiently
with rename(2).

Here are pros vs cons of simple_store.

Pros:
 - faster recovery
 - smaller and simpler
 - would be a good example to introduce other storage drivers

Cons:
 - cluster snapshot is not supported
 - stale objects are not deduplicated
 - there is no sha1 verification

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---

With simple_store, the above test result becomes as follows:

$ ./farm.sh simple
using backend simple store
  Name        Id    Size    Used  Shared    Creation time   VDI id  Tag
  test         1  100 GB  100 GB  0.0 MB 2012-08-23 11:29   7c2b25

real    0m1.016s
user    0m0.092s
sys     0m0.000s

For users who don't need rich features of farm, simple_store would
work very well.


 sheep/Makefile.am    |    3 +-
 sheep/simple_store.c |  381 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 383 insertions(+), 1 deletions(-)
 create mode 100644 sheep/simple_store.c

diff --git a/sheep/Makefile.am b/sheep/Makefile.am
index fe15d94..0ddc2b2 100644
--- a/sheep/Makefile.am
+++ b/sheep/Makefile.am
@@ -26,7 +26,8 @@ sbin_PROGRAMS		= sheep
 
 sheep_SOURCES		= sheep.c group.c request.c gateway.c store.c vdi.c work.c \
 			  journal.c ops.c recovery.c cluster/local.c \
-			  object_cache.c object_list_cache.c sockfd_cache.c
+			  object_cache.c object_list_cache.c sockfd_cache.c \
+			  simple_store.c
 
 if BUILD_COROSYNC
 sheep_SOURCES		+= cluster/corosync.c
diff --git a/sheep/simple_store.c b/sheep/simple_store.c
new file mode 100644
index 0000000..eddf305
--- /dev/null
+++ b/sheep/simple_store.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+
+#include "sheep_priv.h"
+
+static char stale_dir[PATH_MAX];
+
+static int def_open_flags = O_DIRECT | O_DSYNC | O_RDWR;
+
+static int get_obj_path(uint64_t oid, char *path)
+{
+	return sprintf(path, "%s%016" PRIx64, obj_path, oid);
+}
+
+static int get_tmp_obj_path(uint64_t oid, char *path)
+{
+	return sprintf(path, "%s%016"PRIx64".tmp", obj_path, oid);
+}
+
+static int get_stale_obj_path(uint64_t oid, char *path)
+{
+	return sprintf(path, "%s/%016"PRIx64, stale_dir, oid);
+}
+
+static int for_each_objects(int (*func)(uint64_t oid))
+{
+	DIR *dir;
+	struct dirent *d;
+	uint64_t oid;
+	int ret = SD_RES_SUCCESS;
+
+	dir = opendir(obj_path);
+	if (!dir)
+		return SD_RES_EIO;
+
+	while ((d = readdir(dir))) {
+		if (!strncmp(d->d_name, ".", 1))
+			continue;
+
+		oid = strtoull(d->d_name, NULL, 16);
+		if (oid == 0 || oid == ULLONG_MAX)
+			continue;
+
+		ret = func(oid);
+		if (ret != SD_RES_SUCCESS)
+			break;
+	}
+	closedir(dir);
+	return ret;
+}
+
+static int simple_store_exist(uint64_t oid)
+{
+	char path[PATH_MAX];
+
+	get_obj_path(oid, path);
+	if (access(path, R_OK | W_OK) < 0) {
+		if (errno != ENOENT)
+			eprintf("%m\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static int err_to_sderr(uint64_t oid, int err)
+{
+	struct stat s;
+
+	if (err != ENOENT) {
+		eprintf("%m\n");
+		return SD_RES_EIO;
+	}
+
+	if (stat(obj_path, &s) < 0) {
+		eprintf("corrupted\n");
+		return SD_RES_EIO;
+	}
+
+	dprintf("object %016" PRIx64 " not found locally\n", oid);
+	return SD_RES_NO_OBJ;
+}
+
+static int simple_store_write(uint64_t oid, struct siocb *iocb, int create)
+{
+	int flags = def_open_flags, fd, ret = SD_RES_SUCCESS;
+	char path[PATH_MAX];
+	ssize_t size;
+
+	if (iocb->epoch < sys_epoch()) {
+		dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch());
+		return SD_RES_OLD_NODE_VER;
+	}
+	if (!is_data_obj(oid))
+		flags &= ~O_DIRECT;
+
+	if (create)
+		flags |= O_CREAT | O_TRUNC;
+
+	get_obj_path(oid, path);
+	fd = open(path, flags, def_fmode);
+	if (fd < 0)
+		return err_to_sderr(oid, errno);
+
+	if (create && !(iocb->flags & SD_FLAG_CMD_COW)) {
+		ret = prealloc(fd, get_objsize(oid));
+		if (ret != SD_RES_SUCCESS)
+			goto out;
+	}
+	size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
+	if (size != iocb->length) {
+		eprintf("%m\n");
+		ret = SD_RES_EIO;
+		goto out;
+	}
+out:
+	close(fd);
+	return ret;
+}
+
+static int simple_store_cleanup(struct siocb *iocb)
+{
+	rmdir_r(stale_dir);
+	if (mkdir(stale_dir, 0755) < 0) {
+		eprintf("%m\n");
+		return SD_RES_EIO;
+	}
+
+	return SD_RES_SUCCESS;
+}
+
+static int init_objlist_and_vdi_bitmap(uint64_t oid)
+{
+	objlist_cache_insert(oid);
+
+	if (is_vdi_obj(oid)) {
+		vprintf(SDOG_DEBUG, "found the VDI object %" PRIx64 "\n", oid);
+
+		set_bit(oid_to_vid(oid), sys->vdi_inuse);
+	}
+	return SD_RES_SUCCESS;
+}
+
+static int simple_store_init(char *p)
+{
+	dprintf("use simple store driver\n");
+
+	/* create a stale directory */
+	snprintf(stale_dir, sizeof(stale_dir), "%s/.stale", p);
+	if (mkdir(stale_dir, 0755) < 0) {
+		if (errno != EEXIST) {
+			eprintf("%m\n");
+			return SD_RES_EIO;
+		}
+	}
+
+	return for_each_objects(init_objlist_and_vdi_bitmap);
+}
+
+static int simple_store_read_from_path(uint64_t oid, char *path,
+				       struct siocb *iocb)
+{
+	int flags = def_open_flags, fd, ret = SD_RES_SUCCESS;
+	ssize_t size;
+
+	if (!is_data_obj(oid))
+		flags &= ~O_DIRECT;
+
+	fd = open(path, flags);
+
+	if (fd < 0)
+		return err_to_sderr(oid, errno);
+
+	size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
+	if (size != iocb->length) {
+		ret = SD_RES_EIO;
+		goto out;
+	}
+out:
+	close(fd);
+
+	return ret;
+}
+
+static int simple_store_read(uint64_t oid, struct siocb *iocb)
+{
+	int ret;
+	char path[PATH_MAX];
+
+	get_obj_path(oid, path);
+	ret = simple_store_read_from_path(oid, path, iocb);
+
+	if (ret == SD_RES_NO_OBJ && iocb->epoch < sys_epoch()) {
+		/* try to read from the stale directory */
+		get_stale_obj_path(oid, path);
+		ret = simple_store_read_from_path(oid, path, iocb);
+	}
+
+	return ret;
+}
+
+static int simple_store_atomic_put(uint64_t oid, struct siocb *iocb)
+{
+	char path[PATH_MAX], tmp_path[PATH_MAX];
+	int flags = def_open_flags | O_CREAT;
+	int ret = SD_RES_EIO, fd;
+	uint32_t len = iocb->length;
+
+	get_obj_path(oid, path);
+	get_tmp_obj_path(oid, tmp_path);
+
+	if (!is_data_obj(oid))
+		flags &= ~O_DIRECT;
+	fd = open(tmp_path, flags, def_fmode);
+	if (fd < 0) {
+		eprintf("failed to open %s: %m\n", tmp_path);
+		return SD_RES_EIO;
+	}
+
+	ret = xwrite(fd, iocb->buf, len);
+	if (ret != len) {
+		eprintf("failed to write object. %m\n");
+		ret = SD_RES_EIO;
+		goto out;
+	}
+
+	ret = rename(tmp_path, path);
+	if (ret < 0) {
+		eprintf("failed to rename %s to %s: %m\n", tmp_path, path);
+		ret = SD_RES_EIO;
+		goto out;
+	}
+	dprintf("%"PRIx64"\n", oid);
+	ret = SD_RES_SUCCESS;
+out:
+	close(fd);
+	return ret;
+}
+
+static int simple_store_link(uint64_t oid, struct siocb *iocb, uint32_t tgt_epoch)
+{
+	char path[PATH_MAX], stale_path[PATH_MAX];
+
+	dprintf("try link %"PRIx64" from snapshot with epoch %d\n", oid, tgt_epoch);
+
+	get_obj_path(oid, path);
+	get_stale_obj_path(oid, stale_path);
+
+	if (rename(stale_path, path) < 0) {
+		eprintf("%m\n");
+		return SD_RES_EIO;
+	}
+
+	return SD_RES_SUCCESS;
+}
+
+static bool oid_stale(uint64_t oid)
+{
+	int i, nr_copies;
+	struct vnode_info *vinfo;
+	struct sd_vnode *v;
+	bool ret = true;
+	struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
+
+	vinfo = get_vnode_info();
+	nr_copies = get_nr_copies(vinfo);
+
+	oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid,
+		      nr_copies, obj_vnodes);
+	for (i = 0; i < nr_copies; i++) {
+		v = obj_vnodes[i];
+		if (vnode_is_local(v)) {
+			ret = false;
+			break;
+		}
+	}
+
+	put_vnode_info(vinfo);
+	return ret;
+}
+
+static int move_object_to_stale_dir(uint64_t oid)
+{
+	char path[PATH_MAX], stale_path[PATH_MAX];
+
+	if (!oid_stale(oid))
+		return SD_RES_SUCCESS;
+
+	get_obj_path(oid, path);
+	get_stale_obj_path(oid, stale_path);
+
+	if (rename(path, stale_path) < 0) {
+		eprintf("%s:%m\n", path);
+		return SD_RES_EIO;
+	}
+
+	dprintf("moved object %"PRIx64"\n", oid);
+	return SD_RES_SUCCESS;
+}
+
+static int simple_store_end_recover(uint32_t old_epoch,
+				    struct vnode_info *old_vnode_info)
+{
+	if (old_epoch == 0)
+		return SD_RES_SUCCESS;
+
+	return for_each_objects(move_object_to_stale_dir);
+}
+
+static int simple_store_format(struct siocb *iocb)
+{
+	unsigned ret;
+	const char name[] = "simple";
+
+	dprintf("try get a clean store\n");
+	ret = rmdir_r(obj_path);
+	if (ret && ret != -ENOENT) {
+		eprintf("failed to remove %s: %s\n", obj_path, strerror(-ret));
+		return SD_RES_EIO;
+	}
+	if (mkdir(obj_path, def_dmode) < 0) {
+		eprintf("%m\n");
+		return SD_RES_EIO;
+	}
+
+	if (set_cluster_store(name) < 0)
+		return SD_RES_EIO;
+
+	return SD_RES_SUCCESS;
+}
+
+static int simple_store_remove_object(uint64_t oid)
+{
+	char path[PATH_MAX];
+
+	get_obj_path(oid, path);
+
+	if (unlink(path) < 0) {
+		if (errno == ENOENT)
+			return SD_RES_NO_OBJ;
+
+		eprintf("%m\n");
+		return SD_RES_EIO;
+	}
+
+	return SD_RES_SUCCESS;
+}
+
+static int simple_store_purge_obj(void)
+{
+	return for_each_objects(simple_store_remove_object);
+}
+
+struct store_driver simple_store = {
+	.name = "simple",
+	.init = simple_store_init,
+	.exist = simple_store_exist,
+	.write = simple_store_write,
+	.read = simple_store_read,
+	.link = simple_store_link,
+	.atomic_put = simple_store_atomic_put,
+	.end_recover = simple_store_end_recover,
+	.cleanup = simple_store_cleanup,
+	.format = simple_store_format,
+	.remove_object = simple_store_remove_object,
+	.purge_obj = simple_store_purge_obj,
+};
+
+add_store_driver(simple_store);
-- 
1.7.2.5




More information about the sheepdog mailing list