[sheepdog] [PATCH] sheep: add simple_store driver again
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Thu Aug 23 16:58:37 CEST 2012
Currently, the farm driver has a fatal problem in object recovery; it
blocks I/O requests long time while moving stale objects to the
backend store. Here is a test script to show that.
==
sheep -d /store/0 -z 0 -p 7000
sleep 2
collie cluster format -c 1 -b $1
collie vdi create test 100G -P
sheep -d /store/1 -z 1 -p 7001
sleep 2
time collie vdi list
==
The result is as follows:
==
$ ./farm.sh farm
using backend farm store
Name Id Size Used Shared Creation time VDI id Tag
test 1 100 GB 100 GB 0.0 MB 2012-08-23 12:16 7c2b25
real 18m36.962s
user 0m0.108s
sys 0m0.000s
==
To fix this problem, we need to move stale objects in the worker
thread. I've spent several days trying to fix it, but it seems to
need a lot of work, and it looks difficult to stable the change in a
short time.
That brings me another approach; adding a lightweight storage driver
'simple_store' as a workaround. (Perhaps, we should use a different
name to distinguish with the previous simple_store)
------------------------------------------ >8
This introduces a storage driver 'simple_store' based on the current
storage interface. The design of the new simple_store is similar to
one of the farm driver. The main difference is that farm uses the
sha1 based backend store for stale objects, but simple_store uses a
flat directory for them. With this design, simple_store can move
objects from the working directory to the backend store efficiently
with rename(2).
Here are pros vs cons of simple_store.
Pros:
- faster recovery
- smaller and simpler
- would be a good example to introduce other storage drivers
Cons:
- cluster snapshot is not supported
- stale objects are not deduplicated
- there is no sha1 verification
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
With simple_store, the above test result becomes as follows:
$ ./farm.sh simple
using backend simple store
Name Id Size Used Shared Creation time VDI id Tag
test 1 100 GB 100 GB 0.0 MB 2012-08-23 11:29 7c2b25
real 0m1.016s
user 0m0.092s
sys 0m0.000s
For users who don't need rich features of farm, simple_store would
work very well.
sheep/Makefile.am | 3 +-
sheep/simple_store.c | 381 ++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 383 insertions(+), 1 deletions(-)
create mode 100644 sheep/simple_store.c
diff --git a/sheep/Makefile.am b/sheep/Makefile.am
index fe15d94..0ddc2b2 100644
--- a/sheep/Makefile.am
+++ b/sheep/Makefile.am
@@ -26,7 +26,8 @@ sbin_PROGRAMS = sheep
sheep_SOURCES = sheep.c group.c request.c gateway.c store.c vdi.c work.c \
journal.c ops.c recovery.c cluster/local.c \
- object_cache.c object_list_cache.c sockfd_cache.c
+ object_cache.c object_list_cache.c sockfd_cache.c \
+ simple_store.c
if BUILD_COROSYNC
sheep_SOURCES += cluster/corosync.c
diff --git a/sheep/simple_store.c b/sheep/simple_store.c
new file mode 100644
index 0000000..eddf305
--- /dev/null
+++ b/sheep/simple_store.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+
+#include "sheep_priv.h"
+
+static char stale_dir[PATH_MAX];
+
+static int def_open_flags = O_DIRECT | O_DSYNC | O_RDWR;
+
+static int get_obj_path(uint64_t oid, char *path)
+{
+ return sprintf(path, "%s%016" PRIx64, obj_path, oid);
+}
+
+static int get_tmp_obj_path(uint64_t oid, char *path)
+{
+ return sprintf(path, "%s%016"PRIx64".tmp", obj_path, oid);
+}
+
+static int get_stale_obj_path(uint64_t oid, char *path)
+{
+ return sprintf(path, "%s/%016"PRIx64, stale_dir, oid);
+}
+
+static int for_each_objects(int (*func)(uint64_t oid))
+{
+ DIR *dir;
+ struct dirent *d;
+ uint64_t oid;
+ int ret = SD_RES_SUCCESS;
+
+ dir = opendir(obj_path);
+ if (!dir)
+ return SD_RES_EIO;
+
+ while ((d = readdir(dir))) {
+ if (!strncmp(d->d_name, ".", 1))
+ continue;
+
+ oid = strtoull(d->d_name, NULL, 16);
+ if (oid == 0 || oid == ULLONG_MAX)
+ continue;
+
+ ret = func(oid);
+ if (ret != SD_RES_SUCCESS)
+ break;
+ }
+ closedir(dir);
+ return ret;
+}
+
+static int simple_store_exist(uint64_t oid)
+{
+ char path[PATH_MAX];
+
+ get_obj_path(oid, path);
+ if (access(path, R_OK | W_OK) < 0) {
+ if (errno != ENOENT)
+ eprintf("%m\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+static int err_to_sderr(uint64_t oid, int err)
+{
+ struct stat s;
+
+ if (err != ENOENT) {
+ eprintf("%m\n");
+ return SD_RES_EIO;
+ }
+
+ if (stat(obj_path, &s) < 0) {
+ eprintf("corrupted\n");
+ return SD_RES_EIO;
+ }
+
+ dprintf("object %016" PRIx64 " not found locally\n", oid);
+ return SD_RES_NO_OBJ;
+}
+
+static int simple_store_write(uint64_t oid, struct siocb *iocb, int create)
+{
+ int flags = def_open_flags, fd, ret = SD_RES_SUCCESS;
+ char path[PATH_MAX];
+ ssize_t size;
+
+ if (iocb->epoch < sys_epoch()) {
+ dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch());
+ return SD_RES_OLD_NODE_VER;
+ }
+ if (!is_data_obj(oid))
+ flags &= ~O_DIRECT;
+
+ if (create)
+ flags |= O_CREAT | O_TRUNC;
+
+ get_obj_path(oid, path);
+ fd = open(path, flags, def_fmode);
+ if (fd < 0)
+ return err_to_sderr(oid, errno);
+
+ if (create && !(iocb->flags & SD_FLAG_CMD_COW)) {
+ ret = prealloc(fd, get_objsize(oid));
+ if (ret != SD_RES_SUCCESS)
+ goto out;
+ }
+ size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
+ if (size != iocb->length) {
+ eprintf("%m\n");
+ ret = SD_RES_EIO;
+ goto out;
+ }
+out:
+ close(fd);
+ return ret;
+}
+
+static int simple_store_cleanup(struct siocb *iocb)
+{
+ rmdir_r(stale_dir);
+ if (mkdir(stale_dir, 0755) < 0) {
+ eprintf("%m\n");
+ return SD_RES_EIO;
+ }
+
+ return SD_RES_SUCCESS;
+}
+
+static int init_objlist_and_vdi_bitmap(uint64_t oid)
+{
+ objlist_cache_insert(oid);
+
+ if (is_vdi_obj(oid)) {
+ vprintf(SDOG_DEBUG, "found the VDI object %" PRIx64 "\n", oid);
+
+ set_bit(oid_to_vid(oid), sys->vdi_inuse);
+ }
+ return SD_RES_SUCCESS;
+}
+
+static int simple_store_init(char *p)
+{
+ dprintf("use simple store driver\n");
+
+ /* create a stale directory */
+ snprintf(stale_dir, sizeof(stale_dir), "%s/.stale", p);
+ if (mkdir(stale_dir, 0755) < 0) {
+ if (errno != EEXIST) {
+ eprintf("%m\n");
+ return SD_RES_EIO;
+ }
+ }
+
+ return for_each_objects(init_objlist_and_vdi_bitmap);
+}
+
+static int simple_store_read_from_path(uint64_t oid, char *path,
+ struct siocb *iocb)
+{
+ int flags = def_open_flags, fd, ret = SD_RES_SUCCESS;
+ ssize_t size;
+
+ if (!is_data_obj(oid))
+ flags &= ~O_DIRECT;
+
+ fd = open(path, flags);
+
+ if (fd < 0)
+ return err_to_sderr(oid, errno);
+
+ size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
+ if (size != iocb->length) {
+ ret = SD_RES_EIO;
+ goto out;
+ }
+out:
+ close(fd);
+
+ return ret;
+}
+
+static int simple_store_read(uint64_t oid, struct siocb *iocb)
+{
+ int ret;
+ char path[PATH_MAX];
+
+ get_obj_path(oid, path);
+ ret = simple_store_read_from_path(oid, path, iocb);
+
+ if (ret == SD_RES_NO_OBJ && iocb->epoch < sys_epoch()) {
+ /* try to read from the stale directory */
+ get_stale_obj_path(oid, path);
+ ret = simple_store_read_from_path(oid, path, iocb);
+ }
+
+ return ret;
+}
+
+static int simple_store_atomic_put(uint64_t oid, struct siocb *iocb)
+{
+ char path[PATH_MAX], tmp_path[PATH_MAX];
+ int flags = def_open_flags | O_CREAT;
+ int ret = SD_RES_EIO, fd;
+ uint32_t len = iocb->length;
+
+ get_obj_path(oid, path);
+ get_tmp_obj_path(oid, tmp_path);
+
+ if (!is_data_obj(oid))
+ flags &= ~O_DIRECT;
+ fd = open(tmp_path, flags, def_fmode);
+ if (fd < 0) {
+ eprintf("failed to open %s: %m\n", tmp_path);
+ return SD_RES_EIO;
+ }
+
+ ret = xwrite(fd, iocb->buf, len);
+ if (ret != len) {
+ eprintf("failed to write object. %m\n");
+ ret = SD_RES_EIO;
+ goto out;
+ }
+
+ ret = rename(tmp_path, path);
+ if (ret < 0) {
+ eprintf("failed to rename %s to %s: %m\n", tmp_path, path);
+ ret = SD_RES_EIO;
+ goto out;
+ }
+ dprintf("%"PRIx64"\n", oid);
+ ret = SD_RES_SUCCESS;
+out:
+ close(fd);
+ return ret;
+}
+
+static int simple_store_link(uint64_t oid, struct siocb *iocb, uint32_t tgt_epoch)
+{
+ char path[PATH_MAX], stale_path[PATH_MAX];
+
+ dprintf("try link %"PRIx64" from snapshot with epoch %d\n", oid, tgt_epoch);
+
+ get_obj_path(oid, path);
+ get_stale_obj_path(oid, stale_path);
+
+ if (rename(stale_path, path) < 0) {
+ eprintf("%m\n");
+ return SD_RES_EIO;
+ }
+
+ return SD_RES_SUCCESS;
+}
+
+static bool oid_stale(uint64_t oid)
+{
+ int i, nr_copies;
+ struct vnode_info *vinfo;
+ struct sd_vnode *v;
+ bool ret = true;
+ struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
+
+ vinfo = get_vnode_info();
+ nr_copies = get_nr_copies(vinfo);
+
+ oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid,
+ nr_copies, obj_vnodes);
+ for (i = 0; i < nr_copies; i++) {
+ v = obj_vnodes[i];
+ if (vnode_is_local(v)) {
+ ret = false;
+ break;
+ }
+ }
+
+ put_vnode_info(vinfo);
+ return ret;
+}
+
+static int move_object_to_stale_dir(uint64_t oid)
+{
+ char path[PATH_MAX], stale_path[PATH_MAX];
+
+ if (!oid_stale(oid))
+ return SD_RES_SUCCESS;
+
+ get_obj_path(oid, path);
+ get_stale_obj_path(oid, stale_path);
+
+ if (rename(path, stale_path) < 0) {
+ eprintf("%s:%m\n", path);
+ return SD_RES_EIO;
+ }
+
+ dprintf("moved object %"PRIx64"\n", oid);
+ return SD_RES_SUCCESS;
+}
+
+static int simple_store_end_recover(uint32_t old_epoch,
+ struct vnode_info *old_vnode_info)
+{
+ if (old_epoch == 0)
+ return SD_RES_SUCCESS;
+
+ return for_each_objects(move_object_to_stale_dir);
+}
+
+static int simple_store_format(struct siocb *iocb)
+{
+ unsigned ret;
+ const char name[] = "simple";
+
+ dprintf("try get a clean store\n");
+ ret = rmdir_r(obj_path);
+ if (ret && ret != -ENOENT) {
+ eprintf("failed to remove %s: %s\n", obj_path, strerror(-ret));
+ return SD_RES_EIO;
+ }
+ if (mkdir(obj_path, def_dmode) < 0) {
+ eprintf("%m\n");
+ return SD_RES_EIO;
+ }
+
+ if (set_cluster_store(name) < 0)
+ return SD_RES_EIO;
+
+ return SD_RES_SUCCESS;
+}
+
+static int simple_store_remove_object(uint64_t oid)
+{
+ char path[PATH_MAX];
+
+ get_obj_path(oid, path);
+
+ if (unlink(path) < 0) {
+ if (errno == ENOENT)
+ return SD_RES_NO_OBJ;
+
+ eprintf("%m\n");
+ return SD_RES_EIO;
+ }
+
+ return SD_RES_SUCCESS;
+}
+
+static int simple_store_purge_obj(void)
+{
+ return for_each_objects(simple_store_remove_object);
+}
+
+struct store_driver simple_store = {
+ .name = "simple",
+ .init = simple_store_init,
+ .exist = simple_store_exist,
+ .write = simple_store_write,
+ .read = simple_store_read,
+ .link = simple_store_link,
+ .atomic_put = simple_store_atomic_put,
+ .end_recover = simple_store_end_recover,
+ .cleanup = simple_store_cleanup,
+ .format = simple_store_format,
+ .remove_object = simple_store_remove_object,
+ .purge_obj = simple_store_purge_obj,
+};
+
+add_store_driver(simple_store);
--
1.7.2.5
More information about the sheepdog
mailing list