[sheepdog] [PATCH 1/2 v2] sheep : add new strage type "tree"

Tue Mar 17 10:03:26 CET 2015

Current sheepdog stores whole objects in single directory like "/var/lib/sheepdog/obj"
This mechanism is difficult to handle massive files when increasing cluster volume.

In particular, inode object having special informations about VDI,
so it is preferable to divide.

new storage type named "tree"
It separates the inode object and data object.

How to use ,
specify the --store option at the time format

dog cluster format --store tree

v2: refactor using common functions for store driver
    use check_store_type to identify tree store_driver

Signed-off-by: Masaki Saeki <saeki.masaki at po.ntts.co.jp>
---
 sheep/Makefile.am         |    2 +-
 sheep/sheep_priv.h        |   21 ++
 sheep/store/common.c      |    5 +
 sheep/store/md.c          |   14 +
 sheep/store/plain_store.c |    1 +
 sheep/store/tree_store.c  |  757 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 799 insertions(+), 1 deletions(-)
 create mode 100644 sheep/store/tree_store.c

diff --git a/sheep/Makefile.am b/sheep/Makefile.am
index 3ddd761..9dedb03 100644
--- a/sheep/Makefile.am
+++ b/sheep/Makefile.am
@@ -28,7 +28,7 @@ sheep_SOURCES		= sheep.c group.c request.c gateway.c vdi.c \
 			  journal.c ops.c recovery.c cluster/local.c \
 			  object_cache.c object_list_cache.c \
 			  store/common.c store/md.c \
-			  store/plain_store.c \
+			  store/plain_store.c store/tree_store.c \
 			  config.c migrate.c
 
 if BUILD_HTTP
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index e58901f..51e686f 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -84,6 +84,11 @@ enum REQUST_STATUS {
 	REQUEST_DROPPED
 };
 
+enum store_id {
+	PLAIN_STORE,
+	TREE_STORE
+};
+
 struct request_iocb {
 	uint32_t count;
 	int efd;
@@ -235,6 +240,7 @@ struct vdi_info {
 
 struct store_driver {
 	struct list_node list;
+	enum store_id id;
 	const char *name;
 	int (*init)(void);
 	bool (*exist)(uint64_t oid, uint8_t ec_index);
@@ -269,6 +275,20 @@ int default_format(void);
 int default_remove_object(uint64_t oid, uint8_t ec_index);
 int default_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1);
 int default_purge_obj(void);
+
+int tree_init(void);
+bool tree_exist(uint64_t oid, uint8_t ec_index);
+int tree_create_and_write(uint64_t oid, const struct siocb *iocb);
+int tree_write(uint64_t oid, const struct siocb *iocb);
+int tree_read(uint64_t oid, const struct siocb *iocb);
+int tree_link(uint64_t oid, uint32_t tgt_epoch);
+int tree_update_epoch(uint32_t epoch);
+int tree_cleanup(void);
+int tree_format(void);
+int tree_remove_object(uint64_t oid, uint8_t ec_index);
+int tree_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1);
+int tree_purge_obj(void);
+
 int for_each_object_in_wd(int (*func)(uint64_t, const char *, uint32_t,
 				      uint8_t, struct vnode_info *, void *),
 			  bool, void *);
@@ -404,6 +424,7 @@ void queue_cluster_request(struct request *req);
 int prepare_iocb(uint64_t oid, const struct siocb *iocb, bool create);
 int err_to_sderr(const char *path, uint64_t oid, int err);
 int discard(int fd, uint64_t start, uint32_t end);
+bool check_store_type(enum store_id id);
 
 int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes);
 int inc_and_log_epoch(void);
diff --git a/sheep/store/common.c b/sheep/store/common.c
index 8959392..aa2858d 100644
--- a/sheep/store/common.c
+++ b/sheep/store/common.c
@@ -102,6 +102,11 @@ int discard(int fd, uint64_t start, uint32_t end)
 	return ret;
 }
 
+bool check_store_type(enum store_id id)
+{
+	return (sd_store->id == id);
+}
+
 int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes)
 {
 	int ret, len, nodes_len;
diff --git a/sheep/store/md.c b/sheep/store/md.c
index 87ab759..ed95c98 100644
--- a/sheep/store/md.c
+++ b/sheep/store/md.c
@@ -212,6 +212,20 @@ static int for_each_object_in_path(const char *path,
 		if (unlikely(!strncmp(d->d_name, ".", 1)))
 			continue;
 
+		/* recursive call for tree store driver sub directories*/
+		if (check_store_type(TREE_STORE)) {
+			struct stat s;
+
+			snprintf(file_name, sizeof(file_name),
+				 "%s/%s", path, d->d_name);
+			stat(file_name, &s);
+			if (S_ISDIR(s.st_mode)) {
+				ret = for_each_object_in_path(file_name,
+					func, cleanup, vinfo, arg);
+				continue;
+			}
+		}
+
 		sd_debug("%s, %s", path, d->d_name);
 		oid = strtoull(d->d_name, NULL, 16);
 		if (oid == 0 || oid == ULLONG_MAX)
diff --git a/sheep/store/plain_store.c b/sheep/store/plain_store.c
index 0239684..9787293 100644
--- a/sheep/store/plain_store.c
+++ b/sheep/store/plain_store.c
@@ -658,6 +658,7 @@ int default_purge_obj(void)
 }
 
 static struct store_driver plain_store = {
+	.id = PLAIN_STORE,
 	.name = "plain",
 	.init = default_init,
 	.exist = default_exist,
diff --git a/sheep/store/tree_store.c b/sheep/store/tree_store.c
new file mode 100644
index 0000000..441fdf3
--- /dev/null
+++ b/sheep/store/tree_store.c
@@ -0,0 +1,757 @@
+/*
+ * Copyright (C) 2012,2015 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <libgen.h>
+
+#include "sheep_priv.h"
+
+static inline int get_tree(uint64_t oid)
+{
+	return (int)((oid << 24) >> 56);
+}
+
+static int get_store_path(uint64_t oid, uint8_t ec_index, char *path)
+{
+	char tree_path[PATH_MAX];
+
+	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
+		snprintf(tree_path, PATH_MAX, "%s/meta",
+			 md_get_object_dir(oid));
+	} else {
+		snprintf(tree_path, PATH_MAX, "%s/%02x",
+			 md_get_object_dir(oid), get_tree(oid));
+	}
+
+	if (is_erasure_oid(oid)) {
+		if (unlikely(ec_index >= SD_MAX_COPIES))
+			panic("invalid ec_index %d", ec_index);
+		return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
+				tree_path, oid, ec_index);
+	}
+
+	return snprintf(path, PATH_MAX, "%s/%016" PRIx64, tree_path, oid);
+}
+
+static int get_store_tmp_path(uint64_t oid, uint8_t ec_index, char *path)
+{
+	char tree_path[PATH_MAX];
+
+	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
+		snprintf(tree_path, PATH_MAX, "%s/meta",
+			 md_get_object_dir(oid));
+	} else {
+		snprintf(tree_path, PATH_MAX, "%s/%02x",
+			 md_get_object_dir(oid), get_tree(oid));
+	}
+
+	if (is_erasure_oid(oid)) {
+		if (unlikely(ec_index >= SD_MAX_COPIES))
+			panic("invalid ec_index %d", ec_index);
+		return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d.tmp",
+				tree_path, oid, ec_index);
+	}
+
+	return snprintf(path, PATH_MAX, "%s/%016" PRIx64".tmp",
+			tree_path, oid);
+}
+
+static int get_store_stale_path(uint64_t oid, uint32_t epoch, uint8_t ec_index,
+				char *path)
+{
+	return md_get_stale_path(oid, epoch, ec_index, path);
+}
+
+/*
+ * Check if oid is in this nodes (if oid is in the wrong place, it will be moved
+ * to the correct one after this call in a MD setup.
+ */
+bool tree_exist(uint64_t oid, uint8_t ec_index)
+{
+	char path[PATH_MAX];
+
+	get_store_path(oid, ec_index, path);
+
+	return md_exist(oid, ec_index, path);
+}
+
+/* Trim zero blocks of the beginning and end of the object. */
+static int tree_trim(int fd, uint64_t oid, const struct siocb *iocb,
+			uint64_t *poffset, uint32_t *plen)
+{
+	trim_zero_blocks(iocb->buf, poffset, plen);
+
+	if (iocb->offset < *poffset) {
+		sd_debug("discard between %d, %ld, %" PRIx64, iocb->offset,
+			 *poffset, oid);
+
+		if (discard(fd, iocb->offset, *poffset) < 0)
+			return -1;
+	}
+
+	if (*poffset + *plen < iocb->offset + iocb->length) {
+		uint64_t end = iocb->offset + iocb->length;
+		uint32_t object_size = get_vdi_object_size(oid_to_vid(oid));
+		if (end == get_objsize(oid, object_size))
+			/* This is necessary to punch the last block */
+			end = round_up(end, BLOCK_SIZE);
+		sd_debug("discard between %ld, %ld, %" PRIx64, *poffset + *plen,
+			 end, oid);
+
+		if (discard(fd, *poffset + *plen, end) < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+int tree_write(uint64_t oid, const struct siocb *iocb)
+{
+	int flags = prepare_iocb(oid, iocb, false), fd,
+	    ret = SD_RES_SUCCESS;
+	char path[PATH_MAX];
+	ssize_t size;
+	uint32_t len = iocb->length;
+	uint64_t offset = iocb->offset;
+	static bool trim_is_supported = true;
+
+	if (iocb->epoch < sys_epoch()) {
+		sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch());
+		return SD_RES_OLD_NODE_VER;
+	}
+
+	if (uatomic_is_true(&sys->use_journal) &&
+	    unlikely(journal_write_store(oid, iocb->buf, iocb->length,
+					 iocb->offset, false))
+	    != SD_RES_SUCCESS) {
+		sd_err("turn off journaling");
+		uatomic_set_false(&sys->use_journal);
+		flags |= O_DSYNC;
+		sync();
+	}
+
+	get_store_path(oid, iocb->ec_index, path);
+
+	/*
+	 * Make sure oid is in the right place because oid might be misplaced
+	 * in a wrong place, due to 'shutdown/restart with less/more disks' or
+	 * any bugs. We need call err_to_sderr() to return EIO if disk is broken
+	 */
+	if (!tree_exist(oid, iocb->ec_index))
+		return err_to_sderr(path, oid, ENOENT);
+
+	fd = open(path, flags, sd_def_fmode);
+	if (unlikely(fd < 0))
+		return err_to_sderr(path, oid, errno);
+
+	if (trim_is_supported && is_sparse_object(oid)) {
+		if (tree_trim(fd, oid, iocb, &offset, &len) < 0) {
+			trim_is_supported = false;
+			offset = iocb->offset;
+			len = iocb->length;
+		}
+	}
+
+	size = xpwrite(fd, iocb->buf, len, offset);
+	if (unlikely(size != len)) {
+		sd_err("failed to write object %"PRIx64", path=%s, offset=%"
+		       PRId32", size=%"PRId32", result=%zd, %m", oid, path,
+		       iocb->offset, iocb->length, size);
+		ret = err_to_sderr(path, oid, errno);
+		goto out;
+	}
+out:
+	close(fd);
+	return ret;
+}
+
+static int make_tree_dir(const char *path)
+{
+	int i;
+	char p[PATH_MAX];
+
+	snprintf(p, PATH_MAX, "%s/meta", path);
+	if (xmkdir(p, sd_def_dmode) < 0) {
+		sd_err("%s failed, %m", p);
+		return SD_RES_EIO;
+	}
+
+	for (i = 0 ; i < 256 ; i++) {
+		snprintf(p, PATH_MAX, "%s/%02x", path, i);
+		if (xmkdir(p, sd_def_dmode) < 0) {
+			sd_err("%s failed, %m", p);
+			return SD_RES_EIO;
+		}
+	}
+
+	return SD_RES_SUCCESS;
+}
+
+static int make_stale_dir(const char *path)
+{
+	char p[PATH_MAX];
+
+	snprintf(p, PATH_MAX, "%s/.stale", path);
+	if (xmkdir(p, sd_def_dmode) < 0) {
+		sd_err("%s failed, %m", p);
+		return SD_RES_EIO;
+	}
+
+	return SD_RES_SUCCESS;
+}
+
+static int purge_dir(const char *path)
+{
+	if (purge_directory(path) < 0)
+		return SD_RES_EIO;
+
+	return SD_RES_SUCCESS;
+}
+
+static int purge_stale_dir(const char *path)
+{
+	char p[PATH_MAX];
+
+	snprintf(p, PATH_MAX, "%s/.stale", path);
+
+	if (purge_directory_async(p) < 0)
+		return SD_RES_EIO;
+
+	return SD_RES_SUCCESS;
+}
+
+int tree_cleanup(void)
+{
+	int ret;
+
+	ret = for_each_obj_path(purge_stale_dir);
+	if (ret != SD_RES_SUCCESS)
+		return ret;
+
+	return SD_RES_SUCCESS;
+}
+
+static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch)
+{
+	int ret;
+	struct sd_inode *inode = xzalloc(SD_INODE_HEADER_SIZE);
+	struct siocb iocb = {
+		.epoch = epoch,
+		.buf = inode,
+		.length = SD_INODE_HEADER_SIZE,
+	};
+
+	ret = tree_read(oid, &iocb);
+	if (ret != SD_RES_SUCCESS) {
+		sd_err("failed to read inode header %" PRIx64 " %" PRId32
+		       "wat %s", oid, epoch, wd);
+		goto out;
+	}
+	add_vdi_state_unordered(oid_to_vid(oid), inode->nr_copies,
+		      vdi_is_snapshot(inode), inode->copy_policy,
+		      inode->block_size_shift, inode->parent_vdi_id);
+
+	if (inode->name[0] == '\0')
+		atomic_set_bit(oid_to_vid(oid), sys->vdi_deleted);
+
+	atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse);
+
+	ret = SD_RES_SUCCESS;
+out:
+	free(inode);
+	return ret;
+}
+
+static int init_objlist_and_vdi_bitmap(uint64_t oid, const char *wd,
+				       uint32_t epoch, uint8_t ec_index,
+				       struct vnode_info *vinfo,
+				       void *arg)
+{
+	int ret;
+	char path[PATH_MAX];
+	objlist_cache_insert(oid);
+
+	snprintf(path, PATH_MAX, "%s/meta", wd);
+
+	if (is_vdi_obj(oid)) {
+		sd_debug("found the VDI object %" PRIx64" epoch %"PRIu32
+			 " at %s", oid, epoch, path);
+		ret = init_vdi_state(oid, path, epoch);
+		if (ret != SD_RES_SUCCESS)
+			return ret;
+	}
+	return SD_RES_SUCCESS;
+}
+
+int tree_init(void)
+{
+	int ret;
+
+	sd_debug("use tree store driver");
+	ret = for_each_obj_path(make_tree_dir);
+	if (ret != SD_RES_SUCCESS)
+		return ret;
+
+	ret = for_each_obj_path(make_stale_dir);
+	if (ret != SD_RES_SUCCESS)
+		return ret;
+
+
+	for_each_object_in_stale(init_objlist_and_vdi_bitmap, NULL);
+
+	return for_each_object_in_wd(init_objlist_and_vdi_bitmap, true, NULL);
+}
+
+static int tree_read_from_path(uint64_t oid, const char *path,
+				  const struct siocb *iocb)
+{
+	int flags = prepare_iocb(oid, iocb, false), fd,
+	    ret = SD_RES_SUCCESS;
+	ssize_t size;
+
+	/*
+	 * Make sure oid is in the right place because oid might be misplaced
+	 * in a wrong place, due to 'shutdown/restart with less disks' or any
+	 * bugs. We need call err_to_sderr() to return EIO if disk is broken.
+	 *
+	 * For stale path, get_store_stale_path already does tree_exist job.
+	 */
+	if (!is_stale_path(path) && !tree_exist(oid, iocb->ec_index))
+		return err_to_sderr(path, oid, ENOENT);
+
+	fd = open(path, flags);
+	if (fd < 0)
+		return err_to_sderr(path, oid, errno);
+
+	size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
+	if (size < 0) {
+		sd_err("failed to read object %"PRIx64", path=%s, offset=%"
+		       PRId32", size=%"PRId32", result=%zd, %m", oid, path,
+		       iocb->offset, iocb->length, size);
+		ret = err_to_sderr(path, oid, errno);
+	}
+	close(fd);
+	return ret;
+}
+
+int tree_read(uint64_t oid, const struct siocb *iocb)
+{
+	int ret;
+	char path[PATH_MAX];
+
+	get_store_path(oid, iocb->ec_index, path);
+	ret = tree_read_from_path(oid, path, iocb);
+
+	/*
+	 * If the request is against the older epoch, try to read from
+	 * the stale directory
+	 */
+	if (ret == SD_RES_NO_OBJ && iocb->epoch > 0 &&
+	    iocb->epoch < sys_epoch()) {
+		get_store_stale_path(oid, iocb->epoch, iocb->ec_index, path);
+		ret = tree_read_from_path(oid, path, iocb);
+	}
+
+	return ret;
+}
+
+int tree_create_and_write(uint64_t oid, const struct siocb *iocb)
+{
+	char path[PATH_MAX], tmp_path[PATH_MAX], *dir;
+	int flags = prepare_iocb(oid, iocb, true);
+	int ret, fd;
+	uint32_t len = iocb->length;
+	uint32_t object_size = 0;
+	size_t obj_size;
+	uint64_t offset = iocb->offset;
+
+	sd_debug("%"PRIx64, oid);
+	get_store_path(oid, iocb->ec_index, path);
+	get_store_tmp_path(oid, iocb->ec_index, tmp_path);
+
+	if (uatomic_is_true(&sys->use_journal) &&
+	    journal_write_store(oid, iocb->buf, iocb->length,
+				iocb->offset, true)
+	    != SD_RES_SUCCESS) {
+		sd_err("turn off journaling");
+		uatomic_set_false(&sys->use_journal);
+		flags |= O_SYNC;
+		sync();
+	}
+
+	fd = open(tmp_path, flags, sd_def_fmode);
+	if (fd < 0) {
+		if (errno == EEXIST) {
+			/*
+			 * This happens if node membership changes during object
+			 * creation; while gateway retries a CREATE request,
+			 * recovery process could also recover the object at the
+			 * same time.  They should try to write the same date,
+			 * so it is okay to simply return success here.
+			 */
+			sd_debug("%s exists", tmp_path);
+			return SD_RES_SUCCESS;
+		}
+
+		sd_err("failed to open %s: %m", tmp_path);
+		return err_to_sderr(path, oid, errno);
+	}
+
+	obj_size = get_store_objsize(oid);
+
+	trim_zero_blocks(iocb->buf, &offset, &len);
+
+	object_size = get_vdi_object_size(oid_to_vid(oid));
+
+	if (offset != 0 || len != get_objsize(oid, object_size)) {
+		if (is_sparse_object(oid))
+			ret = xftruncate(fd, obj_size);
+		else
+			ret = prealloc(fd, obj_size);
+		if (ret < 0) {
+			ret = err_to_sderr(path, oid, errno);
+			goto out;
+		}
+	}
+
+	ret = xpwrite(fd, iocb->buf, len, offset);
+	if (ret != len) {
+		sd_err("failed to write object. %m");
+		ret = err_to_sderr(path, oid, errno);
+		goto out;
+	}
+
+	ret = rename(tmp_path, path);
+	if (ret < 0) {
+		sd_err("failed to rename %s to %s: %m", tmp_path, path);
+		ret = err_to_sderr(path, oid, errno);
+		goto out;
+	}
+
+	close(fd);
+
+	if (uatomic_is_true(&sys->use_journal) || sys->nosync == true) {
+		objlist_cache_insert(oid);
+		return SD_RES_SUCCESS;
+	}
+
+	pstrcpy(tmp_path, sizeof(tmp_path), path);
+	dir = dirname(tmp_path);
+	fd = open(dir, O_DIRECTORY | O_RDONLY);
+	if (fd < 0) {
+		sd_err("failed to open directory %s: %m", dir);
+		return err_to_sderr(path, oid, errno);
+	}
+
+	if (fsync(fd) != 0) {
+		sd_err("failed to write directory %s: %m", dir);
+		ret = err_to_sderr(path, oid, errno);
+		close(fd);
+		if (unlink(path) != 0)
+			sd_err("failed to unlink %s: %m", path);
+		return ret;
+	}
+	close(fd);
+	objlist_cache_insert(oid);
+	return SD_RES_SUCCESS;
+
+out:
+	if (unlink(tmp_path) != 0)
+		sd_err("failed to unlink %s: %m", tmp_path);
+	close(fd);
+	return ret;
+}
+
+int tree_link(uint64_t oid, uint32_t tgt_epoch)
+{
+	char path[PATH_MAX], stale_path[PATH_MAX], tree_path[PATH_MAX];
+
+	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
+		snprintf(tree_path, PATH_MAX, "%s/meta",
+			 md_get_object_dir(oid));
+	} else {
+		snprintf(tree_path, PATH_MAX, "%s/%02x",
+			 md_get_object_dir(oid), get_tree(oid));
+	}
+
+	sd_debug("try link %"PRIx64" from snapshot with epoch %d", oid,
+		 tgt_epoch);
+
+	snprintf(path, PATH_MAX, "%s/%016"PRIx64, tree_path, oid);
+	get_store_stale_path(oid, tgt_epoch, 0, stale_path);
+
+	if (link(stale_path, path) < 0) {
+		/*
+		 * Recovery thread and main thread might try to recover the
+		 * same object and we might get EEXIST in such case.
+		 */
+		if (errno == EEXIST)
+			goto out;
+
+		sd_debug("failed to link from %s to %s, %m", stale_path, path);
+		return err_to_sderr(path, oid, errno);
+	}
+out:
+	return SD_RES_SUCCESS;
+}
+
+/*
+ * For replicated object, if any of the replica belongs to this node, we
+ * consider it not stale.
+ *
+ * For erasure coded object, since every copy is unique and if it migrates to
+ * other node(index gets changed even it has some other copy belongs to it)
+ * because of hash ring changes, we consider it stale.
+ */
+static bool oid_stale(uint64_t oid, int ec_index, struct vnode_info *vinfo)
+{
+	uint32_t i, nr_copies;
+	const struct sd_vnode *v;
+	bool ret = true;
+	const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
+
+	nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
+	oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes);
+	for (i = 0; i < nr_copies; i++) {
+		v = obj_vnodes[i];
+		if (vnode_is_local(v)) {
+			if (ec_index < SD_MAX_COPIES) {
+				if (i == ec_index)
+					ret = false;
+			} else {
+				ret = false;
+			}
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int move_object_to_stale_dir(uint64_t oid, const char *wd,
+				    uint32_t epoch, uint8_t ec_index,
+				    struct vnode_info *vinfo, void *arg)
+{
+	char path[PATH_MAX], stale_path[PATH_MAX], tree_path[PATH_MAX];
+	uint32_t tgt_epoch = *(uint32_t *)arg;
+
+	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
+		snprintf(tree_path, PATH_MAX, "%s/meta",
+			 md_get_object_dir(oid));
+	} else {
+		snprintf(tree_path, PATH_MAX, "%s/%02x",
+			 md_get_object_dir(oid), get_tree(oid));
+	}
+
+	/* ec_index from md.c is reliable so we can directly use it */
+	if (ec_index < SD_MAX_COPIES) {
+		snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
+			 tree_path, oid, ec_index);
+		snprintf(stale_path, PATH_MAX,
+			 "%s/.stale/%016"PRIx64"_%d.%"PRIu32,
+			 md_get_object_dir(oid), oid, ec_index, tgt_epoch);
+	} else {
+		snprintf(path, PATH_MAX, "%s/%016" PRIx64,
+			 tree_path, oid);
+		snprintf(stale_path, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32,
+			 md_get_object_dir(oid), oid, tgt_epoch);
+	}
+
+	if (unlikely(rename(path, stale_path)) < 0) {
+		sd_err("failed to move stale object %" PRIX64 " to %s, %m", oid,
+		       path);
+		return SD_RES_EIO;
+	}
+	sd_debug("moved object %"PRIx64, oid);
+	return SD_RES_SUCCESS;
+}
+
+static int check_stale_objects(uint64_t oid, const char *wd, uint32_t epoch,
+			       uint8_t ec_index, struct vnode_info *vinfo,
+			       void *arg)
+{
+	if (oid_stale(oid, ec_index, vinfo))
+		return move_object_to_stale_dir(oid, wd, 0, ec_index,
+						NULL, arg);
+
+	return SD_RES_SUCCESS;
+}
+
+int tree_update_epoch(uint32_t epoch)
+{
+	assert(epoch);
+	return for_each_object_in_wd(check_stale_objects, false, &epoch);
+}
+
+int tree_format(void)
+{
+	unsigned ret;
+
+	sd_debug("try get a clean store");
+	ret = for_each_obj_path(purge_dir);
+	if (ret != SD_RES_SUCCESS)
+		return ret;
+
+	if (sys->enable_object_cache)
+		object_cache_format();
+
+	return SD_RES_SUCCESS;
+}
+
+int tree_remove_object(uint64_t oid, uint8_t ec_index)
+{
+	char path[PATH_MAX];
+
+	if (uatomic_is_true(&sys->use_journal))
+		journal_remove_object(oid);
+
+	get_store_path(oid, ec_index, path);
+
+	if (unlink(path) < 0) {
+		if (errno == ENOENT)
+			return SD_RES_NO_OBJ;
+
+		sd_err("failed, %s, %m", path);
+		return SD_RES_EIO;
+	}
+
+	return SD_RES_SUCCESS;
+}
+
+#define SHA1NAME "user.obj.sha1"
+
+static int get_object_sha1(const char *path, uint8_t *sha1)
+{
+	if (getxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE)
+	    != SHA1_DIGEST_SIZE) {
+		if (errno == ENODATA)
+			sd_debug("sha1 is not cached yet, %s", path);
+		else
+			sd_err("fail to get xattr, %s", path);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int set_object_sha1(const char *path, const uint8_t *sha1)
+{
+	int ret;
+
+	ret = setxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE, 0);
+	if (ret < 0)
+		sd_err("fail to set sha1, %s", path);
+
+	return ret;
+}
+
+static int get_object_path(uint64_t oid, uint32_t epoch, char *path,
+			   size_t size)
+{
+	char tree_path[PATH_MAX];
+
+	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
+		snprintf(tree_path, PATH_MAX, "%s/meta",
+			 md_get_object_dir(oid));
+	} else {
+		snprintf(tree_path, PATH_MAX, "%s/%02x",
+			 md_get_object_dir(oid), get_tree(oid));
+	}
+
+	if (tree_exist(oid, 0)) {
+		snprintf(path, PATH_MAX, "%s/%016"PRIx64,
+			 tree_path, oid);
+	} else {
+		get_store_stale_path(oid, epoch, 0, path);
+		if (access(path, F_OK) < 0) {
+			if (errno == ENOENT)
+				return SD_RES_NO_OBJ;
+			return SD_RES_EIO;
+		}
+
+	}
+
+	return SD_RES_SUCCESS;
+}
+
+int tree_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1)
+{
+	int ret;
+	void *buf;
+	struct siocb iocb = {};
+	uint32_t length;
+	bool is_readonly_obj = oid_is_readonly(oid);
+	char path[PATH_MAX];
+
+	ret = get_object_path(oid, epoch, path, sizeof(path));
+	if (ret != SD_RES_SUCCESS)
+		return ret;
+
+	if (is_readonly_obj) {
+		if (get_object_sha1(path, sha1) == 0) {
+			sd_debug("use cached sha1 digest %s",
+				 sha1_to_hex(sha1));
+			return SD_RES_SUCCESS;
+		}
+	}
+
+	length = get_store_objsize(oid);
+	buf = valloc(length);
+	if (buf == NULL)
+		return SD_RES_NO_MEM;
+
+	iocb.epoch = epoch;
+	iocb.buf = buf;
+	iocb.length = length;
+
+	ret = tree_read_from_path(oid, path, &iocb);
+	if (ret != SD_RES_SUCCESS) {
+		free(buf);
+		return ret;
+	}
+
+	get_buffer_sha1(buf, length, sha1);
+	free(buf);
+
+	sd_debug("the message digest of %"PRIx64" at epoch %d is %s", oid,
+		 epoch, sha1_to_hex(sha1));
+
+	if (is_readonly_obj)
+		set_object_sha1(path, sha1);
+
+	return ret;
+}
+
+int tree_purge_obj(void)
+{
+	uint32_t tgt_epoch = get_latest_epoch();
+
+	return for_each_object_in_wd(move_object_to_stale_dir, true,
+				     &tgt_epoch);
+}
+
+static struct store_driver tree_store = {
+	.id = TREE_STORE,
+	.name = "tree",
+	.init = tree_init,
+	.exist = tree_exist,
+	.create_and_write = tree_create_and_write,
+	.write = tree_write,
+	.read = tree_read,
+	.link = tree_link,
+	.update_epoch = tree_update_epoch,
+	.cleanup = tree_cleanup,
+	.format = tree_format,
+	.remove_object = tree_remove_object,
+	.get_hash = tree_get_hash,
+	.purge_obj = tree_purge_obj,
+};
+
+add_store_driver(tree_store);
-- 
1.7.1


-- 
NTTソフトウェア株式会社
クラウド事業部
第一事業ユニット
佐伯 昌樹
TEL: 045-212-7393
FAX: 045-662-7856
Mail: saeki.masaki at po.ntts.co.jp
--