[sheepdog] [PATCH 1/2 v2] sheep : add new strage type "tree"
Liu Yuan
namei.unix at gmail.com
Wed Mar 18 02:44:29 CET 2015
On Tue, Mar 17, 2015 at 06:03:26PM +0900, Saeki Masaki wrote:
> Current sheepdog stores whole objects in single directory like "/var/lib/sheepdog/obj"
> This mechanism is difficult to handle massive files when increasing cluster volume.
>
> In particular, inode object having special informations about VDI,
> so it is preferable to divide.
>
> new storage type named "tree"
> It separates the inode object and data object.
>
> How to use ,
> specify the --store option at the time format
>
> dog cluster format --store tree
>
> v2: refactor using common functions for store driver
> use check_store_type to identify tree store_driver
>
> Signed-off-by: Masaki Saeki <saeki.masaki at po.ntts.co.jp>
> ---
> sheep/Makefile.am | 2 +-
> sheep/sheep_priv.h | 21 ++
> sheep/store/common.c | 5 +
> sheep/store/md.c | 14 +
> sheep/store/plain_store.c | 1 +
> sheep/store/tree_store.c | 757 +++++++++++++++++++++++++++++++++++++++++++++
> 6 files changed, 799 insertions(+), 1 deletions(-)
> create mode 100644 sheep/store/tree_store.c
>
> diff --git a/sheep/Makefile.am b/sheep/Makefile.am
> index 3ddd761..9dedb03 100644
> --- a/sheep/Makefile.am
> +++ b/sheep/Makefile.am
> @@ -28,7 +28,7 @@ sheep_SOURCES = sheep.c group.c request.c gateway.c vdi.c \
> journal.c ops.c recovery.c cluster/local.c \
> object_cache.c object_list_cache.c \
> store/common.c store/md.c \
> - store/plain_store.c \
> + store/plain_store.c store/tree_store.c \
> config.c migrate.c
>
> if BUILD_HTTP
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index e58901f..51e686f 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -84,6 +84,11 @@ enum REQUST_STATUS {
> REQUEST_DROPPED
> };
>
> +enum store_id {
> + PLAIN_STORE,
> + TREE_STORE
> +};
> +
> struct request_iocb {
> uint32_t count;
> int efd;
> @@ -235,6 +240,7 @@ struct vdi_info {
>
> struct store_driver {
> struct list_node list;
> + enum store_id id;
> const char *name;
> int (*init)(void);
> bool (*exist)(uint64_t oid, uint8_t ec_index);
> @@ -269,6 +275,20 @@ int default_format(void);
> int default_remove_object(uint64_t oid, uint8_t ec_index);
> int default_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1);
> int default_purge_obj(void);
> +
> +int tree_init(void);
> +bool tree_exist(uint64_t oid, uint8_t ec_index);
> +int tree_create_and_write(uint64_t oid, const struct siocb *iocb);
> +int tree_write(uint64_t oid, const struct siocb *iocb);
> +int tree_read(uint64_t oid, const struct siocb *iocb);
> +int tree_link(uint64_t oid, uint32_t tgt_epoch);
> +int tree_update_epoch(uint32_t epoch);
> +int tree_cleanup(void);
> +int tree_format(void);
> +int tree_remove_object(uint64_t oid, uint8_t ec_index);
> +int tree_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1);
> +int tree_purge_obj(void);
> +
> int for_each_object_in_wd(int (*func)(uint64_t, const char *, uint32_t,
> uint8_t, struct vnode_info *, void *),
> bool, void *);
> @@ -404,6 +424,7 @@ void queue_cluster_request(struct request *req);
> int prepare_iocb(uint64_t oid, const struct siocb *iocb, bool create);
> int err_to_sderr(const char *path, uint64_t oid, int err);
> int discard(int fd, uint64_t start, uint32_t end);
> +bool check_store_type(enum store_id id);
Better rename check_store_type as store_id_match(enum store_id)
>
> int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes);
> int inc_and_log_epoch(void);
> diff --git a/sheep/store/common.c b/sheep/store/common.c
> index 8959392..aa2858d 100644
> --- a/sheep/store/common.c
> +++ b/sheep/store/common.c
> @@ -102,6 +102,11 @@ int discard(int fd, uint64_t start, uint32_t end)
> return ret;
> }
>
> +bool check_store_type(enum store_id id)
> +{
> + return (sd_store->id == id);
> +}
> +
> int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes)
> {
> int ret, len, nodes_len;
> diff --git a/sheep/store/md.c b/sheep/store/md.c
> index 87ab759..ed95c98 100644
> --- a/sheep/store/md.c
> +++ b/sheep/store/md.c
> @@ -212,6 +212,20 @@ static int for_each_object_in_path(const char *path,
> if (unlikely(!strncmp(d->d_name, ".", 1)))
> continue;
I think for_each_object_in_path family might be moved to common.c
> + /* recursive call for tree store driver sub directories*/
> + if (check_store_type(TREE_STORE)) {
> + struct stat s;
> +
> + snprintf(file_name, sizeof(file_name),
> + "%s/%s", path, d->d_name);
> + stat(file_name, &s);
> + if (S_ISDIR(s.st_mode)) {
> + ret = for_each_object_in_path(file_name,
> + func, cleanup, vinfo, arg);
> + continue;
> + }
> + }
> +
> sd_debug("%s, %s", path, d->d_name);
> oid = strtoull(d->d_name, NULL, 16);
> if (oid == 0 || oid == ULLONG_MAX)
> diff --git a/sheep/store/plain_store.c b/sheep/store/plain_store.c
> index 0239684..9787293 100644
> --- a/sheep/store/plain_store.c
> +++ b/sheep/store/plain_store.c
> @@ -658,6 +658,7 @@ int default_purge_obj(void)
> }
>
> static struct store_driver plain_store = {
> + .id = PLAIN_STORE,
> .name = "plain",
> .init = default_init,
> .exist = default_exist,
> diff --git a/sheep/store/tree_store.c b/sheep/store/tree_store.c
> new file mode 100644
> index 0000000..441fdf3
> --- /dev/null
> +++ b/sheep/store/tree_store.c
> @@ -0,0 +1,757 @@
> +/*
> + * Copyright (C) 2012,2015 Nippon Telegraph and Telephone Corporation.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License version
> + * 2 as published by the Free Software Foundation.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <libgen.h>
> +
> +#include "sheep_priv.h"
> +
> +static inline int get_tree(uint64_t oid)
> +{
> + return (int)((oid << 24) >> 56);
> +}
what is get_tree()? The naming is ambiguous and 24, 56 is too magic. Replace
them with a macro is traditional approach.
> +
> +static int get_store_path(uint64_t oid, uint8_t ec_index, char *path)
> +{
> + char tree_path[PATH_MAX];
> +
> + if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
> + snprintf(tree_path, PATH_MAX, "%s/meta",
> + md_get_object_dir(oid));
> + } else {
> + snprintf(tree_path, PATH_MAX, "%s/%02x",
> + md_get_object_dir(oid), get_tree(oid));
> + }
> +
> + if (is_erasure_oid(oid)) {
> + if (unlikely(ec_index >= SD_MAX_COPIES))
> + panic("invalid ec_index %d", ec_index);
> + return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
> + tree_path, oid, ec_index);
> + }
> +
> + return snprintf(path, PATH_MAX, "%s/%016" PRIx64, tree_path, oid);
> +}
> +
> +static int get_store_tmp_path(uint64_t oid, uint8_t ec_index, char *path)
> +{
> + char tree_path[PATH_MAX];
> +
> + if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
> + snprintf(tree_path, PATH_MAX, "%s/meta",
> + md_get_object_dir(oid));
> + } else {
> + snprintf(tree_path, PATH_MAX, "%s/%02x",
> + md_get_object_dir(oid), get_tree(oid));
> + }
> +
> + if (is_erasure_oid(oid)) {
> + if (unlikely(ec_index >= SD_MAX_COPIES))
> + panic("invalid ec_index %d", ec_index);
> + return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d.tmp",
> + tree_path, oid, ec_index);
> + }
> +
> + return snprintf(path, PATH_MAX, "%s/%016" PRIx64".tmp",
> + tree_path, oid);
> +}
> +
> +static int get_store_stale_path(uint64_t oid, uint32_t epoch, uint8_t ec_index,
> + char *path)
> +{
> + return md_get_stale_path(oid, epoch, ec_index, path);
> +}
> +
> +/*
> + * Check if oid is in this nodes (if oid is in the wrong place, it will be moved
> + * to the correct one after this call in a MD setup.
> + */
> +bool tree_exist(uint64_t oid, uint8_t ec_index)
> +{
> + char path[PATH_MAX];
> +
> + get_store_path(oid, ec_index, path);
> +
> + return md_exist(oid, ec_index, path);
> +}
> +
> +/* Trim zero blocks of the beginning and end of the object. */
> +static int tree_trim(int fd, uint64_t oid, const struct siocb *iocb,
> + uint64_t *poffset, uint32_t *plen)
> +{
> + trim_zero_blocks(iocb->buf, poffset, plen);
> +
> + if (iocb->offset < *poffset) {
> + sd_debug("discard between %d, %ld, %" PRIx64, iocb->offset,
> + *poffset, oid);
> +
> + if (discard(fd, iocb->offset, *poffset) < 0)
> + return -1;
> + }
> +
> + if (*poffset + *plen < iocb->offset + iocb->length) {
> + uint64_t end = iocb->offset + iocb->length;
> + uint32_t object_size = get_vdi_object_size(oid_to_vid(oid));
> + if (end == get_objsize(oid, object_size))
> + /* This is necessary to punch the last block */
> + end = round_up(end, BLOCK_SIZE);
> + sd_debug("discard between %ld, %ld, %" PRIx64, *poffset + *plen,
> + end, oid);
> +
> + if (discard(fd, *poffset + *plen, end) < 0)
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +int tree_write(uint64_t oid, const struct siocb *iocb)
> +{
> + int flags = prepare_iocb(oid, iocb, false), fd,
> + ret = SD_RES_SUCCESS;
> + char path[PATH_MAX];
> + ssize_t size;
> + uint32_t len = iocb->length;
> + uint64_t offset = iocb->offset;
> + static bool trim_is_supported = true;
> +
> + if (iocb->epoch < sys_epoch()) {
> + sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch());
> + return SD_RES_OLD_NODE_VER;
> + }
> +
> + if (uatomic_is_true(&sys->use_journal) &&
> + unlikely(journal_write_store(oid, iocb->buf, iocb->length,
> + iocb->offset, false))
> + != SD_RES_SUCCESS) {
> + sd_err("turn off journaling");
> + uatomic_set_false(&sys->use_journal);
> + flags |= O_DSYNC;
> + sync();
> + }
> +
> + get_store_path(oid, iocb->ec_index, path);
> +
> + /*
> + * Make sure oid is in the right place because oid might be misplaced
> + * in a wrong place, due to 'shutdown/restart with less/more disks' or
> + * any bugs. We need call err_to_sderr() to return EIO if disk is broken
> + */
> + if (!tree_exist(oid, iocb->ec_index))
> + return err_to_sderr(path, oid, ENOENT);
> +
> + fd = open(path, flags, sd_def_fmode);
> + if (unlikely(fd < 0))
> + return err_to_sderr(path, oid, errno);
> +
> + if (trim_is_supported && is_sparse_object(oid)) {
> + if (tree_trim(fd, oid, iocb, &offset, &len) < 0) {
> + trim_is_supported = false;
> + offset = iocb->offset;
> + len = iocb->length;
> + }
> + }
> +
> + size = xpwrite(fd, iocb->buf, len, offset);
> + if (unlikely(size != len)) {
> + sd_err("failed to write object %"PRIx64", path=%s, offset=%"
> + PRId32", size=%"PRId32", result=%zd, %m", oid, path,
> + iocb->offset, iocb->length, size);
> + ret = err_to_sderr(path, oid, errno);
> + goto out;
> + }
> +out:
> + close(fd);
> + return ret;
> +}
> +
> +static int make_tree_dir(const char *path)
> +{
> + int i;
> + char p[PATH_MAX];
> +
> + snprintf(p, PATH_MAX, "%s/meta", path);
> + if (xmkdir(p, sd_def_dmode) < 0) {
> + sd_err("%s failed, %m", p);
> + return SD_RES_EIO;
> + }
> +
> + for (i = 0 ; i < 256 ; i++) {
> + snprintf(p, PATH_MAX, "%s/%02x", path, i);
> + if (xmkdir(p, sd_def_dmode) < 0) {
> + sd_err("%s failed, %m", p);
> + return SD_RES_EIO;
> + }
> + }
> +
> + return SD_RES_SUCCESS;
> +}
> +
> +static int make_stale_dir(const char *path)
> +{
> + char p[PATH_MAX];
> +
> + snprintf(p, PATH_MAX, "%s/.stale", path);
> + if (xmkdir(p, sd_def_dmode) < 0) {
> + sd_err("%s failed, %m", p);
> + return SD_RES_EIO;
> + }
> +
> + return SD_RES_SUCCESS;
> +}
> +
> +static int purge_dir(const char *path)
> +{
> + if (purge_directory(path) < 0)
> + return SD_RES_EIO;
> +
> + return SD_RES_SUCCESS;
> +}
> +
> +static int purge_stale_dir(const char *path)
> +{
> + char p[PATH_MAX];
> +
> + snprintf(p, PATH_MAX, "%s/.stale", path);
> +
> + if (purge_directory_async(p) < 0)
> + return SD_RES_EIO;
> +
> + return SD_RES_SUCCESS;
> +}
> +
> +int tree_cleanup(void)
> +{
> + int ret;
> +
> + ret = for_each_obj_path(purge_stale_dir);
> + if (ret != SD_RES_SUCCESS)
> + return ret;
> +
> + return SD_RES_SUCCESS;
> +}
> +
> +static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch)
> +{
> + int ret;
> + struct sd_inode *inode = xzalloc(SD_INODE_HEADER_SIZE);
> + struct siocb iocb = {
> + .epoch = epoch,
> + .buf = inode,
> + .length = SD_INODE_HEADER_SIZE,
> + };
> +
> + ret = tree_read(oid, &iocb);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("failed to read inode header %" PRIx64 " %" PRId32
> + "wat %s", oid, epoch, wd);
> + goto out;
> + }
> + add_vdi_state_unordered(oid_to_vid(oid), inode->nr_copies,
> + vdi_is_snapshot(inode), inode->copy_policy,
> + inode->block_size_shift, inode->parent_vdi_id);
> +
> + if (inode->name[0] == '\0')
> + atomic_set_bit(oid_to_vid(oid), sys->vdi_deleted);
> +
> + atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse);
> +
> + ret = SD_RES_SUCCESS;
> +out:
> + free(inode);
> + return ret;
> +}
> +
> +static int init_objlist_and_vdi_bitmap(uint64_t oid, const char *wd,
> + uint32_t epoch, uint8_t ec_index,
> + struct vnode_info *vinfo,
> + void *arg)
> +{
> + int ret;
> + char path[PATH_MAX];
> + objlist_cache_insert(oid);
> +
> + snprintf(path, PATH_MAX, "%s/meta", wd);
> +
> + if (is_vdi_obj(oid)) {
> + sd_debug("found the VDI object %" PRIx64" epoch %"PRIu32
> + " at %s", oid, epoch, path);
> + ret = init_vdi_state(oid, path, epoch);
> + if (ret != SD_RES_SUCCESS)
> + return ret;
> + }
> + return SD_RES_SUCCESS;
> +}
> +
> +int tree_init(void)
> +{
> + int ret;
> +
> + sd_debug("use tree store driver");
> + ret = for_each_obj_path(make_tree_dir);
> + if (ret != SD_RES_SUCCESS)
> + return ret;
> +
> + ret = for_each_obj_path(make_stale_dir);
> + if (ret != SD_RES_SUCCESS)
> + return ret;
> +
> +
> + for_each_object_in_stale(init_objlist_and_vdi_bitmap, NULL);
> +
> + return for_each_object_in_wd(init_objlist_and_vdi_bitmap, true, NULL);
> +}
> +
> +static int tree_read_from_path(uint64_t oid, const char *path,
> + const struct siocb *iocb)
> +{
> + int flags = prepare_iocb(oid, iocb, false), fd,
> + ret = SD_RES_SUCCESS;
> + ssize_t size;
> +
> + /*
> + * Make sure oid is in the right place because oid might be misplaced
> + * in a wrong place, due to 'shutdown/restart with less disks' or any
> + * bugs. We need call err_to_sderr() to return EIO if disk is broken.
> + *
> + * For stale path, get_store_stale_path already does tree_exist job.
> + */
> + if (!is_stale_path(path) && !tree_exist(oid, iocb->ec_index))
> + return err_to_sderr(path, oid, ENOENT);
> +
> + fd = open(path, flags);
> + if (fd < 0)
> + return err_to_sderr(path, oid, errno);
> +
> + size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
> + if (size < 0) {
> + sd_err("failed to read object %"PRIx64", path=%s, offset=%"
> + PRId32", size=%"PRId32", result=%zd, %m", oid, path,
> + iocb->offset, iocb->length, size);
> + ret = err_to_sderr(path, oid, errno);
> + }
> + close(fd);
> + return ret;
> +}
> +
> +int tree_read(uint64_t oid, const struct siocb *iocb)
> +{
> + int ret;
> + char path[PATH_MAX];
> +
> + get_store_path(oid, iocb->ec_index, path);
> + ret = tree_read_from_path(oid, path, iocb);
> +
> + /*
> + * If the request is against the older epoch, try to read from
> + * the stale directory
> + */
> + if (ret == SD_RES_NO_OBJ && iocb->epoch > 0 &&
> + iocb->epoch < sys_epoch()) {
> + get_store_stale_path(oid, iocb->epoch, iocb->ec_index, path);
> + ret = tree_read_from_path(oid, path, iocb);
> + }
> +
> + return ret;
> +}
> +
> +int tree_create_and_write(uint64_t oid, const struct siocb *iocb)
> +{
> + char path[PATH_MAX], tmp_path[PATH_MAX], *dir;
> + int flags = prepare_iocb(oid, iocb, true);
> + int ret, fd;
> + uint32_t len = iocb->length;
> + uint32_t object_size = 0;
> + size_t obj_size;
> + uint64_t offset = iocb->offset;
> +
> + sd_debug("%"PRIx64, oid);
> + get_store_path(oid, iocb->ec_index, path);
> + get_store_tmp_path(oid, iocb->ec_index, tmp_path);
> +
> + if (uatomic_is_true(&sys->use_journal) &&
> + journal_write_store(oid, iocb->buf, iocb->length,
> + iocb->offset, true)
> + != SD_RES_SUCCESS) {
> + sd_err("turn off journaling");
> + uatomic_set_false(&sys->use_journal);
> + flags |= O_SYNC;
> + sync();
> + }
> +
> + fd = open(tmp_path, flags, sd_def_fmode);
> + if (fd < 0) {
> + if (errno == EEXIST) {
> + /*
> + * This happens if node membership changes during object
> + * creation; while gateway retries a CREATE request,
> + * recovery process could also recover the object at the
> + * same time. They should try to write the same date,
> + * so it is okay to simply return success here.
> + */
> + sd_debug("%s exists", tmp_path);
> + return SD_RES_SUCCESS;
> + }
> +
> + sd_err("failed to open %s: %m", tmp_path);
> + return err_to_sderr(path, oid, errno);
> + }
> +
> + obj_size = get_store_objsize(oid);
> +
> + trim_zero_blocks(iocb->buf, &offset, &len);
> +
> + object_size = get_vdi_object_size(oid_to_vid(oid));
> +
> + if (offset != 0 || len != get_objsize(oid, object_size)) {
> + if (is_sparse_object(oid))
> + ret = xftruncate(fd, obj_size);
> + else
> + ret = prealloc(fd, obj_size);
> + if (ret < 0) {
> + ret = err_to_sderr(path, oid, errno);
> + goto out;
> + }
> + }
> +
> + ret = xpwrite(fd, iocb->buf, len, offset);
> + if (ret != len) {
> + sd_err("failed to write object. %m");
> + ret = err_to_sderr(path, oid, errno);
> + goto out;
> + }
> +
> + ret = rename(tmp_path, path);
> + if (ret < 0) {
> + sd_err("failed to rename %s to %s: %m", tmp_path, path);
> + ret = err_to_sderr(path, oid, errno);
> + goto out;
> + }
> +
> + close(fd);
> +
> + if (uatomic_is_true(&sys->use_journal) || sys->nosync == true) {
> + objlist_cache_insert(oid);
> + return SD_RES_SUCCESS;
> + }
> +
> + pstrcpy(tmp_path, sizeof(tmp_path), path);
> + dir = dirname(tmp_path);
> + fd = open(dir, O_DIRECTORY | O_RDONLY);
> + if (fd < 0) {
> + sd_err("failed to open directory %s: %m", dir);
> + return err_to_sderr(path, oid, errno);
> + }
> +
> + if (fsync(fd) != 0) {
> + sd_err("failed to write directory %s: %m", dir);
> + ret = err_to_sderr(path, oid, errno);
> + close(fd);
> + if (unlink(path) != 0)
> + sd_err("failed to unlink %s: %m", path);
> + return ret;
> + }
> + close(fd);
> + objlist_cache_insert(oid);
> + return SD_RES_SUCCESS;
> +
> +out:
> + if (unlink(tmp_path) != 0)
> + sd_err("failed to unlink %s: %m", tmp_path);
> + close(fd);
> + return ret;
> +}
> +
> +int tree_link(uint64_t oid, uint32_t tgt_epoch)
> +{
> + char path[PATH_MAX], stale_path[PATH_MAX], tree_path[PATH_MAX];
> +
> + if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
> + snprintf(tree_path, PATH_MAX, "%s/meta",
> + md_get_object_dir(oid));
> + } else {
> + snprintf(tree_path, PATH_MAX, "%s/%02x",
> + md_get_object_dir(oid), get_tree(oid));
> + }
> +
> + sd_debug("try link %"PRIx64" from snapshot with epoch %d", oid,
> + tgt_epoch);
> +
> + snprintf(path, PATH_MAX, "%s/%016"PRIx64, tree_path, oid);
> + get_store_stale_path(oid, tgt_epoch, 0, stale_path);
> +
> + if (link(stale_path, path) < 0) {
> + /*
> + * Recovery thread and main thread might try to recover the
> + * same object and we might get EEXIST in such case.
> + */
> + if (errno == EEXIST)
> + goto out;
> +
> + sd_debug("failed to link from %s to %s, %m", stale_path, path);
> + return err_to_sderr(path, oid, errno);
> + }
> +out:
> + return SD_RES_SUCCESS;
> +}
> +
> +/*
> + * For replicated object, if any of the replica belongs to this node, we
> + * consider it not stale.
> + *
> + * For erasure coded object, since every copy is unique and if it migrates to
> + * other node(index gets changed even it has some other copy belongs to it)
> + * because of hash ring changes, we consider it stale.
> + */
> +static bool oid_stale(uint64_t oid, int ec_index, struct vnode_info *vinfo)
> +{
> + uint32_t i, nr_copies;
> + const struct sd_vnode *v;
> + bool ret = true;
> + const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
> +
> + nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
> + oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes);
> + for (i = 0; i < nr_copies; i++) {
> + v = obj_vnodes[i];
> + if (vnode_is_local(v)) {
> + if (ec_index < SD_MAX_COPIES) {
> + if (i == ec_index)
> + ret = false;
> + } else {
> + ret = false;
> + }
> + break;
> + }
> + }
> +
> + return ret;
> +}
> +
> +static int move_object_to_stale_dir(uint64_t oid, const char *wd,
> + uint32_t epoch, uint8_t ec_index,
> + struct vnode_info *vinfo, void *arg)
> +{
> + char path[PATH_MAX], stale_path[PATH_MAX], tree_path[PATH_MAX];
> + uint32_t tgt_epoch = *(uint32_t *)arg;
> +
> + if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
> + snprintf(tree_path, PATH_MAX, "%s/meta",
> + md_get_object_dir(oid));
> + } else {
> + snprintf(tree_path, PATH_MAX, "%s/%02x",
> + md_get_object_dir(oid), get_tree(oid));
> + }
> +
> + /* ec_index from md.c is reliable so we can directly use it */
> + if (ec_index < SD_MAX_COPIES) {
> + snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
> + tree_path, oid, ec_index);
> + snprintf(stale_path, PATH_MAX,
> + "%s/.stale/%016"PRIx64"_%d.%"PRIu32,
> + md_get_object_dir(oid), oid, ec_index, tgt_epoch);
> + } else {
> + snprintf(path, PATH_MAX, "%s/%016" PRIx64,
> + tree_path, oid);
> + snprintf(stale_path, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32,
> + md_get_object_dir(oid), oid, tgt_epoch);
> + }
> +
> + if (unlikely(rename(path, stale_path)) < 0) {
> + sd_err("failed to move stale object %" PRIX64 " to %s, %m", oid,
> + path);
> + return SD_RES_EIO;
> + }
> + sd_debug("moved object %"PRIx64, oid);
> + return SD_RES_SUCCESS;
> +}
> +
> +static int check_stale_objects(uint64_t oid, const char *wd, uint32_t epoch,
> + uint8_t ec_index, struct vnode_info *vinfo,
> + void *arg)
> +{
> + if (oid_stale(oid, ec_index, vinfo))
> + return move_object_to_stale_dir(oid, wd, 0, ec_index,
> + NULL, arg);
> +
> + return SD_RES_SUCCESS;
> +}
> +
> +int tree_update_epoch(uint32_t epoch)
> +{
> + assert(epoch);
> + return for_each_object_in_wd(check_stale_objects, false, &epoch);
> +}
> +
> +int tree_format(void)
> +{
> + unsigned ret;
> +
> + sd_debug("try get a clean store");
> + ret = for_each_obj_path(purge_dir);
> + if (ret != SD_RES_SUCCESS)
> + return ret;
> +
> + if (sys->enable_object_cache)
> + object_cache_format();
> +
> + return SD_RES_SUCCESS;
> +}
> +
> +int tree_remove_object(uint64_t oid, uint8_t ec_index)
> +{
> + char path[PATH_MAX];
> +
> + if (uatomic_is_true(&sys->use_journal))
> + journal_remove_object(oid);
> +
> + get_store_path(oid, ec_index, path);
> +
> + if (unlink(path) < 0) {
> + if (errno == ENOENT)
> + return SD_RES_NO_OBJ;
> +
> + sd_err("failed, %s, %m", path);
> + return SD_RES_EIO;
> + }
> +
> + return SD_RES_SUCCESS;
> +}
> +
> +#define SHA1NAME "user.obj.sha1"
> +
> +static int get_object_sha1(const char *path, uint8_t *sha1)
> +{
> + if (getxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE)
> + != SHA1_DIGEST_SIZE) {
> + if (errno == ENODATA)
> + sd_debug("sha1 is not cached yet, %s", path);
> + else
> + sd_err("fail to get xattr, %s", path);
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +static int set_object_sha1(const char *path, const uint8_t *sha1)
> +{
> + int ret;
> +
> + ret = setxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE, 0);
> + if (ret < 0)
> + sd_err("fail to set sha1, %s", path);
> +
> + return ret;
> +}
> +
> +static int get_object_path(uint64_t oid, uint32_t epoch, char *path,
> + size_t size)
> +{
> + char tree_path[PATH_MAX];
> +
> + if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
> + snprintf(tree_path, PATH_MAX, "%s/meta",
> + md_get_object_dir(oid));
> + } else {
> + snprintf(tree_path, PATH_MAX, "%s/%02x",
> + md_get_object_dir(oid), get_tree(oid));
> + }
> +
> + if (tree_exist(oid, 0)) {
> + snprintf(path, PATH_MAX, "%s/%016"PRIx64,
> + tree_path, oid);
> + } else {
> + get_store_stale_path(oid, epoch, 0, path);
> + if (access(path, F_OK) < 0) {
> + if (errno == ENOENT)
> + return SD_RES_NO_OBJ;
> + return SD_RES_EIO;
> + }
> +
> + }
> +
> + return SD_RES_SUCCESS;
> +}
> +
> +int tree_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1)
> +{
> + int ret;
> + void *buf;
> + struct siocb iocb = {};
> + uint32_t length;
> + bool is_readonly_obj = oid_is_readonly(oid);
> + char path[PATH_MAX];
> +
> + ret = get_object_path(oid, epoch, path, sizeof(path));
> + if (ret != SD_RES_SUCCESS)
> + return ret;
> +
> + if (is_readonly_obj) {
> + if (get_object_sha1(path, sha1) == 0) {
> + sd_debug("use cached sha1 digest %s",
> + sha1_to_hex(sha1));
> + return SD_RES_SUCCESS;
> + }
> + }
> +
> + length = get_store_objsize(oid);
> + buf = valloc(length);
> + if (buf == NULL)
> + return SD_RES_NO_MEM;
> +
> + iocb.epoch = epoch;
> + iocb.buf = buf;
> + iocb.length = length;
> +
> + ret = tree_read_from_path(oid, path, &iocb);
> + if (ret != SD_RES_SUCCESS) {
> + free(buf);
> + return ret;
> + }
> +
> + get_buffer_sha1(buf, length, sha1);
> + free(buf);
> +
> + sd_debug("the message digest of %"PRIx64" at epoch %d is %s", oid,
> + epoch, sha1_to_hex(sha1));
> +
> + if (is_readonly_obj)
> + set_object_sha1(path, sha1);
> +
> + return ret;
> +}
> +
> +int tree_purge_obj(void)
> +{
> + uint32_t tgt_epoch = get_latest_epoch();
> +
> + return for_each_object_in_wd(move_object_to_stale_dir, true,
> + &tgt_epoch);
> +}
> +
> +static struct store_driver tree_store = {
> + .id = TREE_STORE,
> + .name = "tree",
> + .init = tree_init,
> + .exist = tree_exist,
> + .create_and_write = tree_create_and_write,
> + .write = tree_write,
> + .read = tree_read,
> + .link = tree_link,
> + .update_epoch = tree_update_epoch,
> + .cleanup = tree_cleanup,
> + .format = tree_format,
> + .remove_object = tree_remove_object,
> + .get_hash = tree_get_hash,
> + .purge_obj = tree_purge_obj,
> +};
> +
> +add_store_driver(tree_store);
It seems that tree.c and plain.c still share a great portion of lines. It is
okay to keep them for fast development, but I hope later we can drag as many
lines as possible to common.c.
Thanks,
Yuan
More information about the sheepdog
mailing list