[sheepdog] [PATCH 1/2 v2] sheep : add new strage type "tree"
Saeki Masaki
saeki.masaki at po.ntts.co.jp
Wed Mar 18 03:50:09 CET 2015
On 2015/03/18 10:44, Liu Yuan wrote:
> On Tue, Mar 17, 2015 at 06:03:26PM +0900, Saeki Masaki wrote:
>> Current sheepdog stores whole objects in single directory like "/var/lib/sheepdog/obj"
>> This mechanism is difficult to handle massive files when increasing cluster volume.
>>
>> In particular, inode object having special informations about VDI,
>> so it is preferable to divide.
>>
>> new storage type named "tree"
>> It separates the inode object and data object.
>>
>> How to use ,
>> specify the --store option at the time format
>>
>> dog cluster format --store tree
>>
>> v2: refactor using common functions for store driver
>> use check_store_type to identify tree store_driver
>>
>> Signed-off-by: Masaki Saeki <saeki.masaki at po.ntts.co.jp>
>> ---
>> sheep/Makefile.am | 2 +-
>> sheep/sheep_priv.h | 21 ++
>> sheep/store/common.c | 5 +
>> sheep/store/md.c | 14 +
>> sheep/store/plain_store.c | 1 +
>> sheep/store/tree_store.c | 757 +++++++++++++++++++++++++++++++++++++++++++++
>> 6 files changed, 799 insertions(+), 1 deletions(-)
>> create mode 100644 sheep/store/tree_store.c
>>
>> diff --git a/sheep/Makefile.am b/sheep/Makefile.am
>> index 3ddd761..9dedb03 100644
>> --- a/sheep/Makefile.am
>> +++ b/sheep/Makefile.am
>> @@ -28,7 +28,7 @@ sheep_SOURCES = sheep.c group.c request.c gateway.c vdi.c \
>> journal.c ops.c recovery.c cluster/local.c \
>> object_cache.c object_list_cache.c \
>> store/common.c store/md.c \
>> - store/plain_store.c \
>> + store/plain_store.c store/tree_store.c \
>> config.c migrate.c
>>
>> if BUILD_HTTP
>> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
>> index e58901f..51e686f 100644
>> --- a/sheep/sheep_priv.h
>> +++ b/sheep/sheep_priv.h
>> @@ -84,6 +84,11 @@ enum REQUST_STATUS {
>> REQUEST_DROPPED
>> };
>>
>> +enum store_id {
>> + PLAIN_STORE,
>> + TREE_STORE
>> +};
>> +
>> struct request_iocb {
>> uint32_t count;
>> int efd;
>> @@ -235,6 +240,7 @@ struct vdi_info {
>>
>> struct store_driver {
>> struct list_node list;
>> + enum store_id id;
>> const char *name;
>> int (*init)(void);
>> bool (*exist)(uint64_t oid, uint8_t ec_index);
>> @@ -269,6 +275,20 @@ int default_format(void);
>> int default_remove_object(uint64_t oid, uint8_t ec_index);
>> int default_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1);
>> int default_purge_obj(void);
>> +
>> +int tree_init(void);
>> +bool tree_exist(uint64_t oid, uint8_t ec_index);
>> +int tree_create_and_write(uint64_t oid, const struct siocb *iocb);
>> +int tree_write(uint64_t oid, const struct siocb *iocb);
>> +int tree_read(uint64_t oid, const struct siocb *iocb);
>> +int tree_link(uint64_t oid, uint32_t tgt_epoch);
>> +int tree_update_epoch(uint32_t epoch);
>> +int tree_cleanup(void);
>> +int tree_format(void);
>> +int tree_remove_object(uint64_t oid, uint8_t ec_index);
>> +int tree_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1);
>> +int tree_purge_obj(void);
>> +
>> int for_each_object_in_wd(int (*func)(uint64_t, const char *, uint32_t,
>> uint8_t, struct vnode_info *, void *),
>> bool, void *);
>> @@ -404,6 +424,7 @@ void queue_cluster_request(struct request *req);
>> int prepare_iocb(uint64_t oid, const struct siocb *iocb, bool create);
>> int err_to_sderr(const char *path, uint64_t oid, int err);
>> int discard(int fd, uint64_t start, uint32_t end);
>> +bool check_store_type(enum store_id id);
>
> Better rename check_store_type as store_id_match(enum store_id)
>
>>
>> int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes);
>> int inc_and_log_epoch(void);
>> diff --git a/sheep/store/common.c b/sheep/store/common.c
>> index 8959392..aa2858d 100644
>> --- a/sheep/store/common.c
>> +++ b/sheep/store/common.c
>> @@ -102,6 +102,11 @@ int discard(int fd, uint64_t start, uint32_t end)
>> return ret;
>> }
>>
>> +bool check_store_type(enum store_id id)
>> +{
>> + return (sd_store->id == id);
>> +}
>> +
>> int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes)
>> {
>> int ret, len, nodes_len;
>> diff --git a/sheep/store/md.c b/sheep/store/md.c
>> index 87ab759..ed95c98 100644
>> --- a/sheep/store/md.c
>> +++ b/sheep/store/md.c
>> @@ -212,6 +212,20 @@ static int for_each_object_in_path(const char *path,
>> if (unlikely(!strncmp(d->d_name, ".", 1)))
>> continue;
>
> I think for_each_object_in_path family might be moved to common.c
>
>> + /* recursive call for tree store driver sub directories*/
>> + if (check_store_type(TREE_STORE)) {
>> + struct stat s;
>> +
>> + snprintf(file_name, sizeof(file_name),
>> + "%s/%s", path, d->d_name);
>> + stat(file_name, &s);
>> + if (S_ISDIR(s.st_mode)) {
>> + ret = for_each_object_in_path(file_name,
>> + func, cleanup, vinfo, arg);
>> + continue;
>> + }
>> + }
>> +
>> sd_debug("%s, %s", path, d->d_name);
>> oid = strtoull(d->d_name, NULL, 16);
>> if (oid == 0 || oid == ULLONG_MAX)
>> diff --git a/sheep/store/plain_store.c b/sheep/store/plain_store.c
>> index 0239684..9787293 100644
>> --- a/sheep/store/plain_store.c
>> +++ b/sheep/store/plain_store.c
>> @@ -658,6 +658,7 @@ int default_purge_obj(void)
>> }
>>
>> static struct store_driver plain_store = {
>> + .id = PLAIN_STORE,
>> .name = "plain",
>> .init = default_init,
>> .exist = default_exist,
>> diff --git a/sheep/store/tree_store.c b/sheep/store/tree_store.c
>> new file mode 100644
>> index 0000000..441fdf3
>> --- /dev/null
>> +++ b/sheep/store/tree_store.c
>> @@ -0,0 +1,757 @@
>> +/*
>> + * Copyright (C) 2012,2015 Nippon Telegraph and Telephone Corporation.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public License version
>> + * 2 as published by the Free Software Foundation.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include <libgen.h>
>> +
>> +#include "sheep_priv.h"
>> +
>> +static inline int get_tree(uint64_t oid)
>> +{
>> + return (int)((oid << 24) >> 56);
>> +}
>
> what is get_tree()? The naming is ambiguous and 24, 56 is too magic. Replace
> them with a macro is traditional approach.
>
>> +
>> +static int get_store_path(uint64_t oid, uint8_t ec_index, char *path)
>> +{
>> + char tree_path[PATH_MAX];
>> +
>> + if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
>> + snprintf(tree_path, PATH_MAX, "%s/meta",
>> + md_get_object_dir(oid));
>> + } else {
>> + snprintf(tree_path, PATH_MAX, "%s/%02x",
>> + md_get_object_dir(oid), get_tree(oid));
>> + }
>> +
>> + if (is_erasure_oid(oid)) {
>> + if (unlikely(ec_index >= SD_MAX_COPIES))
>> + panic("invalid ec_index %d", ec_index);
>> + return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
>> + tree_path, oid, ec_index);
>> + }
>> +
>> + return snprintf(path, PATH_MAX, "%s/%016" PRIx64, tree_path, oid);
>> +}
>> +
>> +static int get_store_tmp_path(uint64_t oid, uint8_t ec_index, char *path)
>> +{
>> + char tree_path[PATH_MAX];
>> +
>> + if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
>> + snprintf(tree_path, PATH_MAX, "%s/meta",
>> + md_get_object_dir(oid));
>> + } else {
>> + snprintf(tree_path, PATH_MAX, "%s/%02x",
>> + md_get_object_dir(oid), get_tree(oid));
>> + }
>> +
>> + if (is_erasure_oid(oid)) {
>> + if (unlikely(ec_index >= SD_MAX_COPIES))
>> + panic("invalid ec_index %d", ec_index);
>> + return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d.tmp",
>> + tree_path, oid, ec_index);
>> + }
>> +
>> + return snprintf(path, PATH_MAX, "%s/%016" PRIx64".tmp",
>> + tree_path, oid);
>> +}
>> +
>> +static int get_store_stale_path(uint64_t oid, uint32_t epoch, uint8_t ec_index,
>> + char *path)
>> +{
>> + return md_get_stale_path(oid, epoch, ec_index, path);
>> +}
>> +
>> +/*
>> + * Check if oid is in this nodes (if oid is in the wrong place, it will be moved
>> + * to the correct one after this call in a MD setup.
>> + */
>> +bool tree_exist(uint64_t oid, uint8_t ec_index)
>> +{
>> + char path[PATH_MAX];
>> +
>> + get_store_path(oid, ec_index, path);
>> +
>> + return md_exist(oid, ec_index, path);
>> +}
>> +
>> +/* Trim zero blocks of the beginning and end of the object. */
>> +static int tree_trim(int fd, uint64_t oid, const struct siocb *iocb,
>> + uint64_t *poffset, uint32_t *plen)
>> +{
>> + trim_zero_blocks(iocb->buf, poffset, plen);
>> +
>> + if (iocb->offset < *poffset) {
>> + sd_debug("discard between %d, %ld, %" PRIx64, iocb->offset,
>> + *poffset, oid);
>> +
>> + if (discard(fd, iocb->offset, *poffset) < 0)
>> + return -1;
>> + }
>> +
>> + if (*poffset + *plen < iocb->offset + iocb->length) {
>> + uint64_t end = iocb->offset + iocb->length;
>> + uint32_t object_size = get_vdi_object_size(oid_to_vid(oid));
>> + if (end == get_objsize(oid, object_size))
>> + /* This is necessary to punch the last block */
>> + end = round_up(end, BLOCK_SIZE);
>> + sd_debug("discard between %ld, %ld, %" PRIx64, *poffset + *plen,
>> + end, oid);
>> +
>> + if (discard(fd, *poffset + *plen, end) < 0)
>> + return -1;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +int tree_write(uint64_t oid, const struct siocb *iocb)
>> +{
>> + int flags = prepare_iocb(oid, iocb, false), fd,
>> + ret = SD_RES_SUCCESS;
>> + char path[PATH_MAX];
>> + ssize_t size;
>> + uint32_t len = iocb->length;
>> + uint64_t offset = iocb->offset;
>> + static bool trim_is_supported = true;
>> +
>> + if (iocb->epoch < sys_epoch()) {
>> + sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch());
>> + return SD_RES_OLD_NODE_VER;
>> + }
>> +
>> + if (uatomic_is_true(&sys->use_journal) &&
>> + unlikely(journal_write_store(oid, iocb->buf, iocb->length,
>> + iocb->offset, false))
>> + != SD_RES_SUCCESS) {
>> + sd_err("turn off journaling");
>> + uatomic_set_false(&sys->use_journal);
>> + flags |= O_DSYNC;
>> + sync();
>> + }
>> +
>> + get_store_path(oid, iocb->ec_index, path);
>> +
>> + /*
>> + * Make sure oid is in the right place because oid might be misplaced
>> + * in a wrong place, due to 'shutdown/restart with less/more disks' or
>> + * any bugs. We need call err_to_sderr() to return EIO if disk is broken
>> + */
>> + if (!tree_exist(oid, iocb->ec_index))
>> + return err_to_sderr(path, oid, ENOENT);
>> +
>> + fd = open(path, flags, sd_def_fmode);
>> + if (unlikely(fd < 0))
>> + return err_to_sderr(path, oid, errno);
>> +
>> + if (trim_is_supported && is_sparse_object(oid)) {
>> + if (tree_trim(fd, oid, iocb, &offset, &len) < 0) {
>> + trim_is_supported = false;
>> + offset = iocb->offset;
>> + len = iocb->length;
>> + }
>> + }
>> +
>> + size = xpwrite(fd, iocb->buf, len, offset);
>> + if (unlikely(size != len)) {
>> + sd_err("failed to write object %"PRIx64", path=%s, offset=%"
>> + PRId32", size=%"PRId32", result=%zd, %m", oid, path,
>> + iocb->offset, iocb->length, size);
>> + ret = err_to_sderr(path, oid, errno);
>> + goto out;
>> + }
>> +out:
>> + close(fd);
>> + return ret;
>> +}
>> +
>> +static int make_tree_dir(const char *path)
>> +{
>> + int i;
>> + char p[PATH_MAX];
>> +
>> + snprintf(p, PATH_MAX, "%s/meta", path);
>> + if (xmkdir(p, sd_def_dmode) < 0) {
>> + sd_err("%s failed, %m", p);
>> + return SD_RES_EIO;
>> + }
>> +
>> + for (i = 0 ; i < 256 ; i++) {
>> + snprintf(p, PATH_MAX, "%s/%02x", path, i);
>> + if (xmkdir(p, sd_def_dmode) < 0) {
>> + sd_err("%s failed, %m", p);
>> + return SD_RES_EIO;
>> + }
>> + }
>> +
>> + return SD_RES_SUCCESS;
>> +}
>> +
>> +static int make_stale_dir(const char *path)
>> +{
>> + char p[PATH_MAX];
>> +
>> + snprintf(p, PATH_MAX, "%s/.stale", path);
>> + if (xmkdir(p, sd_def_dmode) < 0) {
>> + sd_err("%s failed, %m", p);
>> + return SD_RES_EIO;
>> + }
>> +
>> + return SD_RES_SUCCESS;
>> +}
>> +
>> +static int purge_dir(const char *path)
>> +{
>> + if (purge_directory(path) < 0)
>> + return SD_RES_EIO;
>> +
>> + return SD_RES_SUCCESS;
>> +}
>> +
>> +static int purge_stale_dir(const char *path)
>> +{
>> + char p[PATH_MAX];
>> +
>> + snprintf(p, PATH_MAX, "%s/.stale", path);
>> +
>> + if (purge_directory_async(p) < 0)
>> + return SD_RES_EIO;
>> +
>> + return SD_RES_SUCCESS;
>> +}
>> +
>> +int tree_cleanup(void)
>> +{
>> + int ret;
>> +
>> + ret = for_each_obj_path(purge_stale_dir);
>> + if (ret != SD_RES_SUCCESS)
>> + return ret;
>> +
>> + return SD_RES_SUCCESS;
>> +}
>> +
>> +static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch)
>> +{
>> + int ret;
>> + struct sd_inode *inode = xzalloc(SD_INODE_HEADER_SIZE);
>> + struct siocb iocb = {
>> + .epoch = epoch,
>> + .buf = inode,
>> + .length = SD_INODE_HEADER_SIZE,
>> + };
>> +
>> + ret = tree_read(oid, &iocb);
>> + if (ret != SD_RES_SUCCESS) {
>> + sd_err("failed to read inode header %" PRIx64 " %" PRId32
>> + "wat %s", oid, epoch, wd);
>> + goto out;
>> + }
>> + add_vdi_state_unordered(oid_to_vid(oid), inode->nr_copies,
>> + vdi_is_snapshot(inode), inode->copy_policy,
>> + inode->block_size_shift, inode->parent_vdi_id);
>> +
>> + if (inode->name[0] == '\0')
>> + atomic_set_bit(oid_to_vid(oid), sys->vdi_deleted);
>> +
>> + atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse);
>> +
>> + ret = SD_RES_SUCCESS;
>> +out:
>> + free(inode);
>> + return ret;
>> +}
>> +
>> +static int init_objlist_and_vdi_bitmap(uint64_t oid, const char *wd,
>> + uint32_t epoch, uint8_t ec_index,
>> + struct vnode_info *vinfo,
>> + void *arg)
>> +{
>> + int ret;
>> + char path[PATH_MAX];
>> + objlist_cache_insert(oid);
>> +
>> + snprintf(path, PATH_MAX, "%s/meta", wd);
>> +
>> + if (is_vdi_obj(oid)) {
>> + sd_debug("found the VDI object %" PRIx64" epoch %"PRIu32
>> + " at %s", oid, epoch, path);
>> + ret = init_vdi_state(oid, path, epoch);
>> + if (ret != SD_RES_SUCCESS)
>> + return ret;
>> + }
>> + return SD_RES_SUCCESS;
>> +}
>> +
>> +int tree_init(void)
>> +{
>> + int ret;
>> +
>> + sd_debug("use tree store driver");
>> + ret = for_each_obj_path(make_tree_dir);
>> + if (ret != SD_RES_SUCCESS)
>> + return ret;
>> +
>> + ret = for_each_obj_path(make_stale_dir);
>> + if (ret != SD_RES_SUCCESS)
>> + return ret;
>> +
>> +
>> + for_each_object_in_stale(init_objlist_and_vdi_bitmap, NULL);
>> +
>> + return for_each_object_in_wd(init_objlist_and_vdi_bitmap, true, NULL);
>> +}
>> +
>> +static int tree_read_from_path(uint64_t oid, const char *path,
>> + const struct siocb *iocb)
>> +{
>> + int flags = prepare_iocb(oid, iocb, false), fd,
>> + ret = SD_RES_SUCCESS;
>> + ssize_t size;
>> +
>> + /*
>> + * Make sure oid is in the right place because oid might be misplaced
>> + * in a wrong place, due to 'shutdown/restart with less disks' or any
>> + * bugs. We need call err_to_sderr() to return EIO if disk is broken.
>> + *
>> + * For stale path, get_store_stale_path already does tree_exist job.
>> + */
>> + if (!is_stale_path(path) && !tree_exist(oid, iocb->ec_index))
>> + return err_to_sderr(path, oid, ENOENT);
>> +
>> + fd = open(path, flags);
>> + if (fd < 0)
>> + return err_to_sderr(path, oid, errno);
>> +
>> + size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
>> + if (size < 0) {
>> + sd_err("failed to read object %"PRIx64", path=%s, offset=%"
>> + PRId32", size=%"PRId32", result=%zd, %m", oid, path,
>> + iocb->offset, iocb->length, size);
>> + ret = err_to_sderr(path, oid, errno);
>> + }
>> + close(fd);
>> + return ret;
>> +}
>> +
>> +int tree_read(uint64_t oid, const struct siocb *iocb)
>> +{
>> + int ret;
>> + char path[PATH_MAX];
>> +
>> + get_store_path(oid, iocb->ec_index, path);
>> + ret = tree_read_from_path(oid, path, iocb);
>> +
>> + /*
>> + * If the request is against the older epoch, try to read from
>> + * the stale directory
>> + */
>> + if (ret == SD_RES_NO_OBJ && iocb->epoch > 0 &&
>> + iocb->epoch < sys_epoch()) {
>> + get_store_stale_path(oid, iocb->epoch, iocb->ec_index, path);
>> + ret = tree_read_from_path(oid, path, iocb);
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +int tree_create_and_write(uint64_t oid, const struct siocb *iocb)
>> +{
>> + char path[PATH_MAX], tmp_path[PATH_MAX], *dir;
>> + int flags = prepare_iocb(oid, iocb, true);
>> + int ret, fd;
>> + uint32_t len = iocb->length;
>> + uint32_t object_size = 0;
>> + size_t obj_size;
>> + uint64_t offset = iocb->offset;
>> +
>> + sd_debug("%"PRIx64, oid);
>> + get_store_path(oid, iocb->ec_index, path);
>> + get_store_tmp_path(oid, iocb->ec_index, tmp_path);
>> +
>> + if (uatomic_is_true(&sys->use_journal) &&
>> + journal_write_store(oid, iocb->buf, iocb->length,
>> + iocb->offset, true)
>> + != SD_RES_SUCCESS) {
>> + sd_err("turn off journaling");
>> + uatomic_set_false(&sys->use_journal);
>> + flags |= O_SYNC;
>> + sync();
>> + }
>> +
>> + fd = open(tmp_path, flags, sd_def_fmode);
>> + if (fd < 0) {
>> + if (errno == EEXIST) {
>> + /*
>> + * This happens if node membership changes during object
>> + * creation; while gateway retries a CREATE request,
>> + * recovery process could also recover the object at the
>> + * same time. They should try to write the same date,
>> + * so it is okay to simply return success here.
>> + */
>> + sd_debug("%s exists", tmp_path);
>> + return SD_RES_SUCCESS;
>> + }
>> +
>> + sd_err("failed to open %s: %m", tmp_path);
>> + return err_to_sderr(path, oid, errno);
>> + }
>> +
>> + obj_size = get_store_objsize(oid);
>> +
>> + trim_zero_blocks(iocb->buf, &offset, &len);
>> +
>> + object_size = get_vdi_object_size(oid_to_vid(oid));
>> +
>> + if (offset != 0 || len != get_objsize(oid, object_size)) {
>> + if (is_sparse_object(oid))
>> + ret = xftruncate(fd, obj_size);
>> + else
>> + ret = prealloc(fd, obj_size);
>> + if (ret < 0) {
>> + ret = err_to_sderr(path, oid, errno);
>> + goto out;
>> + }
>> + }
>> +
>> + ret = xpwrite(fd, iocb->buf, len, offset);
>> + if (ret != len) {
>> + sd_err("failed to write object. %m");
>> + ret = err_to_sderr(path, oid, errno);
>> + goto out;
>> + }
>> +
>> + ret = rename(tmp_path, path);
>> + if (ret < 0) {
>> + sd_err("failed to rename %s to %s: %m", tmp_path, path);
>> + ret = err_to_sderr(path, oid, errno);
>> + goto out;
>> + }
>> +
>> + close(fd);
>> +
>> + if (uatomic_is_true(&sys->use_journal) || sys->nosync == true) {
>> + objlist_cache_insert(oid);
>> + return SD_RES_SUCCESS;
>> + }
>> +
>> + pstrcpy(tmp_path, sizeof(tmp_path), path);
>> + dir = dirname(tmp_path);
>> + fd = open(dir, O_DIRECTORY | O_RDONLY);
>> + if (fd < 0) {
>> + sd_err("failed to open directory %s: %m", dir);
>> + return err_to_sderr(path, oid, errno);
>> + }
>> +
>> + if (fsync(fd) != 0) {
>> + sd_err("failed to write directory %s: %m", dir);
>> + ret = err_to_sderr(path, oid, errno);
>> + close(fd);
>> + if (unlink(path) != 0)
>> + sd_err("failed to unlink %s: %m", path);
>> + return ret;
>> + }
>> + close(fd);
>> + objlist_cache_insert(oid);
>> + return SD_RES_SUCCESS;
>> +
>> +out:
>> + if (unlink(tmp_path) != 0)
>> + sd_err("failed to unlink %s: %m", tmp_path);
>> + close(fd);
>> + return ret;
>> +}
>> +
>> +int tree_link(uint64_t oid, uint32_t tgt_epoch)
>> +{
>> + char path[PATH_MAX], stale_path[PATH_MAX], tree_path[PATH_MAX];
>> +
>> + if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
>> + snprintf(tree_path, PATH_MAX, "%s/meta",
>> + md_get_object_dir(oid));
>> + } else {
>> + snprintf(tree_path, PATH_MAX, "%s/%02x",
>> + md_get_object_dir(oid), get_tree(oid));
>> + }
>> +
>> + sd_debug("try link %"PRIx64" from snapshot with epoch %d", oid,
>> + tgt_epoch);
>> +
>> + snprintf(path, PATH_MAX, "%s/%016"PRIx64, tree_path, oid);
>> + get_store_stale_path(oid, tgt_epoch, 0, stale_path);
>> +
>> + if (link(stale_path, path) < 0) {
>> + /*
>> + * Recovery thread and main thread might try to recover the
>> + * same object and we might get EEXIST in such case.
>> + */
>> + if (errno == EEXIST)
>> + goto out;
>> +
>> + sd_debug("failed to link from %s to %s, %m", stale_path, path);
>> + return err_to_sderr(path, oid, errno);
>> + }
>> +out:
>> + return SD_RES_SUCCESS;
>> +}
>> +
>> +/*
>> + * For replicated object, if any of the replica belongs to this node, we
>> + * consider it not stale.
>> + *
>> + * For erasure coded object, since every copy is unique and if it migrates to
>> + * other node(index gets changed even it has some other copy belongs to it)
>> + * because of hash ring changes, we consider it stale.
>> + */
>> +static bool oid_stale(uint64_t oid, int ec_index, struct vnode_info *vinfo)
>> +{
>> + uint32_t i, nr_copies;
>> + const struct sd_vnode *v;
>> + bool ret = true;
>> + const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
>> +
>> + nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
>> + oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes);
>> + for (i = 0; i < nr_copies; i++) {
>> + v = obj_vnodes[i];
>> + if (vnode_is_local(v)) {
>> + if (ec_index < SD_MAX_COPIES) {
>> + if (i == ec_index)
>> + ret = false;
>> + } else {
>> + ret = false;
>> + }
>> + break;
>> + }
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +static int move_object_to_stale_dir(uint64_t oid, const char *wd,
>> + uint32_t epoch, uint8_t ec_index,
>> + struct vnode_info *vinfo, void *arg)
>> +{
>> + char path[PATH_MAX], stale_path[PATH_MAX], tree_path[PATH_MAX];
>> + uint32_t tgt_epoch = *(uint32_t *)arg;
>> +
>> + if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
>> + snprintf(tree_path, PATH_MAX, "%s/meta",
>> + md_get_object_dir(oid));
>> + } else {
>> + snprintf(tree_path, PATH_MAX, "%s/%02x",
>> + md_get_object_dir(oid), get_tree(oid));
>> + }
>> +
>> + /* ec_index from md.c is reliable so we can directly use it */
>> + if (ec_index < SD_MAX_COPIES) {
>> + snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
>> + tree_path, oid, ec_index);
>> + snprintf(stale_path, PATH_MAX,
>> + "%s/.stale/%016"PRIx64"_%d.%"PRIu32,
>> + md_get_object_dir(oid), oid, ec_index, tgt_epoch);
>> + } else {
>> + snprintf(path, PATH_MAX, "%s/%016" PRIx64,
>> + tree_path, oid);
>> + snprintf(stale_path, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32,
>> + md_get_object_dir(oid), oid, tgt_epoch);
>> + }
>> +
>> + if (unlikely(rename(path, stale_path)) < 0) {
>> + sd_err("failed to move stale object %" PRIX64 " to %s, %m", oid,
>> + path);
>> + return SD_RES_EIO;
>> + }
>> + sd_debug("moved object %"PRIx64, oid);
>> + return SD_RES_SUCCESS;
>> +}
>> +
>> +static int check_stale_objects(uint64_t oid, const char *wd, uint32_t epoch,
>> + uint8_t ec_index, struct vnode_info *vinfo,
>> + void *arg)
>> +{
>> + if (oid_stale(oid, ec_index, vinfo))
>> + return move_object_to_stale_dir(oid, wd, 0, ec_index,
>> + NULL, arg);
>> +
>> + return SD_RES_SUCCESS;
>> +}
>> +
>> +int tree_update_epoch(uint32_t epoch)
>> +{
>> + assert(epoch);
>> + return for_each_object_in_wd(check_stale_objects, false, &epoch);
>> +}
>> +
>> +int tree_format(void)
>> +{
>> + unsigned ret;
>> +
>> + sd_debug("try get a clean store");
>> + ret = for_each_obj_path(purge_dir);
>> + if (ret != SD_RES_SUCCESS)
>> + return ret;
>> +
>> + if (sys->enable_object_cache)
>> + object_cache_format();
>> +
>> + return SD_RES_SUCCESS;
>> +}
>> +
>> +int tree_remove_object(uint64_t oid, uint8_t ec_index)
>> +{
>> + char path[PATH_MAX];
>> +
>> + if (uatomic_is_true(&sys->use_journal))
>> + journal_remove_object(oid);
>> +
>> + get_store_path(oid, ec_index, path);
>> +
>> + if (unlink(path) < 0) {
>> + if (errno == ENOENT)
>> + return SD_RES_NO_OBJ;
>> +
>> + sd_err("failed, %s, %m", path);
>> + return SD_RES_EIO;
>> + }
>> +
>> + return SD_RES_SUCCESS;
>> +}
>> +
>> +#define SHA1NAME "user.obj.sha1"
>> +
>> +static int get_object_sha1(const char *path, uint8_t *sha1)
>> +{
>> + if (getxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE)
>> + != SHA1_DIGEST_SIZE) {
>> + if (errno == ENODATA)
>> + sd_debug("sha1 is not cached yet, %s", path);
>> + else
>> + sd_err("fail to get xattr, %s", path);
>> + return -1;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int set_object_sha1(const char *path, const uint8_t *sha1)
>> +{
>> + int ret;
>> +
>> + ret = setxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE, 0);
>> + if (ret < 0)
>> + sd_err("fail to set sha1, %s", path);
>> +
>> + return ret;
>> +}
>> +
>> +static int get_object_path(uint64_t oid, uint32_t epoch, char *path,
>> + size_t size)
>> +{
>> + char tree_path[PATH_MAX];
>> +
>> + if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
>> + snprintf(tree_path, PATH_MAX, "%s/meta",
>> + md_get_object_dir(oid));
>> + } else {
>> + snprintf(tree_path, PATH_MAX, "%s/%02x",
>> + md_get_object_dir(oid), get_tree(oid));
>> + }
>> +
>> + if (tree_exist(oid, 0)) {
>> + snprintf(path, PATH_MAX, "%s/%016"PRIx64,
>> + tree_path, oid);
>> + } else {
>> + get_store_stale_path(oid, epoch, 0, path);
>> + if (access(path, F_OK) < 0) {
>> + if (errno == ENOENT)
>> + return SD_RES_NO_OBJ;
>> + return SD_RES_EIO;
>> + }
>> +
>> + }
>> +
>> + return SD_RES_SUCCESS;
>> +}
>> +
>> +int tree_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1)
>> +{
>> + int ret;
>> + void *buf;
>> + struct siocb iocb = {};
>> + uint32_t length;
>> + bool is_readonly_obj = oid_is_readonly(oid);
>> + char path[PATH_MAX];
>> +
>> + ret = get_object_path(oid, epoch, path, sizeof(path));
>> + if (ret != SD_RES_SUCCESS)
>> + return ret;
>> +
>> + if (is_readonly_obj) {
>> + if (get_object_sha1(path, sha1) == 0) {
>> + sd_debug("use cached sha1 digest %s",
>> + sha1_to_hex(sha1));
>> + return SD_RES_SUCCESS;
>> + }
>> + }
>> +
>> + length = get_store_objsize(oid);
>> + buf = valloc(length);
>> + if (buf == NULL)
>> + return SD_RES_NO_MEM;
>> +
>> + iocb.epoch = epoch;
>> + iocb.buf = buf;
>> + iocb.length = length;
>> +
>> + ret = tree_read_from_path(oid, path, &iocb);
>> + if (ret != SD_RES_SUCCESS) {
>> + free(buf);
>> + return ret;
>> + }
>> +
>> + get_buffer_sha1(buf, length, sha1);
>> + free(buf);
>> +
>> + sd_debug("the message digest of %"PRIx64" at epoch %d is %s", oid,
>> + epoch, sha1_to_hex(sha1));
>> +
>> + if (is_readonly_obj)
>> + set_object_sha1(path, sha1);
>> +
>> + return ret;
>> +}
>> +
>> +int tree_purge_obj(void)
>> +{
>> + uint32_t tgt_epoch = get_latest_epoch();
>> +
>> + return for_each_object_in_wd(move_object_to_stale_dir, true,
>> + &tgt_epoch);
>> +}
>> +
>> +static struct store_driver tree_store = {
>> + .id = TREE_STORE,
>> + .name = "tree",
>> + .init = tree_init,
>> + .exist = tree_exist,
>> + .create_and_write = tree_create_and_write,
>> + .write = tree_write,
>> + .read = tree_read,
>> + .link = tree_link,
>> + .update_epoch = tree_update_epoch,
>> + .cleanup = tree_cleanup,
>> + .format = tree_format,
>> + .remove_object = tree_remove_object,
>> + .get_hash = tree_get_hash,
>> + .purge_obj = tree_purge_obj,
>> +};
>> +
>> +add_store_driver(tree_store);
>
> It seems that tree.c and plain.c still share a great portion of lines. It is
> okay to keep them for fast development, but I hope later we can drag as many
> lines as possible to common.c.
>
> Thanks,
> Yuan
>
Thank for your advice.
I'll make patch reflecting the noted.
Thanks,Saeki.
More information about the sheepdog
mailing list