[sheepdog] [PATCH 1/2 v2] sheep : add new strage type "tree"

Liu Yuan namei.unix at gmail.com
Wed Mar 18 02:44:29 CET 2015


On Tue, Mar 17, 2015 at 06:03:26PM +0900, Saeki Masaki wrote:
> Current sheepdog stores whole objects in single directory like "/var/lib/sheepdog/obj"
> This mechanism is difficult to handle massive files when increasing cluster volume.
> 
> In particular, inode object having special informations about VDI,
> so it is preferable to divide.
> 
> new storage type named "tree"
> It separates the inode object and data object.
> 
> How to use ,
> specify the --store option at the time format
> 
> dog cluster format --store tree
> 
> v2: refactor using common functions for store driver
>     use check_store_type to identify tree store_driver
> 
> Signed-off-by: Masaki Saeki <saeki.masaki at po.ntts.co.jp>
> ---
>  sheep/Makefile.am         |    2 +-
>  sheep/sheep_priv.h        |   21 ++
>  sheep/store/common.c      |    5 +
>  sheep/store/md.c          |   14 +
>  sheep/store/plain_store.c |    1 +
>  sheep/store/tree_store.c  |  757 +++++++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 799 insertions(+), 1 deletions(-)
>  create mode 100644 sheep/store/tree_store.c
> 
> diff --git a/sheep/Makefile.am b/sheep/Makefile.am
> index 3ddd761..9dedb03 100644
> --- a/sheep/Makefile.am
> +++ b/sheep/Makefile.am
> @@ -28,7 +28,7 @@ sheep_SOURCES		= sheep.c group.c request.c gateway.c vdi.c \
>  			  journal.c ops.c recovery.c cluster/local.c \
>  			  object_cache.c object_list_cache.c \
>  			  store/common.c store/md.c \
> -			  store/plain_store.c \
> +			  store/plain_store.c store/tree_store.c \
>  			  config.c migrate.c
>  
>  if BUILD_HTTP
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index e58901f..51e686f 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -84,6 +84,11 @@ enum REQUST_STATUS {
>  	REQUEST_DROPPED
>  };
>  
> +enum store_id {
> +	PLAIN_STORE,
> +	TREE_STORE
> +};
> +
>  struct request_iocb {
>  	uint32_t count;
>  	int efd;
> @@ -235,6 +240,7 @@ struct vdi_info {
>  
>  struct store_driver {
>  	struct list_node list;
> +	enum store_id id;
>  	const char *name;
>  	int (*init)(void);
>  	bool (*exist)(uint64_t oid, uint8_t ec_index);
> @@ -269,6 +275,20 @@ int default_format(void);
>  int default_remove_object(uint64_t oid, uint8_t ec_index);
>  int default_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1);
>  int default_purge_obj(void);
> +
> +int tree_init(void);
> +bool tree_exist(uint64_t oid, uint8_t ec_index);
> +int tree_create_and_write(uint64_t oid, const struct siocb *iocb);
> +int tree_write(uint64_t oid, const struct siocb *iocb);
> +int tree_read(uint64_t oid, const struct siocb *iocb);
> +int tree_link(uint64_t oid, uint32_t tgt_epoch);
> +int tree_update_epoch(uint32_t epoch);
> +int tree_cleanup(void);
> +int tree_format(void);
> +int tree_remove_object(uint64_t oid, uint8_t ec_index);
> +int tree_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1);
> +int tree_purge_obj(void);
> +
>  int for_each_object_in_wd(int (*func)(uint64_t, const char *, uint32_t,
>  				      uint8_t, struct vnode_info *, void *),
>  			  bool, void *);
> @@ -404,6 +424,7 @@ void queue_cluster_request(struct request *req);
>  int prepare_iocb(uint64_t oid, const struct siocb *iocb, bool create);
>  int err_to_sderr(const char *path, uint64_t oid, int err);
>  int discard(int fd, uint64_t start, uint32_t end);
> +bool check_store_type(enum store_id id);

Better rename check_store_type as store_id_match(enum store_id)

>  
>  int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes);
>  int inc_and_log_epoch(void);
> diff --git a/sheep/store/common.c b/sheep/store/common.c
> index 8959392..aa2858d 100644
> --- a/sheep/store/common.c
> +++ b/sheep/store/common.c
> @@ -102,6 +102,11 @@ int discard(int fd, uint64_t start, uint32_t end)
>  	return ret;
>  }
>  
> +bool check_store_type(enum store_id id)
> +{
> +	return (sd_store->id == id);
> +}
> +
>  int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes)
>  {
>  	int ret, len, nodes_len;
> diff --git a/sheep/store/md.c b/sheep/store/md.c
> index 87ab759..ed95c98 100644
> --- a/sheep/store/md.c
> +++ b/sheep/store/md.c
> @@ -212,6 +212,20 @@ static int for_each_object_in_path(const char *path,
>  		if (unlikely(!strncmp(d->d_name, ".", 1)))
>  			continue;

I think for_each_object_in_path family might be moved to common.c

> +		/* recursive call for tree store driver sub directories*/
> +		if (check_store_type(TREE_STORE)) {
> +			struct stat s;
> +
> +			snprintf(file_name, sizeof(file_name),
> +				 "%s/%s", path, d->d_name);
> +			stat(file_name, &s);
> +			if (S_ISDIR(s.st_mode)) {
> +				ret = for_each_object_in_path(file_name,
> +					func, cleanup, vinfo, arg);
> +				continue;
> +			}
> +		}
> +
>  		sd_debug("%s, %s", path, d->d_name);
>  		oid = strtoull(d->d_name, NULL, 16);
>  		if (oid == 0 || oid == ULLONG_MAX)
> diff --git a/sheep/store/plain_store.c b/sheep/store/plain_store.c
> index 0239684..9787293 100644
> --- a/sheep/store/plain_store.c
> +++ b/sheep/store/plain_store.c
> @@ -658,6 +658,7 @@ int default_purge_obj(void)
>  }
>  
>  static struct store_driver plain_store = {
> +	.id = PLAIN_STORE,
>  	.name = "plain",
>  	.init = default_init,
>  	.exist = default_exist,
> diff --git a/sheep/store/tree_store.c b/sheep/store/tree_store.c
> new file mode 100644
> index 0000000..441fdf3
> --- /dev/null
> +++ b/sheep/store/tree_store.c
> @@ -0,0 +1,757 @@
> +/*
> + * Copyright (C) 2012,2015 Nippon Telegraph and Telephone Corporation.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License version
> + * 2 as published by the Free Software Foundation.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <libgen.h>
> +
> +#include "sheep_priv.h"
> +
> +static inline int get_tree(uint64_t oid)
> +{
> +	return (int)((oid << 24) >> 56);
> +}

what is get_tree()? The naming is ambiguous and 24, 56 is too magic. Replace
them with a macro is traditional approach.

> +
> +static int get_store_path(uint64_t oid, uint8_t ec_index, char *path)
> +{
> +	char tree_path[PATH_MAX];
> +
> +	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
> +		snprintf(tree_path, PATH_MAX, "%s/meta",
> +			 md_get_object_dir(oid));
> +	} else {
> +		snprintf(tree_path, PATH_MAX, "%s/%02x",
> +			 md_get_object_dir(oid), get_tree(oid));
> +	}
> +
> +	if (is_erasure_oid(oid)) {
> +		if (unlikely(ec_index >= SD_MAX_COPIES))
> +			panic("invalid ec_index %d", ec_index);
> +		return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
> +				tree_path, oid, ec_index);
> +	}
> +
> +	return snprintf(path, PATH_MAX, "%s/%016" PRIx64, tree_path, oid);
> +}
> +
> +static int get_store_tmp_path(uint64_t oid, uint8_t ec_index, char *path)
> +{
> +	char tree_path[PATH_MAX];
> +
> +	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
> +		snprintf(tree_path, PATH_MAX, "%s/meta",
> +			 md_get_object_dir(oid));
> +	} else {
> +		snprintf(tree_path, PATH_MAX, "%s/%02x",
> +			 md_get_object_dir(oid), get_tree(oid));
> +	}
> +
> +	if (is_erasure_oid(oid)) {
> +		if (unlikely(ec_index >= SD_MAX_COPIES))
> +			panic("invalid ec_index %d", ec_index);
> +		return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d.tmp",
> +				tree_path, oid, ec_index);
> +	}
> +
> +	return snprintf(path, PATH_MAX, "%s/%016" PRIx64".tmp",
> +			tree_path, oid);
> +}
> +
> +static int get_store_stale_path(uint64_t oid, uint32_t epoch, uint8_t ec_index,
> +				char *path)
> +{
> +	return md_get_stale_path(oid, epoch, ec_index, path);
> +}
> +
> +/*
> + * Check if oid is in this nodes (if oid is in the wrong place, it will be moved
> + * to the correct one after this call in a MD setup.
> + */
> +bool tree_exist(uint64_t oid, uint8_t ec_index)
> +{
> +	char path[PATH_MAX];
> +
> +	get_store_path(oid, ec_index, path);
> +
> +	return md_exist(oid, ec_index, path);
> +}
> +
> +/* Trim zero blocks of the beginning and end of the object. */
> +static int tree_trim(int fd, uint64_t oid, const struct siocb *iocb,
> +			uint64_t *poffset, uint32_t *plen)
> +{
> +	trim_zero_blocks(iocb->buf, poffset, plen);
> +
> +	if (iocb->offset < *poffset) {
> +		sd_debug("discard between %d, %ld, %" PRIx64, iocb->offset,
> +			 *poffset, oid);
> +
> +		if (discard(fd, iocb->offset, *poffset) < 0)
> +			return -1;
> +	}
> +
> +	if (*poffset + *plen < iocb->offset + iocb->length) {
> +		uint64_t end = iocb->offset + iocb->length;
> +		uint32_t object_size = get_vdi_object_size(oid_to_vid(oid));
> +		if (end == get_objsize(oid, object_size))
> +			/* This is necessary to punch the last block */
> +			end = round_up(end, BLOCK_SIZE);
> +		sd_debug("discard between %ld, %ld, %" PRIx64, *poffset + *plen,
> +			 end, oid);
> +
> +		if (discard(fd, *poffset + *plen, end) < 0)
> +			return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +int tree_write(uint64_t oid, const struct siocb *iocb)
> +{
> +	int flags = prepare_iocb(oid, iocb, false), fd,
> +	    ret = SD_RES_SUCCESS;
> +	char path[PATH_MAX];
> +	ssize_t size;
> +	uint32_t len = iocb->length;
> +	uint64_t offset = iocb->offset;
> +	static bool trim_is_supported = true;
> +
> +	if (iocb->epoch < sys_epoch()) {
> +		sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch());
> +		return SD_RES_OLD_NODE_VER;
> +	}
> +
> +	if (uatomic_is_true(&sys->use_journal) &&
> +	    unlikely(journal_write_store(oid, iocb->buf, iocb->length,
> +					 iocb->offset, false))
> +	    != SD_RES_SUCCESS) {
> +		sd_err("turn off journaling");
> +		uatomic_set_false(&sys->use_journal);
> +		flags |= O_DSYNC;
> +		sync();
> +	}
> +
> +	get_store_path(oid, iocb->ec_index, path);
> +
> +	/*
> +	 * Make sure oid is in the right place because oid might be misplaced
> +	 * in a wrong place, due to 'shutdown/restart with less/more disks' or
> +	 * any bugs. We need call err_to_sderr() to return EIO if disk is broken
> +	 */
> +	if (!tree_exist(oid, iocb->ec_index))
> +		return err_to_sderr(path, oid, ENOENT);
> +
> +	fd = open(path, flags, sd_def_fmode);
> +	if (unlikely(fd < 0))
> +		return err_to_sderr(path, oid, errno);
> +
> +	if (trim_is_supported && is_sparse_object(oid)) {
> +		if (tree_trim(fd, oid, iocb, &offset, &len) < 0) {
> +			trim_is_supported = false;
> +			offset = iocb->offset;
> +			len = iocb->length;
> +		}
> +	}
> +
> +	size = xpwrite(fd, iocb->buf, len, offset);
> +	if (unlikely(size != len)) {
> +		sd_err("failed to write object %"PRIx64", path=%s, offset=%"
> +		       PRId32", size=%"PRId32", result=%zd, %m", oid, path,
> +		       iocb->offset, iocb->length, size);
> +		ret = err_to_sderr(path, oid, errno);
> +		goto out;
> +	}
> +out:
> +	close(fd);
> +	return ret;
> +}
> +
> +static int make_tree_dir(const char *path)
> +{
> +	int i;
> +	char p[PATH_MAX];
> +
> +	snprintf(p, PATH_MAX, "%s/meta", path);
> +	if (xmkdir(p, sd_def_dmode) < 0) {
> +		sd_err("%s failed, %m", p);
> +		return SD_RES_EIO;
> +	}
> +
> +	for (i = 0 ; i < 256 ; i++) {
> +		snprintf(p, PATH_MAX, "%s/%02x", path, i);
> +		if (xmkdir(p, sd_def_dmode) < 0) {
> +			sd_err("%s failed, %m", p);
> +			return SD_RES_EIO;
> +		}
> +	}
> +
> +	return SD_RES_SUCCESS;
> +}
> +
> +static int make_stale_dir(const char *path)
> +{
> +	char p[PATH_MAX];
> +
> +	snprintf(p, PATH_MAX, "%s/.stale", path);
> +	if (xmkdir(p, sd_def_dmode) < 0) {
> +		sd_err("%s failed, %m", p);
> +		return SD_RES_EIO;
> +	}
> +
> +	return SD_RES_SUCCESS;
> +}
> +
> +static int purge_dir(const char *path)
> +{
> +	if (purge_directory(path) < 0)
> +		return SD_RES_EIO;
> +
> +	return SD_RES_SUCCESS;
> +}
> +
> +static int purge_stale_dir(const char *path)
> +{
> +	char p[PATH_MAX];
> +
> +	snprintf(p, PATH_MAX, "%s/.stale", path);
> +
> +	if (purge_directory_async(p) < 0)
> +		return SD_RES_EIO;
> +
> +	return SD_RES_SUCCESS;
> +}
> +
> +int tree_cleanup(void)
> +{
> +	int ret;
> +
> +	ret = for_each_obj_path(purge_stale_dir);
> +	if (ret != SD_RES_SUCCESS)
> +		return ret;
> +
> +	return SD_RES_SUCCESS;
> +}
> +
> +static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch)
> +{
> +	int ret;
> +	struct sd_inode *inode = xzalloc(SD_INODE_HEADER_SIZE);
> +	struct siocb iocb = {
> +		.epoch = epoch,
> +		.buf = inode,
> +		.length = SD_INODE_HEADER_SIZE,
> +	};
> +
> +	ret = tree_read(oid, &iocb);
> +	if (ret != SD_RES_SUCCESS) {
> +		sd_err("failed to read inode header %" PRIx64 " %" PRId32
> +		       "wat %s", oid, epoch, wd);
> +		goto out;
> +	}
> +	add_vdi_state_unordered(oid_to_vid(oid), inode->nr_copies,
> +		      vdi_is_snapshot(inode), inode->copy_policy,
> +		      inode->block_size_shift, inode->parent_vdi_id);
> +
> +	if (inode->name[0] == '\0')
> +		atomic_set_bit(oid_to_vid(oid), sys->vdi_deleted);
> +
> +	atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse);
> +
> +	ret = SD_RES_SUCCESS;
> +out:
> +	free(inode);
> +	return ret;
> +}
> +
> +static int init_objlist_and_vdi_bitmap(uint64_t oid, const char *wd,
> +				       uint32_t epoch, uint8_t ec_index,
> +				       struct vnode_info *vinfo,
> +				       void *arg)
> +{
> +	int ret;
> +	char path[PATH_MAX];
> +	objlist_cache_insert(oid);
> +
> +	snprintf(path, PATH_MAX, "%s/meta", wd);
> +
> +	if (is_vdi_obj(oid)) {
> +		sd_debug("found the VDI object %" PRIx64" epoch %"PRIu32
> +			 " at %s", oid, epoch, path);
> +		ret = init_vdi_state(oid, path, epoch);
> +		if (ret != SD_RES_SUCCESS)
> +			return ret;
> +	}
> +	return SD_RES_SUCCESS;
> +}
> +
> +int tree_init(void)
> +{
> +	int ret;
> +
> +	sd_debug("use tree store driver");
> +	ret = for_each_obj_path(make_tree_dir);
> +	if (ret != SD_RES_SUCCESS)
> +		return ret;
> +
> +	ret = for_each_obj_path(make_stale_dir);
> +	if (ret != SD_RES_SUCCESS)
> +		return ret;
> +
> +
> +	for_each_object_in_stale(init_objlist_and_vdi_bitmap, NULL);
> +
> +	return for_each_object_in_wd(init_objlist_and_vdi_bitmap, true, NULL);
> +}
> +
> +static int tree_read_from_path(uint64_t oid, const char *path,
> +				  const struct siocb *iocb)
> +{
> +	int flags = prepare_iocb(oid, iocb, false), fd,
> +	    ret = SD_RES_SUCCESS;
> +	ssize_t size;
> +
> +	/*
> +	 * Make sure oid is in the right place because oid might be misplaced
> +	 * in a wrong place, due to 'shutdown/restart with less disks' or any
> +	 * bugs. We need call err_to_sderr() to return EIO if disk is broken.
> +	 *
> +	 * For stale path, get_store_stale_path already does tree_exist job.
> +	 */
> +	if (!is_stale_path(path) && !tree_exist(oid, iocb->ec_index))
> +		return err_to_sderr(path, oid, ENOENT);
> +
> +	fd = open(path, flags);
> +	if (fd < 0)
> +		return err_to_sderr(path, oid, errno);
> +
> +	size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
> +	if (size < 0) {
> +		sd_err("failed to read object %"PRIx64", path=%s, offset=%"
> +		       PRId32", size=%"PRId32", result=%zd, %m", oid, path,
> +		       iocb->offset, iocb->length, size);
> +		ret = err_to_sderr(path, oid, errno);
> +	}
> +	close(fd);
> +	return ret;
> +}
> +
> +int tree_read(uint64_t oid, const struct siocb *iocb)
> +{
> +	int ret;
> +	char path[PATH_MAX];
> +
> +	get_store_path(oid, iocb->ec_index, path);
> +	ret = tree_read_from_path(oid, path, iocb);
> +
> +	/*
> +	 * If the request is against the older epoch, try to read from
> +	 * the stale directory
> +	 */
> +	if (ret == SD_RES_NO_OBJ && iocb->epoch > 0 &&
> +	    iocb->epoch < sys_epoch()) {
> +		get_store_stale_path(oid, iocb->epoch, iocb->ec_index, path);
> +		ret = tree_read_from_path(oid, path, iocb);
> +	}
> +
> +	return ret;
> +}
> +
> +int tree_create_and_write(uint64_t oid, const struct siocb *iocb)
> +{
> +	char path[PATH_MAX], tmp_path[PATH_MAX], *dir;
> +	int flags = prepare_iocb(oid, iocb, true);
> +	int ret, fd;
> +	uint32_t len = iocb->length;
> +	uint32_t object_size = 0;
> +	size_t obj_size;
> +	uint64_t offset = iocb->offset;
> +
> +	sd_debug("%"PRIx64, oid);
> +	get_store_path(oid, iocb->ec_index, path);
> +	get_store_tmp_path(oid, iocb->ec_index, tmp_path);
> +
> +	if (uatomic_is_true(&sys->use_journal) &&
> +	    journal_write_store(oid, iocb->buf, iocb->length,
> +				iocb->offset, true)
> +	    != SD_RES_SUCCESS) {
> +		sd_err("turn off journaling");
> +		uatomic_set_false(&sys->use_journal);
> +		flags |= O_SYNC;
> +		sync();
> +	}
> +
> +	fd = open(tmp_path, flags, sd_def_fmode);
> +	if (fd < 0) {
> +		if (errno == EEXIST) {
> +			/*
> +			 * This happens if node membership changes during object
> +			 * creation; while gateway retries a CREATE request,
> +			 * recovery process could also recover the object at the
> +			 * same time.  They should try to write the same date,
> +			 * so it is okay to simply return success here.
> +			 */
> +			sd_debug("%s exists", tmp_path);
> +			return SD_RES_SUCCESS;
> +		}
> +
> +		sd_err("failed to open %s: %m", tmp_path);
> +		return err_to_sderr(path, oid, errno);
> +	}
> +
> +	obj_size = get_store_objsize(oid);
> +
> +	trim_zero_blocks(iocb->buf, &offset, &len);
> +
> +	object_size = get_vdi_object_size(oid_to_vid(oid));
> +
> +	if (offset != 0 || len != get_objsize(oid, object_size)) {
> +		if (is_sparse_object(oid))
> +			ret = xftruncate(fd, obj_size);
> +		else
> +			ret = prealloc(fd, obj_size);
> +		if (ret < 0) {
> +			ret = err_to_sderr(path, oid, errno);
> +			goto out;
> +		}
> +	}
> +
> +	ret = xpwrite(fd, iocb->buf, len, offset);
> +	if (ret != len) {
> +		sd_err("failed to write object. %m");
> +		ret = err_to_sderr(path, oid, errno);
> +		goto out;
> +	}
> +
> +	ret = rename(tmp_path, path);
> +	if (ret < 0) {
> +		sd_err("failed to rename %s to %s: %m", tmp_path, path);
> +		ret = err_to_sderr(path, oid, errno);
> +		goto out;
> +	}
> +
> +	close(fd);
> +
> +	if (uatomic_is_true(&sys->use_journal) || sys->nosync == true) {
> +		objlist_cache_insert(oid);
> +		return SD_RES_SUCCESS;
> +	}
> +
> +	pstrcpy(tmp_path, sizeof(tmp_path), path);
> +	dir = dirname(tmp_path);
> +	fd = open(dir, O_DIRECTORY | O_RDONLY);
> +	if (fd < 0) {
> +		sd_err("failed to open directory %s: %m", dir);
> +		return err_to_sderr(path, oid, errno);
> +	}
> +
> +	if (fsync(fd) != 0) {
> +		sd_err("failed to write directory %s: %m", dir);
> +		ret = err_to_sderr(path, oid, errno);
> +		close(fd);
> +		if (unlink(path) != 0)
> +			sd_err("failed to unlink %s: %m", path);
> +		return ret;
> +	}
> +	close(fd);
> +	objlist_cache_insert(oid);
> +	return SD_RES_SUCCESS;
> +
> +out:
> +	if (unlink(tmp_path) != 0)
> +		sd_err("failed to unlink %s: %m", tmp_path);
> +	close(fd);
> +	return ret;
> +}
> +
> +int tree_link(uint64_t oid, uint32_t tgt_epoch)
> +{
> +	char path[PATH_MAX], stale_path[PATH_MAX], tree_path[PATH_MAX];
> +
> +	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
> +		snprintf(tree_path, PATH_MAX, "%s/meta",
> +			 md_get_object_dir(oid));
> +	} else {
> +		snprintf(tree_path, PATH_MAX, "%s/%02x",
> +			 md_get_object_dir(oid), get_tree(oid));
> +	}
> +
> +	sd_debug("try link %"PRIx64" from snapshot with epoch %d", oid,
> +		 tgt_epoch);
> +
> +	snprintf(path, PATH_MAX, "%s/%016"PRIx64, tree_path, oid);
> +	get_store_stale_path(oid, tgt_epoch, 0, stale_path);
> +
> +	if (link(stale_path, path) < 0) {
> +		/*
> +		 * Recovery thread and main thread might try to recover the
> +		 * same object and we might get EEXIST in such case.
> +		 */
> +		if (errno == EEXIST)
> +			goto out;
> +
> +		sd_debug("failed to link from %s to %s, %m", stale_path, path);
> +		return err_to_sderr(path, oid, errno);
> +	}
> +out:
> +	return SD_RES_SUCCESS;
> +}
> +
> +/*
> + * For replicated object, if any of the replica belongs to this node, we
> + * consider it not stale.
> + *
> + * For erasure coded object, since every copy is unique and if it migrates to
> + * other node(index gets changed even it has some other copy belongs to it)
> + * because of hash ring changes, we consider it stale.
> + */
> +static bool oid_stale(uint64_t oid, int ec_index, struct vnode_info *vinfo)
> +{
> +	uint32_t i, nr_copies;
> +	const struct sd_vnode *v;
> +	bool ret = true;
> +	const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
> +
> +	nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
> +	oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes);
> +	for (i = 0; i < nr_copies; i++) {
> +		v = obj_vnodes[i];
> +		if (vnode_is_local(v)) {
> +			if (ec_index < SD_MAX_COPIES) {
> +				if (i == ec_index)
> +					ret = false;
> +			} else {
> +				ret = false;
> +			}
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static int move_object_to_stale_dir(uint64_t oid, const char *wd,
> +				    uint32_t epoch, uint8_t ec_index,
> +				    struct vnode_info *vinfo, void *arg)
> +{
> +	char path[PATH_MAX], stale_path[PATH_MAX], tree_path[PATH_MAX];
> +	uint32_t tgt_epoch = *(uint32_t *)arg;
> +
> +	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
> +		snprintf(tree_path, PATH_MAX, "%s/meta",
> +			 md_get_object_dir(oid));
> +	} else {
> +		snprintf(tree_path, PATH_MAX, "%s/%02x",
> +			 md_get_object_dir(oid), get_tree(oid));
> +	}
> +
> +	/* ec_index from md.c is reliable so we can directly use it */
> +	if (ec_index < SD_MAX_COPIES) {
> +		snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
> +			 tree_path, oid, ec_index);
> +		snprintf(stale_path, PATH_MAX,
> +			 "%s/.stale/%016"PRIx64"_%d.%"PRIu32,
> +			 md_get_object_dir(oid), oid, ec_index, tgt_epoch);
> +	} else {
> +		snprintf(path, PATH_MAX, "%s/%016" PRIx64,
> +			 tree_path, oid);
> +		snprintf(stale_path, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32,
> +			 md_get_object_dir(oid), oid, tgt_epoch);
> +	}
> +
> +	if (unlikely(rename(path, stale_path)) < 0) {
> +		sd_err("failed to move stale object %" PRIX64 " to %s, %m", oid,
> +		       path);
> +		return SD_RES_EIO;
> +	}
> +	sd_debug("moved object %"PRIx64, oid);
> +	return SD_RES_SUCCESS;
> +}
> +
> +static int check_stale_objects(uint64_t oid, const char *wd, uint32_t epoch,
> +			       uint8_t ec_index, struct vnode_info *vinfo,
> +			       void *arg)
> +{
> +	if (oid_stale(oid, ec_index, vinfo))
> +		return move_object_to_stale_dir(oid, wd, 0, ec_index,
> +						NULL, arg);
> +
> +	return SD_RES_SUCCESS;
> +}
> +
> +int tree_update_epoch(uint32_t epoch)
> +{
> +	assert(epoch);
> +	return for_each_object_in_wd(check_stale_objects, false, &epoch);
> +}
> +
> +int tree_format(void)
> +{
> +	unsigned ret;
> +
> +	sd_debug("try get a clean store");
> +	ret = for_each_obj_path(purge_dir);
> +	if (ret != SD_RES_SUCCESS)
> +		return ret;
> +
> +	if (sys->enable_object_cache)
> +		object_cache_format();
> +
> +	return SD_RES_SUCCESS;
> +}
> +
> +int tree_remove_object(uint64_t oid, uint8_t ec_index)
> +{
> +	char path[PATH_MAX];
> +
> +	if (uatomic_is_true(&sys->use_journal))
> +		journal_remove_object(oid);
> +
> +	get_store_path(oid, ec_index, path);
> +
> +	if (unlink(path) < 0) {
> +		if (errno == ENOENT)
> +			return SD_RES_NO_OBJ;
> +
> +		sd_err("failed, %s, %m", path);
> +		return SD_RES_EIO;
> +	}
> +
> +	return SD_RES_SUCCESS;
> +}
> +
> +#define SHA1NAME "user.obj.sha1"
> +
> +static int get_object_sha1(const char *path, uint8_t *sha1)
> +{
> +	if (getxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE)
> +	    != SHA1_DIGEST_SIZE) {
> +		if (errno == ENODATA)
> +			sd_debug("sha1 is not cached yet, %s", path);
> +		else
> +			sd_err("fail to get xattr, %s", path);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int set_object_sha1(const char *path, const uint8_t *sha1)
> +{
> +	int ret;
> +
> +	ret = setxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE, 0);
> +	if (ret < 0)
> +		sd_err("fail to set sha1, %s", path);
> +
> +	return ret;
> +}
> +
> +static int get_object_path(uint64_t oid, uint32_t epoch, char *path,
> +			   size_t size)
> +{
> +	char tree_path[PATH_MAX];
> +
> +	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
> +		snprintf(tree_path, PATH_MAX, "%s/meta",
> +			 md_get_object_dir(oid));
> +	} else {
> +		snprintf(tree_path, PATH_MAX, "%s/%02x",
> +			 md_get_object_dir(oid), get_tree(oid));
> +	}
> +
> +	if (tree_exist(oid, 0)) {
> +		snprintf(path, PATH_MAX, "%s/%016"PRIx64,
> +			 tree_path, oid);
> +	} else {
> +		get_store_stale_path(oid, epoch, 0, path);
> +		if (access(path, F_OK) < 0) {
> +			if (errno == ENOENT)
> +				return SD_RES_NO_OBJ;
> +			return SD_RES_EIO;
> +		}
> +
> +	}
> +
> +	return SD_RES_SUCCESS;
> +}
> +
> +int tree_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1)
> +{
> +	int ret;
> +	void *buf;
> +	struct siocb iocb = {};
> +	uint32_t length;
> +	bool is_readonly_obj = oid_is_readonly(oid);
> +	char path[PATH_MAX];
> +
> +	ret = get_object_path(oid, epoch, path, sizeof(path));
> +	if (ret != SD_RES_SUCCESS)
> +		return ret;
> +
> +	if (is_readonly_obj) {
> +		if (get_object_sha1(path, sha1) == 0) {
> +			sd_debug("use cached sha1 digest %s",
> +				 sha1_to_hex(sha1));
> +			return SD_RES_SUCCESS;
> +		}
> +	}
> +
> +	length = get_store_objsize(oid);
> +	buf = valloc(length);
> +	if (buf == NULL)
> +		return SD_RES_NO_MEM;
> +
> +	iocb.epoch = epoch;
> +	iocb.buf = buf;
> +	iocb.length = length;
> +
> +	ret = tree_read_from_path(oid, path, &iocb);
> +	if (ret != SD_RES_SUCCESS) {
> +		free(buf);
> +		return ret;
> +	}
> +
> +	get_buffer_sha1(buf, length, sha1);
> +	free(buf);
> +
> +	sd_debug("the message digest of %"PRIx64" at epoch %d is %s", oid,
> +		 epoch, sha1_to_hex(sha1));
> +
> +	if (is_readonly_obj)
> +		set_object_sha1(path, sha1);
> +
> +	return ret;
> +}
> +
> +int tree_purge_obj(void)
> +{
> +	uint32_t tgt_epoch = get_latest_epoch();
> +
> +	return for_each_object_in_wd(move_object_to_stale_dir, true,
> +				     &tgt_epoch);
> +}
> +
> +static struct store_driver tree_store = {
> +	.id = TREE_STORE,
> +	.name = "tree",
> +	.init = tree_init,
> +	.exist = tree_exist,
> +	.create_and_write = tree_create_and_write,
> +	.write = tree_write,
> +	.read = tree_read,
> +	.link = tree_link,
> +	.update_epoch = tree_update_epoch,
> +	.cleanup = tree_cleanup,
> +	.format = tree_format,
> +	.remove_object = tree_remove_object,
> +	.get_hash = tree_get_hash,
> +	.purge_obj = tree_purge_obj,
> +};
> +
> +add_store_driver(tree_store);

It seems that tree.c and plain.c still share a great portion of lines. It is
okay to keep them for fast development, but I hope later we can drag as many
lines as possible to common.c.

Thanks,
Yuan



More information about the sheepdog mailing list