[sheepdog] [PATCH 1/2 v2] sheep : add new strage type "tree"

Saeki Masaki saeki.masaki at po.ntts.co.jp
Wed Mar 18 03:50:09 CET 2015


On 2015/03/18 10:44, Liu Yuan wrote:
> On Tue, Mar 17, 2015 at 06:03:26PM +0900, Saeki Masaki wrote:
>> Current sheepdog stores whole objects in single directory like "/var/lib/sheepdog/obj"
>> This mechanism is difficult to handle massive files when increasing cluster volume.
>>
>> In particular, inode object having special informations about VDI,
>> so it is preferable to divide.
>>
>> new storage type named "tree"
>> It separates the inode object and data object.
>>
>> How to use ,
>> specify the --store option at the time format
>>
>> dog cluster format --store tree
>>
>> v2: refactor using common functions for store driver
>>      use check_store_type to identify tree store_driver
>>
>> Signed-off-by: Masaki Saeki <saeki.masaki at po.ntts.co.jp>
>> ---
>>   sheep/Makefile.am         |    2 +-
>>   sheep/sheep_priv.h        |   21 ++
>>   sheep/store/common.c      |    5 +
>>   sheep/store/md.c          |   14 +
>>   sheep/store/plain_store.c |    1 +
>>   sheep/store/tree_store.c  |  757 +++++++++++++++++++++++++++++++++++++++++++++
>>   6 files changed, 799 insertions(+), 1 deletions(-)
>>   create mode 100644 sheep/store/tree_store.c
>>
>> diff --git a/sheep/Makefile.am b/sheep/Makefile.am
>> index 3ddd761..9dedb03 100644
>> --- a/sheep/Makefile.am
>> +++ b/sheep/Makefile.am
>> @@ -28,7 +28,7 @@ sheep_SOURCES		= sheep.c group.c request.c gateway.c vdi.c \
>>   			  journal.c ops.c recovery.c cluster/local.c \
>>   			  object_cache.c object_list_cache.c \
>>   			  store/common.c store/md.c \
>> -			  store/plain_store.c \
>> +			  store/plain_store.c store/tree_store.c \
>>   			  config.c migrate.c
>>
>>   if BUILD_HTTP
>> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
>> index e58901f..51e686f 100644
>> --- a/sheep/sheep_priv.h
>> +++ b/sheep/sheep_priv.h
>> @@ -84,6 +84,11 @@ enum REQUST_STATUS {
>>   	REQUEST_DROPPED
>>   };
>>
>> +enum store_id {
>> +	PLAIN_STORE,
>> +	TREE_STORE
>> +};
>> +
>>   struct request_iocb {
>>   	uint32_t count;
>>   	int efd;
>> @@ -235,6 +240,7 @@ struct vdi_info {
>>
>>   struct store_driver {
>>   	struct list_node list;
>> +	enum store_id id;
>>   	const char *name;
>>   	int (*init)(void);
>>   	bool (*exist)(uint64_t oid, uint8_t ec_index);
>> @@ -269,6 +275,20 @@ int default_format(void);
>>   int default_remove_object(uint64_t oid, uint8_t ec_index);
>>   int default_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1);
>>   int default_purge_obj(void);
>> +
>> +int tree_init(void);
>> +bool tree_exist(uint64_t oid, uint8_t ec_index);
>> +int tree_create_and_write(uint64_t oid, const struct siocb *iocb);
>> +int tree_write(uint64_t oid, const struct siocb *iocb);
>> +int tree_read(uint64_t oid, const struct siocb *iocb);
>> +int tree_link(uint64_t oid, uint32_t tgt_epoch);
>> +int tree_update_epoch(uint32_t epoch);
>> +int tree_cleanup(void);
>> +int tree_format(void);
>> +int tree_remove_object(uint64_t oid, uint8_t ec_index);
>> +int tree_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1);
>> +int tree_purge_obj(void);
>> +
>>   int for_each_object_in_wd(int (*func)(uint64_t, const char *, uint32_t,
>>   				      uint8_t, struct vnode_info *, void *),
>>   			  bool, void *);
>> @@ -404,6 +424,7 @@ void queue_cluster_request(struct request *req);
>>   int prepare_iocb(uint64_t oid, const struct siocb *iocb, bool create);
>>   int err_to_sderr(const char *path, uint64_t oid, int err);
>>   int discard(int fd, uint64_t start, uint32_t end);
>> +bool check_store_type(enum store_id id);
>
> Better rename check_store_type as store_id_match(enum store_id)
>
>>
>>   int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes);
>>   int inc_and_log_epoch(void);
>> diff --git a/sheep/store/common.c b/sheep/store/common.c
>> index 8959392..aa2858d 100644
>> --- a/sheep/store/common.c
>> +++ b/sheep/store/common.c
>> @@ -102,6 +102,11 @@ int discard(int fd, uint64_t start, uint32_t end)
>>   	return ret;
>>   }
>>
>> +bool check_store_type(enum store_id id)
>> +{
>> +	return (sd_store->id == id);
>> +}
>> +
>>   int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes)
>>   {
>>   	int ret, len, nodes_len;
>> diff --git a/sheep/store/md.c b/sheep/store/md.c
>> index 87ab759..ed95c98 100644
>> --- a/sheep/store/md.c
>> +++ b/sheep/store/md.c
>> @@ -212,6 +212,20 @@ static int for_each_object_in_path(const char *path,
>>   		if (unlikely(!strncmp(d->d_name, ".", 1)))
>>   			continue;
>
> I think for_each_object_in_path family might be moved to common.c
>
>> +		/* recursive call for tree store driver sub directories*/
>> +		if (check_store_type(TREE_STORE)) {
>> +			struct stat s;
>> +
>> +			snprintf(file_name, sizeof(file_name),
>> +				 "%s/%s", path, d->d_name);
>> +			stat(file_name, &s);
>> +			if (S_ISDIR(s.st_mode)) {
>> +				ret = for_each_object_in_path(file_name,
>> +					func, cleanup, vinfo, arg);
>> +				continue;
>> +			}
>> +		}
>> +
>>   		sd_debug("%s, %s", path, d->d_name);
>>   		oid = strtoull(d->d_name, NULL, 16);
>>   		if (oid == 0 || oid == ULLONG_MAX)
>> diff --git a/sheep/store/plain_store.c b/sheep/store/plain_store.c
>> index 0239684..9787293 100644
>> --- a/sheep/store/plain_store.c
>> +++ b/sheep/store/plain_store.c
>> @@ -658,6 +658,7 @@ int default_purge_obj(void)
>>   }
>>
>>   static struct store_driver plain_store = {
>> +	.id = PLAIN_STORE,
>>   	.name = "plain",
>>   	.init = default_init,
>>   	.exist = default_exist,
>> diff --git a/sheep/store/tree_store.c b/sheep/store/tree_store.c
>> new file mode 100644
>> index 0000000..441fdf3
>> --- /dev/null
>> +++ b/sheep/store/tree_store.c
>> @@ -0,0 +1,757 @@
>> +/*
>> + * Copyright (C) 2012,2015 Nippon Telegraph and Telephone Corporation.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public License version
>> + * 2 as published by the Free Software Foundation.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include <libgen.h>
>> +
>> +#include "sheep_priv.h"
>> +
>> +static inline int get_tree(uint64_t oid)
>> +{
>> +	return (int)((oid << 24) >> 56);
>> +}
>
> what is get_tree()? The naming is ambiguous and 24, 56 is too magic. Replace
> them with a macro is traditional approach.
>
>> +
>> +static int get_store_path(uint64_t oid, uint8_t ec_index, char *path)
>> +{
>> +	char tree_path[PATH_MAX];
>> +
>> +	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
>> +		snprintf(tree_path, PATH_MAX, "%s/meta",
>> +			 md_get_object_dir(oid));
>> +	} else {
>> +		snprintf(tree_path, PATH_MAX, "%s/%02x",
>> +			 md_get_object_dir(oid), get_tree(oid));
>> +	}
>> +
>> +	if (is_erasure_oid(oid)) {
>> +		if (unlikely(ec_index >= SD_MAX_COPIES))
>> +			panic("invalid ec_index %d", ec_index);
>> +		return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
>> +				tree_path, oid, ec_index);
>> +	}
>> +
>> +	return snprintf(path, PATH_MAX, "%s/%016" PRIx64, tree_path, oid);
>> +}
>> +
>> +static int get_store_tmp_path(uint64_t oid, uint8_t ec_index, char *path)
>> +{
>> +	char tree_path[PATH_MAX];
>> +
>> +	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
>> +		snprintf(tree_path, PATH_MAX, "%s/meta",
>> +			 md_get_object_dir(oid));
>> +	} else {
>> +		snprintf(tree_path, PATH_MAX, "%s/%02x",
>> +			 md_get_object_dir(oid), get_tree(oid));
>> +	}
>> +
>> +	if (is_erasure_oid(oid)) {
>> +		if (unlikely(ec_index >= SD_MAX_COPIES))
>> +			panic("invalid ec_index %d", ec_index);
>> +		return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d.tmp",
>> +				tree_path, oid, ec_index);
>> +	}
>> +
>> +	return snprintf(path, PATH_MAX, "%s/%016" PRIx64".tmp",
>> +			tree_path, oid);
>> +}
>> +
>> +static int get_store_stale_path(uint64_t oid, uint32_t epoch, uint8_t ec_index,
>> +				char *path)
>> +{
>> +	return md_get_stale_path(oid, epoch, ec_index, path);
>> +}
>> +
>> +/*
>> + * Check if oid is in this nodes (if oid is in the wrong place, it will be moved
>> + * to the correct one after this call in a MD setup.
>> + */
>> +bool tree_exist(uint64_t oid, uint8_t ec_index)
>> +{
>> +	char path[PATH_MAX];
>> +
>> +	get_store_path(oid, ec_index, path);
>> +
>> +	return md_exist(oid, ec_index, path);
>> +}
>> +
>> +/* Trim zero blocks of the beginning and end of the object. */
>> +static int tree_trim(int fd, uint64_t oid, const struct siocb *iocb,
>> +			uint64_t *poffset, uint32_t *plen)
>> +{
>> +	trim_zero_blocks(iocb->buf, poffset, plen);
>> +
>> +	if (iocb->offset < *poffset) {
>> +		sd_debug("discard between %d, %ld, %" PRIx64, iocb->offset,
>> +			 *poffset, oid);
>> +
>> +		if (discard(fd, iocb->offset, *poffset) < 0)
>> +			return -1;
>> +	}
>> +
>> +	if (*poffset + *plen < iocb->offset + iocb->length) {
>> +		uint64_t end = iocb->offset + iocb->length;
>> +		uint32_t object_size = get_vdi_object_size(oid_to_vid(oid));
>> +		if (end == get_objsize(oid, object_size))
>> +			/* This is necessary to punch the last block */
>> +			end = round_up(end, BLOCK_SIZE);
>> +		sd_debug("discard between %ld, %ld, %" PRIx64, *poffset + *plen,
>> +			 end, oid);
>> +
>> +		if (discard(fd, *poffset + *plen, end) < 0)
>> +			return -1;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +int tree_write(uint64_t oid, const struct siocb *iocb)
>> +{
>> +	int flags = prepare_iocb(oid, iocb, false), fd,
>> +	    ret = SD_RES_SUCCESS;
>> +	char path[PATH_MAX];
>> +	ssize_t size;
>> +	uint32_t len = iocb->length;
>> +	uint64_t offset = iocb->offset;
>> +	static bool trim_is_supported = true;
>> +
>> +	if (iocb->epoch < sys_epoch()) {
>> +		sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch());
>> +		return SD_RES_OLD_NODE_VER;
>> +	}
>> +
>> +	if (uatomic_is_true(&sys->use_journal) &&
>> +	    unlikely(journal_write_store(oid, iocb->buf, iocb->length,
>> +					 iocb->offset, false))
>> +	    != SD_RES_SUCCESS) {
>> +		sd_err("turn off journaling");
>> +		uatomic_set_false(&sys->use_journal);
>> +		flags |= O_DSYNC;
>> +		sync();
>> +	}
>> +
>> +	get_store_path(oid, iocb->ec_index, path);
>> +
>> +	/*
>> +	 * Make sure oid is in the right place because oid might be misplaced
>> +	 * in a wrong place, due to 'shutdown/restart with less/more disks' or
>> +	 * any bugs. We need call err_to_sderr() to return EIO if disk is broken
>> +	 */
>> +	if (!tree_exist(oid, iocb->ec_index))
>> +		return err_to_sderr(path, oid, ENOENT);
>> +
>> +	fd = open(path, flags, sd_def_fmode);
>> +	if (unlikely(fd < 0))
>> +		return err_to_sderr(path, oid, errno);
>> +
>> +	if (trim_is_supported && is_sparse_object(oid)) {
>> +		if (tree_trim(fd, oid, iocb, &offset, &len) < 0) {
>> +			trim_is_supported = false;
>> +			offset = iocb->offset;
>> +			len = iocb->length;
>> +		}
>> +	}
>> +
>> +	size = xpwrite(fd, iocb->buf, len, offset);
>> +	if (unlikely(size != len)) {
>> +		sd_err("failed to write object %"PRIx64", path=%s, offset=%"
>> +		       PRId32", size=%"PRId32", result=%zd, %m", oid, path,
>> +		       iocb->offset, iocb->length, size);
>> +		ret = err_to_sderr(path, oid, errno);
>> +		goto out;
>> +	}
>> +out:
>> +	close(fd);
>> +	return ret;
>> +}
>> +
>> +static int make_tree_dir(const char *path)
>> +{
>> +	int i;
>> +	char p[PATH_MAX];
>> +
>> +	snprintf(p, PATH_MAX, "%s/meta", path);
>> +	if (xmkdir(p, sd_def_dmode) < 0) {
>> +		sd_err("%s failed, %m", p);
>> +		return SD_RES_EIO;
>> +	}
>> +
>> +	for (i = 0 ; i < 256 ; i++) {
>> +		snprintf(p, PATH_MAX, "%s/%02x", path, i);
>> +		if (xmkdir(p, sd_def_dmode) < 0) {
>> +			sd_err("%s failed, %m", p);
>> +			return SD_RES_EIO;
>> +		}
>> +	}
>> +
>> +	return SD_RES_SUCCESS;
>> +}
>> +
>> +static int make_stale_dir(const char *path)
>> +{
>> +	char p[PATH_MAX];
>> +
>> +	snprintf(p, PATH_MAX, "%s/.stale", path);
>> +	if (xmkdir(p, sd_def_dmode) < 0) {
>> +		sd_err("%s failed, %m", p);
>> +		return SD_RES_EIO;
>> +	}
>> +
>> +	return SD_RES_SUCCESS;
>> +}
>> +
>> +static int purge_dir(const char *path)
>> +{
>> +	if (purge_directory(path) < 0)
>> +		return SD_RES_EIO;
>> +
>> +	return SD_RES_SUCCESS;
>> +}
>> +
>> +static int purge_stale_dir(const char *path)
>> +{
>> +	char p[PATH_MAX];
>> +
>> +	snprintf(p, PATH_MAX, "%s/.stale", path);
>> +
>> +	if (purge_directory_async(p) < 0)
>> +		return SD_RES_EIO;
>> +
>> +	return SD_RES_SUCCESS;
>> +}
>> +
>> +int tree_cleanup(void)
>> +{
>> +	int ret;
>> +
>> +	ret = for_each_obj_path(purge_stale_dir);
>> +	if (ret != SD_RES_SUCCESS)
>> +		return ret;
>> +
>> +	return SD_RES_SUCCESS;
>> +}
>> +
>> +static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch)
>> +{
>> +	int ret;
>> +	struct sd_inode *inode = xzalloc(SD_INODE_HEADER_SIZE);
>> +	struct siocb iocb = {
>> +		.epoch = epoch,
>> +		.buf = inode,
>> +		.length = SD_INODE_HEADER_SIZE,
>> +	};
>> +
>> +	ret = tree_read(oid, &iocb);
>> +	if (ret != SD_RES_SUCCESS) {
>> +		sd_err("failed to read inode header %" PRIx64 " %" PRId32
>> +		       "wat %s", oid, epoch, wd);
>> +		goto out;
>> +	}
>> +	add_vdi_state_unordered(oid_to_vid(oid), inode->nr_copies,
>> +		      vdi_is_snapshot(inode), inode->copy_policy,
>> +		      inode->block_size_shift, inode->parent_vdi_id);
>> +
>> +	if (inode->name[0] == '\0')
>> +		atomic_set_bit(oid_to_vid(oid), sys->vdi_deleted);
>> +
>> +	atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse);
>> +
>> +	ret = SD_RES_SUCCESS;
>> +out:
>> +	free(inode);
>> +	return ret;
>> +}
>> +
>> +static int init_objlist_and_vdi_bitmap(uint64_t oid, const char *wd,
>> +				       uint32_t epoch, uint8_t ec_index,
>> +				       struct vnode_info *vinfo,
>> +				       void *arg)
>> +{
>> +	int ret;
>> +	char path[PATH_MAX];
>> +	objlist_cache_insert(oid);
>> +
>> +	snprintf(path, PATH_MAX, "%s/meta", wd);
>> +
>> +	if (is_vdi_obj(oid)) {
>> +		sd_debug("found the VDI object %" PRIx64" epoch %"PRIu32
>> +			 " at %s", oid, epoch, path);
>> +		ret = init_vdi_state(oid, path, epoch);
>> +		if (ret != SD_RES_SUCCESS)
>> +			return ret;
>> +	}
>> +	return SD_RES_SUCCESS;
>> +}
>> +
>> +int tree_init(void)
>> +{
>> +	int ret;
>> +
>> +	sd_debug("use tree store driver");
>> +	ret = for_each_obj_path(make_tree_dir);
>> +	if (ret != SD_RES_SUCCESS)
>> +		return ret;
>> +
>> +	ret = for_each_obj_path(make_stale_dir);
>> +	if (ret != SD_RES_SUCCESS)
>> +		return ret;
>> +
>> +
>> +	for_each_object_in_stale(init_objlist_and_vdi_bitmap, NULL);
>> +
>> +	return for_each_object_in_wd(init_objlist_and_vdi_bitmap, true, NULL);
>> +}
>> +
>> +static int tree_read_from_path(uint64_t oid, const char *path,
>> +				  const struct siocb *iocb)
>> +{
>> +	int flags = prepare_iocb(oid, iocb, false), fd,
>> +	    ret = SD_RES_SUCCESS;
>> +	ssize_t size;
>> +
>> +	/*
>> +	 * Make sure oid is in the right place because oid might be misplaced
>> +	 * in a wrong place, due to 'shutdown/restart with less disks' or any
>> +	 * bugs. We need call err_to_sderr() to return EIO if disk is broken.
>> +	 *
>> +	 * For stale path, get_store_stale_path already does tree_exist job.
>> +	 */
>> +	if (!is_stale_path(path) && !tree_exist(oid, iocb->ec_index))
>> +		return err_to_sderr(path, oid, ENOENT);
>> +
>> +	fd = open(path, flags);
>> +	if (fd < 0)
>> +		return err_to_sderr(path, oid, errno);
>> +
>> +	size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
>> +	if (size < 0) {
>> +		sd_err("failed to read object %"PRIx64", path=%s, offset=%"
>> +		       PRId32", size=%"PRId32", result=%zd, %m", oid, path,
>> +		       iocb->offset, iocb->length, size);
>> +		ret = err_to_sderr(path, oid, errno);
>> +	}
>> +	close(fd);
>> +	return ret;
>> +}
>> +
>> +int tree_read(uint64_t oid, const struct siocb *iocb)
>> +{
>> +	int ret;
>> +	char path[PATH_MAX];
>> +
>> +	get_store_path(oid, iocb->ec_index, path);
>> +	ret = tree_read_from_path(oid, path, iocb);
>> +
>> +	/*
>> +	 * If the request is against the older epoch, try to read from
>> +	 * the stale directory
>> +	 */
>> +	if (ret == SD_RES_NO_OBJ && iocb->epoch > 0 &&
>> +	    iocb->epoch < sys_epoch()) {
>> +		get_store_stale_path(oid, iocb->epoch, iocb->ec_index, path);
>> +		ret = tree_read_from_path(oid, path, iocb);
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +int tree_create_and_write(uint64_t oid, const struct siocb *iocb)
>> +{
>> +	char path[PATH_MAX], tmp_path[PATH_MAX], *dir;
>> +	int flags = prepare_iocb(oid, iocb, true);
>> +	int ret, fd;
>> +	uint32_t len = iocb->length;
>> +	uint32_t object_size = 0;
>> +	size_t obj_size;
>> +	uint64_t offset = iocb->offset;
>> +
>> +	sd_debug("%"PRIx64, oid);
>> +	get_store_path(oid, iocb->ec_index, path);
>> +	get_store_tmp_path(oid, iocb->ec_index, tmp_path);
>> +
>> +	if (uatomic_is_true(&sys->use_journal) &&
>> +	    journal_write_store(oid, iocb->buf, iocb->length,
>> +				iocb->offset, true)
>> +	    != SD_RES_SUCCESS) {
>> +		sd_err("turn off journaling");
>> +		uatomic_set_false(&sys->use_journal);
>> +		flags |= O_SYNC;
>> +		sync();
>> +	}
>> +
>> +	fd = open(tmp_path, flags, sd_def_fmode);
>> +	if (fd < 0) {
>> +		if (errno == EEXIST) {
>> +			/*
>> +			 * This happens if node membership changes during object
>> +			 * creation; while gateway retries a CREATE request,
>> +			 * recovery process could also recover the object at the
>> +			 * same time.  They should try to write the same date,
>> +			 * so it is okay to simply return success here.
>> +			 */
>> +			sd_debug("%s exists", tmp_path);
>> +			return SD_RES_SUCCESS;
>> +		}
>> +
>> +		sd_err("failed to open %s: %m", tmp_path);
>> +		return err_to_sderr(path, oid, errno);
>> +	}
>> +
>> +	obj_size = get_store_objsize(oid);
>> +
>> +	trim_zero_blocks(iocb->buf, &offset, &len);
>> +
>> +	object_size = get_vdi_object_size(oid_to_vid(oid));
>> +
>> +	if (offset != 0 || len != get_objsize(oid, object_size)) {
>> +		if (is_sparse_object(oid))
>> +			ret = xftruncate(fd, obj_size);
>> +		else
>> +			ret = prealloc(fd, obj_size);
>> +		if (ret < 0) {
>> +			ret = err_to_sderr(path, oid, errno);
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	ret = xpwrite(fd, iocb->buf, len, offset);
>> +	if (ret != len) {
>> +		sd_err("failed to write object. %m");
>> +		ret = err_to_sderr(path, oid, errno);
>> +		goto out;
>> +	}
>> +
>> +	ret = rename(tmp_path, path);
>> +	if (ret < 0) {
>> +		sd_err("failed to rename %s to %s: %m", tmp_path, path);
>> +		ret = err_to_sderr(path, oid, errno);
>> +		goto out;
>> +	}
>> +
>> +	close(fd);
>> +
>> +	if (uatomic_is_true(&sys->use_journal) || sys->nosync == true) {
>> +		objlist_cache_insert(oid);
>> +		return SD_RES_SUCCESS;
>> +	}
>> +
>> +	pstrcpy(tmp_path, sizeof(tmp_path), path);
>> +	dir = dirname(tmp_path);
>> +	fd = open(dir, O_DIRECTORY | O_RDONLY);
>> +	if (fd < 0) {
>> +		sd_err("failed to open directory %s: %m", dir);
>> +		return err_to_sderr(path, oid, errno);
>> +	}
>> +
>> +	if (fsync(fd) != 0) {
>> +		sd_err("failed to write directory %s: %m", dir);
>> +		ret = err_to_sderr(path, oid, errno);
>> +		close(fd);
>> +		if (unlink(path) != 0)
>> +			sd_err("failed to unlink %s: %m", path);
>> +		return ret;
>> +	}
>> +	close(fd);
>> +	objlist_cache_insert(oid);
>> +	return SD_RES_SUCCESS;
>> +
>> +out:
>> +	if (unlink(tmp_path) != 0)
>> +		sd_err("failed to unlink %s: %m", tmp_path);
>> +	close(fd);
>> +	return ret;
>> +}
>> +
>> +int tree_link(uint64_t oid, uint32_t tgt_epoch)
>> +{
>> +	char path[PATH_MAX], stale_path[PATH_MAX], tree_path[PATH_MAX];
>> +
>> +	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
>> +		snprintf(tree_path, PATH_MAX, "%s/meta",
>> +			 md_get_object_dir(oid));
>> +	} else {
>> +		snprintf(tree_path, PATH_MAX, "%s/%02x",
>> +			 md_get_object_dir(oid), get_tree(oid));
>> +	}
>> +
>> +	sd_debug("try link %"PRIx64" from snapshot with epoch %d", oid,
>> +		 tgt_epoch);
>> +
>> +	snprintf(path, PATH_MAX, "%s/%016"PRIx64, tree_path, oid);
>> +	get_store_stale_path(oid, tgt_epoch, 0, stale_path);
>> +
>> +	if (link(stale_path, path) < 0) {
>> +		/*
>> +		 * Recovery thread and main thread might try to recover the
>> +		 * same object and we might get EEXIST in such case.
>> +		 */
>> +		if (errno == EEXIST)
>> +			goto out;
>> +
>> +		sd_debug("failed to link from %s to %s, %m", stale_path, path);
>> +		return err_to_sderr(path, oid, errno);
>> +	}
>> +out:
>> +	return SD_RES_SUCCESS;
>> +}
>> +
>> +/*
>> + * For replicated object, if any of the replica belongs to this node, we
>> + * consider it not stale.
>> + *
>> + * For erasure coded object, since every copy is unique and if it migrates to
>> + * other node(index gets changed even it has some other copy belongs to it)
>> + * because of hash ring changes, we consider it stale.
>> + */
>> +static bool oid_stale(uint64_t oid, int ec_index, struct vnode_info *vinfo)
>> +{
>> +	uint32_t i, nr_copies;
>> +	const struct sd_vnode *v;
>> +	bool ret = true;
>> +	const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
>> +
>> +	nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
>> +	oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes);
>> +	for (i = 0; i < nr_copies; i++) {
>> +		v = obj_vnodes[i];
>> +		if (vnode_is_local(v)) {
>> +			if (ec_index < SD_MAX_COPIES) {
>> +				if (i == ec_index)
>> +					ret = false;
>> +			} else {
>> +				ret = false;
>> +			}
>> +			break;
>> +		}
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +static int move_object_to_stale_dir(uint64_t oid, const char *wd,
>> +				    uint32_t epoch, uint8_t ec_index,
>> +				    struct vnode_info *vinfo, void *arg)
>> +{
>> +	char path[PATH_MAX], stale_path[PATH_MAX], tree_path[PATH_MAX];
>> +	uint32_t tgt_epoch = *(uint32_t *)arg;
>> +
>> +	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
>> +		snprintf(tree_path, PATH_MAX, "%s/meta",
>> +			 md_get_object_dir(oid));
>> +	} else {
>> +		snprintf(tree_path, PATH_MAX, "%s/%02x",
>> +			 md_get_object_dir(oid), get_tree(oid));
>> +	}
>> +
>> +	/* ec_index from md.c is reliable so we can directly use it */
>> +	if (ec_index < SD_MAX_COPIES) {
>> +		snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
>> +			 tree_path, oid, ec_index);
>> +		snprintf(stale_path, PATH_MAX,
>> +			 "%s/.stale/%016"PRIx64"_%d.%"PRIu32,
>> +			 md_get_object_dir(oid), oid, ec_index, tgt_epoch);
>> +	} else {
>> +		snprintf(path, PATH_MAX, "%s/%016" PRIx64,
>> +			 tree_path, oid);
>> +		snprintf(stale_path, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32,
>> +			 md_get_object_dir(oid), oid, tgt_epoch);
>> +	}
>> +
>> +	if (unlikely(rename(path, stale_path)) < 0) {
>> +		sd_err("failed to move stale object %" PRIX64 " to %s, %m", oid,
>> +		       path);
>> +		return SD_RES_EIO;
>> +	}
>> +	sd_debug("moved object %"PRIx64, oid);
>> +	return SD_RES_SUCCESS;
>> +}
>> +
>> +static int check_stale_objects(uint64_t oid, const char *wd, uint32_t epoch,
>> +			       uint8_t ec_index, struct vnode_info *vinfo,
>> +			       void *arg)
>> +{
>> +	if (oid_stale(oid, ec_index, vinfo))
>> +		return move_object_to_stale_dir(oid, wd, 0, ec_index,
>> +						NULL, arg);
>> +
>> +	return SD_RES_SUCCESS;
>> +}
>> +
>> +int tree_update_epoch(uint32_t epoch)
>> +{
>> +	assert(epoch);
>> +	return for_each_object_in_wd(check_stale_objects, false, &epoch);
>> +}
>> +
>> +int tree_format(void)
>> +{
>> +	unsigned ret;
>> +
>> +	sd_debug("try get a clean store");
>> +	ret = for_each_obj_path(purge_dir);
>> +	if (ret != SD_RES_SUCCESS)
>> +		return ret;
>> +
>> +	if (sys->enable_object_cache)
>> +		object_cache_format();
>> +
>> +	return SD_RES_SUCCESS;
>> +}
>> +
>> +int tree_remove_object(uint64_t oid, uint8_t ec_index)
>> +{
>> +	char path[PATH_MAX];
>> +
>> +	if (uatomic_is_true(&sys->use_journal))
>> +		journal_remove_object(oid);
>> +
>> +	get_store_path(oid, ec_index, path);
>> +
>> +	if (unlink(path) < 0) {
>> +		if (errno == ENOENT)
>> +			return SD_RES_NO_OBJ;
>> +
>> +		sd_err("failed, %s, %m", path);
>> +		return SD_RES_EIO;
>> +	}
>> +
>> +	return SD_RES_SUCCESS;
>> +}
>> +
>> +#define SHA1NAME "user.obj.sha1"
>> +
>> +static int get_object_sha1(const char *path, uint8_t *sha1)
>> +{
>> +	if (getxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE)
>> +	    != SHA1_DIGEST_SIZE) {
>> +		if (errno == ENODATA)
>> +			sd_debug("sha1 is not cached yet, %s", path);
>> +		else
>> +			sd_err("fail to get xattr, %s", path);
>> +		return -1;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int set_object_sha1(const char *path, const uint8_t *sha1)
>> +{
>> +	int ret;
>> +
>> +	ret = setxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE, 0);
>> +	if (ret < 0)
>> +		sd_err("fail to set sha1, %s", path);
>> +
>> +	return ret;
>> +}
>> +
>> +static int get_object_path(uint64_t oid, uint32_t epoch, char *path,
>> +			   size_t size)
>> +{
>> +	char tree_path[PATH_MAX];
>> +
>> +	if (is_vdi_obj(oid) || is_vmstate_obj(oid) || is_vdi_attr_obj(oid)) {
>> +		snprintf(tree_path, PATH_MAX, "%s/meta",
>> +			 md_get_object_dir(oid));
>> +	} else {
>> +		snprintf(tree_path, PATH_MAX, "%s/%02x",
>> +			 md_get_object_dir(oid), get_tree(oid));
>> +	}
>> +
>> +	if (tree_exist(oid, 0)) {
>> +		snprintf(path, PATH_MAX, "%s/%016"PRIx64,
>> +			 tree_path, oid);
>> +	} else {
>> +		get_store_stale_path(oid, epoch, 0, path);
>> +		if (access(path, F_OK) < 0) {
>> +			if (errno == ENOENT)
>> +				return SD_RES_NO_OBJ;
>> +			return SD_RES_EIO;
>> +		}
>> +
>> +	}
>> +
>> +	return SD_RES_SUCCESS;
>> +}
>> +
>> +int tree_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1)
>> +{
>> +	int ret;
>> +	void *buf;
>> +	struct siocb iocb = {};
>> +	uint32_t length;
>> +	bool is_readonly_obj = oid_is_readonly(oid);
>> +	char path[PATH_MAX];
>> +
>> +	ret = get_object_path(oid, epoch, path, sizeof(path));
>> +	if (ret != SD_RES_SUCCESS)
>> +		return ret;
>> +
>> +	if (is_readonly_obj) {
>> +		if (get_object_sha1(path, sha1) == 0) {
>> +			sd_debug("use cached sha1 digest %s",
>> +				 sha1_to_hex(sha1));
>> +			return SD_RES_SUCCESS;
>> +		}
>> +	}
>> +
>> +	length = get_store_objsize(oid);
>> +	buf = valloc(length);
>> +	if (buf == NULL)
>> +		return SD_RES_NO_MEM;
>> +
>> +	iocb.epoch = epoch;
>> +	iocb.buf = buf;
>> +	iocb.length = length;
>> +
>> +	ret = tree_read_from_path(oid, path, &iocb);
>> +	if (ret != SD_RES_SUCCESS) {
>> +		free(buf);
>> +		return ret;
>> +	}
>> +
>> +	get_buffer_sha1(buf, length, sha1);
>> +	free(buf);
>> +
>> +	sd_debug("the message digest of %"PRIx64" at epoch %d is %s", oid,
>> +		 epoch, sha1_to_hex(sha1));
>> +
>> +	if (is_readonly_obj)
>> +		set_object_sha1(path, sha1);
>> +
>> +	return ret;
>> +}
>> +
>> +int tree_purge_obj(void)
>> +{
>> +	uint32_t tgt_epoch = get_latest_epoch();
>> +
>> +	return for_each_object_in_wd(move_object_to_stale_dir, true,
>> +				     &tgt_epoch);
>> +}
>> +
>> +static struct store_driver tree_store = {
>> +	.id = TREE_STORE,
>> +	.name = "tree",
>> +	.init = tree_init,
>> +	.exist = tree_exist,
>> +	.create_and_write = tree_create_and_write,
>> +	.write = tree_write,
>> +	.read = tree_read,
>> +	.link = tree_link,
>> +	.update_epoch = tree_update_epoch,
>> +	.cleanup = tree_cleanup,
>> +	.format = tree_format,
>> +	.remove_object = tree_remove_object,
>> +	.get_hash = tree_get_hash,
>> +	.purge_obj = tree_purge_obj,
>> +};
>> +
>> +add_store_driver(tree_store);
>
> It seems that tree.c and plain.c still share a great portion of lines. It is
> okay to keep them for fast development, but I hope later we can drag as many
> lines as possible to common.c.
>
> Thanks,
> Yuan
>

Thank for your advice.
I'll make patch reflecting the noted.

Thanks,Saeki.





More information about the sheepdog mailing list