[sheepdog] [PATCH 3/3] add selectable object_size support of VDI operation (1/2)

Hitoshi Mitake mitake.hitoshi at lab.ntt.co.jp
Thu Dec 11 09:29:31 CET 2014


At Tue,  9 Dec 2014 21:49:24 +0900,
Teruaki Ishizaki wrote:
> 
> Data object size was fix to 4MB and not selectable.
> This patch add feature to select data object size of VDI.
> 
> If you want to use 8MB data object_size, specify the shift bit num.
> ex) dog vdi create -z 23 hogehoge 100M
> 
> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki at lab.ntt.co.jp>
> ---
>  dog/common.c                |    7 +-
>  dog/dog.h                   |    6 +-
>  dog/farm/farm.c             |   17 ++-
>  dog/vdi.c                   |  254 ++++++++++++++++++++++++++++++-------------
>  include/fec.h               |   12 +-
>  include/sheepdog_proto.h    |    7 +-
>  lib/fec.c                   |    9 +-
>  sheep/gateway.c             |    2 +-
>  sheep/group.c               |    3 +-
>  sheep/journal.c             |    5 +-
>  sheep/object_cache.c        |   27 +++--
>  sheep/ops.c                 |   14 ++-
>  sheep/plain_store.c         |   17 ++-
>  sheep/recovery.c            |    3 +-
>  sheep/sheep_priv.h          |    6 +-
>  sheep/vdi.c                 |   82 +++++++++++---
>  tests/unit/sheep/test_vdi.c |    6 +-
>  17 files changed, 336 insertions(+), 141 deletions(-)
> 
> diff --git a/dog/common.c b/dog/common.c
> index 2d8a173..11011a7 100644
> --- a/dog/common.c
> +++ b/dog/common.c
> @@ -365,7 +365,8 @@ void show_progress(uint64_t done, uint64_t total, bool raw)
>  	free(buf);
>  }
>  
> -size_t get_store_objsize(uint8_t copy_policy, uint64_t oid)
> +size_t get_store_objsize(uint8_t copy_policy, uint32_t object_size,
> +			 uint64_t oid)
>  {
>  	if (is_vdi_obj(oid))
>  		return SD_INODE_SIZE;
> @@ -375,9 +376,9 @@ size_t get_store_objsize(uint8_t copy_policy, uint64_t oid)
>  		int d;
>  
>  		ec_policy_to_dp(copy_policy, &d, NULL);
> -		return SD_DATA_OBJ_SIZE / d;
> +		return object_size / d;
>  	}
> -	return get_objsize(oid);
> +	return get_objsize(oid, object_size);
>  }
>  
>  bool is_erasure_oid(uint64_t oid, uint8_t policy)
> diff --git a/dog/dog.h b/dog/dog.h
> index 80becc6..d460a0b 100644
> --- a/dog/dog.h
> +++ b/dog/dog.h
> @@ -87,10 +87,12 @@ void confirm(const char *message);
>  void work_queue_wait(struct work_queue *q);
>  int do_vdi_create(const char *vdiname, int64_t vdi_size,
>  		  uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
> -		  uint8_t nr_copies, uint8_t copy_policy, uint8_t store_policy);
> +		  uint8_t nr_copies, uint8_t copy_policy,
> +		  uint8_t store_policy, uint32_t object_size);
>  int do_vdi_check(const struct sd_inode *inode);
>  void show_progress(uint64_t done, uint64_t total, bool raw);
> -size_t get_store_objsize(uint8_t copy_policy, uint64_t oid);
> +size_t get_store_objsize(uint8_t copy_policy, uint32_t object_size,
> +			 uint64_t oid);
>  bool is_erasure_oid(uint64_t oid, uint8_t policy);
>  uint8_t parse_copy(const char *str, uint8_t *copy_policy);
>  
> diff --git a/dog/farm/farm.c b/dog/farm/farm.c
> index 9414d42..c5fa40e 100644
> --- a/dog/farm/farm.c
> +++ b/dog/farm/farm.c
> @@ -38,6 +38,7 @@ struct active_vdi_entry {
>  	uint8_t  nr_copies;
>  	uint8_t copy_policy;
>  	uint8_t store_policy;
> +	uint32_t object_size;
>  };
>  
>  struct registered_obj_entry {
> @@ -77,6 +78,7 @@ static void update_active_vdi_entry(struct active_vdi_entry *vdi,
>  	vdi->nr_copies = new->nr_copies;
>  	vdi->copy_policy = new->copy_policy;
>  	vdi->store_policy = new->store_policy;
> +	vdi->object_size = (UINT32_C(1) << new->block_size_shift);
>  }
>  
>  static void add_active_vdi(struct sd_inode *new)
> @@ -131,7 +133,8 @@ static int create_active_vdis(void)
>  				  vdi->vdi_id, &new_vid,
>  				  false, vdi->nr_copies,
>  				  vdi->copy_policy,
> -				  vdi->store_policy) < 0)
> +				  vdi->store_policy,
> +				  vdi->object_size) < 0)
>  			return -1;
>  	}
>  	return 0;
> @@ -202,7 +205,7 @@ out:
>  }
>  
>  static int notify_vdi_add(uint32_t vdi_id, uint8_t nr_copies,
> -			  uint8_t copy_policy)
> +			  uint8_t copy_policy, uint32_t object_size)
>  {
>  	int ret;
>  	struct sd_req hdr;
> @@ -213,13 +216,14 @@ static int notify_vdi_add(uint32_t vdi_id, uint8_t nr_copies,
>  	hdr.vdi_state.new_vid = vdi_id;
>  	hdr.vdi_state.copies = nr_copies;
>  	hdr.vdi_state.copy_policy = copy_policy;
> +	hdr.vdi_state.object_size = object_size;
>  	hdr.vdi_state.set_bitmap = true;
>  
>  	ret = dog_exec_req(&sd_nid, &hdr, buf);
>  
>  	if (ret < 0)
> -		sd_err("Fail to notify vdi add event(%"PRIx32", %d)", vdi_id,
> -		       nr_copies);
> +		sd_err("Fail to notify vdi add event(%"PRIx32", %d"
> +		       ", %"PRIu32")", vdi_id, nr_copies, object_size);
>  	if (rsp->result != SD_RES_SUCCESS) {
>  		sd_err("%s", sd_strerror(rsp->result));
>  		ret = -1;
> @@ -261,7 +265,7 @@ static void do_save_object(struct work *work)
>  
>  	sw = container_of(work, struct snapshot_work, work);
>  
> -	size = get_objsize(sw->entry.oid);
> +	size = get_objsize(sw->entry.oid, sw->entry.object_size);
>  	buf = xmalloc(size);
>  
>  	if (dog_read_object(sw->entry.oid, buf, size, 0, true) < 0)
> @@ -413,7 +417,8 @@ static void do_load_object(struct work *work)
>  	vid = oid_to_vid(sw->entry.oid);
>  	if (register_vdi(vid)) {
>  		if (notify_vdi_add(vid, sw->entry.nr_copies,
> -				   sw->entry.copy_policy) < 0)
> +				   sw->entry.copy_policy,
> +				   sw->entry.object_size) < 0)
>  			goto error;
>  	}
>  
> diff --git a/dog/vdi.c b/dog/vdi.c
> index 5353062..3b0c408 100644
> --- a/dog/vdi.c
> +++ b/dog/vdi.c
> @@ -38,6 +38,8 @@ static struct sd_option vdi_options[] = {
>  	{'o', "oid", true, "specify the object id of the tracking object"},
>  	{'e', "exist", false, "only check objects exist or not,\n"
>  	 "                          neither comparing nor repairing"},
> +	{'z', "objsize", true, "specify the bit shift num for"
> +			       " data object size"},
>  	{ 0, NULL, false, NULL },
>  };
>  
> @@ -49,6 +51,7 @@ static struct vdi_cmd_data {
>  	bool delete;
>  	bool prealloc;
>  	int nr_copies;
> +	uint32_t object_size;
>  	bool writeback;
>  	int from_snapshot_id;
>  	char from_snapshot_tag[SD_MAX_VDI_TAG_LEN];
> @@ -67,6 +70,7 @@ struct get_vdi_info {
>  	uint32_t snapid;
>  	uint8_t nr_copies;
>  	uint8_t copy_policy;
> +	uint32_t object_size;
>  };
>  
>  int dog_bnode_writer(uint64_t oid, void *mem, unsigned int len, uint64_t offset,
> @@ -118,6 +122,7 @@ static void print_vdi_list(uint32_t vid, const char *name, const char *tag,
>  	struct tm tm;
>  	char dbuf[128];
>  	struct get_vdi_info *info = data;
> +	uint32_t object_size = (UINT32_C(1) << i->block_size_shift);
>  
>  	if (info && strcmp(name, info->name) != 0)
>  		return;
> @@ -143,23 +148,24 @@ static void print_vdi_list(uint32_t vid, const char *name, const char *tag,
>  				putchar('\\');
>  			putchar(*name++);
>  		}
> -		printf(" %d %s %s %s %s %" PRIx32 " %s %s\n", snapid,
> -		       strnumber(i->vdi_size),
> -		       strnumber(my_objs * SD_DATA_OBJ_SIZE),
> -		       strnumber(cow_objs * SD_DATA_OBJ_SIZE),
> +		printf(" %d %s %s %s %s %" PRIx32 " %s %s %" PRIu32 "\n",
> +		       snapid, strnumber(i->vdi_size),
> +		       strnumber(my_objs * object_size),
> +		       strnumber(cow_objs * object_size),
>  		       dbuf, vid,
>  		       redundancy_scheme(i->nr_copies, i->copy_policy),
> -		       i->tag);
> +		       i->tag, object_size);
>  	} else {
> -		printf("%c %-8s %5d %7s %7s %7s %s  %7" PRIx32 " %6s %13s\n",
> +		printf("%c %-8s %5d %7s %7s %7s %s  %7" PRIx32
> +		       " %6s %13s %7" PRIu32 "\n",
>  		       vdi_is_snapshot(i) ? 's' : (is_clone ? 'c' : ' '),
>  		       name, snapid,
>  		       strnumber(i->vdi_size),
> -		       strnumber(my_objs * SD_DATA_OBJ_SIZE),
> -		       strnumber(cow_objs * SD_DATA_OBJ_SIZE),
> +		       strnumber(my_objs * object_size),
> +		       strnumber(cow_objs * object_size),
>  		       dbuf, vid,
>  		       redundancy_scheme(i->nr_copies, i->copy_policy),
> -		       i->tag);
> +		       i->tag, object_size);
>  	}
>  }
>  
> @@ -282,7 +288,8 @@ static int vdi_list(int argc, char **argv)
>  	const char *vdiname = argv[optind];
>  
>  	if (!raw_output)
> -		printf("  Name        Id    Size    Used  Shared    Creation time   VDI id  Copies  Tag\n");
> +		printf("  Name        Id    Size    Used  Shared"
> +		       "    Creation time   VDI id  Copies  Tag    Obj Size\n");
>  
>  	if (vdiname) {
>  		struct get_vdi_info info;
> @@ -396,7 +403,8 @@ int read_vdi_obj(const char *vdiname, int snapid, const char *tag,
>  
>  int do_vdi_create(const char *vdiname, int64_t vdi_size,
>  		  uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
> -		  uint8_t nr_copies, uint8_t copy_policy, uint8_t store_policy)
> +		  uint8_t nr_copies, uint8_t copy_policy,
> +		  uint8_t store_policy, uint32_t object_size)
>  {
>  	struct sd_req hdr;
>  	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> @@ -416,6 +424,7 @@ int do_vdi_create(const char *vdiname, int64_t vdi_size,
>  	hdr.vdi.copies = nr_copies;
>  	hdr.vdi.copy_policy = copy_policy;
>  	hdr.vdi.store_policy = store_policy;
> +	hdr.vdi.object_size = object_size;
>  
>  	ret = dog_exec_req(&sd_nid, &hdr, buf);
>  	if (ret < 0)
> @@ -440,6 +449,8 @@ static int vdi_create(int argc, char **argv)
>  	uint32_t vid;
>  	uint64_t oid;
>  	uint32_t idx, max_idx;
> +	uint32_t object_size;
> +	uint64_t old_max_total_size = 0;
>  	struct sd_inode *inode = NULL;
>  	int ret;
>  
> @@ -451,10 +462,34 @@ static int vdi_create(int argc, char **argv)
>  	if (ret < 0)
>  		return EXIT_USAGE;
>  
> -	if (size > SD_OLD_MAX_VDI_SIZE && 0 == vdi_cmd_data.store_policy) {
> +	if (vdi_cmd_data.object_size)
> +		old_max_total_size =
> +			vdi_cmd_data.object_size * OLD_MAX_DATA_OBJS;
> +	else{
> +		struct sd_req hdr;
> +		struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> +		struct cluster_info cinfo;
> +		sd_init_req(&hdr, SD_OP_CLUSTER_INFO);
> +		hdr.data_length = sizeof(cinfo);
> +		ret = dog_exec_req(&sd_nid, &hdr, &cinfo);
> +		if (ret < 0) {
> +			sd_err("Fail to execute request: SD_OP_CLUSTER_INFO");
> +			ret = EXIT_FAILURE;
> +			goto out;
> +		}
> +		if (rsp->result != SD_RES_SUCCESS) {
> +			sd_err("%s", sd_strerror(rsp->result));
> +			ret = EXIT_FAILURE;
> +			goto out;
> +		}
> +		old_max_total_size = cinfo.object_size * OLD_MAX_DATA_OBJS;
> +	}

I cannot understand that why blocksize should be read before creating
VDI. If sd_req->vdi.object_size is equal to 0, sheep can use its
default value from cinfo->object_size.

In addition, sd_inode already has a member block_size_shift for
representing object size. Newly added members of cluster_info and
sd_req should be a number of block size shift, not bytes.

Thanks,
Hitoshi

> +
> +	if (size > old_max_total_size && 0 == vdi_cmd_data.store_policy) {
>  		sd_err("VDI size is larger than %s bytes, please use '-y' to "
> -		       "create a hyper volume with size up to %s bytes",
> -		       strnumber(SD_OLD_MAX_VDI_SIZE),
> +		       "create a hyper volume with size up to %s bytes"
> +		       " or use '-z' to create larger object size volume",
> +		       strnumber(old_max_total_size),
>  		       strnumber(SD_MAX_VDI_SIZE));
>  		return EXIT_USAGE;
>  	}
> @@ -466,7 +501,8 @@ static int vdi_create(int argc, char **argv)
>  
>  	ret = do_vdi_create(vdiname, size, 0, &vid, false,
>  			    vdi_cmd_data.nr_copies, vdi_cmd_data.copy_policy,
> -			    vdi_cmd_data.store_policy);
> +			    vdi_cmd_data.store_policy,
> +			    vdi_cmd_data.object_size);
>  	if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc)
>  		goto out;
>  
> @@ -479,10 +515,11 @@ static int vdi_create(int argc, char **argv)
>  		ret = EXIT_FAILURE;
>  		goto out;
>  	}
> -	max_idx = DIV_ROUND_UP(size, SD_DATA_OBJ_SIZE);
> +	object_size = (UINT32_C(1) << inode->block_size_shift);
> +	max_idx = DIV_ROUND_UP(size, object_size);
>  
>  	for (idx = 0; idx < max_idx; idx++) {
> -		vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size);
> +		vdi_show_progress(idx * object_size, inode->vdi_size);
>  		oid = vid_to_data_oid(vid, idx);
>  
>  		ret = dog_write_object(oid, 0, NULL, 0, 0, 0, inode->nr_copies,
> @@ -499,7 +536,7 @@ static int vdi_create(int argc, char **argv)
>  			goto out;
>  		}
>  	}
> -	vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size);
> +	vdi_show_progress(idx * object_size, inode->vdi_size);
>  	ret = EXIT_SUCCESS;
>  
>  out:
> @@ -559,6 +596,7 @@ static int vdi_snapshot(int argc, char **argv)
>  {
>  	const char *vdiname = argv[optind++];
>  	uint32_t vid, new_vid;
> +	uint32_t object_size;
>  	int ret;
>  	char buf[SD_INODE_HEADER_SIZE];
>  	struct sd_inode *inode = (struct sd_inode *)buf;
> @@ -662,9 +700,10 @@ static int vdi_snapshot(int argc, char **argv)
>  	if (ret != SD_RES_SUCCESS)
>  		goto out;
>  
> +	object_size = (UINT32_C(1) << inode->block_size_shift);
>  	ret = do_vdi_create(vdiname, inode->vdi_size, vid, &new_vid, true,
>  			    inode->nr_copies, inode->copy_policy,
> -			    inode->store_policy);
> +			    inode->store_policy, object_size);
>  
>  	if (ret == EXIT_SUCCESS && verbose) {
>  		if (raw_output)
> @@ -691,6 +730,7 @@ static int vdi_clone(int argc, char **argv)
>  	uint32_t base_vid, new_vid, vdi_id;
>  	uint64_t oid;
>  	uint32_t idx, max_idx, ret;
> +	uint32_t object_size;
>  	struct sd_inode *inode = NULL, *new_inode = NULL;
>  	char *buf = NULL;
>  
> @@ -719,9 +759,10 @@ static int vdi_clone(int argc, char **argv)
>  	if (vdi_cmd_data.no_share == true)
>  		base_vid = 0;
>  
> +	object_size = (UINT32_C(1) << inode->block_size_shift);
>  	ret = do_vdi_create(dst_vdi, inode->vdi_size, base_vid, &new_vid, false,
>  			    inode->nr_copies, inode->copy_policy,
> -			    inode->store_policy);
> +			    inode->store_policy, object_size);
>  	if (ret != EXIT_SUCCESS ||
>  			(!vdi_cmd_data.prealloc && !vdi_cmd_data.no_share))
>  		goto out;
> @@ -732,23 +773,23 @@ static int vdi_clone(int argc, char **argv)
>  	if (ret != EXIT_SUCCESS)
>  		goto out;
>  
> -	buf = xzalloc(SD_DATA_OBJ_SIZE);
> +	buf = xzalloc(object_size);
>  	max_idx = count_data_objs(inode);
>  
>  	for (idx = 0; idx < max_idx; idx++) {
>  		size_t size;
>  
> -		vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size);
> +		vdi_show_progress(idx * object_size, inode->vdi_size);
>  		vdi_id = sd_inode_get_vid(inode, idx);
>  		if (vdi_id) {
>  			oid = vid_to_data_oid(vdi_id, idx);
> -			ret = dog_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0,
> +			ret = dog_read_object(oid, buf, object_size, 0,
>  					      true);
>  			if (ret) {
>  				ret = EXIT_FAILURE;
>  				goto out;
>  			}
> -			size = SD_DATA_OBJ_SIZE;
> +			size = object_size;
>  		} else {
>  			if (vdi_cmd_data.no_share && !vdi_cmd_data.prealloc)
>  				continue;
> @@ -772,7 +813,7 @@ static int vdi_clone(int argc, char **argv)
>  			goto out;
>  		}
>  	}
> -	vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size);
> +	vdi_show_progress(idx * object_size, inode->vdi_size);
>  	ret = EXIT_SUCCESS;
>  
>  out:
> @@ -952,6 +993,7 @@ static int vdi_rollback(int argc, char **argv)
>  {
>  	const char *vdiname = argv[optind++];
>  	uint32_t base_vid, new_vid;
> +	uint32_t object_size;
>  	int ret;
>  	char buf[SD_INODE_HEADER_SIZE];
>  	struct sd_inode *inode = (struct sd_inode *)buf;
> @@ -977,9 +1019,10 @@ static int vdi_rollback(int argc, char **argv)
>  		return EXIT_FAILURE;
>  	}
>  
> +	object_size = (UINT32_C(1) << inode->block_size_shift);
>  	ret = do_vdi_create(vdiname, inode->vdi_size, base_vid, &new_vid,
>  			     false, vdi_cmd_data.nr_copies, inode->copy_policy,
> -			     inode->store_policy);
> +			     inode->store_policy, object_size);
>  
>  	if (ret == EXIT_SUCCESS && verbose) {
>  		if (raw_output)
> @@ -1494,6 +1537,7 @@ static int vdi_read(int argc, char **argv)
>  	struct sd_inode *inode = NULL;
>  	uint64_t offset = 0, oid, done = 0, total = (uint64_t) -1;
>  	uint32_t vdi_id, idx;
> +	uint32_t object_size;
>  	unsigned int len;
>  	char *buf = NULL;
>  
> @@ -1509,25 +1553,27 @@ static int vdi_read(int argc, char **argv)
>  	}
>  
>  	inode = malloc(sizeof(*inode));
> -	buf = xmalloc(SD_DATA_OBJ_SIZE);
>  
>  	ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id,
>  			   vdi_cmd_data.snapshot_tag, NULL, inode,
>  			   SD_INODE_SIZE);
>  	if (ret != EXIT_SUCCESS)
> -		goto out;
> +		goto load_inode_err;
>  
>  	if (inode->vdi_size < offset) {
>  		sd_err("Read offset is beyond the end of the VDI");
>  		ret = EXIT_FAILURE;
> -		goto out;
> +		goto load_inode_err;
>  	}
>  
> +	object_size = (UINT32_C(1) << inode->block_size_shift);
> +	buf = xmalloc(object_size);
> +
>  	total = min(total, inode->vdi_size - offset);
> -	idx = offset / SD_DATA_OBJ_SIZE;
> -	offset %= SD_DATA_OBJ_SIZE;
> +	idx = offset / object_size;
> +	offset %= object_size;
>  	while (done < total) {
> -		len = min(total - done, SD_DATA_OBJ_SIZE - offset);
> +		len = min(total - done, object_size - offset);
>  		vdi_id = sd_inode_get_vid(inode, idx);
>  		if (vdi_id) {
>  			oid = vid_to_data_oid(vdi_id, idx);
> @@ -1554,8 +1600,9 @@ static int vdi_read(int argc, char **argv)
>  	fsync(STDOUT_FILENO);
>  	ret = EXIT_SUCCESS;
>  out:
> -	free(inode);
>  	free(buf);
> +load_inode_err:
> +	free(inode);
>  
>  	return ret;
>  }
> @@ -1564,6 +1611,7 @@ static int vdi_write(int argc, char **argv)
>  {
>  	const char *vdiname = argv[optind++];
>  	uint32_t vid, flags, vdi_id, idx;
> +	uint32_t object_size;
>  	int ret;
>  	struct sd_inode *inode = NULL;
>  	uint64_t offset = 0, oid, old_oid, done = 0, total = (uint64_t) -1;
> @@ -1583,26 +1631,28 @@ static int vdi_write(int argc, char **argv)
>  	}
>  
>  	inode = xmalloc(sizeof(*inode));
> -	buf = xmalloc(SD_DATA_OBJ_SIZE);
>  
>  	ret = read_vdi_obj(vdiname, 0, "", &vid, inode, SD_INODE_SIZE);
>  	if (ret != EXIT_SUCCESS)
> -		goto out;
> +		goto load_inode_err;
>  
>  	if (inode->vdi_size < offset) {
>  		sd_err("Write offset is beyond the end of the VDI");
>  		ret = EXIT_FAILURE;
> -		goto out;
> +		goto load_inode_err;
>  	}
>  
> +	object_size = (UINT32_C(1) << inode->block_size_shift);
> +	buf = xmalloc(object_size);
> +
>  	total = min(total, inode->vdi_size - offset);
> -	idx = offset / SD_DATA_OBJ_SIZE;
> -	offset %= SD_DATA_OBJ_SIZE;
> +	idx = offset / object_size;
> +	offset %= object_size;
>  	while (done < total) {
>  		create = false;
>  		old_oid = 0;
>  		flags = 0;
> -		len = min(total - done, SD_DATA_OBJ_SIZE - offset);
> +		len = min(total - done, object_size - offset);
>  
>  		vdi_id = sd_inode_get_vid(inode, idx);
>  		if (!vdi_id)
> @@ -1647,7 +1697,7 @@ static int vdi_write(int argc, char **argv)
>  		}
>  
>  		offset += len;
> -		if (offset == SD_DATA_OBJ_SIZE) {
> +		if (offset == object_size) {
>  			offset = 0;
>  			idx++;
>  		}
> @@ -1655,8 +1705,9 @@ static int vdi_write(int argc, char **argv)
>  	}
>  	ret = EXIT_SUCCESS;
>  out:
> -	free(inode);
>  	free(buf);
> +load_inode_err:
> +	free(inode);
>  
>  	return ret;
>  }
> @@ -1709,6 +1760,7 @@ struct vdi_check_info {
>  	uint64_t oid;
>  	uint8_t nr_copies;
>  	uint8_t copy_policy;
> +	uint32_t object_size;
>  	uint64_t total;
>  	uint64_t *done;
>  	int refcnt;
> @@ -1721,7 +1773,7 @@ struct vdi_check_info {
>  static void free_vdi_check_info(struct vdi_check_info *info)
>  {
>  	if (info->done) {
> -		*info->done += SD_DATA_OBJ_SIZE;
> +		*info->done += info->object_size;
>  		vdi_show_progress(*info->done, info->total);
>  	}
>  	free(info);
> @@ -1783,6 +1835,7 @@ static void vdi_check_object_work(struct work *work)
>  	if (is_erasure_oid(info->oid, info->copy_policy)) {
>  		sd_init_req(&hdr, SD_OP_READ_PEER);
>  		hdr.data_length = get_store_objsize(info->copy_policy,
> +						    info->object_size,
>  						    info->oid);
>  		hdr.obj.ec_index = vcw->ec_index;
>  		hdr.epoch = sd_epoch;
> @@ -1856,7 +1909,8 @@ static void check_erasure_object(struct vdi_check_info *info)
>  	struct fec *ctx = ec_init(d, dp);
>  	int miss_idx[dp], input_idx[dp];
>  	uint64_t oid = info->oid;
> -	size_t len = get_store_objsize(info->copy_policy, oid);
> +	size_t len = get_store_objsize(info->copy_policy,
> +				       info->object_size, oid);
>  	char *obj = xmalloc(len);
>  	uint8_t *input[dp];
>  
> @@ -1882,7 +1936,8 @@ static void check_erasure_object(struct vdi_check_info *info)
>  			uint8_t *ds[d];
>  			for (j = 0; j < d; j++)
>  				ds[j] = info->vcw[j].buf;
> -			ec_decode_buffer(ctx, ds, idx, obj, d + k);
> +			ec_decode_buffer(ctx, ds, idx, obj, d + k,
> +					 info->object_size);
>  			if (memcmp(obj, info->vcw[d + k].buf, len) != 0) {
>  				/* TODO repair the inconsistency */
>  				sd_err("object %"PRIx64" is inconsistent", oid);
> @@ -1900,7 +1955,8 @@ static void check_erasure_object(struct vdi_check_info *info)
>  
>  			for (i = 0; i < d; i++)
>  				ds[i] = input[i];
> -			ec_decode_buffer(ctx, ds, input_idx, obj, m);
> +			ec_decode_buffer(ctx, ds, input_idx, obj, m,
> +					 info->object_size);
>  			write_object_to(info->vcw[m].vnode, oid, obj,
>  					len, true, info->vcw[m].ec_index);
>  			fprintf(stdout, "fixed missing %"PRIx64", "
> @@ -2023,6 +2079,7 @@ struct check_arg {
>  	uint64_t *done;
>  	struct work_queue *wq;
>  	int nr_copies;
> +	uint32_t object_size;
>  };
>  
>  static void check_cb(struct sd_index *idx, void *arg, int ignore)
> @@ -2032,7 +2089,7 @@ static void check_cb(struct sd_index *idx, void *arg, int ignore)
>  
>  	if (idx->vdi_id) {
>  		oid = vid_to_data_oid(idx->vdi_id, idx->idx);
> -		*(carg->done) = (uint64_t)idx->idx * SD_DATA_OBJ_SIZE;
> +		*(carg->done) = (uint64_t)idx->idx * carg->object_size;
>  		vdi_show_progress(*(carg->done), carg->inode->vdi_size);
>  		queue_vdi_check_work(carg->inode, oid, NULL, carg->wq,
>  				     carg->nr_copies);
> @@ -2046,6 +2103,7 @@ int do_vdi_check(const struct sd_inode *inode)
>  	uint32_t vid;
>  	struct work_queue *wq;
>  	int nr_copies = min((int)inode->nr_copies, sd_zones_nr);
> +	uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
>  
>  	if (0 < inode->copy_policy && sd_zones_nr < (int)inode->nr_copies) {
>  		sd_err("ABORT: Not enough active zones for consistency-checking"
> @@ -2070,12 +2128,13 @@ int do_vdi_check(const struct sd_inode *inode)
>  				queue_vdi_check_work(inode, oid, &done, wq,
>  						     nr_copies);
>  			} else {
> -				done += SD_DATA_OBJ_SIZE;
> +				done += object_size;
>  				vdi_show_progress(done, inode->vdi_size);
>  			}
>  		}
>  	} else {
> -		struct check_arg arg = {inode, &done, wq, nr_copies};
> +		struct check_arg arg = {inode, &done, wq, nr_copies,
> +					object_size};
>  		sd_inode_index_walk(inode, check_cb, &arg);
>  		vdi_show_progress(inode->vdi_size, inode->vdi_size);
>  	}
> @@ -2125,11 +2184,12 @@ struct obj_backup {
>  	uint32_t offset;
>  	uint32_t length;
>  	uint32_t reserved;
> -	uint8_t data[SD_DATA_OBJ_SIZE];
> +	uint8_t *data;
>  };
>  
>  /* discards redundant area from backup data */
> -static void compact_obj_backup(struct obj_backup *backup, uint8_t *from_data)
> +static void compact_obj_backup(struct obj_backup *backup, uint8_t *from_data,
> +			       uint32_t object_size)
>  {
>  	uint8_t *p1, *p2;
>  
> @@ -2142,8 +2202,8 @@ static void compact_obj_backup(struct obj_backup *backup, uint8_t *from_data)
>  		backup->length -= SECTOR_SIZE;
>  	}
>  
> -	p1 = backup->data + SD_DATA_OBJ_SIZE - SECTOR_SIZE;
> -	p2 = from_data + SD_DATA_OBJ_SIZE - SECTOR_SIZE;
> +	p1 = backup->data + object_size - SECTOR_SIZE;
> +	p2 = from_data + object_size - SECTOR_SIZE;
>  	while (backup->length > 0 && memcmp(p1, p2, SECTOR_SIZE) == 0) {
>  		p1 -= SECTOR_SIZE;
>  		p2 -= SECTOR_SIZE;
> @@ -2152,29 +2212,29 @@ static void compact_obj_backup(struct obj_backup *backup, uint8_t *from_data)
>  }
>  
>  static int get_obj_backup(uint32_t idx, uint32_t from_vid, uint32_t to_vid,
> -			  struct obj_backup *backup)
> +			  struct obj_backup *backup, uint32_t object_size)
>  {
>  	int ret;
> -	uint8_t *from_data = xzalloc(SD_DATA_OBJ_SIZE);
> +	uint8_t *from_data = xzalloc(object_size);
>  
>  	backup->idx = idx;
>  	backup->offset = 0;
> -	backup->length = SD_DATA_OBJ_SIZE;
> +	backup->length = object_size;
>  
>  	if (to_vid) {
>  		ret = dog_read_object(vid_to_data_oid(to_vid, idx),
> -				      backup->data, SD_DATA_OBJ_SIZE, 0, true);
> +				      backup->data, object_size, 0, true);
>  		if (ret != SD_RES_SUCCESS) {
>  			sd_err("Failed to read object %" PRIx32 ", %d", to_vid,
>  			       idx);
>  			return EXIT_FAILURE;
>  		}
>  	} else
> -		memset(backup->data, 0, SD_DATA_OBJ_SIZE);
> +		memset(backup->data, 0, object_size);
>  
>  	if (from_vid) {
>  		ret = dog_read_object(vid_to_data_oid(from_vid, idx), from_data,
> -				      SD_DATA_OBJ_SIZE, 0, true);
> +				      object_size, 0, true);
>  		if (ret != SD_RES_SUCCESS) {
>  			sd_err("Failed to read object %" PRIx32 ", %d",
>  			       from_vid, idx);
> @@ -2182,7 +2242,7 @@ static int get_obj_backup(uint32_t idx, uint32_t from_vid, uint32_t to_vid,
>  		}
>  	}
>  
> -	compact_obj_backup(backup, from_data);
> +	compact_obj_backup(backup, from_data, object_size);
>  
>  	free(from_data);
>  
> @@ -2194,13 +2254,13 @@ static int vdi_backup(int argc, char **argv)
>  	const char *vdiname = argv[optind++];
>  	int ret = EXIT_SUCCESS;
>  	uint32_t idx, nr_objs;
> +	uint32_t object_size;
>  	struct sd_inode *from_inode = xzalloc(sizeof(*from_inode));
>  	struct sd_inode *to_inode = xzalloc(sizeof(*to_inode));
>  	struct backup_hdr hdr = {
>  		.version = VDI_BACKUP_FORMAT_VERSION,
>  		.magic = VDI_BACKUP_MAGIC,
>  	};
> -	struct obj_backup *backup = xzalloc(sizeof(*backup));
>  
>  	if ((!vdi_cmd_data.snapshot_id && !vdi_cmd_data.snapshot_tag[0]) ||
>  	    (!vdi_cmd_data.from_snapshot_id &&
> @@ -2214,21 +2274,25 @@ static int vdi_backup(int argc, char **argv)
>  			   vdi_cmd_data.from_snapshot_tag, NULL,
>  			   from_inode, SD_INODE_SIZE);
>  	if (ret != EXIT_SUCCESS)
> -		goto out;
> +		goto load_inode_err;
>  
>  	ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id,
>  			   vdi_cmd_data.snapshot_tag, NULL, to_inode,
>  			   SD_INODE_SIZE);
>  	if (ret != EXIT_SUCCESS)
> -		goto out;
> +		goto load_inode_err;
>  
>  	nr_objs = count_data_objs(to_inode);
>  
> +	struct obj_backup *backup = xzalloc(sizeof(*backup));
> +	object_size = (UINT32_C(1) << from_inode->block_size_shift);
> +	backup->data = xzalloc(sizeof(uint8_t) * object_size);
> +
>  	ret = xwrite(STDOUT_FILENO, &hdr, sizeof(hdr));
>  	if (ret < 0) {
>  		sd_err("failed to write backup header, %m");
>  		ret = EXIT_SYSFAIL;
> -		goto out;
> +		goto error;
>  	}
>  
>  	for (idx = 0; idx < nr_objs; idx++) {
> @@ -2238,9 +2302,10 @@ static int vdi_backup(int argc, char **argv)
>  		if (to_vid == 0 && from_vid == 0)
>  			continue;
>  
> -		ret = get_obj_backup(idx, from_vid, to_vid, backup);
> +		ret = get_obj_backup(idx, from_vid, to_vid,
> +				     backup, object_size);
>  		if (ret != EXIT_SUCCESS)
> -			goto out;
> +			goto error;
>  
>  		if (backup->length == 0)
>  			continue;
> @@ -2250,14 +2315,14 @@ static int vdi_backup(int argc, char **argv)
>  		if (ret < 0) {
>  			sd_err("failed to write backup data, %m");
>  			ret = EXIT_SYSFAIL;
> -			goto out;
> +			goto error;
>  		}
>  		ret = xwrite(STDOUT_FILENO, backup->data + backup->offset,
>  			     backup->length);
>  		if (ret < 0) {
>  			sd_err("failed to write backup data, %m");
>  			ret = EXIT_SYSFAIL;
> -			goto out;
> +			goto error;
>  		}
>  	}
>  
> @@ -2269,15 +2334,18 @@ static int vdi_backup(int argc, char **argv)
>  	if (ret < 0) {
>  		sd_err("failed to write end marker, %m");
>  		ret = EXIT_SYSFAIL;
> -		goto out;
> +		goto error;
>  	}
>  
>  	fsync(STDOUT_FILENO);
>  	ret = EXIT_SUCCESS;
> -out:
> +error:
> +	free(backup->data);
> +	free(backup);
> +load_inode_err:
>  	free(from_inode);
>  	free(to_inode);
> -	free(backup);
> +out:
>  	return ret;
>  }
>  
> @@ -2310,6 +2378,7 @@ static uint32_t do_restore(const char *vdiname, int snapid, const char *tag)
>  {
>  	int ret;
>  	uint32_t vid;
> +	uint32_t object_size;
>  	struct backup_hdr hdr;
>  	struct obj_backup *backup = xzalloc(sizeof(*backup));
>  	struct sd_inode *inode = xzalloc(sizeof(*inode));
> @@ -2329,9 +2398,10 @@ static uint32_t do_restore(const char *vdiname, int snapid, const char *tag)
>  	if (ret != EXIT_SUCCESS)
>  		goto out;
>  
> +	object_size = (UINT32_C(1) << inode->block_size_shift);
>  	ret = do_vdi_create(vdiname, inode->vdi_size, inode->vdi_id, &vid,
>  			    false, inode->nr_copies, inode->copy_policy,
> -			    inode->store_policy);
> +			    inode->store_policy, object_size);
>  	if (ret != EXIT_SUCCESS) {
>  		sd_err("Failed to read VDI");
>  		goto out;
> @@ -2435,12 +2505,15 @@ static int vdi_restore(int argc, char **argv)
>  out:
>  	if (need_current_recovery) {
>  		int recovery_ret;
> +		uint32_t object_size =
> +			(UINT32_C(1) << current_inode->block_size_shift);
>  		/* recreate the current vdi object */
>  		recovery_ret = do_vdi_create(vdiname, current_inode->vdi_size,
>  					     current_inode->parent_vdi_id, NULL,
>  					     true, current_inode->nr_copies,
>  					     current_inode->copy_policy,
> -					     current_inode->store_policy);
> +					     current_inode->store_policy,
> +					     object_size);
>  		if (recovery_ret != EXIT_SUCCESS) {
>  			sd_err("failed to resume the current vdi");
>  			ret = recovery_ret;
> @@ -2563,9 +2636,25 @@ static int vdi_cache_info(int argc, char **argv)
>  
>  	fprintf(stdout, "Name\tTag\tTotal\tDirty\tClean\n");
>  	for (i = 0; i < info.count; i++) {
> -		uint64_t total = info.caches[i].total * SD_DATA_OBJ_SIZE,
> -			 dirty = info.caches[i].dirty * SD_DATA_OBJ_SIZE,
> +		uint32_t object_size;
> +		uint32_t vid = info.caches[i].vid;
> +		struct sd_inode *inode = NULL;
> +		int r;
> +
> +		r = dog_read_object(vid_to_vdi_oid(vid), inode,
> +				    SD_INODE_HEADER_SIZE, 0, true);
> +		if (r != EXIT_SUCCESS)
> +			return r;
> +
> +		if (!inode->block_size_shift)
> +			return EXIT_FAILURE;
> +
> +		object_size = (UINT32_C(1) << inode->block_size_shift);
> +
> +		uint64_t total = info.caches[i].total * object_size,
> +			 dirty = info.caches[i].dirty * object_size,
>  			 clean = total - dirty;
> +
>  		char name[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
>  
>  		ret = vid_to_name_tag(info.caches[i].vid, name, tag);
> @@ -2955,7 +3044,7 @@ static struct subcommand vdi_cmd[] = {
>  	{"check", "<vdiname>", "seaphT", "check and repair image's consistency",
>  	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
>  	 vdi_check, vdi_options},
> -	{"create", "<vdiname> <size>", "PycaphrvT", "create an image",
> +	{"create", "<vdiname> <size>", "PycaphrvzT", "create an image",
>  	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
>  	 vdi_create, vdi_options},
>  	{"snapshot", "<vdiname>", "saphrvT", "create a snapshot",
> @@ -3023,6 +3112,7 @@ static struct subcommand vdi_cmd[] = {
>  static int vdi_parser(int ch, const char *opt)
>  {
>  	char *p;
> +	uint32_t object_size_shift_bit;
>  
>  	switch (ch) {
>  	case 'P':
> @@ -3101,6 +3191,20 @@ static int vdi_parser(int ch, const char *opt)
>  	case 'e':
>  		vdi_cmd_data.exist = true;
>  		break;
> +	case 'z':
> +		object_size_shift_bit = (uint32_t)atoi(opt);
> +		if (object_size_shift_bit > 31) {
> +			sd_err("Object Size is limited to 2^31."
> +			       " Please set shift bit lower than 31");
> +			exit(EXIT_FAILURE);
> +		}
> +		vdi_cmd_data.object_size =
> +				(UINT32_C(1) << object_size_shift_bit);
> +		if (!vdi_cmd_data.object_size) {
> +			sd_err("Invalid parameter %s", opt);
> +			exit(EXIT_FAILURE);
> +		}
> +		break;
>  	}
>  
>  	return 0;
> diff --git a/include/fec.h b/include/fec.h
> index 1ae32e4..b3ef8d8 100644
> --- a/include/fec.h
> +++ b/include/fec.h
> @@ -96,12 +96,12 @@ void fec_encode(const struct fec *code,
>  		size_t num_block_nums, size_t sz);
>  
>  void fec_decode_buffer(struct fec *ctx, uint8_t *input[], const int in_idx[],
> -		       char *buf, int idx);
> +		       char *buf, int idx, uint32_t object_size);
>  
>  /* for isa-l */
>  
>  void isa_decode_buffer(struct fec *ctx, uint8_t *input[], const int in_idx[],
> -		       char *buf, int idx);
> +		       char *buf, int idx, uint32_t object_size);
>  
>  /*
>   * @param inpkts an array of packets (size k); If a primary block, i, is present
> @@ -119,7 +119,6 @@ void fec_decode(const struct fec *code,
>  
>  /* Set data stripe as sector size to make VM happy */
>  #define SD_EC_DATA_STRIPE_SIZE (512) /* 512 Byte */
> -#define SD_EC_NR_STRIPE_PER_OBJECT (SD_DATA_OBJ_SIZE / SD_EC_DATA_STRIPE_SIZE)
>  #define SD_EC_MAX_STRIP (16)
>  
>  static inline int ec_policy_to_dp(uint8_t policy, int *d, int *p)
> @@ -205,11 +204,12 @@ static inline void ec_destroy(struct fec *ctx)
>  }
>  
>  static inline void ec_decode_buffer(struct fec *ctx, uint8_t *input[],
> -				    const int in_idx[], char *buf, int idx)
> +				    const int in_idx[], char *buf,
> +				    int idx, uint32_t object_size)
>  {
>  	if (cpu_has_ssse3)
> -		isa_decode_buffer(ctx, input, in_idx, buf, idx);
> +		isa_decode_buffer(ctx, input, in_idx, buf, idx, object_size);
>  	else
> -		fec_decode_buffer(ctx, input, in_idx, buf, idx);
> +		fec_decode_buffer(ctx, input, in_idx, buf, idx, object_size);
>  }
>  #endif
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index cbb65b6..5cdedf5 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -477,10 +477,11 @@ static inline bool is_data_obj(uint64_t oid)
>  
>  static inline size_t count_data_objs(const struct sd_inode *inode)
>  {
> -	return DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE);
> +	return DIV_ROUND_UP(inode->vdi_size,
> +			    (UINT32_C(1) << inode->block_size_shift));
>  }
>  
> -static inline size_t get_objsize(uint64_t oid)
> +static inline size_t get_objsize(uint64_t oid, uint32_t object_size)
>  {
>  	if (is_vdi_obj(oid))
>  		return SD_INODE_SIZE;
> @@ -494,7 +495,7 @@ static inline size_t get_objsize(uint64_t oid)
>  	if (is_ledger_object(oid))
>  		return SD_LEDGER_OBJ_SIZE;
>  
> -	return SD_DATA_OBJ_SIZE;



More information about the sheepdog mailing list