[sheepdog] [PATCH 3/3] add selectable object_size support of VDI operation (1/2)
Hitoshi Mitake
mitake.hitoshi at lab.ntt.co.jp
Thu Dec 11 09:29:31 CET 2014
At Tue, 9 Dec 2014 21:49:24 +0900,
Teruaki Ishizaki wrote:
>
> Data object size was fix to 4MB and not selectable.
> This patch add feature to select data object size of VDI.
>
> If you want to use 8MB data object_size, specify the shift bit num.
> ex) dog vdi create -z 23 hogehoge 100M
>
> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki at lab.ntt.co.jp>
> ---
> dog/common.c | 7 +-
> dog/dog.h | 6 +-
> dog/farm/farm.c | 17 ++-
> dog/vdi.c | 254 ++++++++++++++++++++++++++++++-------------
> include/fec.h | 12 +-
> include/sheepdog_proto.h | 7 +-
> lib/fec.c | 9 +-
> sheep/gateway.c | 2 +-
> sheep/group.c | 3 +-
> sheep/journal.c | 5 +-
> sheep/object_cache.c | 27 +++--
> sheep/ops.c | 14 ++-
> sheep/plain_store.c | 17 ++-
> sheep/recovery.c | 3 +-
> sheep/sheep_priv.h | 6 +-
> sheep/vdi.c | 82 +++++++++++---
> tests/unit/sheep/test_vdi.c | 6 +-
> 17 files changed, 336 insertions(+), 141 deletions(-)
>
> diff --git a/dog/common.c b/dog/common.c
> index 2d8a173..11011a7 100644
> --- a/dog/common.c
> +++ b/dog/common.c
> @@ -365,7 +365,8 @@ void show_progress(uint64_t done, uint64_t total, bool raw)
> free(buf);
> }
>
> -size_t get_store_objsize(uint8_t copy_policy, uint64_t oid)
> +size_t get_store_objsize(uint8_t copy_policy, uint32_t object_size,
> + uint64_t oid)
> {
> if (is_vdi_obj(oid))
> return SD_INODE_SIZE;
> @@ -375,9 +376,9 @@ size_t get_store_objsize(uint8_t copy_policy, uint64_t oid)
> int d;
>
> ec_policy_to_dp(copy_policy, &d, NULL);
> - return SD_DATA_OBJ_SIZE / d;
> + return object_size / d;
> }
> - return get_objsize(oid);
> + return get_objsize(oid, object_size);
> }
>
> bool is_erasure_oid(uint64_t oid, uint8_t policy)
> diff --git a/dog/dog.h b/dog/dog.h
> index 80becc6..d460a0b 100644
> --- a/dog/dog.h
> +++ b/dog/dog.h
> @@ -87,10 +87,12 @@ void confirm(const char *message);
> void work_queue_wait(struct work_queue *q);
> int do_vdi_create(const char *vdiname, int64_t vdi_size,
> uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
> - uint8_t nr_copies, uint8_t copy_policy, uint8_t store_policy);
> + uint8_t nr_copies, uint8_t copy_policy,
> + uint8_t store_policy, uint32_t object_size);
> int do_vdi_check(const struct sd_inode *inode);
> void show_progress(uint64_t done, uint64_t total, bool raw);
> -size_t get_store_objsize(uint8_t copy_policy, uint64_t oid);
> +size_t get_store_objsize(uint8_t copy_policy, uint32_t object_size,
> + uint64_t oid);
> bool is_erasure_oid(uint64_t oid, uint8_t policy);
> uint8_t parse_copy(const char *str, uint8_t *copy_policy);
>
> diff --git a/dog/farm/farm.c b/dog/farm/farm.c
> index 9414d42..c5fa40e 100644
> --- a/dog/farm/farm.c
> +++ b/dog/farm/farm.c
> @@ -38,6 +38,7 @@ struct active_vdi_entry {
> uint8_t nr_copies;
> uint8_t copy_policy;
> uint8_t store_policy;
> + uint32_t object_size;
> };
>
> struct registered_obj_entry {
> @@ -77,6 +78,7 @@ static void update_active_vdi_entry(struct active_vdi_entry *vdi,
> vdi->nr_copies = new->nr_copies;
> vdi->copy_policy = new->copy_policy;
> vdi->store_policy = new->store_policy;
> + vdi->object_size = (UINT32_C(1) << new->block_size_shift);
> }
>
> static void add_active_vdi(struct sd_inode *new)
> @@ -131,7 +133,8 @@ static int create_active_vdis(void)
> vdi->vdi_id, &new_vid,
> false, vdi->nr_copies,
> vdi->copy_policy,
> - vdi->store_policy) < 0)
> + vdi->store_policy,
> + vdi->object_size) < 0)
> return -1;
> }
> return 0;
> @@ -202,7 +205,7 @@ out:
> }
>
> static int notify_vdi_add(uint32_t vdi_id, uint8_t nr_copies,
> - uint8_t copy_policy)
> + uint8_t copy_policy, uint32_t object_size)
> {
> int ret;
> struct sd_req hdr;
> @@ -213,13 +216,14 @@ static int notify_vdi_add(uint32_t vdi_id, uint8_t nr_copies,
> hdr.vdi_state.new_vid = vdi_id;
> hdr.vdi_state.copies = nr_copies;
> hdr.vdi_state.copy_policy = copy_policy;
> + hdr.vdi_state.object_size = object_size;
> hdr.vdi_state.set_bitmap = true;
>
> ret = dog_exec_req(&sd_nid, &hdr, buf);
>
> if (ret < 0)
> - sd_err("Fail to notify vdi add event(%"PRIx32", %d)", vdi_id,
> - nr_copies);
> + sd_err("Fail to notify vdi add event(%"PRIx32", %d"
> + ", %"PRIu32")", vdi_id, nr_copies, object_size);
> if (rsp->result != SD_RES_SUCCESS) {
> sd_err("%s", sd_strerror(rsp->result));
> ret = -1;
> @@ -261,7 +265,7 @@ static void do_save_object(struct work *work)
>
> sw = container_of(work, struct snapshot_work, work);
>
> - size = get_objsize(sw->entry.oid);
> + size = get_objsize(sw->entry.oid, sw->entry.object_size);
> buf = xmalloc(size);
>
> if (dog_read_object(sw->entry.oid, buf, size, 0, true) < 0)
> @@ -413,7 +417,8 @@ static void do_load_object(struct work *work)
> vid = oid_to_vid(sw->entry.oid);
> if (register_vdi(vid)) {
> if (notify_vdi_add(vid, sw->entry.nr_copies,
> - sw->entry.copy_policy) < 0)
> + sw->entry.copy_policy,
> + sw->entry.object_size) < 0)
> goto error;
> }
>
> diff --git a/dog/vdi.c b/dog/vdi.c
> index 5353062..3b0c408 100644
> --- a/dog/vdi.c
> +++ b/dog/vdi.c
> @@ -38,6 +38,8 @@ static struct sd_option vdi_options[] = {
> {'o', "oid", true, "specify the object id of the tracking object"},
> {'e', "exist", false, "only check objects exist or not,\n"
> " neither comparing nor repairing"},
> + {'z', "objsize", true, "specify the bit shift num for"
> + " data object size"},
> { 0, NULL, false, NULL },
> };
>
> @@ -49,6 +51,7 @@ static struct vdi_cmd_data {
> bool delete;
> bool prealloc;
> int nr_copies;
> + uint32_t object_size;
> bool writeback;
> int from_snapshot_id;
> char from_snapshot_tag[SD_MAX_VDI_TAG_LEN];
> @@ -67,6 +70,7 @@ struct get_vdi_info {
> uint32_t snapid;
> uint8_t nr_copies;
> uint8_t copy_policy;
> + uint32_t object_size;
> };
>
> int dog_bnode_writer(uint64_t oid, void *mem, unsigned int len, uint64_t offset,
> @@ -118,6 +122,7 @@ static void print_vdi_list(uint32_t vid, const char *name, const char *tag,
> struct tm tm;
> char dbuf[128];
> struct get_vdi_info *info = data;
> + uint32_t object_size = (UINT32_C(1) << i->block_size_shift);
>
> if (info && strcmp(name, info->name) != 0)
> return;
> @@ -143,23 +148,24 @@ static void print_vdi_list(uint32_t vid, const char *name, const char *tag,
> putchar('\\');
> putchar(*name++);
> }
> - printf(" %d %s %s %s %s %" PRIx32 " %s %s\n", snapid,
> - strnumber(i->vdi_size),
> - strnumber(my_objs * SD_DATA_OBJ_SIZE),
> - strnumber(cow_objs * SD_DATA_OBJ_SIZE),
> + printf(" %d %s %s %s %s %" PRIx32 " %s %s %" PRIu32 "\n",
> + snapid, strnumber(i->vdi_size),
> + strnumber(my_objs * object_size),
> + strnumber(cow_objs * object_size),
> dbuf, vid,
> redundancy_scheme(i->nr_copies, i->copy_policy),
> - i->tag);
> + i->tag, object_size);
> } else {
> - printf("%c %-8s %5d %7s %7s %7s %s %7" PRIx32 " %6s %13s\n",
> + printf("%c %-8s %5d %7s %7s %7s %s %7" PRIx32
> + " %6s %13s %7" PRIu32 "\n",
> vdi_is_snapshot(i) ? 's' : (is_clone ? 'c' : ' '),
> name, snapid,
> strnumber(i->vdi_size),
> - strnumber(my_objs * SD_DATA_OBJ_SIZE),
> - strnumber(cow_objs * SD_DATA_OBJ_SIZE),
> + strnumber(my_objs * object_size),
> + strnumber(cow_objs * object_size),
> dbuf, vid,
> redundancy_scheme(i->nr_copies, i->copy_policy),
> - i->tag);
> + i->tag, object_size);
> }
> }
>
> @@ -282,7 +288,8 @@ static int vdi_list(int argc, char **argv)
> const char *vdiname = argv[optind];
>
> if (!raw_output)
> - printf(" Name Id Size Used Shared Creation time VDI id Copies Tag\n");
> + printf(" Name Id Size Used Shared"
> + " Creation time VDI id Copies Tag Obj Size\n");
>
> if (vdiname) {
> struct get_vdi_info info;
> @@ -396,7 +403,8 @@ int read_vdi_obj(const char *vdiname, int snapid, const char *tag,
>
> int do_vdi_create(const char *vdiname, int64_t vdi_size,
> uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
> - uint8_t nr_copies, uint8_t copy_policy, uint8_t store_policy)
> + uint8_t nr_copies, uint8_t copy_policy,
> + uint8_t store_policy, uint32_t object_size)
> {
> struct sd_req hdr;
> struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> @@ -416,6 +424,7 @@ int do_vdi_create(const char *vdiname, int64_t vdi_size,
> hdr.vdi.copies = nr_copies;
> hdr.vdi.copy_policy = copy_policy;
> hdr.vdi.store_policy = store_policy;
> + hdr.vdi.object_size = object_size;
>
> ret = dog_exec_req(&sd_nid, &hdr, buf);
> if (ret < 0)
> @@ -440,6 +449,8 @@ static int vdi_create(int argc, char **argv)
> uint32_t vid;
> uint64_t oid;
> uint32_t idx, max_idx;
> + uint32_t object_size;
> + uint64_t old_max_total_size = 0;
> struct sd_inode *inode = NULL;
> int ret;
>
> @@ -451,10 +462,34 @@ static int vdi_create(int argc, char **argv)
> if (ret < 0)
> return EXIT_USAGE;
>
> - if (size > SD_OLD_MAX_VDI_SIZE && 0 == vdi_cmd_data.store_policy) {
> + if (vdi_cmd_data.object_size)
> + old_max_total_size =
> + vdi_cmd_data.object_size * OLD_MAX_DATA_OBJS;
> + else{
> + struct sd_req hdr;
> + struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> + struct cluster_info cinfo;
> + sd_init_req(&hdr, SD_OP_CLUSTER_INFO);
> + hdr.data_length = sizeof(cinfo);
> + ret = dog_exec_req(&sd_nid, &hdr, &cinfo);
> + if (ret < 0) {
> + sd_err("Fail to execute request: SD_OP_CLUSTER_INFO");
> + ret = EXIT_FAILURE;
> + goto out;
> + }
> + if (rsp->result != SD_RES_SUCCESS) {
> + sd_err("%s", sd_strerror(rsp->result));
> + ret = EXIT_FAILURE;
> + goto out;
> + }
> + old_max_total_size = cinfo.object_size * OLD_MAX_DATA_OBJS;
> + }
I cannot understand that why blocksize should be read before creating
VDI. If sd_req->vdi.object_size is equal to 0, sheep can use its
default value from cinfo->object_size.
In addition, sd_inode already has a member block_size_shift for
representing object size. Newly added members of cluster_info and
sd_req should be a number of block size shift, not bytes.
Thanks,
Hitoshi
> +
> + if (size > old_max_total_size && 0 == vdi_cmd_data.store_policy) {
> sd_err("VDI size is larger than %s bytes, please use '-y' to "
> - "create a hyper volume with size up to %s bytes",
> - strnumber(SD_OLD_MAX_VDI_SIZE),
> + "create a hyper volume with size up to %s bytes"
> + " or use '-z' to create larger object size volume",
> + strnumber(old_max_total_size),
> strnumber(SD_MAX_VDI_SIZE));
> return EXIT_USAGE;
> }
> @@ -466,7 +501,8 @@ static int vdi_create(int argc, char **argv)
>
> ret = do_vdi_create(vdiname, size, 0, &vid, false,
> vdi_cmd_data.nr_copies, vdi_cmd_data.copy_policy,
> - vdi_cmd_data.store_policy);
> + vdi_cmd_data.store_policy,
> + vdi_cmd_data.object_size);
> if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc)
> goto out;
>
> @@ -479,10 +515,11 @@ static int vdi_create(int argc, char **argv)
> ret = EXIT_FAILURE;
> goto out;
> }
> - max_idx = DIV_ROUND_UP(size, SD_DATA_OBJ_SIZE);
> + object_size = (UINT32_C(1) << inode->block_size_shift);
> + max_idx = DIV_ROUND_UP(size, object_size);
>
> for (idx = 0; idx < max_idx; idx++) {
> - vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size);
> + vdi_show_progress(idx * object_size, inode->vdi_size);
> oid = vid_to_data_oid(vid, idx);
>
> ret = dog_write_object(oid, 0, NULL, 0, 0, 0, inode->nr_copies,
> @@ -499,7 +536,7 @@ static int vdi_create(int argc, char **argv)
> goto out;
> }
> }
> - vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size);
> + vdi_show_progress(idx * object_size, inode->vdi_size);
> ret = EXIT_SUCCESS;
>
> out:
> @@ -559,6 +596,7 @@ static int vdi_snapshot(int argc, char **argv)
> {
> const char *vdiname = argv[optind++];
> uint32_t vid, new_vid;
> + uint32_t object_size;
> int ret;
> char buf[SD_INODE_HEADER_SIZE];
> struct sd_inode *inode = (struct sd_inode *)buf;
> @@ -662,9 +700,10 @@ static int vdi_snapshot(int argc, char **argv)
> if (ret != SD_RES_SUCCESS)
> goto out;
>
> + object_size = (UINT32_C(1) << inode->block_size_shift);
> ret = do_vdi_create(vdiname, inode->vdi_size, vid, &new_vid, true,
> inode->nr_copies, inode->copy_policy,
> - inode->store_policy);
> + inode->store_policy, object_size);
>
> if (ret == EXIT_SUCCESS && verbose) {
> if (raw_output)
> @@ -691,6 +730,7 @@ static int vdi_clone(int argc, char **argv)
> uint32_t base_vid, new_vid, vdi_id;
> uint64_t oid;
> uint32_t idx, max_idx, ret;
> + uint32_t object_size;
> struct sd_inode *inode = NULL, *new_inode = NULL;
> char *buf = NULL;
>
> @@ -719,9 +759,10 @@ static int vdi_clone(int argc, char **argv)
> if (vdi_cmd_data.no_share == true)
> base_vid = 0;
>
> + object_size = (UINT32_C(1) << inode->block_size_shift);
> ret = do_vdi_create(dst_vdi, inode->vdi_size, base_vid, &new_vid, false,
> inode->nr_copies, inode->copy_policy,
> - inode->store_policy);
> + inode->store_policy, object_size);
> if (ret != EXIT_SUCCESS ||
> (!vdi_cmd_data.prealloc && !vdi_cmd_data.no_share))
> goto out;
> @@ -732,23 +773,23 @@ static int vdi_clone(int argc, char **argv)
> if (ret != EXIT_SUCCESS)
> goto out;
>
> - buf = xzalloc(SD_DATA_OBJ_SIZE);
> + buf = xzalloc(object_size);
> max_idx = count_data_objs(inode);
>
> for (idx = 0; idx < max_idx; idx++) {
> size_t size;
>
> - vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size);
> + vdi_show_progress(idx * object_size, inode->vdi_size);
> vdi_id = sd_inode_get_vid(inode, idx);
> if (vdi_id) {
> oid = vid_to_data_oid(vdi_id, idx);
> - ret = dog_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0,
> + ret = dog_read_object(oid, buf, object_size, 0,
> true);
> if (ret) {
> ret = EXIT_FAILURE;
> goto out;
> }
> - size = SD_DATA_OBJ_SIZE;
> + size = object_size;
> } else {
> if (vdi_cmd_data.no_share && !vdi_cmd_data.prealloc)
> continue;
> @@ -772,7 +813,7 @@ static int vdi_clone(int argc, char **argv)
> goto out;
> }
> }
> - vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size);
> + vdi_show_progress(idx * object_size, inode->vdi_size);
> ret = EXIT_SUCCESS;
>
> out:
> @@ -952,6 +993,7 @@ static int vdi_rollback(int argc, char **argv)
> {
> const char *vdiname = argv[optind++];
> uint32_t base_vid, new_vid;
> + uint32_t object_size;
> int ret;
> char buf[SD_INODE_HEADER_SIZE];
> struct sd_inode *inode = (struct sd_inode *)buf;
> @@ -977,9 +1019,10 @@ static int vdi_rollback(int argc, char **argv)
> return EXIT_FAILURE;
> }
>
> + object_size = (UINT32_C(1) << inode->block_size_shift);
> ret = do_vdi_create(vdiname, inode->vdi_size, base_vid, &new_vid,
> false, vdi_cmd_data.nr_copies, inode->copy_policy,
> - inode->store_policy);
> + inode->store_policy, object_size);
>
> if (ret == EXIT_SUCCESS && verbose) {
> if (raw_output)
> @@ -1494,6 +1537,7 @@ static int vdi_read(int argc, char **argv)
> struct sd_inode *inode = NULL;
> uint64_t offset = 0, oid, done = 0, total = (uint64_t) -1;
> uint32_t vdi_id, idx;
> + uint32_t object_size;
> unsigned int len;
> char *buf = NULL;
>
> @@ -1509,25 +1553,27 @@ static int vdi_read(int argc, char **argv)
> }
>
> inode = malloc(sizeof(*inode));
> - buf = xmalloc(SD_DATA_OBJ_SIZE);
>
> ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id,
> vdi_cmd_data.snapshot_tag, NULL, inode,
> SD_INODE_SIZE);
> if (ret != EXIT_SUCCESS)
> - goto out;
> + goto load_inode_err;
>
> if (inode->vdi_size < offset) {
> sd_err("Read offset is beyond the end of the VDI");
> ret = EXIT_FAILURE;
> - goto out;
> + goto load_inode_err;
> }
>
> + object_size = (UINT32_C(1) << inode->block_size_shift);
> + buf = xmalloc(object_size);
> +
> total = min(total, inode->vdi_size - offset);
> - idx = offset / SD_DATA_OBJ_SIZE;
> - offset %= SD_DATA_OBJ_SIZE;
> + idx = offset / object_size;
> + offset %= object_size;
> while (done < total) {
> - len = min(total - done, SD_DATA_OBJ_SIZE - offset);
> + len = min(total - done, object_size - offset);
> vdi_id = sd_inode_get_vid(inode, idx);
> if (vdi_id) {
> oid = vid_to_data_oid(vdi_id, idx);
> @@ -1554,8 +1600,9 @@ static int vdi_read(int argc, char **argv)
> fsync(STDOUT_FILENO);
> ret = EXIT_SUCCESS;
> out:
> - free(inode);
> free(buf);
> +load_inode_err:
> + free(inode);
>
> return ret;
> }
> @@ -1564,6 +1611,7 @@ static int vdi_write(int argc, char **argv)
> {
> const char *vdiname = argv[optind++];
> uint32_t vid, flags, vdi_id, idx;
> + uint32_t object_size;
> int ret;
> struct sd_inode *inode = NULL;
> uint64_t offset = 0, oid, old_oid, done = 0, total = (uint64_t) -1;
> @@ -1583,26 +1631,28 @@ static int vdi_write(int argc, char **argv)
> }
>
> inode = xmalloc(sizeof(*inode));
> - buf = xmalloc(SD_DATA_OBJ_SIZE);
>
> ret = read_vdi_obj(vdiname, 0, "", &vid, inode, SD_INODE_SIZE);
> if (ret != EXIT_SUCCESS)
> - goto out;
> + goto load_inode_err;
>
> if (inode->vdi_size < offset) {
> sd_err("Write offset is beyond the end of the VDI");
> ret = EXIT_FAILURE;
> - goto out;
> + goto load_inode_err;
> }
>
> + object_size = (UINT32_C(1) << inode->block_size_shift);
> + buf = xmalloc(object_size);
> +
> total = min(total, inode->vdi_size - offset);
> - idx = offset / SD_DATA_OBJ_SIZE;
> - offset %= SD_DATA_OBJ_SIZE;
> + idx = offset / object_size;
> + offset %= object_size;
> while (done < total) {
> create = false;
> old_oid = 0;
> flags = 0;
> - len = min(total - done, SD_DATA_OBJ_SIZE - offset);
> + len = min(total - done, object_size - offset);
>
> vdi_id = sd_inode_get_vid(inode, idx);
> if (!vdi_id)
> @@ -1647,7 +1697,7 @@ static int vdi_write(int argc, char **argv)
> }
>
> offset += len;
> - if (offset == SD_DATA_OBJ_SIZE) {
> + if (offset == object_size) {
> offset = 0;
> idx++;
> }
> @@ -1655,8 +1705,9 @@ static int vdi_write(int argc, char **argv)
> }
> ret = EXIT_SUCCESS;
> out:
> - free(inode);
> free(buf);
> +load_inode_err:
> + free(inode);
>
> return ret;
> }
> @@ -1709,6 +1760,7 @@ struct vdi_check_info {
> uint64_t oid;
> uint8_t nr_copies;
> uint8_t copy_policy;
> + uint32_t object_size;
> uint64_t total;
> uint64_t *done;
> int refcnt;
> @@ -1721,7 +1773,7 @@ struct vdi_check_info {
> static void free_vdi_check_info(struct vdi_check_info *info)
> {
> if (info->done) {
> - *info->done += SD_DATA_OBJ_SIZE;
> + *info->done += info->object_size;
> vdi_show_progress(*info->done, info->total);
> }
> free(info);
> @@ -1783,6 +1835,7 @@ static void vdi_check_object_work(struct work *work)
> if (is_erasure_oid(info->oid, info->copy_policy)) {
> sd_init_req(&hdr, SD_OP_READ_PEER);
> hdr.data_length = get_store_objsize(info->copy_policy,
> + info->object_size,
> info->oid);
> hdr.obj.ec_index = vcw->ec_index;
> hdr.epoch = sd_epoch;
> @@ -1856,7 +1909,8 @@ static void check_erasure_object(struct vdi_check_info *info)
> struct fec *ctx = ec_init(d, dp);
> int miss_idx[dp], input_idx[dp];
> uint64_t oid = info->oid;
> - size_t len = get_store_objsize(info->copy_policy, oid);
> + size_t len = get_store_objsize(info->copy_policy,
> + info->object_size, oid);
> char *obj = xmalloc(len);
> uint8_t *input[dp];
>
> @@ -1882,7 +1936,8 @@ static void check_erasure_object(struct vdi_check_info *info)
> uint8_t *ds[d];
> for (j = 0; j < d; j++)
> ds[j] = info->vcw[j].buf;
> - ec_decode_buffer(ctx, ds, idx, obj, d + k);
> + ec_decode_buffer(ctx, ds, idx, obj, d + k,
> + info->object_size);
> if (memcmp(obj, info->vcw[d + k].buf, len) != 0) {
> /* TODO repair the inconsistency */
> sd_err("object %"PRIx64" is inconsistent", oid);
> @@ -1900,7 +1955,8 @@ static void check_erasure_object(struct vdi_check_info *info)
>
> for (i = 0; i < d; i++)
> ds[i] = input[i];
> - ec_decode_buffer(ctx, ds, input_idx, obj, m);
> + ec_decode_buffer(ctx, ds, input_idx, obj, m,
> + info->object_size);
> write_object_to(info->vcw[m].vnode, oid, obj,
> len, true, info->vcw[m].ec_index);
> fprintf(stdout, "fixed missing %"PRIx64", "
> @@ -2023,6 +2079,7 @@ struct check_arg {
> uint64_t *done;
> struct work_queue *wq;
> int nr_copies;
> + uint32_t object_size;
> };
>
> static void check_cb(struct sd_index *idx, void *arg, int ignore)
> @@ -2032,7 +2089,7 @@ static void check_cb(struct sd_index *idx, void *arg, int ignore)
>
> if (idx->vdi_id) {
> oid = vid_to_data_oid(idx->vdi_id, idx->idx);
> - *(carg->done) = (uint64_t)idx->idx * SD_DATA_OBJ_SIZE;
> + *(carg->done) = (uint64_t)idx->idx * carg->object_size;
> vdi_show_progress(*(carg->done), carg->inode->vdi_size);
> queue_vdi_check_work(carg->inode, oid, NULL, carg->wq,
> carg->nr_copies);
> @@ -2046,6 +2103,7 @@ int do_vdi_check(const struct sd_inode *inode)
> uint32_t vid;
> struct work_queue *wq;
> int nr_copies = min((int)inode->nr_copies, sd_zones_nr);
> + uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
>
> if (0 < inode->copy_policy && sd_zones_nr < (int)inode->nr_copies) {
> sd_err("ABORT: Not enough active zones for consistency-checking"
> @@ -2070,12 +2128,13 @@ int do_vdi_check(const struct sd_inode *inode)
> queue_vdi_check_work(inode, oid, &done, wq,
> nr_copies);
> } else {
> - done += SD_DATA_OBJ_SIZE;
> + done += object_size;
> vdi_show_progress(done, inode->vdi_size);
> }
> }
> } else {
> - struct check_arg arg = {inode, &done, wq, nr_copies};
> + struct check_arg arg = {inode, &done, wq, nr_copies,
> + object_size};
> sd_inode_index_walk(inode, check_cb, &arg);
> vdi_show_progress(inode->vdi_size, inode->vdi_size);
> }
> @@ -2125,11 +2184,12 @@ struct obj_backup {
> uint32_t offset;
> uint32_t length;
> uint32_t reserved;
> - uint8_t data[SD_DATA_OBJ_SIZE];
> + uint8_t *data;
> };
>
> /* discards redundant area from backup data */
> -static void compact_obj_backup(struct obj_backup *backup, uint8_t *from_data)
> +static void compact_obj_backup(struct obj_backup *backup, uint8_t *from_data,
> + uint32_t object_size)
> {
> uint8_t *p1, *p2;
>
> @@ -2142,8 +2202,8 @@ static void compact_obj_backup(struct obj_backup *backup, uint8_t *from_data)
> backup->length -= SECTOR_SIZE;
> }
>
> - p1 = backup->data + SD_DATA_OBJ_SIZE - SECTOR_SIZE;
> - p2 = from_data + SD_DATA_OBJ_SIZE - SECTOR_SIZE;
> + p1 = backup->data + object_size - SECTOR_SIZE;
> + p2 = from_data + object_size - SECTOR_SIZE;
> while (backup->length > 0 && memcmp(p1, p2, SECTOR_SIZE) == 0) {
> p1 -= SECTOR_SIZE;
> p2 -= SECTOR_SIZE;
> @@ -2152,29 +2212,29 @@ static void compact_obj_backup(struct obj_backup *backup, uint8_t *from_data)
> }
>
> static int get_obj_backup(uint32_t idx, uint32_t from_vid, uint32_t to_vid,
> - struct obj_backup *backup)
> + struct obj_backup *backup, uint32_t object_size)
> {
> int ret;
> - uint8_t *from_data = xzalloc(SD_DATA_OBJ_SIZE);
> + uint8_t *from_data = xzalloc(object_size);
>
> backup->idx = idx;
> backup->offset = 0;
> - backup->length = SD_DATA_OBJ_SIZE;
> + backup->length = object_size;
>
> if (to_vid) {
> ret = dog_read_object(vid_to_data_oid(to_vid, idx),
> - backup->data, SD_DATA_OBJ_SIZE, 0, true);
> + backup->data, object_size, 0, true);
> if (ret != SD_RES_SUCCESS) {
> sd_err("Failed to read object %" PRIx32 ", %d", to_vid,
> idx);
> return EXIT_FAILURE;
> }
> } else
> - memset(backup->data, 0, SD_DATA_OBJ_SIZE);
> + memset(backup->data, 0, object_size);
>
> if (from_vid) {
> ret = dog_read_object(vid_to_data_oid(from_vid, idx), from_data,
> - SD_DATA_OBJ_SIZE, 0, true);
> + object_size, 0, true);
> if (ret != SD_RES_SUCCESS) {
> sd_err("Failed to read object %" PRIx32 ", %d",
> from_vid, idx);
> @@ -2182,7 +2242,7 @@ static int get_obj_backup(uint32_t idx, uint32_t from_vid, uint32_t to_vid,
> }
> }
>
> - compact_obj_backup(backup, from_data);
> + compact_obj_backup(backup, from_data, object_size);
>
> free(from_data);
>
> @@ -2194,13 +2254,13 @@ static int vdi_backup(int argc, char **argv)
> const char *vdiname = argv[optind++];
> int ret = EXIT_SUCCESS;
> uint32_t idx, nr_objs;
> + uint32_t object_size;
> struct sd_inode *from_inode = xzalloc(sizeof(*from_inode));
> struct sd_inode *to_inode = xzalloc(sizeof(*to_inode));
> struct backup_hdr hdr = {
> .version = VDI_BACKUP_FORMAT_VERSION,
> .magic = VDI_BACKUP_MAGIC,
> };
> - struct obj_backup *backup = xzalloc(sizeof(*backup));
>
> if ((!vdi_cmd_data.snapshot_id && !vdi_cmd_data.snapshot_tag[0]) ||
> (!vdi_cmd_data.from_snapshot_id &&
> @@ -2214,21 +2274,25 @@ static int vdi_backup(int argc, char **argv)
> vdi_cmd_data.from_snapshot_tag, NULL,
> from_inode, SD_INODE_SIZE);
> if (ret != EXIT_SUCCESS)
> - goto out;
> + goto load_inode_err;
>
> ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id,
> vdi_cmd_data.snapshot_tag, NULL, to_inode,
> SD_INODE_SIZE);
> if (ret != EXIT_SUCCESS)
> - goto out;
> + goto load_inode_err;
>
> nr_objs = count_data_objs(to_inode);
>
> + struct obj_backup *backup = xzalloc(sizeof(*backup));
> + object_size = (UINT32_C(1) << from_inode->block_size_shift);
> + backup->data = xzalloc(sizeof(uint8_t) * object_size);
> +
> ret = xwrite(STDOUT_FILENO, &hdr, sizeof(hdr));
> if (ret < 0) {
> sd_err("failed to write backup header, %m");
> ret = EXIT_SYSFAIL;
> - goto out;
> + goto error;
> }
>
> for (idx = 0; idx < nr_objs; idx++) {
> @@ -2238,9 +2302,10 @@ static int vdi_backup(int argc, char **argv)
> if (to_vid == 0 && from_vid == 0)
> continue;
>
> - ret = get_obj_backup(idx, from_vid, to_vid, backup);
> + ret = get_obj_backup(idx, from_vid, to_vid,
> + backup, object_size);
> if (ret != EXIT_SUCCESS)
> - goto out;
> + goto error;
>
> if (backup->length == 0)
> continue;
> @@ -2250,14 +2315,14 @@ static int vdi_backup(int argc, char **argv)
> if (ret < 0) {
> sd_err("failed to write backup data, %m");
> ret = EXIT_SYSFAIL;
> - goto out;
> + goto error;
> }
> ret = xwrite(STDOUT_FILENO, backup->data + backup->offset,
> backup->length);
> if (ret < 0) {
> sd_err("failed to write backup data, %m");
> ret = EXIT_SYSFAIL;
> - goto out;
> + goto error;
> }
> }
>
> @@ -2269,15 +2334,18 @@ static int vdi_backup(int argc, char **argv)
> if (ret < 0) {
> sd_err("failed to write end marker, %m");
> ret = EXIT_SYSFAIL;
> - goto out;
> + goto error;
> }
>
> fsync(STDOUT_FILENO);
> ret = EXIT_SUCCESS;
> -out:
> +error:
> + free(backup->data);
> + free(backup);
> +load_inode_err:
> free(from_inode);
> free(to_inode);
> - free(backup);
> +out:
> return ret;
> }
>
> @@ -2310,6 +2378,7 @@ static uint32_t do_restore(const char *vdiname, int snapid, const char *tag)
> {
> int ret;
> uint32_t vid;
> + uint32_t object_size;
> struct backup_hdr hdr;
> struct obj_backup *backup = xzalloc(sizeof(*backup));
> struct sd_inode *inode = xzalloc(sizeof(*inode));
> @@ -2329,9 +2398,10 @@ static uint32_t do_restore(const char *vdiname, int snapid, const char *tag)
> if (ret != EXIT_SUCCESS)
> goto out;
>
> + object_size = (UINT32_C(1) << inode->block_size_shift);
> ret = do_vdi_create(vdiname, inode->vdi_size, inode->vdi_id, &vid,
> false, inode->nr_copies, inode->copy_policy,
> - inode->store_policy);
> + inode->store_policy, object_size);
> if (ret != EXIT_SUCCESS) {
> sd_err("Failed to read VDI");
> goto out;
> @@ -2435,12 +2505,15 @@ static int vdi_restore(int argc, char **argv)
> out:
> if (need_current_recovery) {
> int recovery_ret;
> + uint32_t object_size =
> + (UINT32_C(1) << current_inode->block_size_shift);
> /* recreate the current vdi object */
> recovery_ret = do_vdi_create(vdiname, current_inode->vdi_size,
> current_inode->parent_vdi_id, NULL,
> true, current_inode->nr_copies,
> current_inode->copy_policy,
> - current_inode->store_policy);
> + current_inode->store_policy,
> + object_size);
> if (recovery_ret != EXIT_SUCCESS) {
> sd_err("failed to resume the current vdi");
> ret = recovery_ret;
> @@ -2563,9 +2636,25 @@ static int vdi_cache_info(int argc, char **argv)
>
> fprintf(stdout, "Name\tTag\tTotal\tDirty\tClean\n");
> for (i = 0; i < info.count; i++) {
> - uint64_t total = info.caches[i].total * SD_DATA_OBJ_SIZE,
> - dirty = info.caches[i].dirty * SD_DATA_OBJ_SIZE,
> + uint32_t object_size;
> + uint32_t vid = info.caches[i].vid;
> + struct sd_inode *inode = NULL;
> + int r;
> +
> + r = dog_read_object(vid_to_vdi_oid(vid), inode,
> + SD_INODE_HEADER_SIZE, 0, true);
> + if (r != EXIT_SUCCESS)
> + return r;
> +
> + if (!inode->block_size_shift)
> + return EXIT_FAILURE;
> +
> + object_size = (UINT32_C(1) << inode->block_size_shift);
> +
> + uint64_t total = info.caches[i].total * object_size,
> + dirty = info.caches[i].dirty * object_size,
> clean = total - dirty;
> +
> char name[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
>
> ret = vid_to_name_tag(info.caches[i].vid, name, tag);
> @@ -2955,7 +3044,7 @@ static struct subcommand vdi_cmd[] = {
> {"check", "<vdiname>", "seaphT", "check and repair image's consistency",
> NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
> vdi_check, vdi_options},
> - {"create", "<vdiname> <size>", "PycaphrvT", "create an image",
> + {"create", "<vdiname> <size>", "PycaphrvzT", "create an image",
> NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
> vdi_create, vdi_options},
> {"snapshot", "<vdiname>", "saphrvT", "create a snapshot",
> @@ -3023,6 +3112,7 @@ static struct subcommand vdi_cmd[] = {
> static int vdi_parser(int ch, const char *opt)
> {
> char *p;
> + uint32_t object_size_shift_bit;
>
> switch (ch) {
> case 'P':
> @@ -3101,6 +3191,20 @@ static int vdi_parser(int ch, const char *opt)
> case 'e':
> vdi_cmd_data.exist = true;
> break;
> + case 'z':
> + object_size_shift_bit = (uint32_t)atoi(opt);
> + if (object_size_shift_bit > 31) {
> + sd_err("Object Size is limited to 2^31."
> + " Please set shift bit lower than 31");
> + exit(EXIT_FAILURE);
> + }
> + vdi_cmd_data.object_size =
> + (UINT32_C(1) << object_size_shift_bit);
> + if (!vdi_cmd_data.object_size) {
> + sd_err("Invalid parameter %s", opt);
> + exit(EXIT_FAILURE);
> + }
> + break;
> }
>
> return 0;
> diff --git a/include/fec.h b/include/fec.h
> index 1ae32e4..b3ef8d8 100644
> --- a/include/fec.h
> +++ b/include/fec.h
> @@ -96,12 +96,12 @@ void fec_encode(const struct fec *code,
> size_t num_block_nums, size_t sz);
>
> void fec_decode_buffer(struct fec *ctx, uint8_t *input[], const int in_idx[],
> - char *buf, int idx);
> + char *buf, int idx, uint32_t object_size);
>
> /* for isa-l */
>
> void isa_decode_buffer(struct fec *ctx, uint8_t *input[], const int in_idx[],
> - char *buf, int idx);
> + char *buf, int idx, uint32_t object_size);
>
> /*
> * @param inpkts an array of packets (size k); If a primary block, i, is present
> @@ -119,7 +119,6 @@ void fec_decode(const struct fec *code,
>
> /* Set data stripe as sector size to make VM happy */
> #define SD_EC_DATA_STRIPE_SIZE (512) /* 512 Byte */
> -#define SD_EC_NR_STRIPE_PER_OBJECT (SD_DATA_OBJ_SIZE / SD_EC_DATA_STRIPE_SIZE)
> #define SD_EC_MAX_STRIP (16)
>
> static inline int ec_policy_to_dp(uint8_t policy, int *d, int *p)
> @@ -205,11 +204,12 @@ static inline void ec_destroy(struct fec *ctx)
> }
>
> static inline void ec_decode_buffer(struct fec *ctx, uint8_t *input[],
> - const int in_idx[], char *buf, int idx)
> + const int in_idx[], char *buf,
> + int idx, uint32_t object_size)
> {
> if (cpu_has_ssse3)
> - isa_decode_buffer(ctx, input, in_idx, buf, idx);
> + isa_decode_buffer(ctx, input, in_idx, buf, idx, object_size);
> else
> - fec_decode_buffer(ctx, input, in_idx, buf, idx);
> + fec_decode_buffer(ctx, input, in_idx, buf, idx, object_size);
> }
> #endif
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index cbb65b6..5cdedf5 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -477,10 +477,11 @@ static inline bool is_data_obj(uint64_t oid)
>
> static inline size_t count_data_objs(const struct sd_inode *inode)
> {
> - return DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE);
> + return DIV_ROUND_UP(inode->vdi_size,
> + (UINT32_C(1) << inode->block_size_shift));
> }
>
> -static inline size_t get_objsize(uint64_t oid)
> +static inline size_t get_objsize(uint64_t oid, uint32_t object_size)
> {
> if (is_vdi_obj(oid))
> return SD_INODE_SIZE;
> @@ -494,7 +495,7 @@ static inline size_t get_objsize(uint64_t oid)
> if (is_ledger_object(oid))
> return SD_LEDGER_OBJ_SIZE;
>
> - return SD_DATA_OBJ_SIZE;
More information about the sheepdog
mailing list