[sheepdog] [PATCH v5] sheepdog: selectable object size support
Hitoshi Mitake
mitake.hitoshi at gmail.com
Sun Feb 15 15:17:11 CET 2015
At Fri, 13 Feb 2015 18:20:53 +0900,
Teruaki Ishizaki wrote:
>
> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> This patch enables users to handle VDI object size.
>
> When you start qemu, you don't need to specify additional command option.
>
> But when you create the VDI which doesn't have default object size
> with qemu-img command, you specify object_size option.
>
> If you want to create a VDI of 8MB object size,
> you need to specify following command option.
>
> # qemu-img create -o object_size=8M sheepdog:test1 100M
>
> In addition, when you don't specify qemu-img command option,
> a default value of sheepdog cluster is used for creating VDI.
>
> # qemu-img create sheepdog:test2 100M
>
> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki at lab.ntt.co.jp>
> ---
> V5:
> - Change option from block_size_shift to object_size.
> - Change parse type to QEMU_OPT_SIZE.
> - Add operation to verify max VDI size for resizing.
> - Change to use 4MB object size with using old Sheepdog.
>
> V4:
> - Limit a read/write buffer size for creating a preallocated VDI.
> - Replace a parse function for the block_size_shift option.
> - Fix an error message.
>
> V3:
> - Delete the needless operation of buffer.
> - Delete the needless operations of request header.
> for SD_OP_GET_CLUSTER_DEFAULT.
> - Fix coding style problems.
>
> V2:
> - Fix coding style problem (white space).
> - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> - Initialize request header to use block_size_shift specified by user.
> ---
> block/sheepdog.c | 155 ++++++++++++++++++++++++++++++++++++++-------
> include/block/block_int.h | 1 +
> 2 files changed, 134 insertions(+), 22 deletions(-)
Looks good to me.
Acked-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
Thanks,
Hitoshi
>
> diff --git a/block/sheepdog.c b/block/sheepdog.c
> index be3176f..f6fe97e 100644
> --- a/block/sheepdog.c
> +++ b/block/sheepdog.c
> @@ -37,6 +37,7 @@
> #define SD_OP_READ_VDIS 0x15
> #define SD_OP_FLUSH_VDI 0x16
> #define SD_OP_DEL_VDI 0x17
> +#define SD_OP_GET_CLUSTER_DEFAULT 0x18
>
> #define SD_FLAG_CMD_WRITE 0x01
> #define SD_FLAG_CMD_COW 0x02
> @@ -91,6 +92,7 @@
> #define SD_NR_VDIS (1U << 24)
> #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
> #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
> +#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22
> /*
> * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
> * (SD_EC_MAX_STRIP - 1) for parity strips
> @@ -167,7 +169,8 @@ typedef struct SheepdogVdiReq {
> uint32_t base_vdi_id;
> uint8_t copies;
> uint8_t copy_policy;
> - uint8_t reserved[2];
> + uint8_t store_policy;
> + uint8_t block_size_shift;
> uint32_t snapid;
> uint32_t type;
> uint32_t pad[2];
> @@ -186,6 +189,21 @@ typedef struct SheepdogVdiRsp {
> uint32_t pad[5];
> } SheepdogVdiRsp;
>
> +typedef struct SheepdogClusterRsp {
> + uint8_t proto_ver;
> + uint8_t opcode;
> + uint16_t flags;
> + uint32_t epoch;
> + uint32_t id;
> + uint32_t data_length;
> + uint32_t result;
> + uint8_t nr_copies;
> + uint8_t copy_policy;
> + uint8_t block_size_shift;
> + uint8_t __pad1;
> + uint32_t __pad2[6];
> +} SheepdogClusterRsp;
> +
> typedef struct SheepdogInode {
> char name[SD_MAX_VDI_LEN];
> char tag[SD_MAX_VDI_TAG_LEN];
> @@ -1544,6 +1562,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> hdr.vdi_size = s->inode.vdi_size;
> hdr.copy_policy = s->inode.copy_policy;
> hdr.copies = s->inode.nr_copies;
> + hdr.block_size_shift = s->inode.block_size_shift;
>
> ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
>
> @@ -1569,9 +1588,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> static int sd_prealloc(const char *filename, Error **errp)
> {
> BlockDriverState *bs = NULL;
> + BDRVSheepdogState *base = NULL;
> + unsigned long buf_size;
> uint32_t idx, max_idx;
> + uint32_t object_size;
> int64_t vdi_size;
> - void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> + void *buf = NULL;
> int ret;
>
> ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> @@ -1585,18 +1607,24 @@ static int sd_prealloc(const char *filename, Error **errp)
> ret = vdi_size;
> goto out;
> }
> - max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> +
> + base = bs->opaque;
> + object_size = (UINT32_C(1) << base->inode.block_size_shift);
> + buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
> + buf = g_malloc0(buf_size);
> +
> + max_idx = DIV_ROUND_UP(vdi_size, buf_size);
>
> for (idx = 0; idx < max_idx; idx++) {
> /*
> * The created image can be a cloned image, so we need to read
> * a data from the source image.
> */
> - ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> + ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
> if (ret < 0) {
> goto out;
> }
> - ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> + ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
> if (ret < 0) {
> goto out;
> }
> @@ -1669,6 +1697,27 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
> return 0;
> }
>
> +static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
> +{
> + struct SheepdogInode *inode = &s->inode;
> + uint64_t object_size;
> + int obj_order;
> +
> + object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0);
> + if (object_size) {
> + if ((object_size - 1) & object_size) { /* not a power of 2? */
> + return -EINVAL;
> + }
> + obj_order = ffs(object_size) - 1;
> + if (obj_order < 20 || obj_order > 31) {
> + return -EINVAL;
> + }
> + inode->block_size_shift = (uint8_t)obj_order;
> + }
> +
> + return 0;
> +}
> +
> static int sd_create(const char *filename, QemuOpts *opts,
> Error **errp)
> {
> @@ -1679,6 +1728,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
> BDRVSheepdogState *s;
> char tag[SD_MAX_VDI_TAG_LEN];
> uint32_t snapid;
> + uint64_t max_vdi_size;
> bool prealloc = false;
>
> s = g_new0(BDRVSheepdogState, 1);
> @@ -1717,10 +1767,11 @@ static int sd_create(const char *filename, QemuOpts *opts,
> goto out;
> }
> }
> -
> - if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> - error_setg(errp, "too big image size");
> - ret = -EINVAL;
> + ret = parse_block_size_shift(s, opts);
> + if (ret < 0) {
> + error_setg(errp, "Invalid object_size."
> + " obect_size needs to be power of 2"
> + " and be limited from 2^20 to 2^31");
> goto out;
> }
>
> @@ -1757,6 +1808,51 @@ static int sd_create(const char *filename, QemuOpts *opts,
> }
>
> s->aio_context = qemu_get_aio_context();
> +
> + /* if block_size_shift is not specified, get cluster default value */
> + if (s->inode.block_size_shift == 0) {
> + SheepdogVdiReq hdr;
> + SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
> + Error *local_err = NULL;
> + int fd;
> + unsigned int wlen = 0, rlen = 0;
> +
> + fd = connect_to_sdog(s, &local_err);
> + if (fd < 0) {
> + error_report("%s", error_get_pretty(local_err));
> + error_free(local_err);
> + ret = -EIO;
> + goto out;
> + }
> +
> + memset(&hdr, 0, sizeof(hdr));
> + hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
> + hdr.proto_ver = SD_PROTO_VER;
> +
> + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
> + NULL, &wlen, &rlen);
> + closesocket(fd);
> + if (ret) {
> + error_setg_errno(errp, -ret, "failed to get cluster default");
> + goto out;
> + }
> + if (rsp->result == SD_RES_SUCCESS) {
> + s->inode.block_size_shift = rsp->block_size_shift;
> + } else {
> + s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT;
> + }
> + }
> +
> + max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
> +
> + if (s->inode.vdi_size > max_vdi_size) {
> + error_setg(errp, "An image is too large."
> + " The maximum image size is %"PRIu64 "GB",
> + max_vdi_size / 1024 / 1024 / 1024);
> + ret = -EINVAL;
> + goto out;
> + }
> +
> ret = do_sd_create(s, &vid, 0, errp);
> if (ret) {
> goto out;
> @@ -1827,11 +1923,13 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
> BDRVSheepdogState *s = bs->opaque;
> int ret, fd;
> unsigned int datalen;
> + uint64_t max_vdi_size;
>
> + max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
> if (offset < s->inode.vdi_size) {
> error_report("shrinking is not supported");
> return -EINVAL;
> - } else if (offset > SD_MAX_VDI_SIZE) {
> + } else if (offset > max_vdi_size) {
> error_report("too big image size");
> return -EINVAL;
> }
> @@ -2013,9 +2111,10 @@ static int coroutine_fn sd_co_rw_vector(void *p)
> SheepdogAIOCB *acb = p;
> int ret = 0;
> unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
> - unsigned long idx = acb->sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE;
> + unsigned long idx;
> + uint32_t object_size;
> uint64_t oid;
> - uint64_t offset = (acb->sector_num * BDRV_SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
> + uint64_t offset;
> BDRVSheepdogState *s = acb->common.bs->opaque;
> SheepdogInode *inode = &s->inode;
> AIOReq *aio_req;
> @@ -2032,6 +2131,10 @@ static int coroutine_fn sd_co_rw_vector(void *p)
> }
> }
>
> + object_size = (UINT32_C(1) << inode->block_size_shift);
> + idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
> + offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size;
> +
> /*
> * Make sure we don't free the aiocb before we are done with all requests.
> * This additional reference is dropped at the end of this function.
> @@ -2045,7 +2148,7 @@ static int coroutine_fn sd_co_rw_vector(void *p)
>
> oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
>
> - len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
> + len = MIN(total - done, object_size - offset);
>
> switch (acb->aiocb_type) {
> case AIOCB_READ_UDATA:
> @@ -2069,7 +2172,7 @@ static int coroutine_fn sd_co_rw_vector(void *p)
> * We discard the object only when the whole object is
> * 1) allocated 2) trimmed. Otherwise, simply skip it.
> */
> - if (len != SD_DATA_OBJ_SIZE || inode->data_vdi_id[idx] == 0) {
> + if (len != object_size || inode->data_vdi_id[idx] == 0) {
> goto done;
> }
> break;
> @@ -2426,6 +2529,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
> uint64_t offset;
> uint32_t vdi_index;
> uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
> + uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift);
>
> fd = connect_to_sdog(s, &local_err);
> if (fd < 0) {
> @@ -2435,10 +2539,10 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
> }
>
> while (remaining) {
> - vdi_index = pos / SD_DATA_OBJ_SIZE;
> - offset = pos % SD_DATA_OBJ_SIZE;
> + vdi_index = pos / object_size;
> + offset = pos % object_size;
>
> - data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
> + data_len = MIN(remaining, object_size - offset);
>
> vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
>
> @@ -2525,10 +2629,11 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
> {
> BDRVSheepdogState *s = bs->opaque;
> SheepdogInode *inode = &s->inode;
> + uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
> uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
> - unsigned long start = offset / SD_DATA_OBJ_SIZE,
> + unsigned long start = offset / object_size,
> end = DIV_ROUND_UP((sector_num + nb_sectors) *
> - BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE);
> + BDRV_SECTOR_SIZE, object_size);
> unsigned long idx;
> int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
>
> @@ -2547,7 +2652,7 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
> }
> }
>
> - *pnum = (idx - start) * SD_DATA_OBJ_SIZE / BDRV_SECTOR_SIZE;
> + *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE;
> if (*pnum > nb_sectors) {
> *pnum = nb_sectors;
> }
> @@ -2558,14 +2663,15 @@ static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
> {
> BDRVSheepdogState *s = bs->opaque;
> SheepdogInode *inode = &s->inode;
> - unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE);
> + uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
> + unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size);
> uint64_t size = 0;
>
> for (i = 0; i < last; i++) {
> if (inode->data_vdi_id[i] == 0) {
> continue;
> }
> - size += SD_DATA_OBJ_SIZE;
> + size += object_size;
> }
> return size;
> }
> @@ -2594,6 +2700,11 @@ static QemuOptsList sd_create_opts = {
> .type = QEMU_OPT_STRING,
> .help = "Redundancy of the image"
> },
> + {
> + .name = BLOCK_OPT_OBJECT_SIZE,
> + .type = QEMU_OPT_SIZE,
> + .help = "Object size of the image"
> + },
> { /* end of list */ }
> }
> };
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 7ad1950..5e718a3 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -56,6 +56,7 @@
> #define BLOCK_OPT_ADAPTER_TYPE "adapter_type"
> #define BLOCK_OPT_REDUNDANCY "redundancy"
> #define BLOCK_OPT_NOCOW "nocow"
> +#define BLOCK_OPT_OBJECT_SIZE "object_size"
>
> #define BLOCK_PROBE_BUF_SIZE 512
>
> --
> 1.7.1
>
> --
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> https://lists.wpkg.org/mailman/listinfo/sheepdog
More information about the sheepdog
mailing list