[sheepdog] [PATCH v3] sheepdog: selectable object size support
Kevin Wolf
kwolf at redhat.com
Fri Jan 23 14:53:19 CET 2015
Am 23.01.2015 um 09:24 hat Teruaki Ishizaki geschrieben:
> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> This patch enables users to handle "block_size_shift" value for
> calculating VDI object size.
>
> When you start qemu, you don't need to specify additional command option.
>
> But when you create the VDI which doesn't have default object size
> with qemu-img command, you specify block_size_shift option.
>
> If you want to create a VDI of 8MB(1 << 23) object size,
> you need to specify following command option.
>
> # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>
> In addition, when you don't specify qemu-img command option,
> a default value of sheepdog cluster is used for creating VDI.
>
> # qemu-img create sheepdog:test2 100M
>
> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki at lab.ntt.co.jp>
> ---
> V3:
> - Delete the needless operation of buffer.
> - Delete the needless operations of request header
> for SD_OP_GET_CLUSTER_DEFAULT.
> - Fix coding style problems.
>
> V2:
> - Fix coding style problem (white space).
> - Add members, store_policy and block_size_shift to struct SheepdogVdiReq
> - Initialize request header to use block_size_shift specified by user.
> ---
> block/sheepdog.c | 140 ++++++++++++++++++++++++++++++++++++++-------
> include/block/block_int.h | 1 +
> 2 files changed, 119 insertions(+), 22 deletions(-)
>
> diff --git a/block/sheepdog.c b/block/sheepdog.c
> index be3176f..c9f06db 100644
> --- a/block/sheepdog.c
> +++ b/block/sheepdog.c
> @@ -37,6 +37,7 @@
> #define SD_OP_READ_VDIS 0x15
> #define SD_OP_FLUSH_VDI 0x16
> #define SD_OP_DEL_VDI 0x17
> +#define SD_OP_GET_CLUSTER_DEFAULT 0x18
>
> #define SD_FLAG_CMD_WRITE 0x01
> #define SD_FLAG_CMD_COW 0x02
> @@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
> uint32_t base_vdi_id;
> uint8_t copies;
> uint8_t copy_policy;
> - uint8_t reserved[2];
> + uint8_t store_policy;
> + uint8_t block_size_shift;
> uint32_t snapid;
> uint32_t type;
> uint32_t pad[2];
> @@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
> uint32_t pad[5];
> } SheepdogVdiRsp;
>
> +typedef struct SheepdogClusterRsp {
> + uint8_t proto_ver;
> + uint8_t opcode;
> + uint16_t flags;
> + uint32_t epoch;
> + uint32_t id;
> + uint32_t data_length;
> + uint32_t result;
> + uint8_t nr_copies;
> + uint8_t copy_policy;
> + uint8_t block_size_shift;
> + uint8_t __pad1;
> + uint32_t __pad2[6];
> +} SheepdogClusterRsp;
> +
> typedef struct SheepdogInode {
> char name[SD_MAX_VDI_LEN];
> char tag[SD_MAX_VDI_TAG_LEN];
> @@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> hdr.vdi_size = s->inode.vdi_size;
> hdr.copy_policy = s->inode.copy_policy;
> hdr.copies = s->inode.nr_copies;
> + hdr.block_size_shift = s->inode.block_size_shift;
>
> ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
>
> @@ -1569,9 +1587,11 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> static int sd_prealloc(const char *filename, Error **errp)
> {
> BlockDriverState *bs = NULL;
> + BDRVSheepdogState *base = NULL;
> uint32_t idx, max_idx;
> + uint32_t object_size;
> int64_t vdi_size;
> - void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> + void *buf = NULL;
> int ret;
>
> ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> @@ -1585,18 +1605,23 @@ static int sd_prealloc(const char *filename, Error **errp)
> ret = vdi_size;
> goto out;
> }
> - max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> +
> + base = bs->opaque;
> + object_size = (UINT32_C(1) << base->inode.block_size_shift);
> + buf = g_malloc0(object_size);
If I understand correctly, block_size_shift can be up to 31, i.e. this
is a 2 GB allocation. Do you really think this is a good idea?
At least use g_try_malloc0() here, so that a memory allocation failure
doesn't crash qemu. (Same goes for all potentially huge allocations that
you make in the whole codebase.)
> + max_idx = DIV_ROUND_UP(vdi_size, object_size);
>
> for (idx = 0; idx < max_idx; idx++) {
> /*
> * The created image can be a cloned image, so we need to read
> * a data from the source image.
> */
> - ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> + ret = bdrv_pread(bs, idx * object_size, buf, object_size);
> if (ret < 0) {
> goto out;
> }
> - ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> + ret = bdrv_pwrite(bs, idx * object_size, buf, object_size);
> if (ret < 0) {
> goto out;
> }
> @@ -1610,7 +1635,9 @@ out_with_err_set:
> if (bs) {
> bdrv_unref(bs);
> }
> - g_free(buf);
> + if (buf) {
> + g_free(buf);
> + }
This is unnecessary. g_free(NULL) is valid, it does nothing.
> return ret;
> }
> @@ -1669,6 +1696,17 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
> return 0;
> }
>
> +static int parse_block_size_shift(BDRVSheepdogState *s, const char *opt)
> +{
> + struct SheepdogInode *inode = &s->inode;
> + inode->block_size_shift = (uint8_t)atoi(opt);
atoi() is best avoided, it has poor error handling.
But I think you don't need this parse function at all, see below.
> + if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> static int sd_create(const char *filename, QemuOpts *opts,
> Error **errp)
> {
> @@ -1679,6 +1717,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
> BDRVSheepdogState *s;
> char tag[SD_MAX_VDI_TAG_LEN];
> uint32_t snapid;
> + uint64_t max_vdi_size;
> bool prealloc = false;
>
> s = g_new0(BDRVSheepdogState, 1);
> @@ -1718,10 +1757,15 @@ static int sd_create(const char *filename, QemuOpts *opts,
> }
> }
>
> - if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> - error_setg(errp, "too big image size");
> - ret = -EINVAL;
> - goto out;
> + g_free(buf);
> + buf = qemu_opt_get_del(opts, BLOCK_OPT_BLOCK_SIZE_SHIFT);
> + if (buf) {
> + ret = parse_block_size_shift(s, buf);
> + if (ret < 0) {
> + error_setg(errp, "Invalid block_size_shift:"
> + " '%s'. please use 20-31", buf);
> + goto out;
> + }
> }
You could use qemu_opt_get_number_del() here and get a number directly
from QemuOpts that you don't have to parse any more, if you defined the
option as QEMU_OPT_NUMBER instead of QEMU_OPT_STRING.
> if (backing_file) {
> @@ -1757,6 +1801,45 @@ static int sd_create(const char *filename, QemuOpts *opts,
> }
>
> s->aio_context = qemu_get_aio_context();
> +
> + /* if block_size_shift is not specified, get cluster default value */
> + if (s->inode.block_size_shift == 0) {
> + SheepdogVdiReq hdr;
> + SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
> + Error *local_err = NULL;
> + int fd;
> + unsigned int wlen = 0, rlen = 0;
> +
> + fd = connect_to_sdog(s, &local_err);
> + if (fd < 0) {
> + error_report("%s", error_get_pretty(local_err));
> + error_free(local_err);
> + ret = -EIO;
> + goto out;
> + }
> +
> + memset(&hdr, 0, sizeof(hdr));
> + hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
> + hdr.proto_ver = SD_PROTO_VER;
> +
> + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
> + NULL, &wlen, &rlen);
> + closesocket(fd);
> + if (ret) {
> + error_setg_errno(errp, -ret, "failed to get cluster default");
> + goto out;
> + }
> + s->inode.block_size_shift = rsp->block_size_shift;
> + }
> +
> + max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
> +
> + if (s->inode.vdi_size > max_vdi_size) {
> + error_setg(errp, "too big image size");
Please capitalise the first word in an error messages, i.e. "Too big
image size".
> + ret = -EINVAL;
> + goto out;
> + }
> +
> ret = do_sd_create(s, &vid, 0, errp);
> if (ret) {
> goto out;
Kevin
More information about the sheepdog
mailing list