[sheepdog] [PATCH v4] sheepdog: selectable object size support
Teruaki Ishizaki
ishizaki.teruaki at lab.ntt.co.jp
Tue Feb 10 09:22:02 CET 2015
(2015/02/10 12:10), Liu Yuan wrote:
> On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
>> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
>> This patch enables users to handle "block_size_shift" value for
>> calculating VDI object size.
>>
>> When you start qemu, you don't need to specify additional command option.
>>
>> But when you create the VDI which doesn't have default object size
>> with qemu-img command, you specify block_size_shift option.
>>
>> If you want to create a VDI of 8MB(1 << 23) object size,
>> you need to specify following command option.
>>
>> # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>>
>> In addition, when you don't specify qemu-img command option,
>> a default value of sheepdog cluster is used for creating VDI.
>>
>> # qemu-img create sheepdog:test2 100M
>>
>> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki at lab.ntt.co.jp>
>> ---
>> V4:
>> - Limit a read/write buffer size for creating a preallocated VDI.
>> - Replace a parse function for the block_size_shift option.
>> - Fix an error message.
>>
>> V3:
>> - Delete the needless operation of buffer.
>> - Delete the needless operations of request header.
>> for SD_OP_GET_CLUSTER_DEFAULT.
>> - Fix coding style problems.
>>
>> V2:
>> - Fix coding style problem (white space).
>> - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
>> - Initialize request header to use block_size_shift specified by user.
>> ---
>> block/sheepdog.c | 138 ++++++++++++++++++++++++++++++++++++++-------
>> include/block/block_int.h | 1 +
>> 2 files changed, 119 insertions(+), 20 deletions(-)
>>
>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>> index be3176f..a43b947 100644
>> --- a/block/sheepdog.c
>> +++ b/block/sheepdog.c
>> @@ -37,6 +37,7 @@
>> #define SD_OP_READ_VDIS 0x15
>> #define SD_OP_FLUSH_VDI 0x16
>> #define SD_OP_DEL_VDI 0x17
>> +#define SD_OP_GET_CLUSTER_DEFAULT 0x18
>>
>> #define SD_FLAG_CMD_WRITE 0x01
>> #define SD_FLAG_CMD_COW 0x02
>> @@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
>> uint32_t base_vdi_id;
>> uint8_t copies;
>> uint8_t copy_policy;
>> - uint8_t reserved[2];
>> + uint8_t store_policy;
>> + uint8_t block_size_shift;
>> uint32_t snapid;
>> uint32_t type;
>> uint32_t pad[2];
>> @@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
>> uint32_t pad[5];
>> } SheepdogVdiRsp;
>>
>> +typedef struct SheepdogClusterRsp {
>> + uint8_t proto_ver;
>> + uint8_t opcode;
>> + uint16_t flags;
>> + uint32_t epoch;
>> + uint32_t id;
>> + uint32_t data_length;
>> + uint32_t result;
>> + uint8_t nr_copies;
>> + uint8_t copy_policy;
>> + uint8_t block_size_shift;
>> + uint8_t __pad1;
>> + uint32_t __pad2[6];
>> +} SheepdogClusterRsp;
>> +
>> typedef struct SheepdogInode {
>> char name[SD_MAX_VDI_LEN];
>> char tag[SD_MAX_VDI_TAG_LEN];
>> @@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>> hdr.vdi_size = s->inode.vdi_size;
>> hdr.copy_policy = s->inode.copy_policy;
>> hdr.copies = s->inode.nr_copies;
>> + hdr.block_size_shift = s->inode.block_size_shift;
>>
>> ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
>>
>> @@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>> static int sd_prealloc(const char *filename, Error **errp)
>> {
>> BlockDriverState *bs = NULL;
>> + BDRVSheepdogState *base = NULL;
>> + unsigned long buf_size;
>> uint32_t idx, max_idx;
>> + uint32_t object_size;
>> int64_t vdi_size;
>> - void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
>> + void *buf = NULL;
>> int ret;
>>
>> ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
>> @@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
>> ret = vdi_size;
>> goto out;
>> }
>> - max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
>> +
>> + base = bs->opaque;
>> + object_size = (UINT32_C(1) << base->inode.block_size_shift);
>> + buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
>> + buf = g_malloc0(buf_size);
>> +
>> + max_idx = DIV_ROUND_UP(vdi_size, buf_size);
>>
>> for (idx = 0; idx < max_idx; idx++) {
>> /*
>> * The created image can be a cloned image, so we need to read
>> * a data from the source image.
>> */
>> - ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
>> + ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
>> if (ret < 0) {
>> goto out;
>> }
>> - ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
>> + ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
>> if (ret < 0) {
>> goto out;
>> }
>> @@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
>> return 0;
>> }
>>
>> +static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
>> +{
>> + struct SheepdogInode *inode = &s->inode;
>> + inode->block_size_shift =
>> + (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
>> + if (inode->block_size_shift == 0) {
>> + /* block_size_shift is set for cluster default value by sheepdog */
>> + return 0;
>> + } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
>> + return -EINVAL;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> static int sd_create(const char *filename, QemuOpts *opts,
>> Error **errp)
>> {
>> @@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
>> BDRVSheepdogState *s;
>> char tag[SD_MAX_VDI_TAG_LEN];
>> uint32_t snapid;
>> + uint64_t max_vdi_size;
>> bool prealloc = false;
>>
>> s = g_new0(BDRVSheepdogState, 1);
>> @@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
>> }
>> }
>>
>> - if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
>> - error_setg(errp, "too big image size");
>> - ret = -EINVAL;
>> + ret = parse_block_size_shift(s, opts);
>> + if (ret < 0) {
>> + error_setg(errp, "Invalid block_size_shift:"
>> + " '%s'. please use 20-31", buf);
>> goto out;
>> }
>>
>> @@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
>> }
>>
>> s->aio_context = qemu_get_aio_context();
>> +
>> + /* if block_size_shift is not specified, get cluster default value */
>> + if (s->inode.block_size_shift == 0) {
>
> Seems that we don't keep backward compatibility for new QEMU and old sheep that
> doesn't support SD_OP_GET_CLUSTER_DEFAULT.
Then, I'll add the operation that if block_size_shift from
SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22.
Is it OK?
>
> What will happen if old QEMU with new sheep that support block_size_shift? Most
> distributions will ship the old stable qemu that wouldn't be aware of
> block_size_shift.
If old QEMU with new sheep is used, VDI is created with cluster default
block_size_shift and accessed as 4MB object_size.
So I think that for backward compatibility, users must do cluster
format command with default block_size_shift 22 equal to
4MB object_size.
If users want to use for iSCSI target with changing block_size_shift,
they must specify the value to create VDI.
>
> I'd suggest we have 0 in block_size_shift represent 4MB to keep backward
> compatiblity.
>
Thanks,
Teruaki Ishizaki
More information about the sheepdog
mailing list