[sheepdog] [PATCH v1 2/2] sheep/http: add support for big object which is larger than SD_DATA_OBJ_SIZE
Liu Yuan
namei.unix at gmail.com
Sat Dec 7 08:09:50 CET 2013
On Fri, Dec 06, 2013 at 05:04:19PM +0800, Robin Dong wrote:
> From: Robin Dong <sanbai at taobao.com>
>
> Using hyper volume and extent structure of onode to store large number of
> objects size of which exceed SD_DATA_OBJ_SIZE.
>
> Signed-off-by: Robin Dong <sanbai at taobao.com>
> ---
> sheep/http/kv.c | 422 ++++++++++++++++++++++++++++++++++++++++-------------
> sheep/http/kv.h | 15 +-
> sheep/http/s3.c | 8 +-
> sheep/http/swift.c | 30 +---
> 4 files changed, 335 insertions(+), 140 deletions(-)
>
> diff --git a/sheep/http/kv.c b/sheep/http/kv.c
> index 55a7e24..68f0759 100644
> --- a/sheep/http/kv.c
> +++ b/sheep/http/kv.c
> @@ -21,6 +21,7 @@ struct bucket_inode {
> uint64_t obj_count;
> uint64_t bytes_used;
> uint32_t vdi_id; /* kv_onode stores in this vdi */
> + uint32_t data_vid; /* data of objects store in this vdi */
seems that this data_vid isn't used.
> uint32_t pad;
> uint64_t reserved[SD_MAX_BUCKET_NAME/sizeof(uint64_t) - 3];
> };
> @@ -172,13 +173,13 @@ int kv_create_account(const char *account)
> return kv_create_hyper_volume(account, &vdi_id);
> }
>
> -typedef void (*list_cb)(struct http_request *req, const char *bucket,
> +typedef void (*list_bucket_cb)(struct http_request *req, const char *bucket,
> void *opaque);
>
> struct list_buckets_arg {
> struct http_request *req;
> const char *account;
> - list_cb cb;
> + list_bucket_cb cb;
> uint32_t bucket_counter;
> };
>
> @@ -360,7 +361,16 @@ static int delete_bucket(struct sd_inode *account_inode, uint64_t idx,
> bnode->vdi_id = 0;
> snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
> account_inode->name, bucket);
> -
> + /* delete vdi which store kv_onode */
> + ret = kv_delete_vdi(vdi_name);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to delete vdi %s", vdi_name);
> + ret = -1;
> + goto out;
> + }
> + /* delete vdi which store object data */
> + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator",
> + account_inode->name, bucket);
> ret = kv_delete_vdi(vdi_name);
> if (ret != SD_RES_SUCCESS) {
> sd_err("Failed to delete vdi %s", vdi_name);
> @@ -462,12 +472,29 @@ static int add_bucket(struct sd_inode *account_inode, uint64_t idx,
> bnode->bytes_used = 0;
> snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
> account_inode->name, bucket);
> + /* create vdi to store kv_onode */
> ret = kv_create_hyper_volume(vdi_name, &(bnode->vdi_id));
> if (ret != SD_RES_SUCCESS) {
> sd_err("Failed to create hyper volume %d", ret);
> ret = -1;
> goto out;
> }
> + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator",
> + account_inode->name, bucket);
> + /* create vdi to store objects */
> + ret = kv_create_hyper_volume(vdi_name, &(bnode->data_vid));
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to create hyper volume %d", ret);
> + ret = -1;
> + goto out;
> + }
> + ret = oalloc_init(bnode->data_vid);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to init allocator on %x",
> + bnode->data_vid);
> + ret = -1;
> + goto out;
> + }
> sd_debug("create hyper volume %s success", vdi_name);
> break;
> }
> @@ -629,7 +656,7 @@ int kv_delete_bucket(const char *account, const char *bucket)
> return SD_RES_SUCCESS;
> }
>
> -int kv_list_buckets(struct http_request *req, list_cb cb, void *opaque)
> +int kv_list_buckets(struct http_request *req, list_bucket_cb cb, void *opaque)
> {
> struct sd_inode account_inode;
> const char *account = (const char *)opaque;
> @@ -658,6 +685,34 @@ int kv_list_buckets(struct http_request *req, list_cb cb, void *opaque)
> return SD_RES_SUCCESS;
> }
>
> +/*
> + * A bucket contains two vdi: one (vdi_id) stores 'struct kv_onode' by hash
> + * algorithm and another one (data_vid) stores data of objects.
> + * The first vdi names "account/bucket" and the second vdi names
> + * "account/bucket/allocator".
> + *
> + * It manage space in data vdi by algorithm in oalloc.c.
> + *
> + * For example: bucket "fruit" with account 'coly' has two objects "banana"
> + * and "apple"
> + *
> + *
> + * --------------------- kv_onode -----------------------
> + * | |
> + * bucket vdi v v
> + * +-----------------+--+---------------------------+--------------------------+
> + * |name: coly/fruit |..|kv_onode_hdr (name: banana)|onode_extent: start, count|
> + * +-----------------+--+---------------------------+--------------------------+
> + * /
> + * /
> + * ------------
> + * /
> + * data_vid v
> + * +---------------------------+---+-----------------+
> + * |name: coly/fruit/allocator |...| data |
> + * +---------------------------+---+-----------------+
> + */
> +
> /* Object operations */
>
> /* 4 KB header of kv object index node */
> @@ -691,24 +746,71 @@ struct kv_onode {
> struct kv_onode_hdr hdr;
> union {
> uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct kv_onode_hdr)];
> - struct onode_extent *o_extent;
> + struct onode_extent o_extent[0];
> };
> };
>
> +typedef void (*list_object_cb)(struct http_request *req, const char *bucket,
> + const char *object, void *opaque);
> +
> +struct list_objects_arg {
> + struct http_request *req;
> + void *opaque;
> + const char *bucket;
> + list_object_cb cb;
> + uint32_t object_counter;
> +};
> +
> +static void list_objects_cb(void *data, enum btree_node_type type, void *arg)
> +{
> + struct sd_extent *ext;
> + struct list_objects_arg *loarg = arg;
> + struct kv_onode *onode = NULL;
> + uint64_t oid;
> + int ret;
> +
> + if (type == BTREE_EXT) {
> + ext = (struct sd_extent *)data;
> + if (!ext->vdi_id)
> + goto out;
> +
> + onode = xmalloc(SD_DATA_OBJ_SIZE);
> +
> + oid = vid_to_data_oid(ext->vdi_id, ext->idx);
> + ret = sd_read_object(oid, (char *)onode, SD_DATA_OBJ_SIZE, 0);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to read data object %lx", oid);
> + goto out;
> + }
> +
> + if (onode->hdr.name[0] == '\0')
> + goto out;
> + if (loarg->cb)
> + loarg->cb(loarg->req, loarg->bucket, onode->hdr.name,
> + loarg->opaque);
> + loarg->object_counter++;
> + }
> +out:
> + free(onode);
> +}
> +
> #define KV_ONODE_INLINE_SIZE (SD_DATA_OBJ_SIZE - sizeof(struct kv_onode_hdr))
>
> -static int kv_create_inlined_object(struct sd_inode *inode,
> - struct kv_onode *onode,
> - uint32_t vid, uint32_t idx,
> - bool overwrite)
> +static int kv_write_onode(struct sd_inode *inode, struct kv_onode *onode,
> + uint32_t vid, uint32_t idx, bool overwrite)
> {
> - uint64_t oid = vid_to_data_oid(vid, idx);
> + uint64_t oid = vid_to_data_oid(vid, idx), len;
> int ret;
>
> + if (onode->hdr.inlined)
> + len = onode->hdr.size;
> + else
> + len = sizeof(struct onode_extent) * onode->hdr.nr_extent;
> +
> if (overwrite) {
> sd_info("overwrite object %s", onode->hdr.name);
> ret = sd_write_object(oid, (char *)onode,
> - sizeof(onode->hdr) + onode->hdr.size,
> + sizeof(onode->hdr) + len,
> 0, false);
> if (ret != SD_RES_SUCCESS) {
> sd_err("failed to write object, %" PRIx64, oid);
> @@ -716,7 +818,7 @@ static int kv_create_inlined_object(struct sd_inode *inode,
> }
> } else {
> ret = sd_write_object(oid, (char *)onode,
> - sizeof(onode->hdr) + onode->hdr.size,
> + sizeof(onode->hdr) + len,
> 0, true);
> if (ret != SD_RES_SUCCESS) {
> sd_err("failed to create object, %" PRIx64, oid);
> @@ -735,13 +837,6 @@ out:
> return ret;
> }
>
> -static int kv_create_extented_object(struct sd_inode *inode,
> - struct kv_onode *onode,
> - uint32_t vid, uint32_t idx)
> -{
> - return SD_RES_SUCCESS;
> -}
> -
> /*
> * Create the object if the index isn't taken. Overwrite the object if it exists
> * Return SD_RES_OBJ_TAKEN if the index is taken by other object.
> @@ -778,48 +873,112 @@ static int do_kv_create_object(struct http_request *req,
> goto out;
> }
> }
> - if (onode->hdr.inlined)
> - ret = kv_create_inlined_object(inode, onode, vid, idx,
> - !!tmp_vid);
> - else
> - ret = kv_create_extented_object(inode, onode, vid, idx);
> +
> + ret = kv_write_onode(inode, onode, vid, idx, !!tmp_vid);
> + if (ret != SD_RES_SUCCESS)
> + sd_err("Failed to write onode");
> out:
> free(inode);
> return ret;
> }
>
> -int kv_create_object(struct http_request *req, const char *bucket,
> - const char *name)
> +int kv_create_object(struct http_request *req, const char *account,
> + const char *bucket, const char *name)
> {
> struct kv_onode *onode;
> - ssize_t size;
> + ssize_t size, total_size = 0;
> int ret;
> - uint64_t hval;
> - uint32_t vid;
> + uint64_t hval, start = 0, count, block, limit;
> + uint32_t vid, data_vid;
> struct timeval tv;
> + char vdi_name[SD_MAX_VDI_LEN];
> + char *data_buf = NULL;
>
> - ret = lookup_bucket(req, bucket, &vid);
> + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket);
> + ret = lookup_bucket(req, vdi_name, &vid);
> + if (ret < 0)
> + return ret;
> +
> + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator", account, bucket);
> + ret = lookup_bucket(req, vdi_name, &data_vid);
> if (ret < 0)
> return ret;
>
> onode = xzalloc(sizeof(*onode));
>
> + /* for inlined onode */
> + if (req->data_length <= KV_ONODE_INLINE_SIZE) {
> + onode->hdr.inlined = 1;
> + size = http_request_read(req, onode->data, sizeof(onode->data));
> + if (size < 0) {
> + sd_err("%s: bucket %s, object %s", sd_strerror(ret),
> + bucket, name);
> + http_response_header(req, INTERNAL_SERVER_ERROR);
> + ret = -1;
> + goto out;
> + }
> + total_size = size;
> + } else {
> + sd_debug("data_length: %lu, %lu", req->data_length,
> + SD_DATA_OBJ_SIZE);
> + count = (req->data_length + SD_DATA_OBJ_SIZE + 1) /
> + SD_DATA_OBJ_SIZE;
> + ret = oalloc_new_prepare(data_vid, &start, count);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to prepare allocation of %lu bytes!",
> + req->data_length);
> + ret = -1;
> + goto out;
> + }
> +
> + /* receive and write data at first, then write onode */
> + data_buf = xmalloc(SD_DATA_OBJ_SIZE);
> +
> + sd_debug("start: %lu, count: %lu", start, count);
> + for (block = start, limit = start + count;
> + block < limit; block++) {
> + sd_debug("block: %lu, limit: %lu", block, limit);
> + size = http_request_read(req, data_buf,
> + SD_DATA_OBJ_SIZE);
> + total_size += size;
> + ret = sd_write_object(vid_to_data_oid(data_vid, block),
> + data_buf, size, 0, true);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to write data object for %"
> + PRIx32" %s", data_vid, sd_strerror(ret));
> + goto out;
> + }
> + if (size < SD_DATA_OBJ_SIZE)
> + break;
> + }
> +
> + sd_debug("DATA_LENGTH: %lu, total size: %lu, last blocks: %lu",
> + req->data_length, total_size, start);
> +
> + sd_debug("finish start: %lu, count: %lu", start, count);
> + ret = oalloc_new_finish(data_vid, start, count);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to finish allocation of %lu bytes!",
> + req->data_length);
> + ret = -1;
> + goto out;
> + }
> +
> + onode->o_extent[0].vdi = data_vid;
Seems that onode_extent.vdi can be removed because of node->hdr.data_vid.
Get a compile warning
CC corosync.o
CC zookeeper.o
Built sheep
http/kv.c: In function ‘kv_delete_bucket’:
http/kv.c:422:6: warning: ‘buf’ may be used uninitialized in this function [-Wmaybe-uninitialized]
http/kv.c:319:8: note: ‘buf’ was declared here
char *buf;
^
CCLD sheep
Thanks
Yuan
More information about the sheepdog
mailing list