[sheepdog] [PATCH v3 2/3] sheep/http: add support for big object which is larger than SD_DATA_OBJ_SIZE
Liu Yuan
namei.unix at gmail.com
Thu Dec 12 09:51:26 CET 2013
On Wed, Dec 11, 2013 at 06:14:42PM +0800, Robin Dong wrote:
> From: Robin Dong <sanbai at taobao.com>
>
> Using hyper volume and extent structure of onode to store large number of
> objects size of which exceed SD_DATA_OBJ_SIZE.
>
> Signed-off-by: Robin Dong <sanbai at taobao.com>
> ---
> sheep/http/kv.c | 427 +++++++++++++++++++++++++++++++++++++++--------------
> sheep/http/kv.h | 15 +-
> sheep/http/s3.c | 8 +-
> sheep/http/swift.c | 39 ++---
> 4 files changed, 344 insertions(+), 145 deletions(-)
>
> diff --git a/sheep/http/kv.c b/sheep/http/kv.c
> index 57b5cd3..fb66dfa 100644
> --- a/sheep/http/kv.c
> +++ b/sheep/http/kv.c
> @@ -21,6 +21,7 @@ struct bucket_inode_hdr {
> uint64_t obj_count;
> uint64_t bytes_used;
> uint32_t onode_vid;
> + uint32_t data_vid; /* data of objects store in this vdi */
> };
>
> struct bucket_inode {
> @@ -159,13 +160,13 @@ int kv_create_account(const char *account)
> return kv_create_hyper_volume(account, &vdi_id);
> }
>
> -typedef void (*list_cb)(struct http_request *req, const char *bucket,
> - void *opaque);
> +typedef void (*list_bucket_cb)(struct http_request *req, const char *bucket,
> + void *opaque);
>
> struct list_buckets_arg {
> struct http_request *req;
> void *opaque;
> - list_cb cb;
> + list_bucket_cb cb;
> uint32_t bucket_counter;
> };
>
> @@ -347,7 +348,16 @@ static int delete_bucket(struct sd_inode *account_inode, uint64_t idx,
> bnode->hdr.onode_vid = 0;
> snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
> account_inode->name, bucket);
> -
> + /* delete vdi which store kv_onode */
> + ret = kv_delete_vdi(vdi_name);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to delete vdi %s", vdi_name);
> + ret = -1;
> + goto out;
> + }
> + /* delete vdi which store object data */
> + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator",
> + account_inode->name, bucket);
> ret = kv_delete_vdi(vdi_name);
> if (ret != SD_RES_SUCCESS) {
> sd_err("Failed to delete vdi %s", vdi_name);
> @@ -449,12 +459,29 @@ static int add_bucket(struct sd_inode *account_inode, uint64_t idx,
> bnode->hdr.bytes_used = 0;
> snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
> account_inode->name, bucket);
> + /* create vdi to store kv_onode */
> ret = kv_create_hyper_volume(vdi_name, &(bnode->hdr.onode_vid));
> if (ret != SD_RES_SUCCESS) {
> sd_err("Failed to create hyper volume %d", ret);
> ret = -1;
> goto out;
> }
> + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator",
> + account_inode->name, bucket);
> + /* create vdi to store objects */
> + ret = kv_create_hyper_volume(vdi_name, &(bnode->hdr.data_vid));
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to create hyper volume %d", ret);
> + ret = -1;
> + goto out;
> + }
> + ret = oalloc_init(bnode->hdr.data_vid);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to init allocator on %x",
> + bnode->hdr.data_vid);
> + ret = -1;
> + goto out;
> + }
> sd_debug("create hyper volume %s success", vdi_name);
> break;
> }
> @@ -616,8 +643,8 @@ int kv_delete_bucket(const char *account, const char *bucket)
> return SD_RES_SUCCESS;
> }
>
> -int kv_list_buckets(struct http_request *req, const char *account, list_cb cb,
> - void *opaque)
> +int kv_list_buckets(struct http_request *req, const char *account,
> + list_bucket_cb cb, void *opaque)
> {
> struct sd_inode account_inode;
> uint32_t account_vid;
> @@ -645,6 +672,34 @@ int kv_list_buckets(struct http_request *req, const char *account, list_cb cb,
> return SD_RES_SUCCESS;
> }
>
> +/*
> + * A bucket contains two vdi: one (vdi_id) stores 'struct kv_onode' by hash
> + * algorithm and another one (data_vid) stores data of objects.
> + * The first vdi names "account/bucket" and the second vdi names
> + * "account/bucket/allocator".
> + *
> + * It manage space in data vdi by algorithm in oalloc.c.
> + *
> + * For example: bucket "fruit" with account 'coly' has two objects "banana"
> + * and "apple"
> + *
> + *
> + * --------------------- kv_onode -----------------------
> + * | |
> + * bucket vdi v v
> + * +-----------------+--+---------------------------+--------------------------+
> + * |name: coly/fruit |..|kv_onode_hdr (name: banana)|onode_extent: start, count|
> + * +-----------------+--+---------------------------+--------------------------+
> + * /
> + * /
> + * ------------
> + * /
> + * data_vid v
> + * +---------------------------+---+-----------------+
> + * |name: coly/fruit/allocator |...| data |
> + * +---------------------------+---+-----------------+
> + */
> +
> /* Object operations */
>
> /* 4 KB header of kv object index node */
> @@ -668,8 +723,6 @@ struct kv_onode_hdr {
> };
>
> struct onode_extent {
> - uint32_t vdi;
> - uint32_t pad;
> uint64_t start;
> uint64_t count;
> };
> @@ -678,24 +731,71 @@ struct kv_onode {
> struct kv_onode_hdr hdr;
> union {
> uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct kv_onode_hdr)];
> - struct onode_extent *o_extent;
> + struct onode_extent o_extent[0];
> };
> };
>
> +typedef void (*list_object_cb)(struct http_request *req, const char *bucket,
> + const char *object, void *opaque);
> +
> +struct list_objects_arg {
> + struct http_request *req;
> + void *opaque;
> + const char *bucket;
> + list_object_cb cb;
> + uint32_t object_counter;
> +};
> +
> +static void list_objects_cb(void *data, enum btree_node_type type, void *arg)
> +{
> + struct sd_extent *ext;
> + struct list_objects_arg *loarg = arg;
> + struct kv_onode *onode = NULL;
> + uint64_t oid;
> + int ret;
> +
> + if (type == BTREE_EXT) {
> + ext = (struct sd_extent *)data;
> + if (!ext->vdi_id)
> + goto out;
> +
> + onode = xmalloc(SD_DATA_OBJ_SIZE);
> +
> + oid = vid_to_data_oid(ext->vdi_id, ext->idx);
> + ret = sd_read_object(oid, (char *)onode, SD_DATA_OBJ_SIZE, 0);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to read data object %lx", oid);
> + goto out;
> + }
> +
> + if (onode->hdr.name[0] == '\0')
> + goto out;
> + if (loarg->cb)
> + loarg->cb(loarg->req, loarg->bucket, onode->hdr.name,
> + loarg->opaque);
> + loarg->object_counter++;
> + }
> +out:
> + free(onode);
> +}
> +
> #define KV_ONODE_INLINE_SIZE (SD_DATA_OBJ_SIZE - sizeof(struct kv_onode_hdr))
>
> -static int kv_create_inlined_object(struct sd_inode *inode,
> - struct kv_onode *onode,
> - uint32_t vid, uint32_t idx,
> - bool overwrite)
> +static int kv_write_onode(struct sd_inode *inode, struct kv_onode *onode,
> + uint32_t vid, uint32_t idx, bool overwrite)
> {
> - uint64_t oid = vid_to_data_oid(vid, idx);
> + uint64_t oid = vid_to_data_oid(vid, idx), len;
> int ret;
>
> + if (onode->hdr.inlined)
> + len = onode->hdr.size;
> + else
> + len = sizeof(struct onode_extent) * onode->hdr.nr_extent;
> +
> if (overwrite) {
> sd_info("overwrite object %s", onode->hdr.name);
> ret = sd_write_object(oid, (char *)onode,
> - sizeof(onode->hdr) + onode->hdr.size,
> + sizeof(onode->hdr) + len,
> 0, false);
> if (ret != SD_RES_SUCCESS) {
> sd_err("failed to write object, %" PRIx64, oid);
> @@ -703,7 +803,7 @@ static int kv_create_inlined_object(struct sd_inode *inode,
> }
> } else {
> ret = sd_write_object(oid, (char *)onode,
> - sizeof(onode->hdr) + onode->hdr.size,
> + sizeof(onode->hdr) + len,
> 0, true);
> if (ret != SD_RES_SUCCESS) {
> sd_err("failed to create object, %" PRIx64, oid);
> @@ -722,13 +822,6 @@ out:
> return ret;
> }
>
> -static int kv_create_extented_object(struct sd_inode *inode,
> - struct kv_onode *onode,
> - uint32_t vid, uint32_t idx)
> -{
> - return SD_RES_SUCCESS;
> -}
> -
> /*
> * Create the object if the index isn't taken. Overwrite the object if it exists
> * Return SD_RES_OBJ_TAKEN if the index is taken by other object.
> @@ -765,48 +858,111 @@ static int do_kv_create_object(struct http_request *req,
> goto out;
> }
> }
> - if (onode->hdr.inlined)
> - ret = kv_create_inlined_object(inode, onode, vid, idx,
> - !!tmp_vid);
> - else
> - ret = kv_create_extented_object(inode, onode, vid, idx);
> +
> + ret = kv_write_onode(inode, onode, vid, idx, !!tmp_vid);
> + if (ret != SD_RES_SUCCESS)
> + sd_err("Failed to write onode");
> out:
> free(inode);
> return ret;
> }
>
> -int kv_create_object(struct http_request *req, const char *bucket,
> - const char *name)
> +int kv_create_object(struct http_request *req, const char *account,
> + const char *bucket, const char *name)
> {
> struct kv_onode *onode;
> - ssize_t size;
> + ssize_t size, total_size = 0;
> int ret;
> - uint64_t hval;
> - uint32_t vid;
> + uint64_t hval, start = 0, count, block, limit;
> + uint32_t vid, data_vid;
> struct timeval tv;
> + char vdi_name[SD_MAX_VDI_LEN];
> + char *data_buf = NULL;
>
> - ret = lookup_bucket(req, bucket, &vid);
> + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket);
> + ret = lookup_bucket(req, vdi_name, &vid);
> + if (ret < 0)
> + return ret;
> +
> + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator", account, bucket);
> + ret = lookup_bucket(req, vdi_name, &data_vid);
> if (ret < 0)
> return ret;
>
> onode = xzalloc(sizeof(*onode));
>
> + /* for inlined onode */
> + if (req->data_length <= KV_ONODE_INLINE_SIZE) {
> + onode->hdr.inlined = 1;
> + size = http_request_read(req, onode->data, sizeof(onode->data));
> + if (size < 0) {
> + sd_err("%s: bucket %s, object %s", sd_strerror(ret),
> + bucket, name);
> + http_response_header(req, INTERNAL_SERVER_ERROR);
> + ret = -1;
> + goto out;
> + }
> + total_size = size;
> + } else {
> + sd_debug("data_length: %lu, %lu", req->data_length,
> + SD_DATA_OBJ_SIZE);
> + count = (req->data_length + SD_DATA_OBJ_SIZE + 1) /
> + SD_DATA_OBJ_SIZE;
> + ret = oalloc_new_prepare(data_vid, &start, count);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to prepare allocation of %lu bytes!",
> + req->data_length);
> + ret = -1;
> + goto out;
> + }
> +
> + /* receive and write data at first, then write onode */
> + data_buf = xmalloc(SD_DATA_OBJ_SIZE);
> +
> + sd_debug("start: %lu, count: %lu", start, count);
> + for (block = start, limit = start + count;
> + block < limit; block++) {
> + sd_debug("block: %lu, limit: %lu", block, limit);
> + size = http_request_read(req, data_buf,
> + SD_DATA_OBJ_SIZE);
> + total_size += size;
> + ret = sd_write_object(vid_to_data_oid(data_vid, block),
> + data_buf, size, 0, true);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to write data object for %"
> + PRIx32" %s", data_vid, sd_strerror(ret));
> + goto out;
> + }
> + if (size < SD_DATA_OBJ_SIZE)
> + break;
> + }
> +
> + sd_debug("DATA_LENGTH: %lu, total size: %lu, last blocks: %lu",
> + req->data_length, total_size, start);
> +
> + sd_debug("finish start: %lu, count: %lu", start, count);
> + ret = oalloc_new_finish(data_vid, start, count);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to finish allocation of %lu bytes!",
> + req->data_length);
> + ret = -1;
> + goto out;
> + }
> +
> + onode->o_extent[0].start = start;
> + onode->o_extent[0].count = count;
> + onode->hdr.nr_extent = 1;
> + }
> +
> + /* after write data, we write onode now */
> +
> gettimeofday(&tv, NULL);
> pstrcpy(onode->hdr.name, sizeof(onode->hdr.name), name);
> onode->hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
> onode->hdr.mtime = onode->hdr.ctime;
> + onode->hdr.size = total_size;
> + onode->hdr.data_vid = data_vid;
>
> - size = http_request_read(req, onode->data, sizeof(onode->data));
> - if (size < 0) {
> - sd_err("%s: bucket %s, object %s", sd_strerror(ret),
> - bucket, name);
> - http_response_header(req, INTERNAL_SERVER_ERROR);
> - return -1;
> - }
> -
> - onode->hdr.size = size;
> - if (size <= KV_ONODE_INLINE_SIZE)
> - onode->hdr.inlined = 1;
> hval = sd_hash(name, strlen(name));
> for (int i = 0; i < MAX_DATA_OBJS; i++) {
> uint32_t idx = (hval + i) % MAX_DATA_OBJS;
> @@ -815,30 +971,66 @@ int kv_create_object(struct http_request *req, const char *bucket,
> switch (ret) {
> case SD_RES_SUCCESS:
> http_response_header(req, CREATED);
> - free(onode);
> - return 0;
> + goto out;
> case SD_RES_OBJ_TAKEN:
> break;
> default:
> http_response_header(req, INTERNAL_SERVER_ERROR);
> - free(onode);
> - return -1;
> + goto out;
> }
> }
> -
> /* no free space to create a object */
> http_response_header(req, SERVICE_UNAVAILABLE);
> +out:
> free(onode);
> - return -1;
> + free(data_buf);
> + return ret;
> +}
> +
> +static int kv_read_extent_onode(struct http_request *req,
> + struct kv_onode *onode)
> +{
> + struct onode_extent *ext;
> + uint64_t oid, block, size, total_size, limit;
> + uint32_t i;
> + int ret;
> + char *data_buf = NULL;
> +
> + data_buf = xmalloc(SD_DATA_OBJ_SIZE);
> +
> + total_size = onode->hdr.size;
> + ext = onode->o_extent;
> + for (i = 0; i < onode->hdr.nr_extent; i++) {
> + limit = ext->count + ext->start;
> + for (block = ext->start; block < limit; block++) {
> + oid = vid_to_data_oid(onode->hdr.data_vid, block);
> + if (total_size < SD_DATA_OBJ_SIZE)
> + size = total_size;
> + else
> + size = SD_DATA_OBJ_SIZE;
> + ret = sd_read_object(oid, data_buf, size, 0);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to read oid %lx", oid);
> + goto out;
> + }
> + http_request_write(req, data_buf, size);
> + total_size -= size;
> + sd_debug("read extented block %lu, size %lu",
> + block, size);
> + }
> + }
> +out:
> + free(data_buf);
> + return ret;
> }
>
> static int do_kv_read_object(struct http_request *req, const char *obj_name,
> - struct kv_onode *obj, uint32_t vid, uint32_t idx)
> + struct kv_onode *onode, uint32_t vid, uint32_t idx)
> {
> uint64_t oid = vid_to_data_oid(vid, idx);
> int ret;
>
> - ret = sd_read_object(oid, (char *)obj, sizeof(*obj), 0);
> + ret = sd_read_object(oid, (char *)onode, sizeof(*onode), 0);
> switch (ret) {
> case SD_RES_SUCCESS:
> break;
> @@ -852,42 +1044,51 @@ static int do_kv_read_object(struct http_request *req, const char *obj_name,
> return -1;
> }
>
> - if (strcmp(obj->hdr.name, obj_name) == 0) {
> + if (strcmp(onode->hdr.name, obj_name) == 0) {
> http_response_header(req, OK);
> -
> - /* TODO: support multi parted object for large object */
> - http_request_write(req, obj->data, obj->hdr.size);
> + /* for inlined onode */
> + if (onode->hdr.inlined)
> + http_request_write(req, onode->data, onode->hdr.size);
> + else {
> + ret = kv_read_extent_onode(req, onode);
> + if (ret) {
> + sd_err("Failed to read extent onode");
> + return -1;
> + }
> + }
> }
>
> return 0;
> }
>
> -int kv_read_object(struct http_request *req, const char *bucket,
> - const char *object)
> +int kv_read_object(struct http_request *req, const char *account,
> + const char *bucket, const char *object)
> {
> - struct kv_onode *obj;
> + struct kv_onode *onode;
> int ret;
> uint64_t hval;
> uint32_t vid;
> + char vdi_name[SD_MAX_VDI_LEN];
>
> - ret = lookup_bucket(req, bucket, &vid);
> + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket);
> + ret = lookup_bucket(req, vdi_name, &vid);
> if (ret < 0)
> return ret;
>
> - obj = xzalloc(sizeof(*obj));
> + onode = xzalloc(sizeof(*onode));
>
> hval = sd_hash(object, strlen(object));
> for (int i = 0; i < MAX_DATA_OBJS; i++) {
> uint32_t idx = (hval + i) % MAX_DATA_OBJS;
>
> - do_kv_read_object(req, object, obj, vid, idx);
> + do_kv_read_object(req, object, onode, vid, idx);
> if (req->status != UNKNOWN) {
> - free(obj);
> + free(onode);
> return 0;
> }
> }
>
> - free(obj);
> + free(onode);
>
> http_response_header(req, NOT_FOUND);
> return -1;
> @@ -980,9 +1181,11 @@ int kv_update_object(struct http_request *req, const char *bucket,
Is there http api map to this function? Or how do you think we can make use of
update semantics?
I think update an existing object should be excluded this patch set since there
is currently no caller of it.
It should be on a dedicated patch set that include more information, such as how
it works for user interface and why we need it, etc.
Thanks
Yuan
More information about the sheepdog
mailing list