[sheepdog] [PATCH] http: add basic extent framework to support large user object
Robin Dong
robin.k.dong at gmail.com
Fri Nov 29 06:48:40 CET 2013
Reviewed-by: Robin Dong <sanbai at taobao.com>
2013/11/27 Liu Yuan <namei.unix at gmail.com>
> To support object larger than 4M, we have to introduce extent like
> structure
> to map the data indexes to data holders(sheepdog object). So basically, we
> use
> one object index node(onode) to hold the metadata and extents.
>
> user object -> onode[metadata, extent1, ............., extentN]
> | |
> V |
> +-----------------------------------+ |
> | obj1 | obj2 | .............| objN | |
> +-----------------------------------+ |
> V
> +-----------------------------------+
> | obj1` | obj2` |...........| objN` |
> +-----------------------------------+
>
> For user object size smaller than 4M, we just inline it to the onode
>
> user object -> onode[metadata, user-data].
>
> kv_create_extented_object() is left for later patch set.
>
> For object name to onode mapping, we make use of old hash approach, that is
>
> - hash(objec_name) --> vdi[hash_value] -> onode
>
> Signed-off-by: Liu Yuan <namei.unix at gmail.com>
> ---
> include/internal_proto.h | 1 +
> sheep/http/kv.c | 183
> ++++++++++++++++++++++++++++------------------
> 2 files changed, 113 insertions(+), 71 deletions(-)
>
> diff --git a/include/internal_proto.h b/include/internal_proto.h
> index e5e0f05..70a7b5d 100644
> --- a/include/internal_proto.h
> +++ b/include/internal_proto.h
> @@ -124,6 +124,7 @@
> #define SD_RES_AGAIN 0x8F /* Ask to try again */
> #define SD_RES_STALE_OBJ 0x90 /* Object may be stale */
> #define SD_RES_CLUSTER_ERROR 0x91 /* Cluster driver error */
> +#define SD_RES_OBJ_TAKEN 0x92 /* Object ID is taken up */
>
> enum sd_status {
> SD_STATUS_OK = 1,
> diff --git a/sheep/http/kv.c b/sheep/http/kv.c
> index d30a6a1..c04e629 100644
> --- a/sheep/http/kv.c
> +++ b/sheep/http/kv.c
> @@ -165,43 +165,100 @@ int kv_list_buckets(struct http_request *req,
>
> /* Object operations */
>
> -/* 4 KB header of kv object */
> -struct kv_object_hdr {
> +/* 4 KB header of kv object index node */
> +struct kv_onode_hdr {
> union {
> struct {
> char name[SD_MAX_OBJECT_NAME];
> + /* a hash value for etag */
> + uint8_t sha1[round_up(SHA1_DIGEST_SIZE, 8)];
> uint64_t size;
> uint64_t ctime;
> uint64_t mtime;
> -
> - /* the index of the multi parted object */
> - uint64_t segment;
> -
> - /* a hash value for etag */
> - uint8_t sha1[round_up(SHA1_DIGEST_SIZE, 8)];
> + uint32_t data_vid;
> + uint32_t nr_extent;
> + uint8_t inlined;
> + uint8_t pad[5];
> };
>
> uint8_t __pad[BLOCK_SIZE];
> };
> };
>
> -struct kv_object {
> - struct kv_object_hdr hdr;
> - uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct kv_object_hdr)];
> +struct onode_extent {
> + uint32_t vdi;
> + uint32_t pad;
> + uint64_t start;
> + uint64_t count;
> };
>
> +struct kv_onode {
> + struct kv_onode_hdr hdr;
> + union {
> + uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct
> kv_onode_hdr)];
> + struct onode_extent *o_extent;
> + };
> +};
> +
> +#define KV_ONODE_INLINE_SIZE (SD_DATA_OBJ_SIZE - sizeof(struct
> kv_onode_hdr))
> +
> +static int kv_create_inlined_object(struct sd_inode *inode,
> + struct kv_onode *onode,
> + uint32_t vid, uint32_t idx,
> + bool overwrite)
> +{
> + uint64_t oid = vid_to_data_oid(vid, idx);
> + int ret;
> +
> + if (overwrite) {
> + sd_info("overwrite object %s", onode->hdr.name);
> + ret = write_object(oid, (char *)onode,
> + sizeof(onode->hdr) + onode->hdr.size,
> + 0, false);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("failed to write object, %" PRIx64, oid);
> + goto out;
> + }
> + } else {
> + ret = write_object(oid, (char *)onode,
> + sizeof(onode->hdr) + onode->hdr.size,
> + 0, true);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("failed to create object, %" PRIx64, oid);
> + goto out;
> + }
> + INODE_SET_VID(inode, idx, vid);
> + ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx,
> + vid, vid, 0, false, false);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("failed to update inode, %" PRIx64,
> + vid_to_vdi_oid(vid));
> + goto out;
> + }
> + }
> +out:
> + return ret;
> +}
> +
> +static int kv_create_extented_object(struct sd_inode *inode,
> + struct kv_onode *onode,
> + uint32_t vid, uint32_t idx)
> +{
> + return SD_RES_SUCCESS;
> +}
> +
> /*
> * Create the object if the index isn't taken. Overwrite the object if it
> exists
> - * Return 0 if the index is taken by other object.
> + * Return SD_RES_OBJ_TAKEN if the index is taken by other object.
> */
> -static int do_kv_create_object(struct http_request *req, const char
> *obj_name,
> - struct kv_object *obj, uint32_t vid,
> - uint32_t idx)
> +static int do_kv_create_object(struct http_request *req,
> + struct kv_onode *onode,
> + uint32_t vid, uint32_t idx)
> {
> + struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
> uint64_t oid = vid_to_data_oid(vid, idx);
> + struct kv_onode_hdr hdr;
> uint32_t tmp_vid;
> - struct kv_object_hdr hdr;
> - struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
> int ret;
>
> ret = read_object(vid_to_vdi_oid(vid), (char *)inode,
> @@ -209,60 +266,37 @@ static int do_kv_create_object(struct http_request
> *req, const char *obj_name,
> if (ret != SD_RES_SUCCESS) {
> sd_err("failed to read inode, %" PRIx64,
> vid_to_vdi_oid(vid));
> - goto err;
> + goto out;
> }
> tmp_vid = INODE_GET_VID(inode, idx);
> if (tmp_vid) {
> ret = read_object(oid, (char *)&hdr, sizeof(hdr), 0);
> if (ret != SD_RES_SUCCESS) {
> sd_err("failed to read object, %" PRIx64, oid);
> - goto err;
> + goto out;
> }
>
> if (hdr.name[0] != '\0' &&
> - strcmp(hdr.name, obj->hdr.name) != 0){
> + strcmp(hdr.name, onode->hdr.name) != 0) {
> sd_debug("index %d is already used", idx);
> + ret = SD_RES_OBJ_TAKEN;
> goto out;
> }
> - sd_info("overwrite object %s", obj_name);
> - ret = write_object(oid, (char *)obj,
> - sizeof(obj->hdr) + obj->hdr.size,
> - 0, false);
> - if (ret != SD_RES_SUCCESS) {
> - sd_err("failed to write object, %" PRIx64, oid);
> - goto err;
> - }
> - } else {
> - ret = write_object(oid, (char *)obj,
> - sizeof(obj->hdr) + obj->hdr.size,
> - 0, true);
> - if (ret != SD_RES_SUCCESS) {
> - sd_err("failed to create object, %" PRIx64, oid);
> - goto err;
> - }
> - INODE_SET_VID(inode, idx, vid);
> - ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx,
> - vid, vid, 0, false, false);
> - if (ret != SD_RES_SUCCESS) {
> - sd_err("failed to update inode, %" PRIx64,
> - vid_to_vdi_oid(vid));
> - goto err;
> - }
> }
> - http_response_header(req, CREATED);
> + if (onode->hdr.inlined)
> + ret = kv_create_inlined_object(inode, onode, vid, idx,
> + !!tmp_vid);
> + else
> + ret = kv_create_extented_object(inode, onode, vid, idx);
> out:
> free(inode);
> - return 0;
> -err:
> - http_response_header(req, INTERNAL_SERVER_ERROR);
> - free(inode);
> - return -1;
> + return ret;
> }
>
> int kv_create_object(struct http_request *req, const char *bucket,
> - const char *object)
> + const char *name)
> {
> - struct kv_object *obj;
> + struct kv_onode *onode;
> ssize_t size;
> int ret;
> uint64_t hval;
> @@ -273,44 +307,51 @@ int kv_create_object(struct http_request *req, const
> char *bucket,
> if (ret < 0)
> return ret;
>
> - obj = xzalloc(sizeof(*obj));
> + onode = xzalloc(sizeof(*onode));
>
> gettimeofday(&tv, NULL);
> - pstrcpy(obj->hdr.name, sizeof(obj->hdr.name), object);
> - obj->hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
> - obj->hdr.mtime = obj->hdr.ctime;
> + pstrcpy(onode->hdr.name, sizeof(onode->hdr.name), name);
> + onode->hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
> + onode->hdr.mtime = onode->hdr.ctime;
>
> - /* TODO: support multi parted object for large object */
> - size = http_request_read(req, obj->data, sizeof(obj->data));
> + size = http_request_read(req, onode->data, sizeof(onode->data));
> if (size < 0) {
> sd_err("%s: bucket %s, object %s", sd_strerror(ret),
> - bucket, object);
> + bucket, name);
> http_response_header(req, INTERNAL_SERVER_ERROR);
> return -1;
> }
>
> - obj->hdr.size = size;
> -
> - hval = sd_hash(object, strlen(object));
> + onode->hdr.size = size;
> + if (size <= KV_ONODE_INLINE_SIZE)
> + onode->hdr.inlined = 1;
> + hval = sd_hash(name, strlen(name));
> for (int i = 0; i < MAX_DATA_OBJS; i++) {
> uint32_t idx = (hval + i) % MAX_DATA_OBJS;
>
> - do_kv_create_object(req, object, obj, vid, idx);
> - if (req->status != UNKNOWN) {
> - free(obj);
> + ret = do_kv_create_object(req, onode, vid, idx);
> + switch (ret) {
> + case SD_RES_SUCCESS:
> + http_response_header(req, CREATED);
> + free(onode);
> return 0;
> + case SD_RES_OBJ_TAKEN:
> + break;
> + default:
> + http_response_header(req, INTERNAL_SERVER_ERROR);
> + free(onode);
> + return -1;
> }
> }
>
> - free(obj);
> -
> /* no free space to create a object */
> http_response_header(req, SERVICE_UNAVAILABLE);
> + free(onode);
> return -1;
> }
>
> static int do_kv_read_object(struct http_request *req, const char
> *obj_name,
> - struct kv_object *obj, uint32_t vid, uint32_t
> idx)
> + struct kv_onode *obj, uint32_t vid, uint32_t
> idx)
> {
> uint64_t oid = vid_to_data_oid(vid, idx);
> int ret;
> @@ -342,7 +383,7 @@ static int do_kv_read_object(struct http_request *req,
> const char *obj_name,
> int kv_read_object(struct http_request *req, const char *bucket,
> const char *object)
> {
> - struct kv_object *obj;
> + struct kv_onode *obj;
> int ret;
> uint64_t hval;
> uint32_t vid;
> @@ -371,7 +412,7 @@ int kv_read_object(struct http_request *req, const
> char *bucket,
> }
>
> static int do_kv_update_object(struct http_request *req, const char
> *obj_name,
> - struct kv_object *obj, uint32_t vid,
> + struct kv_onode *obj, uint32_t vid,
> uint32_t idx, size_t size)
> {
> uint64_t oid = vid_to_data_oid(vid, idx);
> @@ -415,7 +456,7 @@ static int do_kv_update_object(struct http_request
> *req, const char *obj_name,
> int kv_update_object(struct http_request *req, const char *bucket,
> const char *object)
> {
> - struct kv_object *obj;
> + struct kv_onode *obj;
> int ret;
> uint64_t hval;
> uint32_t vid;
> --
> 1.7.9.5
>
> --
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog
>
--
--
Best Regard
Robin Dong
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.wpkg.org/pipermail/sheepdog/attachments/20131129/c38a27a4/attachment-0004.html>
More information about the sheepdog
mailing list