[sheepdog] [PATCH] http: add basic extent framework to support large user object

Robin Dong robin.k.dong at gmail.com
Fri Nov 29 06:48:40 CET 2013


Reviewed-by: Robin Dong <sanbai at taobao.com>


2013/11/27 Liu Yuan <namei.unix at gmail.com>

> To support object larger than 4M, we have to introduce extent like
> structure
> to map the data indexes to data holders(sheepdog object). So basically, we
> use
> one object index node(onode) to hold the metadata and extents.
>
> user object -> onode[metadata, extent1, ............., extentN]
>                                  |                        |
>                                  V                        |
>                 +-----------------------------------+     |
>                 | obj1 | obj2 | .............| objN |     |
>                 +-----------------------------------+     |
>                                                           V
>                                  +-----------------------------------+
>                                  | obj1` | obj2` |...........| objN` |
>                                  +-----------------------------------+
>
> For user object size smaller than 4M, we just inline it to the onode
>
> user object -> onode[metadata, user-data].
>
> kv_create_extented_object() is left for later patch set.
>
> For object name to onode mapping, we make use of old hash approach, that is
>
> - hash(objec_name) --> vdi[hash_value] -> onode
>
> Signed-off-by: Liu Yuan <namei.unix at gmail.com>
> ---
>  include/internal_proto.h |    1 +
>  sheep/http/kv.c          |  183
> ++++++++++++++++++++++++++++------------------
>  2 files changed, 113 insertions(+), 71 deletions(-)
>
> diff --git a/include/internal_proto.h b/include/internal_proto.h
> index e5e0f05..70a7b5d 100644
> --- a/include/internal_proto.h
> +++ b/include/internal_proto.h
> @@ -124,6 +124,7 @@
>  #define SD_RES_AGAIN            0x8F /* Ask to try again */
>  #define SD_RES_STALE_OBJ        0x90 /* Object may be stale */
>  #define SD_RES_CLUSTER_ERROR    0x91 /* Cluster driver error */
> +#define SD_RES_OBJ_TAKEN        0x92 /* Object ID is taken up */
>
>  enum sd_status {
>         SD_STATUS_OK = 1,
> diff --git a/sheep/http/kv.c b/sheep/http/kv.c
> index d30a6a1..c04e629 100644
> --- a/sheep/http/kv.c
> +++ b/sheep/http/kv.c
> @@ -165,43 +165,100 @@ int kv_list_buckets(struct http_request *req,
>
>  /* Object operations */
>
> -/* 4 KB header of kv object */
> -struct kv_object_hdr {
> +/* 4 KB header of kv object index node */
> +struct kv_onode_hdr {
>         union {
>                 struct {
>                         char name[SD_MAX_OBJECT_NAME];
> +                       /* a hash value for etag */
> +                       uint8_t sha1[round_up(SHA1_DIGEST_SIZE, 8)];
>                         uint64_t size;
>                         uint64_t ctime;
>                         uint64_t mtime;
> -
> -                       /* the index of the multi parted object */
> -                       uint64_t segment;
> -
> -                       /* a hash value for etag */
> -                       uint8_t sha1[round_up(SHA1_DIGEST_SIZE, 8)];
> +                       uint32_t data_vid;
> +                       uint32_t nr_extent;
> +                       uint8_t inlined;
> +                       uint8_t pad[5];
>                 };
>
>                 uint8_t __pad[BLOCK_SIZE];
>         };
>  };
>
> -struct kv_object {
> -       struct kv_object_hdr hdr;
> -       uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct kv_object_hdr)];
> +struct onode_extent {
> +       uint32_t vdi;
> +       uint32_t pad;
> +       uint64_t start;
> +       uint64_t count;
>  };
>
> +struct kv_onode {
> +       struct kv_onode_hdr hdr;
> +       union {
> +               uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct
> kv_onode_hdr)];
> +               struct onode_extent *o_extent;
> +       };
> +};
> +
> +#define KV_ONODE_INLINE_SIZE (SD_DATA_OBJ_SIZE - sizeof(struct
> kv_onode_hdr))
> +
> +static int kv_create_inlined_object(struct sd_inode *inode,
> +                                   struct kv_onode *onode,
> +                                   uint32_t vid, uint32_t idx,
> +                                   bool overwrite)
> +{
> +       uint64_t oid = vid_to_data_oid(vid, idx);
> +       int ret;
> +
> +       if (overwrite) {
> +               sd_info("overwrite object %s", onode->hdr.name);
> +               ret = write_object(oid, (char *)onode,
> +                                  sizeof(onode->hdr) + onode->hdr.size,
> +                                  0, false);
> +               if (ret != SD_RES_SUCCESS) {
> +                       sd_err("failed to write object, %" PRIx64, oid);
> +                       goto out;
> +               }
> +       } else {
> +               ret = write_object(oid, (char *)onode,
> +                                  sizeof(onode->hdr) + onode->hdr.size,
> +                                  0, true);
> +               if (ret != SD_RES_SUCCESS) {
> +                       sd_err("failed to create object, %" PRIx64, oid);
> +                       goto out;
> +               }
> +               INODE_SET_VID(inode, idx, vid);
> +               ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx,
> +                                        vid, vid, 0, false, false);
> +               if (ret != SD_RES_SUCCESS) {
> +                       sd_err("failed to update inode, %" PRIx64,
> +                              vid_to_vdi_oid(vid));
> +                       goto out;
> +               }
> +       }
> +out:
> +       return ret;
> +}
> +
> +static int kv_create_extented_object(struct sd_inode *inode,
> +                                    struct kv_onode *onode,
> +                                    uint32_t vid, uint32_t idx)
> +{
> +       return SD_RES_SUCCESS;
> +}
> +
>  /*
>   * Create the object if the index isn't taken. Overwrite the object if it
> exists
> - * Return 0 if the index is taken by other object.
> + * Return SD_RES_OBJ_TAKEN if the index is taken by other object.
>   */
> -static int do_kv_create_object(struct http_request *req, const char
> *obj_name,
> -                              struct kv_object *obj, uint32_t vid,
> -                              uint32_t idx)
> +static int do_kv_create_object(struct http_request *req,
> +                              struct kv_onode *onode,
> +                              uint32_t vid, uint32_t idx)
>  {
> +       struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
>         uint64_t oid = vid_to_data_oid(vid, idx);
> +       struct kv_onode_hdr hdr;
>         uint32_t tmp_vid;
> -       struct kv_object_hdr hdr;
> -       struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
>         int ret;
>
>         ret = read_object(vid_to_vdi_oid(vid), (char *)inode,
> @@ -209,60 +266,37 @@ static int do_kv_create_object(struct http_request
> *req, const char *obj_name,
>         if (ret != SD_RES_SUCCESS) {
>                 sd_err("failed to read inode, %" PRIx64,
>                        vid_to_vdi_oid(vid));
> -               goto err;
> +               goto out;
>         }
>         tmp_vid = INODE_GET_VID(inode, idx);
>         if (tmp_vid) {
>                 ret = read_object(oid, (char *)&hdr, sizeof(hdr), 0);
>                 if (ret != SD_RES_SUCCESS) {
>                         sd_err("failed to read object, %" PRIx64, oid);
> -                       goto err;
> +                       goto out;
>                 }
>
>                 if (hdr.name[0] != '\0' &&
> -                   strcmp(hdr.name, obj->hdr.name) != 0){
> +                   strcmp(hdr.name, onode->hdr.name) != 0) {
>                         sd_debug("index %d is already used", idx);
> +                       ret = SD_RES_OBJ_TAKEN;
>                         goto out;
>                 }
> -               sd_info("overwrite object %s", obj_name);
> -               ret = write_object(oid, (char *)obj,
> -                                  sizeof(obj->hdr) + obj->hdr.size,
> -                                  0, false);
> -               if (ret != SD_RES_SUCCESS) {
> -                       sd_err("failed to write object, %" PRIx64, oid);
> -                       goto err;
> -               }
> -       } else {
> -               ret = write_object(oid, (char *)obj,
> -                                  sizeof(obj->hdr) + obj->hdr.size,
> -                                  0, true);
> -               if (ret != SD_RES_SUCCESS) {
> -                       sd_err("failed to create object, %" PRIx64, oid);
> -                       goto err;
> -               }
> -               INODE_SET_VID(inode, idx, vid);
> -               ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx,
> -                                        vid, vid, 0, false, false);
> -               if (ret != SD_RES_SUCCESS) {
> -                       sd_err("failed to update inode, %" PRIx64,
> -                              vid_to_vdi_oid(vid));
> -                       goto err;
> -               }
>         }
> -       http_response_header(req, CREATED);
> +       if (onode->hdr.inlined)
> +               ret = kv_create_inlined_object(inode, onode, vid, idx,
> +                                              !!tmp_vid);
> +       else
> +               ret = kv_create_extented_object(inode, onode, vid, idx);
>  out:
>         free(inode);
> -       return 0;
> -err:
> -       http_response_header(req, INTERNAL_SERVER_ERROR);
> -       free(inode);
> -       return -1;
> +       return ret;
>  }
>
>  int kv_create_object(struct http_request *req, const char *bucket,
> -                    const char *object)
> +                    const char *name)
>  {
> -       struct kv_object *obj;
> +       struct kv_onode *onode;
>         ssize_t size;
>         int ret;
>         uint64_t hval;
> @@ -273,44 +307,51 @@ int kv_create_object(struct http_request *req, const
> char *bucket,
>         if (ret < 0)
>                 return ret;
>
> -       obj = xzalloc(sizeof(*obj));
> +       onode = xzalloc(sizeof(*onode));
>
>         gettimeofday(&tv, NULL);
> -       pstrcpy(obj->hdr.name, sizeof(obj->hdr.name), object);
> -       obj->hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
> -       obj->hdr.mtime = obj->hdr.ctime;
> +       pstrcpy(onode->hdr.name, sizeof(onode->hdr.name), name);
> +       onode->hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
> +       onode->hdr.mtime = onode->hdr.ctime;
>
> -       /* TODO: support multi parted object for large object */
> -       size = http_request_read(req, obj->data, sizeof(obj->data));
> +       size = http_request_read(req, onode->data, sizeof(onode->data));
>         if (size < 0) {
>                 sd_err("%s: bucket %s, object %s", sd_strerror(ret),
> -                      bucket, object);
> +                      bucket, name);
>                 http_response_header(req, INTERNAL_SERVER_ERROR);
>                 return -1;
>         }
>
> -       obj->hdr.size = size;
> -
> -       hval = sd_hash(object, strlen(object));
> +       onode->hdr.size = size;
> +       if (size <= KV_ONODE_INLINE_SIZE)
> +               onode->hdr.inlined = 1;
> +       hval = sd_hash(name, strlen(name));
>         for (int i = 0; i < MAX_DATA_OBJS; i++) {
>                 uint32_t idx = (hval + i) % MAX_DATA_OBJS;
>
> -               do_kv_create_object(req, object, obj, vid, idx);
> -               if (req->status != UNKNOWN) {
> -                       free(obj);
> +               ret = do_kv_create_object(req, onode, vid, idx);
> +               switch (ret) {
> +               case SD_RES_SUCCESS:
> +                       http_response_header(req, CREATED);
> +                       free(onode);
>                         return 0;
> +               case SD_RES_OBJ_TAKEN:
> +                       break;
> +               default:
> +                       http_response_header(req, INTERNAL_SERVER_ERROR);
> +                       free(onode);
> +                       return -1;
>                 }
>         }
>
> -       free(obj);
> -
>         /* no free space to create a object */
>         http_response_header(req, SERVICE_UNAVAILABLE);
> +       free(onode);
>         return -1;
>  }
>
>  static int do_kv_read_object(struct http_request *req, const char
> *obj_name,
> -                            struct kv_object *obj, uint32_t vid, uint32_t
> idx)
> +                            struct kv_onode *obj, uint32_t vid, uint32_t
> idx)
>  {
>         uint64_t oid = vid_to_data_oid(vid, idx);
>         int ret;
> @@ -342,7 +383,7 @@ static int do_kv_read_object(struct http_request *req,
> const char *obj_name,
>  int kv_read_object(struct http_request *req, const char *bucket,
>                    const char *object)
>  {
> -       struct kv_object *obj;
> +       struct kv_onode *obj;
>         int ret;
>         uint64_t hval;
>         uint32_t vid;
> @@ -371,7 +412,7 @@ int kv_read_object(struct http_request *req, const
> char *bucket,
>  }
>
>  static int do_kv_update_object(struct http_request *req, const char
> *obj_name,
> -                              struct kv_object *obj, uint32_t vid,
> +                              struct kv_onode *obj, uint32_t vid,
>                                uint32_t idx, size_t size)
>  {
>         uint64_t oid = vid_to_data_oid(vid, idx);
> @@ -415,7 +456,7 @@ static int do_kv_update_object(struct http_request
> *req, const char *obj_name,
>  int kv_update_object(struct http_request *req, const char *bucket,
>                      const char *object)
>  {
> -       struct kv_object *obj;
> +       struct kv_onode *obj;
>         int ret;
>         uint64_t hval;
>         uint32_t vid;
> --
> 1.7.9.5
>
> --
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog
>



-- 
--
Best Regard
Robin Dong
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.wpkg.org/pipermail/sheepdog/attachments/20131129/c38a27a4/attachment-0004.html>


More information about the sheepdog mailing list