<div dir="ltr">Reviewed-by: Robin Dong <<a href="mailto:sanbai@taobao.com">sanbai@taobao.com</a>></div><div class="gmail_extra"><br><br><div class="gmail_quote">2013/11/27 Liu Yuan <span dir="ltr"><<a href="mailto:namei.unix@gmail.com" target="_blank">namei.unix@gmail.com</a>></span><br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">To support object larger than 4M, we have to introduce extent like structure<br>
to map the data indexes to data holders(sheepdog object). So basically, we use<br>
one object index node(onode) to hold the metadata and extents.<br>
<br>
user object -> onode[metadata, extent1, ............., extentN]<br>
                                 |                        |<br>
                                 V                        |<br>
                +-----------------------------------+     |<br>
                | obj1 | obj2 | .............| objN |     |<br>
                +-----------------------------------+     |<br>
                                                          V<br>
                                 +-----------------------------------+<br>
                                 | obj1` | obj2` |...........| objN` |<br>
                                 +-----------------------------------+<br>
<br>
For user object size smaller than 4M, we just inline it to the onode<br>
<br>
user object -> onode[metadata, user-data].<br>
<br>
kv_create_extented_object() is left for later patch set.<br>
<br>
For object name to onode mapping, we make use of old hash approach, that is<br>
<br>
- hash(objec_name) --> vdi[hash_value] -> onode<br>
<br>
Signed-off-by: Liu Yuan <<a href="mailto:namei.unix@gmail.com">namei.unix@gmail.com</a>><br>
---<br>
 include/internal_proto.h |    1 +<br>
 sheep/http/kv.c          |  183 ++++++++++++++++++++++++++++------------------<br>
 2 files changed, 113 insertions(+), 71 deletions(-)<br>
<br>
diff --git a/include/internal_proto.h b/include/internal_proto.h<br>
index e5e0f05..70a7b5d 100644<br>
--- a/include/internal_proto.h<br>
+++ b/include/internal_proto.h<br>
@@ -124,6 +124,7 @@<br>
 #define SD_RES_AGAIN            0x8F /* Ask to try again */<br>
 #define SD_RES_STALE_OBJ        0x90 /* Object may be stale */<br>
 #define SD_RES_CLUSTER_ERROR    0x91 /* Cluster driver error */<br>
+#define SD_RES_OBJ_TAKEN        0x92 /* Object ID is taken up */<br>
<br>
 enum sd_status {<br>
        SD_STATUS_OK = 1,<br>
diff --git a/sheep/http/kv.c b/sheep/http/kv.c<br>
index d30a6a1..c04e629 100644<br>
--- a/sheep/http/kv.c<br>
+++ b/sheep/http/kv.c<br>
@@ -165,43 +165,100 @@ int kv_list_buckets(struct http_request *req,<br>
<br>
 /* Object operations */<br>
<br>
-/* 4 KB header of kv object */<br>
-struct kv_object_hdr {<br>
+/* 4 KB header of kv object index node */<br>
+struct kv_onode_hdr {<br>
        union {<br>
                struct {<br>
                        char name[SD_MAX_OBJECT_NAME];<br>
+                       /* a hash value for etag */<br>
+                       uint8_t sha1[round_up(SHA1_DIGEST_SIZE, 8)];<br>
                        uint64_t size;<br>
                        uint64_t ctime;<br>
                        uint64_t mtime;<br>
-<br>
-                       /* the index of the multi parted object */<br>
-                       uint64_t segment;<br>
-<br>
-                       /* a hash value for etag */<br>
-                       uint8_t sha1[round_up(SHA1_DIGEST_SIZE, 8)];<br>
+                       uint32_t data_vid;<br>
+                       uint32_t nr_extent;<br>
+                       uint8_t inlined;<br>
+                       uint8_t pad[5];<br>
                };<br>
<br>
                uint8_t __pad[BLOCK_SIZE];<br>
        };<br>
 };<br>
<br>
-struct kv_object {<br>
-       struct kv_object_hdr hdr;<br>
-       uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct kv_object_hdr)];<br>
+struct onode_extent {<br>
+       uint32_t vdi;<br>
+       uint32_t pad;<br>
+       uint64_t start;<br>
+       uint64_t count;<br>
 };<br>
<br>
+struct kv_onode {<br>
+       struct kv_onode_hdr hdr;<br>
+       union {<br>
+               uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct kv_onode_hdr)];<br>
+               struct onode_extent *o_extent;<br>
+       };<br>
+};<br>
+<br>
+#define KV_ONODE_INLINE_SIZE (SD_DATA_OBJ_SIZE - sizeof(struct kv_onode_hdr))<br>
+<br>
+static int kv_create_inlined_object(struct sd_inode *inode,<br>
+                                   struct kv_onode *onode,<br>
+                                   uint32_t vid, uint32_t idx,<br>
+                                   bool overwrite)<br>
+{<br>
+       uint64_t oid = vid_to_data_oid(vid, idx);<br>
+       int ret;<br>
+<br>
+       if (overwrite) {<br>
+               sd_info("overwrite object %s", onode-><a href="http://hdr.name" target="_blank">hdr.name</a>);<br>
+               ret = write_object(oid, (char *)onode,<br>
+                                  sizeof(onode->hdr) + onode->hdr.size,<br>
+                                  0, false);<br>
+               if (ret != SD_RES_SUCCESS) {<br>
+                       sd_err("failed to write object, %" PRIx64, oid);<br>
+                       goto out;<br>
+               }<br>
+       } else {<br>
+               ret = write_object(oid, (char *)onode,<br>
+                                  sizeof(onode->hdr) + onode->hdr.size,<br>
+                                  0, true);<br>
+               if (ret != SD_RES_SUCCESS) {<br>
+                       sd_err("failed to create object, %" PRIx64, oid);<br>
+                       goto out;<br>
+               }<br>
+               INODE_SET_VID(inode, idx, vid);<br>
+               ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx,<br>
+                                        vid, vid, 0, false, false);<br>
+               if (ret != SD_RES_SUCCESS) {<br>
+                       sd_err("failed to update inode, %" PRIx64,<br>
+                              vid_to_vdi_oid(vid));<br>
+                       goto out;<br>
+               }<br>
+       }<br>
+out:<br>
+       return ret;<br>
+}<br>
+<br>
+static int kv_create_extented_object(struct sd_inode *inode,<br>
+                                    struct kv_onode *onode,<br>
+                                    uint32_t vid, uint32_t idx)<br>
+{<br>
+       return SD_RES_SUCCESS;<br>
+}<br>
+<br>
 /*<br>
  * Create the object if the index isn't taken. Overwrite the object if it exists<br>
- * Return 0 if the index is taken by other object.<br>
+ * Return SD_RES_OBJ_TAKEN if the index is taken by other object.<br>
  */<br>
-static int do_kv_create_object(struct http_request *req, const char *obj_name,<br>
-                              struct kv_object *obj, uint32_t vid,<br>
-                              uint32_t idx)<br>
+static int do_kv_create_object(struct http_request *req,<br>
+                              struct kv_onode *onode,<br>
+                              uint32_t vid, uint32_t idx)<br>
 {<br>
+       struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));<br>
        uint64_t oid = vid_to_data_oid(vid, idx);<br>
+       struct kv_onode_hdr hdr;<br>
        uint32_t tmp_vid;<br>
-       struct kv_object_hdr hdr;<br>
-       struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));<br>
        int ret;<br>
<br>
        ret = read_object(vid_to_vdi_oid(vid), (char *)inode,<br>
@@ -209,60 +266,37 @@ static int do_kv_create_object(struct http_request *req, const char *obj_name,<br>
        if (ret != SD_RES_SUCCESS) {<br>
                sd_err("failed to read inode, %" PRIx64,<br>
                       vid_to_vdi_oid(vid));<br>
-               goto err;<br>
+               goto out;<br>
        }<br>
        tmp_vid = INODE_GET_VID(inode, idx);<br>
        if (tmp_vid) {<br>
                ret = read_object(oid, (char *)&hdr, sizeof(hdr), 0);<br>
                if (ret != SD_RES_SUCCESS) {<br>
                        sd_err("failed to read object, %" PRIx64, oid);<br>
-                       goto err;<br>
+                       goto out;<br>
                }<br>
<br>
                if (<a href="http://hdr.name" target="_blank">hdr.name</a>[0] != '\0' &&<br>
-                   strcmp(<a href="http://hdr.name" target="_blank">hdr.name</a>, obj-><a href="http://hdr.name" target="_blank">hdr.name</a>) != 0){<br>
+                   strcmp(<a href="http://hdr.name" target="_blank">hdr.name</a>, onode-><a href="http://hdr.name" target="_blank">hdr.name</a>) != 0) {<br>
                        sd_debug("index %d is already used", idx);<br>
+                       ret = SD_RES_OBJ_TAKEN;<br>
                        goto out;<br>
                }<br>
-               sd_info("overwrite object %s", obj_name);<br>
-               ret = write_object(oid, (char *)obj,<br>
-                                  sizeof(obj->hdr) + obj->hdr.size,<br>
-                                  0, false);<br>
-               if (ret != SD_RES_SUCCESS) {<br>
-                       sd_err("failed to write object, %" PRIx64, oid);<br>
-                       goto err;<br>
-               }<br>
-       } else {<br>
-               ret = write_object(oid, (char *)obj,<br>
-                                  sizeof(obj->hdr) + obj->hdr.size,<br>
-                                  0, true);<br>
-               if (ret != SD_RES_SUCCESS) {<br>
-                       sd_err("failed to create object, %" PRIx64, oid);<br>
-                       goto err;<br>
-               }<br>
-               INODE_SET_VID(inode, idx, vid);<br>
-               ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx,<br>
-                                        vid, vid, 0, false, false);<br>
-               if (ret != SD_RES_SUCCESS) {<br>
-                       sd_err("failed to update inode, %" PRIx64,<br>
-                              vid_to_vdi_oid(vid));<br>
-                       goto err;<br>
-               }<br>
        }<br>
-       http_response_header(req, CREATED);<br>
+       if (onode->hdr.inlined)<br>
+               ret = kv_create_inlined_object(inode, onode, vid, idx,<br>
+                                              !!tmp_vid);<br>
+       else<br>
+               ret = kv_create_extented_object(inode, onode, vid, idx);<br>
 out:<br>
        free(inode);<br>
-       return 0;<br>
-err:<br>
-       http_response_header(req, INTERNAL_SERVER_ERROR);<br>
-       free(inode);<br>
-       return -1;<br>
+       return ret;<br>
 }<br>
<br>
 int kv_create_object(struct http_request *req, const char *bucket,<br>
-                    const char *object)<br>
+                    const char *name)<br>
 {<br>
-       struct kv_object *obj;<br>
+       struct kv_onode *onode;<br>
        ssize_t size;<br>
        int ret;<br>
        uint64_t hval;<br>
@@ -273,44 +307,51 @@ int kv_create_object(struct http_request *req, const char *bucket,<br>
        if (ret < 0)<br>
                return ret;<br>
<br>
-       obj = xzalloc(sizeof(*obj));<br>
+       onode = xzalloc(sizeof(*onode));<br>
<br>
        gettimeofday(&tv, NULL);<br>
-       pstrcpy(obj-><a href="http://hdr.name" target="_blank">hdr.name</a>, sizeof(obj-><a href="http://hdr.name" target="_blank">hdr.name</a>), object);<br>
-       obj->hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;<br>
-       obj->hdr.mtime = obj->hdr.ctime;<br>
+       pstrcpy(onode-><a href="http://hdr.name" target="_blank">hdr.name</a>, sizeof(onode-><a href="http://hdr.name" target="_blank">hdr.name</a>), name);<br>
+       onode->hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;<br>
+       onode->hdr.mtime = onode->hdr.ctime;<br>
<br>
-       /* TODO: support multi parted object for large object */<br>
-       size = http_request_read(req, obj->data, sizeof(obj->data));<br>
+       size = http_request_read(req, onode->data, sizeof(onode->data));<br>
        if (size < 0) {<br>
                sd_err("%s: bucket %s, object %s", sd_strerror(ret),<br>
-                      bucket, object);<br>
+                      bucket, name);<br>
                http_response_header(req, INTERNAL_SERVER_ERROR);<br>
                return -1;<br>
        }<br>
<br>
-       obj->hdr.size = size;<br>
-<br>
-       hval = sd_hash(object, strlen(object));<br>
+       onode->hdr.size = size;<br>
+       if (size <= KV_ONODE_INLINE_SIZE)<br>
+               onode->hdr.inlined = 1;<br>
+       hval = sd_hash(name, strlen(name));<br>
        for (int i = 0; i < MAX_DATA_OBJS; i++) {<br>
                uint32_t idx = (hval + i) % MAX_DATA_OBJS;<br>
<br>
-               do_kv_create_object(req, object, obj, vid, idx);<br>
-               if (req->status != UNKNOWN) {<br>
-                       free(obj);<br>
+               ret = do_kv_create_object(req, onode, vid, idx);<br>
+               switch (ret) {<br>
+               case SD_RES_SUCCESS:<br>
+                       http_response_header(req, CREATED);<br>
+                       free(onode);<br>
                        return 0;<br>
+               case SD_RES_OBJ_TAKEN:<br>
+                       break;<br>
+               default:<br>
+                       http_response_header(req, INTERNAL_SERVER_ERROR);<br>
+                       free(onode);<br>
+                       return -1;<br>
                }<br>
        }<br>
<br>
-       free(obj);<br>
-<br>
        /* no free space to create a object */<br>
        http_response_header(req, SERVICE_UNAVAILABLE);<br>
+       free(onode);<br>
        return -1;<br>
 }<br>
<br>
 static int do_kv_read_object(struct http_request *req, const char *obj_name,<br>
-                            struct kv_object *obj, uint32_t vid, uint32_t idx)<br>
+                            struct kv_onode *obj, uint32_t vid, uint32_t idx)<br>
 {<br>
        uint64_t oid = vid_to_data_oid(vid, idx);<br>
        int ret;<br>
@@ -342,7 +383,7 @@ static int do_kv_read_object(struct http_request *req, const char *obj_name,<br>
 int kv_read_object(struct http_request *req, const char *bucket,<br>
                   const char *object)<br>
 {<br>
-       struct kv_object *obj;<br>
+       struct kv_onode *obj;<br>
        int ret;<br>
        uint64_t hval;<br>
        uint32_t vid;<br>
@@ -371,7 +412,7 @@ int kv_read_object(struct http_request *req, const char *bucket,<br>
 }<br>
<br>
 static int do_kv_update_object(struct http_request *req, const char *obj_name,<br>
-                              struct kv_object *obj, uint32_t vid,<br>
+                              struct kv_onode *obj, uint32_t vid,<br>
                               uint32_t idx, size_t size)<br>
 {<br>
        uint64_t oid = vid_to_data_oid(vid, idx);<br>
@@ -415,7 +456,7 @@ static int do_kv_update_object(struct http_request *req, const char *obj_name,<br>
 int kv_update_object(struct http_request *req, const char *bucket,<br>
                     const char *object)<br>
 {<br>
-       struct kv_object *obj;<br>
+       struct kv_onode *obj;<br>
        int ret;<br>
        uint64_t hval;<br>
        uint32_t vid;<br>
<span class="HOEnZb"><font color="#888888">--<br>
1.7.9.5<br>
<br>
--<br>
sheepdog mailing list<br>
<a href="mailto:sheepdog@lists.wpkg.org">sheepdog@lists.wpkg.org</a><br>
<a href="http://lists.wpkg.org/mailman/listinfo/sheepdog" target="_blank">http://lists.wpkg.org/mailman/listinfo/sheepdog</a><br>
</font></span></blockquote></div><br><br clear="all"><div><br></div>-- <br>--<br>Best Regard<br>Robin Dong
</div>