[sheepdog] [PATCH] http: add basic extent framework to support large user object

Liu Yuan namei.unix at gmail.com
Wed Nov 27 10:10:20 CET 2013


To support object larger than 4M, we have to introduce extent like structure
to map the data indexes to data holders(sheepdog object). So basically, we use
one object index node(onode) to hold the metadata and extents.

user object -> onode[metadata, extent1, ............., extentN]
                                 |                        |
				 V                        |
		+-----------------------------------+     |
		| obj1 | obj2 | .............| objN |     |
		+-----------------------------------+     |
		                                          V
		                 +-----------------------------------+
		                 | obj1` | obj2` |...........| objN` |
		                 +-----------------------------------+

For user object size smaller than 4M, we just inline it to the onode

user object -> onode[metadata, user-data].

kv_create_extented_object() is left for later patch set.

For object name to onode mapping, we make use of old hash approach, that is

- hash(objec_name) --> vdi[hash_value] -> onode

Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 include/internal_proto.h |    1 +
 sheep/http/kv.c          |  183 ++++++++++++++++++++++++++++------------------
 2 files changed, 113 insertions(+), 71 deletions(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index e5e0f05..70a7b5d 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -124,6 +124,7 @@
 #define SD_RES_AGAIN            0x8F /* Ask to try again */
 #define SD_RES_STALE_OBJ        0x90 /* Object may be stale */
 #define SD_RES_CLUSTER_ERROR    0x91 /* Cluster driver error */
+#define SD_RES_OBJ_TAKEN        0x92 /* Object ID is taken up */
 
 enum sd_status {
 	SD_STATUS_OK = 1,
diff --git a/sheep/http/kv.c b/sheep/http/kv.c
index d30a6a1..c04e629 100644
--- a/sheep/http/kv.c
+++ b/sheep/http/kv.c
@@ -165,43 +165,100 @@ int kv_list_buckets(struct http_request *req,
 
 /* Object operations */
 
-/* 4 KB header of kv object */
-struct kv_object_hdr {
+/* 4 KB header of kv object index node */
+struct kv_onode_hdr {
 	union {
 		struct {
 			char name[SD_MAX_OBJECT_NAME];
+			/* a hash value for etag */
+			uint8_t sha1[round_up(SHA1_DIGEST_SIZE, 8)];
 			uint64_t size;
 			uint64_t ctime;
 			uint64_t mtime;
-
-			/* the index of the multi parted object */
-			uint64_t segment;
-
-			/* a hash value for etag */
-			uint8_t sha1[round_up(SHA1_DIGEST_SIZE, 8)];
+			uint32_t data_vid;
+			uint32_t nr_extent;
+			uint8_t inlined;
+			uint8_t pad[5];
 		};
 
 		uint8_t __pad[BLOCK_SIZE];
 	};
 };
 
-struct kv_object {
-	struct kv_object_hdr hdr;
-	uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct kv_object_hdr)];
+struct onode_extent {
+	uint32_t vdi;
+	uint32_t pad;
+	uint64_t start;
+	uint64_t count;
 };
 
+struct kv_onode {
+	struct kv_onode_hdr hdr;
+	union {
+		uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct kv_onode_hdr)];
+		struct onode_extent *o_extent;
+	};
+};
+
+#define KV_ONODE_INLINE_SIZE (SD_DATA_OBJ_SIZE - sizeof(struct kv_onode_hdr))
+
+static int kv_create_inlined_object(struct sd_inode *inode,
+				    struct kv_onode *onode,
+				    uint32_t vid, uint32_t idx,
+				    bool overwrite)
+{
+	uint64_t oid = vid_to_data_oid(vid, idx);
+	int ret;
+
+	if (overwrite) {
+		sd_info("overwrite object %s", onode->hdr.name);
+		ret = write_object(oid, (char *)onode,
+				   sizeof(onode->hdr) + onode->hdr.size,
+				   0, false);
+		if (ret != SD_RES_SUCCESS) {
+			sd_err("failed to write object, %" PRIx64, oid);
+			goto out;
+		}
+	} else {
+		ret = write_object(oid, (char *)onode,
+				   sizeof(onode->hdr) + onode->hdr.size,
+				   0, true);
+		if (ret != SD_RES_SUCCESS) {
+			sd_err("failed to create object, %" PRIx64, oid);
+			goto out;
+		}
+		INODE_SET_VID(inode, idx, vid);
+		ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx,
+					 vid, vid, 0, false, false);
+		if (ret != SD_RES_SUCCESS) {
+			sd_err("failed to update inode, %" PRIx64,
+			       vid_to_vdi_oid(vid));
+			goto out;
+		}
+	}
+out:
+	return ret;
+}
+
+static int kv_create_extented_object(struct sd_inode *inode,
+				     struct kv_onode *onode,
+				     uint32_t vid, uint32_t idx)
+{
+	return SD_RES_SUCCESS;
+}
+
 /*
  * Create the object if the index isn't taken. Overwrite the object if it exists
- * Return 0 if the index is taken by other object.
+ * Return SD_RES_OBJ_TAKEN if the index is taken by other object.
  */
-static int do_kv_create_object(struct http_request *req, const char *obj_name,
-			       struct kv_object *obj, uint32_t vid,
-			       uint32_t idx)
+static int do_kv_create_object(struct http_request *req,
+			       struct kv_onode *onode,
+			       uint32_t vid, uint32_t idx)
 {
+	struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
 	uint64_t oid = vid_to_data_oid(vid, idx);
+	struct kv_onode_hdr hdr;
 	uint32_t tmp_vid;
-	struct kv_object_hdr hdr;
-	struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
 	int ret;
 
 	ret = read_object(vid_to_vdi_oid(vid), (char *)inode,
@@ -209,60 +266,37 @@ static int do_kv_create_object(struct http_request *req, const char *obj_name,
 	if (ret != SD_RES_SUCCESS) {
 		sd_err("failed to read inode, %" PRIx64,
 		       vid_to_vdi_oid(vid));
-		goto err;
+		goto out;
 	}
 	tmp_vid = INODE_GET_VID(inode, idx);
 	if (tmp_vid) {
 		ret = read_object(oid, (char *)&hdr, sizeof(hdr), 0);
 		if (ret != SD_RES_SUCCESS) {
 			sd_err("failed to read object, %" PRIx64, oid);
-			goto err;
+			goto out;
 		}
 
 		if (hdr.name[0] != '\0' &&
-		    strcmp(hdr.name, obj->hdr.name) != 0){
+		    strcmp(hdr.name, onode->hdr.name) != 0) {
 			sd_debug("index %d is already used", idx);
+			ret = SD_RES_OBJ_TAKEN;
 			goto out;
 		}
-		sd_info("overwrite object %s", obj_name);
-		ret = write_object(oid, (char *)obj,
-				   sizeof(obj->hdr) + obj->hdr.size,
-				   0, false);
-		if (ret != SD_RES_SUCCESS) {
-			sd_err("failed to write object, %" PRIx64, oid);
-			goto err;
-		}
-	} else {
-		ret = write_object(oid, (char *)obj,
-				   sizeof(obj->hdr) + obj->hdr.size,
-				   0, true);
-		if (ret != SD_RES_SUCCESS) {
-			sd_err("failed to create object, %" PRIx64, oid);
-			goto err;
-		}
-		INODE_SET_VID(inode, idx, vid);
-		ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx,
-					 vid, vid, 0, false, false);
-		if (ret != SD_RES_SUCCESS) {
-			sd_err("failed to update inode, %" PRIx64,
-			       vid_to_vdi_oid(vid));
-			goto err;
-		}
 	}
-	http_response_header(req, CREATED);
+	if (onode->hdr.inlined)
+		ret = kv_create_inlined_object(inode, onode, vid, idx,
+					       !!tmp_vid);
+	else
+		ret = kv_create_extented_object(inode, onode, vid, idx);
 out:
 	free(inode);
-	return 0;
-err:
-	http_response_header(req, INTERNAL_SERVER_ERROR);
-	free(inode);
-	return -1;
+	return ret;
 }
 
 int kv_create_object(struct http_request *req, const char *bucket,
-		     const char *object)
+		     const char *name)
 {
-	struct kv_object *obj;
+	struct kv_onode *onode;
 	ssize_t size;
 	int ret;
 	uint64_t hval;
@@ -273,44 +307,51 @@ int kv_create_object(struct http_request *req, const char *bucket,
 	if (ret < 0)
 		return ret;
 
-	obj = xzalloc(sizeof(*obj));
+	onode = xzalloc(sizeof(*onode));
 
 	gettimeofday(&tv, NULL);
-	pstrcpy(obj->hdr.name, sizeof(obj->hdr.name), object);
-	obj->hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
-	obj->hdr.mtime = obj->hdr.ctime;
+	pstrcpy(onode->hdr.name, sizeof(onode->hdr.name), name);
+	onode->hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
+	onode->hdr.mtime = onode->hdr.ctime;
 
-	/* TODO: support multi parted object for large object */
-	size = http_request_read(req, obj->data, sizeof(obj->data));
+	size = http_request_read(req, onode->data, sizeof(onode->data));
 	if (size < 0) {
 		sd_err("%s: bucket %s, object %s", sd_strerror(ret),
-		       bucket, object);
+		       bucket, name);
 		http_response_header(req, INTERNAL_SERVER_ERROR);
 		return -1;
 	}
 
-	obj->hdr.size = size;
-
-	hval = sd_hash(object, strlen(object));
+	onode->hdr.size = size;
+	if (size <= KV_ONODE_INLINE_SIZE)
+		onode->hdr.inlined = 1;
+	hval = sd_hash(name, strlen(name));
 	for (int i = 0; i < MAX_DATA_OBJS; i++) {
 		uint32_t idx = (hval + i) % MAX_DATA_OBJS;
 
-		do_kv_create_object(req, object, obj, vid, idx);
-		if (req->status != UNKNOWN) {
-			free(obj);
+		ret = do_kv_create_object(req, onode, vid, idx);
+		switch (ret) {
+		case SD_RES_SUCCESS:
+			http_response_header(req, CREATED);
+			free(onode);
 			return 0;
+		case SD_RES_OBJ_TAKEN:
+			break;
+		default:
+			http_response_header(req, INTERNAL_SERVER_ERROR);
+			free(onode);
+			return -1;
 		}
 	}
 
-	free(obj);
-
 	/* no free space to create a object */
 	http_response_header(req, SERVICE_UNAVAILABLE);
+	free(onode);
 	return -1;
 }
 
 static int do_kv_read_object(struct http_request *req, const char *obj_name,
-			     struct kv_object *obj, uint32_t vid, uint32_t idx)
+			     struct kv_onode *obj, uint32_t vid, uint32_t idx)
 {
 	uint64_t oid = vid_to_data_oid(vid, idx);
 	int ret;
@@ -342,7 +383,7 @@ static int do_kv_read_object(struct http_request *req, const char *obj_name,
 int kv_read_object(struct http_request *req, const char *bucket,
 		   const char *object)
 {
-	struct kv_object *obj;
+	struct kv_onode *obj;
 	int ret;
 	uint64_t hval;
 	uint32_t vid;
@@ -371,7 +412,7 @@ int kv_read_object(struct http_request *req, const char *bucket,
 }
 
 static int do_kv_update_object(struct http_request *req, const char *obj_name,
-			       struct kv_object *obj, uint32_t vid,
+			       struct kv_onode *obj, uint32_t vid,
 			       uint32_t idx, size_t size)
 {
 	uint64_t oid = vid_to_data_oid(vid, idx);
@@ -415,7 +456,7 @@ static int do_kv_update_object(struct http_request *req, const char *obj_name,
 int kv_update_object(struct http_request *req, const char *bucket,
 		     const char *object)
 {
-	struct kv_object *obj;
+	struct kv_onode *obj;
 	int ret;
 	uint64_t hval;
 	uint32_t vid;
-- 
1.7.9.5




More information about the sheepdog mailing list