[sheepdog] [PATCH v1 2/2] sheep/http: add support for big object which is larger than SD_DATA_OBJ_SIZE

Liu Yuan namei.unix at gmail.com
Sat Dec 7 08:09:50 CET 2013


On Fri, Dec 06, 2013 at 05:04:19PM +0800, Robin Dong wrote:
> From: Robin Dong <sanbai at taobao.com>
> 
> Using hyper volume and extent structure of onode to store large number of
> objects size of which exceed SD_DATA_OBJ_SIZE.
> 
> Signed-off-by: Robin Dong <sanbai at taobao.com>
> ---
>  sheep/http/kv.c    | 422 ++++++++++++++++++++++++++++++++++++++++-------------
>  sheep/http/kv.h    |  15 +-
>  sheep/http/s3.c    |   8 +-
>  sheep/http/swift.c |  30 +---
>  4 files changed, 335 insertions(+), 140 deletions(-)
> 
> diff --git a/sheep/http/kv.c b/sheep/http/kv.c
> index 55a7e24..68f0759 100644
> --- a/sheep/http/kv.c
> +++ b/sheep/http/kv.c
> @@ -21,6 +21,7 @@ struct bucket_inode {
>  	uint64_t obj_count;
>  	uint64_t bytes_used;
>  	uint32_t vdi_id;		/* kv_onode stores in this vdi */
> +	uint32_t data_vid;		/* data of objects store in this vdi */

seems that this data_vid isn't used.

>  	uint32_t pad;
>  	uint64_t reserved[SD_MAX_BUCKET_NAME/sizeof(uint64_t) - 3];
>  };
> @@ -172,13 +173,13 @@ int kv_create_account(const char *account)
>  	return kv_create_hyper_volume(account, &vdi_id);
>  }
>  
> -typedef void (*list_cb)(struct http_request *req, const char *bucket,
> +typedef void (*list_bucket_cb)(struct http_request *req, const char *bucket,
>  			void *opaque);
>  
>  struct list_buckets_arg {
>  	struct http_request *req;
>  	const char *account;
> -	list_cb cb;
> +	list_bucket_cb cb;
>  	uint32_t bucket_counter;
>  };
>  
> @@ -360,7 +361,16 @@ static int delete_bucket(struct sd_inode *account_inode, uint64_t idx,
>  		bnode->vdi_id = 0;
>  		snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
>  			 account_inode->name, bucket);
> -
> +		/* delete vdi which store kv_onode */
> +		ret = kv_delete_vdi(vdi_name);
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to delete vdi %s", vdi_name);
> +			ret = -1;
> +			goto out;
> +		}
> +		/* delete vdi which store object data */
> +		snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator",
> +			 account_inode->name, bucket);
>  		ret = kv_delete_vdi(vdi_name);
>  		if (ret != SD_RES_SUCCESS) {
>  			sd_err("Failed to delete vdi %s", vdi_name);
> @@ -462,12 +472,29 @@ static int add_bucket(struct sd_inode *account_inode, uint64_t idx,
>  		bnode->bytes_used = 0;
>  		snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
>  			 account_inode->name, bucket);
> +		/* create vdi to store kv_onode */
>  		ret = kv_create_hyper_volume(vdi_name, &(bnode->vdi_id));
>  		if (ret != SD_RES_SUCCESS) {
>  			sd_err("Failed to create hyper volume %d", ret);
>  			ret = -1;
>  			goto out;
>  		}
> +		snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator",
> +			 account_inode->name, bucket);
> +		/* create vdi to store objects */
> +		ret = kv_create_hyper_volume(vdi_name, &(bnode->data_vid));
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to create hyper volume %d", ret);
> +			ret = -1;
> +			goto out;
> +		}
> +		ret = oalloc_init(bnode->data_vid);
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to init allocator on %x",
> +			       bnode->data_vid);
> +			ret = -1;
> +			goto out;
> +		}
>  		sd_debug("create hyper volume %s success", vdi_name);
>  		break;
>  	}
> @@ -629,7 +656,7 @@ int kv_delete_bucket(const char *account, const char *bucket)
>  	return SD_RES_SUCCESS;
>  }
>  
> -int kv_list_buckets(struct http_request *req, list_cb cb, void *opaque)
> +int kv_list_buckets(struct http_request *req, list_bucket_cb cb, void *opaque)
>  {
>  	struct sd_inode account_inode;
>  	const char *account = (const char *)opaque;
> @@ -658,6 +685,34 @@ int kv_list_buckets(struct http_request *req, list_cb cb, void *opaque)
>  	return SD_RES_SUCCESS;
>  }
>  
> +/*
> + * A bucket contains two vdi: one (vdi_id)  stores 'struct kv_onode' by hash
> + * algorithm and another one (data_vid) stores data of objects.
> + * The first vdi names "account/bucket" and the second vdi names
> + * "account/bucket/allocator".
> + *
> + * It manage space in data vdi by algorithm in oalloc.c.
> + *
> + * For example: bucket "fruit" with account 'coly' has two objects "banana"
> + *              and "apple"
> + *
> + *
> + *                       --------------------- kv_onode -----------------------
> + *                      |                                                      |
> + * bucket vdi           v                                                      v
> + * +-----------------+--+---------------------------+--------------------------+
> + * |name: coly/fruit |..|kv_onode_hdr (name: banana)|onode_extent: start, count|
> + * +-----------------+--+---------------------------+--------------------------+
> + *                                                                  /
> + *                                                                 /
> + *                                                     ------------
> + *                                                    /
> + *		     data_vid                        v
> + *                   +---------------------------+---+-----------------+
> + *                   |name: coly/fruit/allocator |...|       data      |
> + *                   +---------------------------+---+-----------------+
> + */
> +
>  /* Object operations */
>  
>  /* 4 KB header of kv object index node */
> @@ -691,24 +746,71 @@ struct kv_onode {
>  	struct kv_onode_hdr hdr;
>  	union {
>  		uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct kv_onode_hdr)];
> -		struct onode_extent *o_extent;
> +		struct onode_extent o_extent[0];
>  	};
>  };
>  
> +typedef void (*list_object_cb)(struct http_request *req, const char *bucket,
> +			       const char *object, void *opaque);
> +
> +struct list_objects_arg {
> +	struct http_request *req;
> +	void *opaque;
> +	const char *bucket;
> +	list_object_cb cb;
> +	uint32_t object_counter;
> +};
> +
> +static void list_objects_cb(void *data, enum btree_node_type type, void *arg)
> +{
> +	struct sd_extent *ext;
> +	struct list_objects_arg *loarg = arg;
> +	struct kv_onode *onode = NULL;
> +	uint64_t oid;
> +	int ret;
> +
> +	if (type == BTREE_EXT) {
> +		ext = (struct sd_extent *)data;
> +		if (!ext->vdi_id)
> +			goto out;
> +
> +		onode = xmalloc(SD_DATA_OBJ_SIZE);
> +
> +		oid = vid_to_data_oid(ext->vdi_id, ext->idx);
> +		ret = sd_read_object(oid, (char *)onode, SD_DATA_OBJ_SIZE, 0);
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to read data object %lx", oid);
> +			goto out;
> +		}
> +
> +		if (onode->hdr.name[0] == '\0')
> +			goto out;
> +		if (loarg->cb)
> +			loarg->cb(loarg->req, loarg->bucket, onode->hdr.name,
> +				  loarg->opaque);
> +		loarg->object_counter++;
> +	}
> +out:
> +	free(onode);
> +}
> +
>  #define KV_ONODE_INLINE_SIZE (SD_DATA_OBJ_SIZE - sizeof(struct kv_onode_hdr))
>  
> -static int kv_create_inlined_object(struct sd_inode *inode,
> -				    struct kv_onode *onode,
> -				    uint32_t vid, uint32_t idx,
> -				    bool overwrite)
> +static int kv_write_onode(struct sd_inode *inode, struct kv_onode *onode,
> +			  uint32_t vid, uint32_t idx, bool overwrite)
>  {
> -	uint64_t oid = vid_to_data_oid(vid, idx);
> +	uint64_t oid = vid_to_data_oid(vid, idx), len;
>  	int ret;
>  
> +	if (onode->hdr.inlined)
> +		len = onode->hdr.size;
> +	else
> +		len = sizeof(struct onode_extent) * onode->hdr.nr_extent;
> +
>  	if (overwrite) {
>  		sd_info("overwrite object %s", onode->hdr.name);
>  		ret = sd_write_object(oid, (char *)onode,
> -				      sizeof(onode->hdr) + onode->hdr.size,
> +				      sizeof(onode->hdr) + len,
>  				      0, false);
>  		if (ret != SD_RES_SUCCESS) {
>  			sd_err("failed to write object, %" PRIx64, oid);
> @@ -716,7 +818,7 @@ static int kv_create_inlined_object(struct sd_inode *inode,
>  		}
>  	} else {
>  		ret = sd_write_object(oid, (char *)onode,
> -				      sizeof(onode->hdr) + onode->hdr.size,
> +				      sizeof(onode->hdr) + len,
>  				      0, true);
>  		if (ret != SD_RES_SUCCESS) {
>  			sd_err("failed to create object, %" PRIx64, oid);
> @@ -735,13 +837,6 @@ out:
>  	return ret;
>  }
>  
> -static int kv_create_extented_object(struct sd_inode *inode,
> -				     struct kv_onode *onode,
> -				     uint32_t vid, uint32_t idx)
> -{
> -	return SD_RES_SUCCESS;
> -}
> -
>  /*
>   * Create the object if the index isn't taken. Overwrite the object if it exists
>   * Return SD_RES_OBJ_TAKEN if the index is taken by other object.
> @@ -778,48 +873,112 @@ static int do_kv_create_object(struct http_request *req,
>  			goto out;
>  		}
>  	}
> -	if (onode->hdr.inlined)
> -		ret = kv_create_inlined_object(inode, onode, vid, idx,
> -					       !!tmp_vid);
> -	else
> -		ret = kv_create_extented_object(inode, onode, vid, idx);
> +
> +	ret = kv_write_onode(inode, onode, vid, idx, !!tmp_vid);
> +	if (ret != SD_RES_SUCCESS)
> +		sd_err("Failed to write onode");
>  out:
>  	free(inode);
>  	return ret;
>  }
>  
> -int kv_create_object(struct http_request *req, const char *bucket,
> -		     const char *name)
> +int kv_create_object(struct http_request *req, const char *account,
> +		     const char *bucket, const char *name)
>  {
>  	struct kv_onode *onode;
> -	ssize_t size;
> +	ssize_t size, total_size = 0;
>  	int ret;
> -	uint64_t hval;
> -	uint32_t vid;
> +	uint64_t hval, start = 0, count, block, limit;
> +	uint32_t vid, data_vid;
>  	struct timeval tv;
> +	char vdi_name[SD_MAX_VDI_LEN];
> +	char *data_buf = NULL;
>  
> -	ret = lookup_bucket(req, bucket, &vid);
> +	snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket);
> +	ret = lookup_bucket(req, vdi_name, &vid);
> +	if (ret < 0)
> +		return ret;
> +
> +	snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator", account, bucket);
> +	ret = lookup_bucket(req, vdi_name, &data_vid);
>  	if (ret < 0)
>  		return ret;
>  
>  	onode = xzalloc(sizeof(*onode));
>  
> +	/* for inlined onode */
> +	if (req->data_length <= KV_ONODE_INLINE_SIZE) {
> +		onode->hdr.inlined = 1;
> +		size = http_request_read(req, onode->data, sizeof(onode->data));
> +		if (size < 0) {
> +			sd_err("%s: bucket %s, object %s", sd_strerror(ret),
> +			       bucket, name);
> +			http_response_header(req, INTERNAL_SERVER_ERROR);
> +			ret = -1;
> +			goto out;
> +		}
> +		total_size = size;
> +	} else {
> +		sd_debug("data_length: %lu, %lu", req->data_length,
> +			 SD_DATA_OBJ_SIZE);
> +		count = (req->data_length + SD_DATA_OBJ_SIZE + 1) /
> +			SD_DATA_OBJ_SIZE;
> +		ret = oalloc_new_prepare(data_vid, &start, count);
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to prepare allocation of %lu bytes!",
> +			       req->data_length);
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		/* receive and write data at first, then write onode */
> +		data_buf = xmalloc(SD_DATA_OBJ_SIZE);
> +
> +		sd_debug("start: %lu, count: %lu", start, count);
> +		for (block = start, limit = start + count;
> +		     block < limit; block++) {
> +			sd_debug("block: %lu, limit: %lu", block, limit);
> +			size = http_request_read(req, data_buf,
> +						 SD_DATA_OBJ_SIZE);
> +			total_size += size;
> +			ret = sd_write_object(vid_to_data_oid(data_vid, block),
> +					      data_buf, size, 0, true);
> +			if (ret != SD_RES_SUCCESS) {
> +				sd_err("Failed to write data object for %"
> +				       PRIx32" %s", data_vid, sd_strerror(ret));
> +				goto out;
> +			}
> +			if (size < SD_DATA_OBJ_SIZE)
> +				break;
> +		}
> +
> +		sd_debug("DATA_LENGTH: %lu, total size: %lu, last blocks: %lu",
> +			 req->data_length, total_size, start);
> +
> +		sd_debug("finish start: %lu, count: %lu", start, count);
> +		ret = oalloc_new_finish(data_vid, start, count);
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to finish allocation of %lu bytes!",
> +			       req->data_length);
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		onode->o_extent[0].vdi = data_vid;

Seems that onode_extent.vdi can be removed because of node->hdr.data_vid.

Get a compile warning

  CC     corosync.o
  CC     zookeeper.o
Built sheep
http/kv.c: In function ‘kv_delete_bucket’:
http/kv.c:422:6: warning: ‘buf’ may be used uninitialized in this function [-Wmaybe-uninitialized]
http/kv.c:319:8: note: ‘buf’ was declared here
  char *buf;
        ^
  CCLD   sheep

Thanks
Yuan



More information about the sheepdog mailing list