[sheepdog] [PATCH v1 1/3] sheep/http: add APPEND operation for PUT request

Liu Yuan namei.unix at gmail.com
Fri Mar 14 07:41:42 CET 2014


On Thu, Mar 13, 2014 at 06:45:34PM +0800, Robin Dong wrote:
> From: Robin Dong <sanbai at taobao.com>
> 
> Users could add http headers 'FLAG: append' for PUT http request to upload
> file or stdout to the tail of an existed object (a object which has been
> appended is just ONODE_INIT), and after all append-operations, it cound use
> http headers 'FLAG: eof' to set the object to ONODE_COMPLETE.
> 
> Signed-off-by: Robin Dong <sanbai at taobao.com>
> ---
>  sheep/http/http.c  |   7 ++
>  sheep/http/http.h  |  10 ++-
>  sheep/http/kv.c    | 203 ++++++++++++++++++++++++++++++++++++++++-------------
>  sheep/http/s3.c    |   2 +-
>  sheep/http/swift.c |  12 +++-
>  5 files changed, 180 insertions(+), 54 deletions(-)
> 
> diff --git a/sheep/http/http.c b/sheep/http/http.c
> index 3027153..382937f 100644
> --- a/sheep/http/http.c
> +++ b/sheep/http/http.c
> @@ -202,6 +202,13 @@ static int request_init_operation(struct http_request *req)
>  		if (!strcmp("true", p))
>  			req->force = true;
>  	}
> +	p = FCGX_GetParam("HTTP_FLAG", env);
> +	if (p && p[0] != '\0') {
> +		if (!strcmp("append", p))
> +			req->append = true;
> +		else if (!strcmp("eof", p))
> +			req->eof = true;
> +	}
>  
>  	req->status = UNKNOWN;
>  
> diff --git a/sheep/http/http.h b/sheep/http/http.h
> index 06dfd5d..149f49d 100644
> --- a/sheep/http/http.h
> +++ b/sheep/http/http.h
> @@ -50,6 +50,8 @@ struct http_request {
>  	uint64_t data_length;
>  	uint64_t offset;
>  	bool force;
> +	bool append;
> +	bool eof;
>  };
>  
>  struct http_driver {
> @@ -140,9 +142,15 @@ int kv_iterate_bucket(const char *account,
>  		      void (*cb)(const char *bucket, void *opaque),
>  		      void *opaque);
>  
> +/* creat_flags for kv_create_object */
> +#define ONODE_APPEND	1	/* append data to the tail of onode */
> +#define ONODE_EOF	2	/* all data has been uploaded */
> +
>  /* Object operations */
> +int kv_complete_object(struct http_request *req, const char *account,
> +		       const char *bucket, const char *object);
>  int kv_create_object(struct http_request *req, const char *account,
> -		     const char *bucket, const char *object);
> +		     const char *bucket, const char *object, int creat_flags);
>  int kv_read_object(struct http_request *req, const char *account,
>  		   const char *bucket, const char *object);
>  int kv_read_object_meta(struct http_request *req, const char *account,
> diff --git a/sheep/http/kv.c b/sheep/http/kv.c
> index ff3bfd8..07196ab 100644
> --- a/sheep/http/kv.c
> +++ b/sheep/http/kv.c
> @@ -54,6 +54,19 @@ struct onode_extent {
>  #define ONODE_INIT	1	/* created and allocated space, but no data */
>  #define ONODE_COMPLETE	2	/* data upload complete */
>  
> +
> +/*
> + * Add 'append' operation for creating object.
> + * Users could add http headers 'FLAG: append' to PUT file or stdout to
> + * the tail of an existed object (a object which has been appended is
> + * just ONODE_INIT), and after all append-operations, it cound use http headers
> + * 'FLAG: eof' to set the object to ONODE_COMPLETE.
> + */
> +static inline bool is_append(int creat_flags)
> +{
> +	return (creat_flags & ONODE_APPEND) > 0;
> +}
> +
>  #define ONODE_HDR_SIZE  BLOCK_SIZE
>  
>  struct kv_onode {
> @@ -672,7 +685,7 @@ static int onode_allocate_extents(struct kv_onode *onode,
>  {
>  	uint64_t start = 0, count;
>  	int ret;
> -	uint32_t data_vid = onode->data_vid;
> +	uint32_t data_vid = onode->data_vid, idx;
>  
>  	count = DIV_ROUND_UP(req->data_length, SD_DATA_OBJ_SIZE);
>  	sys->cdrv->lock(data_vid);
> @@ -693,9 +706,10 @@ static int onode_allocate_extents(struct kv_onode *onode,
>  		goto out;
>  	}
>  
> -	onode->o_extent[0].start = start;
> -	onode->o_extent[0].count = count;
> -	onode->nr_extent = 1;
> +	idx = onode->nr_extent;
> +	onode->o_extent[idx].start = start;
> +	onode->o_extent[idx].count = count;
> +	onode->nr_extent++;
>  out:
>  	return ret;
>  }
> @@ -704,7 +718,7 @@ static int onode_populate_extents(struct kv_onode *onode,
>  				  struct http_request *req)
>  {
>  	ssize_t size;
> -	uint64_t start = onode->o_extent[0].start;
> +	uint64_t start = onode->o_extent[onode->nr_extent - 1].start;
>  	uint64_t done = 0, total, offset;
>  	uint64_t write_buffer_size = MIN(kv_rw_buffer, req->data_length);
>  	int ret = SD_RES_SUCCESS;
> @@ -745,33 +759,39 @@ static uint64_t get_seconds(void)
>  	return seconds;
>  }
>  
> -static int onode_allocate_data(struct kv_onode *onode, struct http_request *req)
> +static int onode_allocate_data(struct kv_onode *onode, struct http_request *req,
> +			       int creat_flags)
>  {
>  	int ret = SD_RES_SUCCESS;
>  
> -	if (req->data_length <= KV_ONODE_INLINE_SIZE)
> -		onode->inlined = 1;
> +	/* only 'created' object could be inlined */
> +	if (req->data_length <= KV_ONODE_INLINE_SIZE && !is_append(creat_flags))
> +			onode->inlined = 1;
>  	else {
>  		ret = onode_allocate_extents(onode, req);
>  		if (ret != SD_RES_SUCCESS)
>  			goto out;
>  	}
>  
> -	onode->ctime = get_seconds();
> -	onode->size = req->data_length;
> +	if (!is_append(creat_flags))
> +		onode->ctime = get_seconds();
> +	onode->size += req->data_length;
>  out:
>  	return ret;
>  }
>  
> -static int onode_populate_data(struct kv_onode *onode, struct http_request *req)
> +static int onode_populate_data(struct kv_onode *onode, struct http_request *req,
> +			       int creat_flags)
>  {
>  	ssize_t size;
>  	int ret = SD_RES_SUCCESS;
>  
>  	onode->mtime = get_seconds();
> -	onode->flags = ONODE_COMPLETE;
> +	if (!is_append(creat_flags))
> +		onode->flags = ONODE_COMPLETE;
>  
> -	if (req->data_length <= KV_ONODE_INLINE_SIZE) {
> +	if (req->data_length <= KV_ONODE_INLINE_SIZE &&
> +	    !is_append(creat_flags)) {
>  		size = http_request_read(req, onode->data, sizeof(onode->data));
>  		if (size < 0 || req->data_length != size) {
>  			sd_err("Failed to read from web server for %s",
> @@ -881,16 +901,22 @@ out:
>  static int onode_free_data(struct kv_onode *onode)
>  {
>  	uint32_t data_vid = onode->data_vid;
> -	int ret = SD_RES_SUCCESS;
> +	int ret = SD_RES_SUCCESS, i;
>  
>  	/* it don't need to free data for inlined onode */
>  	if (!onode->inlined) {
>  		sys->cdrv->lock(data_vid);
> -		ret = oalloc_free(data_vid, onode->o_extent[0].start,
> -				  onode->o_extent[0].count);
> +		for (i = 0; i < onode->nr_extent; i++) {
> +			ret = oalloc_free(data_vid, onode->o_extent[i].start,
> +					  onode->o_extent[i].count);
> +			if (ret != SD_RES_SUCCESS)
> +				sd_err("failed to free start: %"PRIu64
> +				       ", count: %"PRIu64", for %s",
> +				       onode->o_extent[i].start,
> +				       onode->o_extent[i].count,
> +				       onode->name);
> +		}
>  		sys->cdrv->unlock(data_vid);
> -		if (ret != SD_RES_SUCCESS)
> -			sd_err("failed to free %s", onode->name);
>  	}
>  	return ret;
>  }
> @@ -1073,64 +1099,141 @@ static int onode_delete(struct kv_onode *onode)
>  /* Create onode and allocate space for it */
>  static int onode_allocate_space(struct http_request *req, const char *account,
>  				uint32_t bucket_vid, const char *bucket,
> -				const char *name, struct kv_onode *onode)
> +				const char *name, struct kv_onode *onode,
> +				int creat_flags)
>  {
>  	char vdi_name[SD_MAX_VDI_LEN];
>  	uint32_t data_vid;
> +	uint64_t len;
>  	int ret = SD_RES_SUCCESS;
> +	bool object_exists;
>  
>  	sys->cdrv->lock(bucket_vid);
>  	ret = onode_lookup_nolock(onode, bucket_vid, name);
> -	if (ret == SD_RES_SUCCESS) {
> -		/* if the exists onode has not been uploaded complete */
> -		if (onode->flags != ONODE_COMPLETE) {
> -			ret = SD_RES_INCOMPLETE;
> -			sd_err("The exists onode %s is incomplete", name);
> -			goto out;
> -		}
> -		/* For overwrite, we delete old object and then create */
> -		ret = onode_delete(onode);
> -		if (ret != SD_RES_SUCCESS) {
> -			sd_err("Failed to delete exists object %s", name);
> +
> +	if (is_append(creat_flags)) {
> +		if (ret == SD_RES_SUCCESS && onode->flags == ONODE_COMPLETE) {
> +			/* Not allowed "append" to a COMPLETED onode */
> +			sd_err("Not allowed to append data to a COMPLETED"
> +			       " onode %s", onode->name);

error messages are seen by users, so please make it more user-friendly:
"failed to append data to the object %s, which is marked COMPLETE"

>  			goto out;
>  		}
> -		ret = bnode_update(account, bucket, onode->size, false);
> -		if (ret != SD_RES_SUCCESS) {
> -			sd_err("Failed to update bnode for %s", name);
> +	} else {
> +		if (ret == SD_RES_SUCCESS) {
> +			/* if the exists onode has not been uploaded complete */

if (onode->flags != ONODE_COMPLETE) this line say it clearly what this situation
is, so  please no this meaningless comment because your code say it clearly.

> +			if (onode->flags != ONODE_COMPLETE) {
> +				ret = SD_RES_INCOMPLETE;
> +				sd_err("The exists onode %s is incomplete",
> +				       name);
> +				goto out;
> +			}
> +			/*
> +			 * For overwrite, we delete old object
> +			 * and then create
> +			 */
> +			ret = onode_delete(onode);
> +			if (ret != SD_RES_SUCCESS) {
> +				sd_err("Failed to delete exists object %s",
> +				       name);
> +				goto out;
> +			}
> +			ret = bnode_update(account, bucket, onode->size, false);
> +			if (ret != SD_RES_SUCCESS) {
> +				sd_err("Failed to update bnode for %s", name);
> +				goto out;
> +			}
> +		} else if (ret != SD_RES_NO_OBJ) {
> +			sd_err("Failed to lookup onode %s %s", name,
> +			       sd_strerror(ret));
>  			goto out;
>  		}
> -	} else if (ret != SD_RES_NO_OBJ) {
> -		sd_err("Failed to lookup onode %s %s", name, sd_strerror(ret));
> -		goto out;
>  	}
>  
> +	object_exists = (ret == SD_RES_SUCCESS);
> +
>  	snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator", account, bucket);
>  	ret = sd_lookup_vdi(vdi_name, &data_vid);
>  	if (ret != SD_RES_SUCCESS)
>  		goto out;
>  
> -	memset(onode, 0, sizeof(*onode));
> -	pstrcpy(onode->name, sizeof(onode->name), name);
> -	onode->data_vid = data_vid;
> -	onode->flags = ONODE_INIT;
> +	if (!object_exists || !is_append(creat_flags)) {
> +		memset(onode, 0, sizeof(*onode));
> +		pstrcpy(onode->name, sizeof(onode->name), name);
> +		onode->data_vid = data_vid;
> +		onode->flags = ONODE_INIT;
> +	}
>  
> -	ret = onode_allocate_data(onode, req);
> +	ret = onode_allocate_data(onode, req, creat_flags);
>  	if (ret != SD_RES_SUCCESS) {
>  		sd_err("failed to write data for %s", name);
>  		goto out;
>  	}
>  
> -	ret = onode_create(onode, bucket_vid);
> +	if (!object_exists || !is_append(creat_flags)) {
> +		ret = onode_create(onode, bucket_vid);
> +		if (ret != SD_RES_SUCCESS) {
> +			/* free last allocated o_extent */
> +			int last_ext = onode->nr_extent;
> +			sys->cdrv->lock(data_vid);
> +			oalloc_free(data_vid, onode->o_extent[last_ext].start,
> +				    onode->o_extent[last_ext].count);
> +			sys->cdrv->unlock(data_vid);
> +			sd_err("failed to create onode for %s", name);
> +			goto out;
> +		}
> +
> +		ret = bnode_update(account, bucket, req->data_length, true);
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("failed to update bucket for %s", name);
> +			onode_delete(onode);
> +			goto out;
> +		}
> +	} else {
> +		/* update new appended o_extent[] */
> +		len = sizeof(struct onode_extent) * onode->nr_extent;
>

I think we can't allocate new extent for every append operation blindly. Suppose
we append 1M data to the object, which is now 3M, we should simply append the 1M
to the tail of the allocated extent instead of creating a new one.

I think you should explain the algorithm for extent allocation for append-capable
object.

For e.g, how much space one extent should expand? Since append means add more
data in the objects step by step, we can expect users will write more and more
data until 'eof' is issued. So I'd suggest a extent space stratege like:

initally 64M for first allocation, and then x2 for each later append with a 2G
as upper limit.

But I am considering a simpler approach:

add a new SIZE: xxx header to indicate the real size and only allocate a single
extent for it.

Thanks
Yuan



More information about the sheepdog mailing list