[sheepdog] [PATCH] sheep/http: fix parallely append-upload problem

Hitoshi Mitake mitake.hitoshi at lab.ntt.co.jp
Fri Sep 19 07:39:22 CEST 2014


At Fri, 19 Sep 2014 13:31:33 +0800,
Robin Dong wrote:
> 
> From: Robin Dong <sanbai at taobao.com>
> 
> The 'append' opreation in sheepdog contains:
> 
> 1. lock(data_vid)
> 2. read onode
> 3. allocate space and set o_extent.data_len to zero
> 4. write onode down
> 5. unlock(data_vid)
> 6. read uploading data and set o_extent.data_len in terms of req->data_length
> 7. write onode down
> 
> We unlock(data_vid) too early to protect the updating of 'o_extent.data_len'
> so one process may overwrite the space allocated by another process.
> To solve this problem, we move set operation of 'data_len' into lock region
> and modify the code to write data_vid.
> 
> Signed-off-by: Robin Dong <sanbai at taobao.com>
> ---
>  sheep/http/kv.c | 106 ++++++++++++++++++++++++++++++++++----------------------
>  sheep/md.c      |  13 ++++---

The change for md.c is already pushed to the master branch. Could you
remove it? You can push it to master directly because the change only
affects http components after removing the change for md.c.

Thanks,
Hitoshi

>  2 files changed, 72 insertions(+), 47 deletions(-)
> 
> diff --git a/sheep/http/kv.c b/sheep/http/kv.c
> index 04ee462..4611e05 100644
> --- a/sheep/http/kv.c
> +++ b/sheep/http/kv.c
> @@ -727,8 +727,11 @@ static int onode_allocate_extents(struct kv_onode *onode,
>  		 * if we can put whole request data into extra space of last
>  		 * o_extent, it don't need to allocate new extent.
>  		 */
> -		if (req->data_length <= reserv_len)
> +		if (req->data_length <= reserv_len) {
> +			onode->o_extent[idx - 1].data_len += req->data_length;
>  			goto out;
> +		} else
> +			onode->o_extent[idx - 1].data_len += reserv_len;
>  	}
>  	count = DIV_ROUND_UP((req->data_length - reserv_len), SD_DATA_OBJ_SIZE);
>  	sys->cdrv->lock(data_vid);
> @@ -751,52 +754,22 @@ static int onode_allocate_extents(struct kv_onode *onode,
>  
>  	onode->o_extent[idx].start = start;
>  	onode->o_extent[idx].count = count;
> -	onode->o_extent[idx].data_len = 0;
> +	onode->o_extent[idx].data_len = req->data_length - reserv_len;
>  	onode->nr_extent++;
>  out:
>  	return ret;
>  }
>  
> -static int onode_populate_extents(struct kv_onode *onode,
> -				  struct http_request *req)
> +static int do_vdi_write(struct http_request *req, uint32_t data_vid,
> +			uint64_t offset, uint64_t total, char *data_buf,
> +			bool create)
>  {
> -	struct onode_extent *ext;
> -	struct onode_extent *last_ext = onode->o_extent + onode->nr_extent - 1;
> -	ssize_t size;
> -	uint64_t done = 0, total, offset = 0, reserv_len;
> -	uint64_t write_buffer_size = MIN(kv_rw_buffer, req->data_length);
> +	uint64_t done = 0, size;
>  	int ret = SD_RES_SUCCESS;
> -	char *data_buf = NULL;
> -	uint32_t data_vid = onode->data_vid;
> -	bool create = true;
> -
> -	data_buf = xmalloc(write_buffer_size);
> -	if (last_ext->data_len == 0 && onode->nr_extent == 1) {
> -		offset = last_ext->start * SD_DATA_OBJ_SIZE +
> -			 last_ext->data_len;
> -		last_ext->data_len += req->data_length;
> -	} else if (last_ext->data_len > 0) {
> -		offset = last_ext->start * SD_DATA_OBJ_SIZE +
> -			 last_ext->data_len;
> -		last_ext->data_len += req->data_length;
> -		create = false;
> -	} else {
> -		ext = last_ext - 1;
> -		reserv_len = ext->count * SD_DATA_OBJ_SIZE - ext->data_len;
> -		offset = ext->start * SD_DATA_OBJ_SIZE + ext->data_len;
> -		ext->data_len += reserv_len;
> -		last_ext->data_len = req->data_length - reserv_len;
> -		/*
> -		 * if the previous oid has extra space, we don't need
> -		 * to use SD_OP_CREATE_AND_WRITE_OBJ on this oid.
> -		 */
> -		if (reserv_len > 0)
> -			create = false;
> -	}
>  
> -	total = req->data_length;
>  	while (done < total) {
> -		size = http_request_read(req, data_buf, write_buffer_size);
> +		size = http_request_read(req, data_buf,
> +					 MIN(kv_rw_buffer, total - done));
>  		if (size <= 0) {
>  			sd_err("Failed to read http request: %ld", size);
>  			ret = SD_RES_EIO;
> @@ -804,17 +777,66 @@ static int onode_populate_extents(struct kv_onode *onode,
>  		}
>  		ret = vdi_read_write(data_vid, data_buf, size, offset,
>  				     false, create);
> -		sd_debug("vdi_write size: %"PRIu64", offset: %"
> -			 PRIu64", ret:%d", size, offset, ret);
> +		sd_debug("vdi_write offset: %"PRIu64", size: %" PRIu64
> +			 ", for %" PRIx32 "ret: %d", offset, size,
> +			 data_vid, ret);
>  		if (ret != SD_RES_SUCCESS) {
> -			sd_err("Failed to write data object for %s, %s",
> -			       onode->name, sd_strerror(ret));
> +			sd_err("Failed to write data object for %" PRIx32
> +			       ", %s", data_vid, sd_strerror(ret));
>  			goto out;
>  		}
>  		done += size;
>  		offset += size;
>  	}
>  out:
> +	return ret;
> +}
> +
> +static int onode_populate_extents(struct kv_onode *onode,
> +				  struct http_request *req)
> +{
> +	struct onode_extent *ext;
> +	struct onode_extent *last_ext = onode->o_extent + onode->nr_extent - 1;
> +	uint64_t total, offset = 0, reserv_len;
> +	uint64_t write_buffer_size = MIN(kv_rw_buffer, req->data_length);
> +	int ret = SD_RES_SUCCESS;
> +	char *data_buf = NULL;
> +	uint32_t data_vid = onode->data_vid;
> +	bool create = true;
> +
> +	data_buf = xmalloc(write_buffer_size);
> +
> +	if (last_ext->data_len < req->data_length) {
> +		ext = last_ext - 1;
> +		reserv_len = (req->data_length - last_ext->data_len);
> +		offset = (ext->start + ext->count) * SD_DATA_OBJ_SIZE -
> +			 reserv_len;
> +		ret = do_vdi_write(req, data_vid, offset, reserv_len,
> +				   data_buf, false);
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to do_vdi_write data_vid: %" PRIx32
> +			       ", offset: %" PRIx64 ", total: %" PRIx64
> +			       ", ret: %s", data_vid, offset, reserv_len,
> +			       sd_strerror(ret));
> +			goto out;
> +		}
> +		offset = last_ext->start * SD_DATA_OBJ_SIZE;
> +		total = last_ext->data_len;
> +	} else {
> +		reserv_len = (last_ext->data_len - req->data_length);
> +		offset = last_ext->start * SD_DATA_OBJ_SIZE + reserv_len;
> +		total = req->data_length;
> +		if (last_ext->data_len > req->data_length)
> +			create = false;
> +	}
> +
> +	ret = do_vdi_write(req, data_vid, offset, total, data_buf, create);
> +	if (ret != SD_RES_SUCCESS)
> +		sd_err("Failed to do_vdi_write data_vid: %" PRIx32
> +		       ", offset: %" PRIx64 ", total: %" PRIx64
> +		       ", ret: %s", data_vid, offset, total,
> +		       sd_strerror(ret));
> +out:
>  	free(data_buf);
>  	return ret;
>  }
> diff --git a/sheep/md.c b/sheep/md.c
> index 378d1f1..0313b95 100644
> --- a/sheep/md.c
> +++ b/sheep/md.c
> @@ -541,11 +541,14 @@ static void md_do_recover(struct work *work)
>  out:
>  	sd_rw_unlock(&md.lock);
>  
> -	if (disk)
> -		update_node_disks();
> -
> -	if (nr > 0)
> -		kick_recover();
> +	if (disk) {
> +		if (nr > 0) {
> +			update_node_disks();
> +			kick_recover();
> +		} else {
> +			leave_cluster();
> +		}
> +	}
>  
>  	free(mw);
>  }
> -- 
> 1.9.1
> 
> -- 
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog



More information about the sheepdog mailing list