[sheepdog] [PATCH v2] sheep/http: fix parallely append-upload problem

Robin Dong robin.k.dong at gmail.com
Fri Sep 19 08:30:50 CEST 2014


From: Robin Dong <sanbai at taobao.com>

The 'append' opreation in sheepdog contains:

1. lock(data_vid)
2. read onode
3. allocate space and set o_extent.data_len to zero
4. write onode down
5. unlock(data_vid)
6. read uploading data and set o_extent.data_len in terms of req->data_length
7. write onode down

We unlock(data_vid) too early to protect the updating of 'o_extent.data_len'
so one process may overwrite the space allocated by another process.
To solve this problem, we move set operation of 'data_len' into lock region
and modify the code to write data_vid.

Signed-off-by: Robin Dong <sanbai at taobao.com>
---
v2-->v1:
  1. remove faulty modification of md.c

 sheep/http/kv.c | 106 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 64 insertions(+), 42 deletions(-)

diff --git a/sheep/http/kv.c b/sheep/http/kv.c
index 04ee462..4611e05 100644
--- a/sheep/http/kv.c
+++ b/sheep/http/kv.c
@@ -727,8 +727,11 @@ static int onode_allocate_extents(struct kv_onode *onode,
 		 * if we can put whole request data into extra space of last
 		 * o_extent, it don't need to allocate new extent.
 		 */
-		if (req->data_length <= reserv_len)
+		if (req->data_length <= reserv_len) {
+			onode->o_extent[idx - 1].data_len += req->data_length;
 			goto out;
+		} else
+			onode->o_extent[idx - 1].data_len += reserv_len;
 	}
 	count = DIV_ROUND_UP((req->data_length - reserv_len), SD_DATA_OBJ_SIZE);
 	sys->cdrv->lock(data_vid);
@@ -751,52 +754,22 @@ static int onode_allocate_extents(struct kv_onode *onode,
 
 	onode->o_extent[idx].start = start;
 	onode->o_extent[idx].count = count;
-	onode->o_extent[idx].data_len = 0;
+	onode->o_extent[idx].data_len = req->data_length - reserv_len;
 	onode->nr_extent++;
 out:
 	return ret;
 }
 
-static int onode_populate_extents(struct kv_onode *onode,
-				  struct http_request *req)
+static int do_vdi_write(struct http_request *req, uint32_t data_vid,
+			uint64_t offset, uint64_t total, char *data_buf,
+			bool create)
 {
-	struct onode_extent *ext;
-	struct onode_extent *last_ext = onode->o_extent + onode->nr_extent - 1;
-	ssize_t size;
-	uint64_t done = 0, total, offset = 0, reserv_len;
-	uint64_t write_buffer_size = MIN(kv_rw_buffer, req->data_length);
+	uint64_t done = 0, size;
 	int ret = SD_RES_SUCCESS;
-	char *data_buf = NULL;
-	uint32_t data_vid = onode->data_vid;
-	bool create = true;
-
-	data_buf = xmalloc(write_buffer_size);
-	if (last_ext->data_len == 0 && onode->nr_extent == 1) {
-		offset = last_ext->start * SD_DATA_OBJ_SIZE +
-			 last_ext->data_len;
-		last_ext->data_len += req->data_length;
-	} else if (last_ext->data_len > 0) {
-		offset = last_ext->start * SD_DATA_OBJ_SIZE +
-			 last_ext->data_len;
-		last_ext->data_len += req->data_length;
-		create = false;
-	} else {
-		ext = last_ext - 1;
-		reserv_len = ext->count * SD_DATA_OBJ_SIZE - ext->data_len;
-		offset = ext->start * SD_DATA_OBJ_SIZE + ext->data_len;
-		ext->data_len += reserv_len;
-		last_ext->data_len = req->data_length - reserv_len;
-		/*
-		 * if the previous oid has extra space, we don't need
-		 * to use SD_OP_CREATE_AND_WRITE_OBJ on this oid.
-		 */
-		if (reserv_len > 0)
-			create = false;
-	}
 
-	total = req->data_length;
 	while (done < total) {
-		size = http_request_read(req, data_buf, write_buffer_size);
+		size = http_request_read(req, data_buf,
+					 MIN(kv_rw_buffer, total - done));
 		if (size <= 0) {
 			sd_err("Failed to read http request: %ld", size);
 			ret = SD_RES_EIO;
@@ -804,17 +777,66 @@ static int onode_populate_extents(struct kv_onode *onode,
 		}
 		ret = vdi_read_write(data_vid, data_buf, size, offset,
 				     false, create);
-		sd_debug("vdi_write size: %"PRIu64", offset: %"
-			 PRIu64", ret:%d", size, offset, ret);
+		sd_debug("vdi_write offset: %"PRIu64", size: %" PRIu64
+			 ", for %" PRIx32 "ret: %d", offset, size,
+			 data_vid, ret);
 		if (ret != SD_RES_SUCCESS) {
-			sd_err("Failed to write data object for %s, %s",
-			       onode->name, sd_strerror(ret));
+			sd_err("Failed to write data object for %" PRIx32
+			       ", %s", data_vid, sd_strerror(ret));
 			goto out;
 		}
 		done += size;
 		offset += size;
 	}
 out:
+	return ret;
+}
+
+static int onode_populate_extents(struct kv_onode *onode,
+				  struct http_request *req)
+{
+	struct onode_extent *ext;
+	struct onode_extent *last_ext = onode->o_extent + onode->nr_extent - 1;
+	uint64_t total, offset = 0, reserv_len;
+	uint64_t write_buffer_size = MIN(kv_rw_buffer, req->data_length);
+	int ret = SD_RES_SUCCESS;
+	char *data_buf = NULL;
+	uint32_t data_vid = onode->data_vid;
+	bool create = true;
+
+	data_buf = xmalloc(write_buffer_size);
+
+	if (last_ext->data_len < req->data_length) {
+		ext = last_ext - 1;
+		reserv_len = (req->data_length - last_ext->data_len);
+		offset = (ext->start + ext->count) * SD_DATA_OBJ_SIZE -
+			 reserv_len;
+		ret = do_vdi_write(req, data_vid, offset, reserv_len,
+				   data_buf, false);
+		if (ret != SD_RES_SUCCESS) {
+			sd_err("Failed to do_vdi_write data_vid: %" PRIx32
+			       ", offset: %" PRIx64 ", total: %" PRIx64
+			       ", ret: %s", data_vid, offset, reserv_len,
+			       sd_strerror(ret));
+			goto out;
+		}
+		offset = last_ext->start * SD_DATA_OBJ_SIZE;
+		total = last_ext->data_len;
+	} else {
+		reserv_len = (last_ext->data_len - req->data_length);
+		offset = last_ext->start * SD_DATA_OBJ_SIZE + reserv_len;
+		total = req->data_length;
+		if (last_ext->data_len > req->data_length)
+			create = false;
+	}
+
+	ret = do_vdi_write(req, data_vid, offset, total, data_buf, create);
+	if (ret != SD_RES_SUCCESS)
+		sd_err("Failed to do_vdi_write data_vid: %" PRIx32
+		       ", offset: %" PRIx64 ", total: %" PRIx64
+		       ", ret: %s", data_vid, offset, total,
+		       sd_strerror(ret));
+out:
 	free(data_buf);
 	return ret;
 }
-- 
1.9.1




More information about the sheepdog mailing list