[sheepdog] [PATCH 9/9] erasure: add unligned write support

Liu Yuan namei.unix at gmail.com
Thu Sep 19 12:42:53 CEST 2013


unligned write is that either 'offset' or 'offset + length' isn't aligned to
SD_EC_D_SIZE. In this case, we have to read data stripes that hold the
unaligned write part(s) before write.

Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 sheep/gateway.c |   66 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 63 insertions(+), 3 deletions(-)

diff --git a/sheep/gateway.c b/sheep/gateway.c
index 5cf26f8..968e7cb 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -47,13 +47,65 @@ static struct req_iter *prepare_replication_requests(struct request *req,
 	return reqs;
 }
 
+
+/*
+ * Make sure we don't overwrite the existing data for unaligned write
+ *
+ * If either offset or length of request isn't aligned to SD_EC_D_SIZE, we have
+ * to read the unaligned blocks before write. This kind of write amplification
+ * indeed slow down the write operation with extra read overhead.
+ */
+static void *init_erasure_buffer(struct request *req, int buf_len)
+{
+	char *buf = xzalloc(buf_len);
+	uint32_t len = req->rq.data_length;
+	uint64_t off = req->rq.obj.offset;
+	uint64_t oid = req->rq.obj.oid;
+	int opcode = req->rq.opcode;
+	struct sd_req hdr;
+	uint64_t head = round_down(off, SD_EC_D_SIZE);
+	uint64_t tail = round_down(off + len, SD_EC_D_SIZE);
+	int ret;
+
+	if (opcode != SD_OP_WRITE_OBJ)
+		goto out;
+
+	if (off % SD_EC_D_SIZE) {
+		/* Read head */
+		sd_init_req(&hdr, SD_OP_READ_OBJ);
+		hdr.obj.oid = oid;
+		hdr.data_length = SD_EC_D_SIZE;
+		hdr.obj.offset = head;
+		ret = exec_local_req(&hdr, buf);
+		if (ret != SD_RES_SUCCESS) {
+			free(buf);
+			return NULL;
+		}
+	}
+
+	if ((len + off) % SD_EC_D_SIZE && tail - head > 0) {
+		/* Read tail */
+		sd_init_req(&hdr, SD_OP_READ_OBJ);
+		hdr.obj.oid = oid;
+		hdr.data_length = SD_EC_D_SIZE;
+		hdr.obj.offset = tail;
+		ret = exec_local_req(&hdr, buf + tail - head);
+		if (ret != SD_RES_SUCCESS) {
+			free(buf);
+			return NULL;
+		}
+	}
+out:
+	memcpy(buf + off % SD_EC_D_SIZE, req->data, len);
+	return buf;
+}
+
 /*
  * We spread data strips of req along with its parity strips onto replica for
  * write opertaion. For read we only need to prepare data strip buffers.
  */
 static struct req_iter *prepare_erasure_requests(struct request *req, int *nr)
 {
-	void *data = req->data;
 	uint32_t len = req->rq.data_length;
 	uint64_t off = req->rq.obj.offset;
 	int opcode = req->rq.opcode;
@@ -88,8 +140,14 @@ static struct req_iter *prepare_erasure_requests(struct request *req, int *nr)
 	if (opcode != SD_OP_WRITE_OBJ && opcode != SD_OP_CREATE_AND_WRITE_OBJ)
 		goto out; /* Read and remove operation */
 
-	p = buf = xzalloc(SD_EC_D_SIZE * nr_stripe);
-	memcpy(buf + off % SD_EC_D_SIZE, data, len);
+	p = buf = init_erasure_buffer(req, SD_EC_D_SIZE * nr_stripe);
+	if (!buf) {
+		sd_err("failed to init erasure buffer %"PRIx64,
+		       req->rq.obj.oid);
+		free(reqs);
+		reqs = NULL;
+		goto out;
+	}
 	for (i = 0; i < nr_stripe; i++) {
 		const uint8_t *ds[SD_EC_D];
 		uint8_t *ps[SD_EC_P];
@@ -420,6 +478,8 @@ static int gateway_forward_request(struct request *req)
 	oid_to_nodes(oid, &req->vinfo->vroot, nr_copies, target_nodes);
 	forward_info_init(&fi, nr_copies);
 	reqs = prepare_requests(req, &nr_to_send);
+	if (!reqs)
+		return SD_RES_NETWORK_ERROR;
 
 	for (i = 0; i < nr_to_send; i++) {
 		struct sockfd *sfd;
-- 
1.7.9.5




More information about the sheepdog mailing list