[sheepdog] [PATCH 8/9] erasure: add basic read/write code proper

Liu Yuan namei.unix at gmail.com
Thu Sep 19 12:42:52 CEST 2013


Instead of storing copy in the replica, erasure code tries to spread data on all
the replica to achieve the same fault tolerance while reducing the redundancy to
minimal.

For a simple test on my box, aligned-4k write get 1.5x faster than replication
at most, while read performance drop to 60%, compared with copies=3 (4:2 scheme)

/*
 * Stripe: data strips + parity strips, spread on all replica
 * DS: data strip
 * PS: parity strip
 * R: Replica
 *
 *  +--------------------stripe ----------------------+
 *  v                                                 v
 * +----+----------------------------------------------+
 * | ds | ds | ds | ds | ds | ... | ps | ps | ... | ps |
 * +----+----------------------------------------------+
 * | .. | .. | .. | .. | .. | ... | .. | .. | ... | .. |
 * +----+----+----+----+----+ ... +----+----+-----+----+
 *  R1    R2   R3   R4   R5   ...   Rn  Rn+1  Rn+2  Rn+3
 */

We use replica to hold data and parity strips. Suppose we have a
4:2 scheme, 4 data strips and 2 parity strips on 6 replica and strip size = 1k,
so basically we'll generate 2k parites for each 4k write, we call this 6K as
stripe as a whole. For write, we'll horizontally spread data, not vertically as
replciation. So for read, we have to assemble the strip from all the data
replica, this probably the reason why we get slowed down for read.

Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 sheep/gateway.c     |  124 +++++++++++++++++++++++++++++++++++++++++++++++++--
 sheep/plain_store.c |   11 ++++-
 sheep/sheep_priv.h  |    1 +
 3 files changed, 130 insertions(+), 6 deletions(-)

diff --git a/sheep/gateway.c b/sheep/gateway.c
index 77fcb6a..5cf26f8 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -47,13 +47,124 @@ static struct req_iter *prepare_replication_requests(struct request *req,
 	return reqs;
 }
 
+/*
+ * We spread data strips of req along with its parity strips onto replica for
+ * write opertaion. For read we only need to prepare data strip buffers.
+ */
+static struct req_iter *prepare_erasure_requests(struct request *req, int *nr)
+{
+	void *data = req->data;
+	uint32_t len = req->rq.data_length;
+	uint64_t off = req->rq.obj.offset;
+	int opcode = req->rq.opcode;
+	int start = off / SD_EC_D_SIZE;
+	int end = DIV_ROUND_UP(off + len, SD_EC_D_SIZE), i, j;
+	int nr_stripe = end - start;
+	void *ctx = ec_init();
+	int nr_to_send = (opcode == SD_OP_READ_OBJ) ? SD_EC_D : SD_EC_DP;
+	struct req_iter *reqs = xzalloc(sizeof(*reqs) * nr_to_send);
+	char *p, *buf = NULL;
+
+	sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32,
+		 start, end, nr_to_send, off, len);
+
+	*nr = nr_to_send;
+	for (i = 0; i < nr_to_send; i++) {
+		int l = SD_EC_STRIP_SIZE * nr_stripe;
+
+		reqs[i].buf = xmalloc(l);
+		reqs[i].dlen = l;
+		reqs[i].off = start * SD_EC_STRIP_SIZE;
+		switch (opcode) {
+		case SD_OP_CREATE_AND_WRITE_OBJ:
+		case SD_OP_WRITE_OBJ:
+			reqs[i].wlen = l;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (opcode != SD_OP_WRITE_OBJ && opcode != SD_OP_CREATE_AND_WRITE_OBJ)
+		goto out; /* Read and remove operation */
+
+	p = buf = xzalloc(SD_EC_D_SIZE * nr_stripe);
+	memcpy(buf + off % SD_EC_D_SIZE, data, len);
+	for (i = 0; i < nr_stripe; i++) {
+		const uint8_t *ds[SD_EC_D];
+		uint8_t *ps[SD_EC_P];
+
+		for (j = 0; j < SD_EC_D; j++)
+			ds[j] = reqs[j].buf + SD_EC_STRIP_SIZE * i;
+
+		for (j = 0; j < SD_EC_P; j++)
+			ps[j] = reqs[SD_EC_D + j].buf + SD_EC_STRIP_SIZE * i;
+
+		for (j = 0; j < SD_EC_D; j++)
+			memcpy((uint8_t *)ds[j], p + j * SD_EC_STRIP_SIZE,
+			       SD_EC_STRIP_SIZE);
+		ec_encode(ctx, ds, ps);
+		p += SD_EC_D_SIZE;
+	}
+out:
+	ec_destroy(ctx);
+	free(buf);
+
+	return reqs;
+}
+
+bool is_erasure_object(uint64_t oid)
+{
+	return !is_vdi_obj(oid) && get_vdi_copy_policy(oid_to_vid(oid)) != 0;
+}
+
+/* Prepare request iterator and buffer for each replica */
 static struct req_iter *prepare_requests(struct request *req, int *nr)
 {
-	return prepare_replication_requests(req, nr);
+	uint64_t oid = req->rq.obj.oid;
+
+	if (is_erasure_object(oid))
+		return prepare_erasure_requests(req, nr);
+	else
+		return prepare_replication_requests(req, nr);
 }
 
-static void finish_requests(struct req_iter *reqs)
+static void finish_requests(struct request *req, struct req_iter *reqs,
+			    int nr_to_send)
 {
+	uint64_t oid = req->rq.obj.oid;
+	uint32_t len = req->rq.data_length;
+	uint64_t off = req->rq.obj.offset;
+	int opcode = req->rq.opcode;
+	int start = off / SD_EC_D_SIZE;
+	int end = DIV_ROUND_UP(off + len, SD_EC_D_SIZE), i, j;
+	int nr_stripe = end - start;
+
+	if (!is_erasure_object(oid))
+		goto out;
+
+	sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32,
+		 start, end, nr_to_send, off, len);
+
+	/* We need to assemble the data strips into the req buffer for read */
+	if (opcode == SD_OP_READ_OBJ) {
+		char *p, *buf = xmalloc(SD_EC_D_SIZE * nr_stripe);
+
+		p = buf;
+		for (i = 0; i < nr_stripe; i++) {
+			for (j = 0; j < nr_to_send; j++) {
+				memcpy(p, reqs[j].buf + SD_EC_STRIP_SIZE * i,
+				       SD_EC_STRIP_SIZE);
+				p += SD_EC_STRIP_SIZE;
+			}
+		}
+		memcpy(req->data, buf + off % SD_EC_D_SIZE, len);
+		req->rp.data_length = req->rq.data_length;
+		free(buf);
+	}
+	for (i = 0; i < nr_to_send; i++)
+		free(reqs[i].buf);
+out:
 	free(reqs);
 }
 
@@ -343,16 +454,21 @@ static int gateway_forward_request(struct request *req)
 			err_ret = ret;
 	}
 
-	finish_requests(reqs);
+	finish_requests(req, reqs, nr_to_send);
 	return err_ret;
 }
 
 int gateway_read_obj(struct request *req)
 {
+	uint64_t oid = req->rq.obj.oid;
+
 	if (!bypass_object_cache(req))
 		return object_cache_handle_request(req);
 
-	return gateway_replication_read(req);
+	if (is_erasure_object(oid))
+		return gateway_forward_request(req);
+	else
+		return gateway_replication_read(req);
 }
 
 int gateway_write_obj(struct request *req)
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index b363402..6c7839b 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -293,6 +293,13 @@ int prealloc(int fd, uint32_t size)
 	return 0;
 }
 
+static size_t get_object_size(uint64_t oid)
+{
+	if (is_erasure_object(oid))
+		return SD_EC_OBJECT_SIZE;
+	return get_objsize(oid);
+}
+
 int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 {
 	char path[PATH_MAX], tmp_path[PATH_MAX];
@@ -331,8 +338,8 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 		return err_to_sderr(path, oid, errno);
 	}
 
-	if (iocb->offset != 0 || iocb->length != get_objsize(oid)) {
-		ret = prealloc(fd, get_objsize(oid));
+	if (iocb->offset != 0 || iocb->length != get_object_size(oid)) {
+		ret = prealloc(fd, get_object_size(oid));
 		if (ret < 0) {
 			ret = err_to_sderr(path, oid, errno);
 			goto out;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index fab25b5..d1d2209 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -401,6 +401,7 @@ int gateway_read_obj(struct request *req);
 int gateway_write_obj(struct request *req);
 int gateway_create_and_write_obj(struct request *req);
 int gateway_remove_obj(struct request *req);
+bool is_erasure_object(uint64_t oid);
 
 /* backend store */
 int peer_read_obj(struct request *req);
-- 
1.7.9.5




More information about the sheepdog mailing list