[sheepdog] [PATCH v2 08/11] erasure: add basic read/write code proper
Liu Yuan
namei.unix at gmail.com
Thu Sep 26 09:25:45 CEST 2013
Instead of storing copy in the replica, erasure code tries to spread data on all
the replica to achieve the same fault tolerance while reducing the redundancy to
minimal.
For a simple test on my box, aligned-4k write get 1.5x faster than replication
at most, while read get 1.15x faster, compared with copies=3 (4:2 scheme)
For 6 nodes cluster with 1000Mb/s NIC, I got the following result:
replication(3 copies): write 36.5 MB/s, read 71.8 MB/s
erasure code(4 : 2) : write 46.6 MB/s, read 82.9 MB/
/*
* Stripe: data strips + parity strips, spread on all replica
* DS: data strip
* PS: parity strip
* R: Replica
*
* +--------------------stripe ----------------------+
* v v
* +----+----------------------------------------------+
* | ds | ds | ds | ds | ds | ... | ps | ps | ... | ps |
* +----+----------------------------------------------+
* | .. | .. | .. | .. | .. | ... | .. | .. | ... | .. |
* +----+----+----+----+----+ ... +----+----+-----+----+
* R1 R2 R3 R4 R5 ... Rn Rn+1 Rn+2 Rn+3
*/
We use replica to hold data and parity strips. Suppose we have a
4:2 scheme, 4 data strips and 2 parity strips on 6 replica and strip size = 1k,
so basically we'll generate 2k parites for each 4k write, we call this 6K as
stripe as a whole. For write, we'll horizontally spread data, not vertically as
replciation. So for read, we have to assemble the strip from all the data
replica.
The downsize for erasure coding is:
1. for recovery, we have to recover 0.5x more data
2. if any replica fails, we have to wait for its recover for read.
3. it needs at least 6(4+2) nodes to work.
Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
sheep/gateway.c | 124 +++++++++++++++++++++++++++++++++++++++++++++++++--
sheep/plain_store.c | 11 ++++-
sheep/sheep_priv.h | 1 +
3 files changed, 130 insertions(+), 6 deletions(-)
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 55d18fe..3684140 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -47,13 +47,124 @@ static struct req_iter *prepare_replication_requests(struct request *req,
return reqs;
}
+/*
+ * We spread data strips of req along with its parity strips onto replica for
+ * write opertaion. For read we only need to prepare data strip buffers.
+ */
+static struct req_iter *prepare_erasure_requests(struct request *req, int *nr)
+{
+ void *data = req->data;
+ uint32_t len = req->rq.data_length;
+ uint64_t off = req->rq.obj.offset;
+ int opcode = req->rq.opcode;
+ int start = off / SD_EC_D_SIZE;
+ int end = DIV_ROUND_UP(off + len, SD_EC_D_SIZE), i, j;
+ int nr_stripe = end - start;
+ struct fec *ctx = ec_init();
+ int nr_to_send = (opcode == SD_OP_READ_OBJ) ? SD_EC_D : SD_EC_DP;
+ struct req_iter *reqs = xzalloc(sizeof(*reqs) * nr_to_send);
+ char *p, *buf = NULL;
+
+ sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32,
+ start, end, nr_to_send, off, len);
+
+ *nr = nr_to_send;
+ for (i = 0; i < nr_to_send; i++) {
+ int l = SD_EC_STRIP_SIZE * nr_stripe;
+
+ reqs[i].buf = xmalloc(l);
+ reqs[i].dlen = l;
+ reqs[i].off = start * SD_EC_STRIP_SIZE;
+ switch (opcode) {
+ case SD_OP_CREATE_AND_WRITE_OBJ:
+ case SD_OP_WRITE_OBJ:
+ reqs[i].wlen = l;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (opcode != SD_OP_WRITE_OBJ && opcode != SD_OP_CREATE_AND_WRITE_OBJ)
+ goto out; /* Read and remove operation */
+
+ p = buf = xzalloc(SD_EC_D_SIZE * nr_stripe);
+ memcpy(buf + off % SD_EC_D_SIZE, data, len);
+ for (i = 0; i < nr_stripe; i++) {
+ const uint8_t *ds[SD_EC_D];
+ uint8_t *ps[SD_EC_P];
+
+ for (j = 0; j < SD_EC_D; j++)
+ ds[j] = reqs[j].buf + SD_EC_STRIP_SIZE * i;
+
+ for (j = 0; j < SD_EC_P; j++)
+ ps[j] = reqs[SD_EC_D + j].buf + SD_EC_STRIP_SIZE * i;
+
+ for (j = 0; j < SD_EC_D; j++)
+ memcpy((uint8_t *)ds[j], p + j * SD_EC_STRIP_SIZE,
+ SD_EC_STRIP_SIZE);
+ ec_encode(ctx, ds, ps);
+ p += SD_EC_D_SIZE;
+ }
+out:
+ ec_destroy(ctx);
+ free(buf);
+
+ return reqs;
+}
+
+bool is_erasure_object(uint64_t oid)
+{
+ return !is_vdi_obj(oid) && get_vdi_copy_policy(oid_to_vid(oid)) != 0;
+}
+
+/* Prepare request iterator and buffer for each replica */
static struct req_iter *prepare_requests(struct request *req, int *nr)
{
- return prepare_replication_requests(req, nr);
+ uint64_t oid = req->rq.obj.oid;
+
+ if (is_erasure_object(oid))
+ return prepare_erasure_requests(req, nr);
+ else
+ return prepare_replication_requests(req, nr);
}
-static void finish_requests(struct req_iter *reqs)
+static void finish_requests(struct request *req, struct req_iter *reqs,
+ int nr_to_send)
{
+ uint64_t oid = req->rq.obj.oid;
+ uint32_t len = req->rq.data_length;
+ uint64_t off = req->rq.obj.offset;
+ int opcode = req->rq.opcode;
+ int start = off / SD_EC_D_SIZE;
+ int end = DIV_ROUND_UP(off + len, SD_EC_D_SIZE), i, j;
+ int nr_stripe = end - start;
+
+ if (!is_erasure_object(oid))
+ goto out;
+
+ sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32,
+ start, end, nr_to_send, off, len);
+
+ /* We need to assemble the data strips into the req buffer for read */
+ if (opcode == SD_OP_READ_OBJ) {
+ char *p, *buf = xmalloc(SD_EC_D_SIZE * nr_stripe);
+
+ p = buf;
+ for (i = 0; i < nr_stripe; i++) {
+ for (j = 0; j < nr_to_send; j++) {
+ memcpy(p, reqs[j].buf + SD_EC_STRIP_SIZE * i,
+ SD_EC_STRIP_SIZE);
+ p += SD_EC_STRIP_SIZE;
+ }
+ }
+ memcpy(req->data, buf + off % SD_EC_D_SIZE, len);
+ req->rp.data_length = req->rq.data_length;
+ free(buf);
+ }
+ for (i = 0; i < nr_to_send; i++)
+ free(reqs[i].buf);
+out:
free(reqs);
}
@@ -335,16 +446,21 @@ static int gateway_forward_request(struct request *req)
err_ret = ret;
}
- finish_requests(reqs);
+ finish_requests(req, reqs, nr_to_send);
return err_ret;
}
int gateway_read_obj(struct request *req)
{
+ uint64_t oid = req->rq.obj.oid;
+
if (!bypass_object_cache(req))
return object_cache_handle_request(req);
- return gateway_replication_read(req);
+ if (is_erasure_object(oid))
+ return gateway_forward_request(req);
+ else
+ return gateway_replication_read(req);
}
int gateway_write_obj(struct request *req)
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index b363402..af9ecbf 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -293,6 +293,13 @@ int prealloc(int fd, uint32_t size)
return 0;
}
+static size_t get_store_objsize(uint64_t oid)
+{
+ if (is_erasure_object(oid))
+ return SD_EC_OBJECT_SIZE;
+ return get_objsize(oid);
+}
+
int default_create_and_write(uint64_t oid, const struct siocb *iocb)
{
char path[PATH_MAX], tmp_path[PATH_MAX];
@@ -331,8 +338,8 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
return err_to_sderr(path, oid, errno);
}
- if (iocb->offset != 0 || iocb->length != get_objsize(oid)) {
- ret = prealloc(fd, get_objsize(oid));
+ if (iocb->offset != 0 || iocb->length != get_store_objsize(oid)) {
+ ret = prealloc(fd, get_store_objsize(oid));
if (ret < 0) {
ret = err_to_sderr(path, oid, errno);
goto out;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index fab25b5..d1d2209 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -401,6 +401,7 @@ int gateway_read_obj(struct request *req);
int gateway_write_obj(struct request *req);
int gateway_create_and_write_obj(struct request *req);
int gateway_remove_obj(struct request *req);
+bool is_erasure_object(uint64_t oid);
/* backend store */
int peer_read_obj(struct request *req);
--
1.7.9.5
More information about the sheepdog
mailing list