[sheepdog] [PATCH 1/4] erasure: make ec_helpers more generic
Liu Yuan
namei.unix at gmail.com
Thu Oct 17 14:17:53 CEST 2013
This will allow us to pass dynamic data and parity combination to make different
erasure coding possible.
At most we support 7 parity strip, which means we can stand with 7 nodes failure
at the same time.
Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
include/fec.h | 41 +++++-----
lib/fec.c | 230 ++++++++++++++++++++++++++++--------------------------
sheep/gateway.c | 61 ++++++++-------
sheep/recovery.c | 9 ++-
4 files changed, 179 insertions(+), 162 deletions(-)
diff --git a/include/fec.h b/include/fec.h
index ff79edb..55e0bdb 100644
--- a/include/fec.h
+++ b/include/fec.h
@@ -61,16 +61,16 @@
struct fec {
unsigned long magic;
- unsigned short k, n; /* parameters of the code */
+ unsigned short d, dp; /* parameters of the code */
uint8_t *enc_matrix;
};
void init_fec(void);
/*
- * param k the number of blocks required to reconstruct
- * param m the total number of blocks created
+ * param d the number of blocks required to reconstruct
+ * param dp the total number of blocks created
*/
-struct fec *fec_new(unsigned short k, unsigned short m);
+struct fec *fec_new(unsigned short d, unsigned short dp);
void fec_free(struct fec *p);
/*
@@ -104,7 +104,6 @@ void fec_decode(const struct fec *code,
#define SD_EC_D 4 /* No. of data strips */
#define SD_EC_P 2 /* No. of parity strips */
#define SD_EC_DP (SD_EC_D + SD_EC_P)
-#define SD_EC_STRIP_SIZE (256)
/*
* SD_EC_D_SIZE <= 1K is the safe value to run VM after some experimentations.
@@ -115,10 +114,9 @@ void fec_decode(const struct fec *code,
* failed (grub got screwed) and 1K is probably the biggest value if we want
* VM to run on erasure coded volume.
*/
-#define SD_EC_D_SIZE (SD_EC_STRIP_SIZE * SD_EC_D)
+#define SD_EC_DATA_STRIPE_SIZE (1024) /* 1K */
#define SD_EC_OBJECT_SIZE (SD_DATA_OBJ_SIZE / SD_EC_D)
-#define SD_EC_STRIPE (SD_EC_STRIP_SIZE * SD_EC_DP)
-#define SD_EC_NR_STRIPE_PER_OBJECT (SD_EC_OBJECT_SIZE / SD_EC_STRIP_SIZE)
+#define SD_EC_NR_STRIPE_PER_OBJECT (SD_DATA_OBJ_SIZE / SD_EC_DATA_STRIPE_SIZE)
/*
* Stripe: data strips + parity strips, spread on all replica
@@ -127,19 +125,19 @@ void fec_decode(const struct fec *code,
* R: Replica
*
* +--------------------stripe ----------------------+
- * v v
- * +----+----------------------------------------------+
+ * v data stripe parity stripe v
+ * +----+----+----+----+----+-----+----+----+-----+----+
* | ds | ds | ds | ds | ds | ... | ps | ps | ... | ps |
- * +----+----------------------------------------------+
+ * +----+----+----+----+----+-----+----+----+-----+----+
* | .. | .. | .. | .. | .. | ... | .. | .. | ... | .. |
* +----+----+----+----+----+ ... +----+----+-----+----+
* R1 R2 R3 R4 R5 ... Rn Rn+1 Rn+2 Rn+3
*/
/* Return the erasure code context to encode|decode */
-static inline struct fec *ec_init(void)
+static inline struct fec *ec_init(int d, int dp)
{
- return fec_new(SD_EC_D, SD_EC_DP);
+ return fec_new(d, dp);
}
/*
@@ -148,13 +146,16 @@ static inline struct fec *ec_init(void)
* @ds: data strips to generate parity strips
* @ps: parity strips to return
*/
-static inline void ec_encode(struct fec *ctx, const uint8_t *ds[SD_EC_D],
- uint8_t *ps[SD_EC_P])
+static inline void ec_encode(struct fec *ctx, const uint8_t *ds[],
+ uint8_t *ps[])
{
- int total = SD_EC_D + SD_EC_P;
- const int pidx[SD_EC_P] = { total - 2, total - 1 };
+ int p = ctx->dp - ctx->d;
+ int pidx[p];
- fec_encode(ctx, ds, ps, pidx, SD_EC_P, SD_EC_STRIP_SIZE);
+ for (int i = 0; i < p; i++)
+ pidx[i] = ctx->d + i;
+
+ fec_encode(ctx, ds, ps, pidx, p, SD_EC_DATA_STRIPE_SIZE / ctx->d);
}
/*
@@ -166,8 +167,8 @@ static inline void ec_encode(struct fec *ctx, const uint8_t *ds[SD_EC_D],
* @output: the lost ds or ps to return
* @idx: index of output which is lost
*/
-void ec_decode(struct fec *ctx, const uint8_t *input[SD_EC_D],
- const int inidx[SD_EC_D],
+void ec_decode(struct fec *ctx, const uint8_t *input[],
+ const int inidx[],
uint8_t output[], int idx);
/* Destroy the erasure code context */
diff --git a/lib/fec.c b/lib/fec.c
index 6deafe4..05dffa0 100644
--- a/lib/fec.c
+++ b/lib/fec.c
@@ -251,18 +251,18 @@ static void _addmul1(register uint8_t *dst,
GF_ADDMULC(*dst, *src);
}
-/* computes C = AB where A is n*k, B is k*m, C is n*m */
-static void _matmul(uint8_t *a, uint8_t *b, uint8_t *c, unsigned n, unsigned k,
+/* computes C = AB where A is dp*d, B is d*m, C is dp*m */
+static void _matmul(uint8_t *a, uint8_t *b, uint8_t *c, unsigned dp, unsigned d,
unsigned m)
{
unsigned row, col, i;
- for (row = 0; row < n; row++) {
+ for (row = 0; row < dp; row++) {
for (col = 0; col < m; col++) {
- uint8_t *pa = &a[row * k];
+ uint8_t *pa = &a[row * d];
uint8_t *pb = &b[col];
uint8_t acc = 0;
- for (i = 0; i < k; i++, pa++, pb += m)
+ for (i = 0; i < d; i++, pa++, pb += m)
acc ^= gf_mul(*pa, *pb);
c[row * m + col] = acc;
}
@@ -271,43 +271,43 @@ static void _matmul(uint8_t *a, uint8_t *b, uint8_t *c, unsigned n, unsigned k,
/*
* _invert_mat() takes a matrix and produces its inverse
- * k is the size of the matrix.
+ * d is the size of the matrix.
* (Gauss-Jordan, adapted from Numerical Recipes in C)
* Return non-zero if singular.
*/
-static void _invert_mat(uint8_t *src, unsigned k)
+static void _invert_mat(uint8_t *src, unsigned d)
{
uint8_t c, *p;
unsigned irow = 0;
unsigned icol = 0;
unsigned row, col, i, ix;
- unsigned *indxc = (unsigned *)xmalloc(k * sizeof(unsigned));
- unsigned *indxr = (unsigned *)xmalloc(k * sizeof(unsigned));
- unsigned *ipiv = (unsigned *)xmalloc(k * sizeof(unsigned));
- uint8_t *id_row = NEW_GF_MATRIX(1, k);
+ unsigned *indxc = (unsigned *)xmalloc(d * sizeof(unsigned));
+ unsigned *indxr = (unsigned *)xmalloc(d * sizeof(unsigned));
+ unsigned *ipiv = (unsigned *)xmalloc(d * sizeof(unsigned));
+ uint8_t *id_row = NEW_GF_MATRIX(1, d);
- memset(id_row, '\0', k * sizeof(uint8_t));
+ memset(id_row, '\0', d * sizeof(uint8_t));
/* ipiv marks elements already used as pivots. */
- for (i = 0; i < k; i++)
+ for (i = 0; i < d; i++)
ipiv[i] = 0;
- for (col = 0; col < k; col++) {
+ for (col = 0; col < d; col++) {
uint8_t *pivot_row;
/*
* Zeroing column 'col', look for a non-zero element.
* First try on the diagonal, if it fails, look elsewhere.
*/
- if (ipiv[col] != 1 && src[col * k + col] != 0) {
+ if (ipiv[col] != 1 && src[col * d + col] != 0) {
irow = col;
icol = col;
goto found_piv;
}
- for (row = 0; row < k; row++) {
+ for (row = 0; row < d; row++) {
if (ipiv[row] != 1) {
- for (ix = 0; ix < k; ix++) {
+ for (ix = 0; ix < d; ix++) {
if (ipiv[ix] == 0) {
- if (src[row * k + ix] != 0) {
+ if (src[row * d + ix] != 0) {
irow = row;
icol = ix;
goto found_piv;
@@ -325,11 +325,11 @@ found_piv:
* optimizing.
*/
if (irow != icol)
- for (ix = 0; ix < k; ix++)
- SWAP(src[irow*k + ix], src[icol*k + ix]);
+ for (ix = 0; ix < d; ix++)
+ SWAP(src[irow*d + ix], src[icol*d + ix]);
indxr[col] = irow;
indxc[col] = icol;
- pivot_row = &src[icol * k];
+ pivot_row = &src[icol * d];
c = pivot_row[icol];
assert(c != 0);
if (c != 1) { /* otherwhise this is a NOP */
@@ -339,7 +339,7 @@ found_piv:
*/
c = inverse[c];
pivot_row[icol] = 1;
- for (ix = 0; ix < k; ix++)
+ for (ix = 0; ix < d; ix++)
pivot_row[ix] = gf_mul(c, pivot_row[ix]);
}
/*
@@ -350,22 +350,22 @@ found_piv:
* we can optimize the addmul).
*/
id_row[icol] = 1;
- if (memcmp(pivot_row, id_row, k * sizeof(uint8_t)) != 0) {
- for (p = src, ix = 0; ix < k; ix++, p += k) {
+ if (memcmp(pivot_row, id_row, d * sizeof(uint8_t)) != 0) {
+ for (p = src, ix = 0; ix < d; ix++, p += d) {
if (ix != icol) {
c = p[icol];
p[icol] = 0;
- addmul(p, pivot_row, c, k);
+ addmul(p, pivot_row, c, d);
}
}
}
id_row[icol] = 0;
} /* done all columns */
- for (col = k; col > 0; col--)
+ for (col = d; col > 0; col--)
if (indxr[col-1] != indxc[col-1])
- for (row = 0; row < k; row++)
- SWAP(src[row * k + indxr[col-1]],
- src[row * k + indxc[col-1]]);
+ for (row = 0; row < d; row++)
+ SWAP(src[row * d + indxr[col-1]],
+ src[row * d + indxc[col-1]]);
}
/*
@@ -379,51 +379,51 @@ found_piv:
* p = coefficients of the matrix (p_i)
* q = values of the polynomial (known)
*/
-static void _invert_vdm(uint8_t *src, unsigned k)
+static void _invert_vdm(uint8_t *src, unsigned d)
{
unsigned i, j, row, col;
uint8_t *b, *c, *p;
uint8_t t, xx;
- if (k == 1) /* degenerate case, matrix must be p^0 = 1 */
+ if (d == 1) /* degenerate case, matrix must be p^0 = 1 */
return;
/*
- * c holds the coefficient of P(x) = Prod (x - p_i), i=0..k-1
+ * c holds the coefficient of P(x) = Prod (x - p_i), i=0..d-1
* b holds the coefficient for the matrix inversion
*/
- c = NEW_GF_MATRIX(1, k);
- b = NEW_GF_MATRIX(1, k);
- p = NEW_GF_MATRIX(1, k);
+ c = NEW_GF_MATRIX(1, d);
+ b = NEW_GF_MATRIX(1, d);
+ p = NEW_GF_MATRIX(1, d);
- for (j = 1, i = 0; i < k; i++, j += k) {
+ for (j = 1, i = 0; i < d; i++, j += d) {
c[i] = 0;
p[i] = src[j]; /* p[i] */
}
/*
- * construct coeffs. recursively. We know c[k] = 1 (implicit)
+ * construct coeffs. recursively. We know c[d] = 1 (implicit)
* and start P_0 = x - p_0, then at each stage multiply by
* x - p_i generating P_i = x P_{i-1} - p_i P_{i-1}
- * After k steps we are done.
+ * After d steps we are done.
*/
- c[k - 1] = p[0]; /* really -p(0), but x = -x in GF(2^m) */
- for (i = 1; i < k; i++) {
+ c[d - 1] = p[0]; /* really -p(0), but x = -x in GF(2^m) */
+ for (i = 1; i < d; i++) {
uint8_t p_i = p[i]; /* see above comment */
- for (j = k - 1 - (i - 1); j < k - 1; j++)
+ for (j = d - 1 - (i - 1); j < d - 1; j++)
c[j] ^= gf_mul(p_i, c[j + 1]);
- c[k - 1] ^= p_i;
+ c[d - 1] ^= p_i;
}
- for (row = 0; row < k; row++) {
+ for (row = 0; row < d; row++) {
/* synthetic division etc. */
xx = p[row];
t = 1;
- b[k - 1] = 1; /* this is in fact c[k] */
- for (i = k - 1; i > 0; i--) {
+ b[d - 1] = 1; /* this is in fact c[d] */
+ for (i = d - 1; i > 0; i--) {
b[i-1] = c[i] ^ gf_mul(xx, b[i]);
t = gf_mul(xx, t) ^ b[i-1];
}
- for (col = 0; col < k; col++)
- src[col * k + row] = gf_mul(inverse[t], b[col]);
+ for (col = 0; col < d; col++)
+ src[col * d + row] = gf_mul(inverse[t], b[col]);
}
free(c);
free(b);
@@ -447,13 +447,13 @@ void init_fec(void)
void fec_free(struct fec *p)
{
- assert(p != NULL && p->magic == (((FEC_MAGIC ^ p->k) ^ p->n) ^
+ assert(p != NULL && p->magic == (((FEC_MAGIC ^ p->d) ^ p->dp) ^
(unsigned long) (p->enc_matrix)));
free(p->enc_matrix);
free(p);
}
-struct fec *fec_new(unsigned short k, unsigned short n)
+struct fec *fec_new(unsigned short d, unsigned short dp)
{
unsigned row, col;
uint8_t *p, *tmp_m;
@@ -461,32 +461,32 @@ struct fec *fec_new(unsigned short k, unsigned short n)
struct fec *retval;
retval = (struct fec *)xmalloc(sizeof(struct fec));
- retval->k = k;
- retval->n = n;
- retval->enc_matrix = NEW_GF_MATRIX(n, k);
- retval->magic = ((FEC_MAGIC^k)^n)^(unsigned long)(retval->enc_matrix);
- tmp_m = NEW_GF_MATRIX(n, k);
+ retval->d = d;
+ retval->dp = dp;
+ retval->enc_matrix = NEW_GF_MATRIX(dp, d);
+ retval->magic = ((FEC_MAGIC^d)^dp)^(unsigned long)(retval->enc_matrix);
+ tmp_m = NEW_GF_MATRIX(dp, d);
/*
* fill the matrix with powers of field elements, starting from 0.
* The first row is special, cannot be computed with exp. table.
*/
tmp_m[0] = 1;
- for (col = 1; col < k; col++)
+ for (col = 1; col < d; col++)
tmp_m[col] = 0;
- for (p = tmp_m + k, row = 0; row < n - 1; row++, p += k)
- for (col = 0; col < k; col++)
+ for (p = tmp_m + d, row = 0; row < dp - 1; row++, p += d)
+ for (col = 0; col < d; col++)
p[col] = gf_exp[modnn(row * col)];
/*
* quick code to build systematic matrix: invert the top
- * k*k vandermonde matrix, multiply right the bottom n-k rows
+ * d*d vandermonde matrix, multiply right the bottom dp-d rows
* by the inverse, and construct the identity matrix at the top.
*/
- _invert_vdm(tmp_m, k); /* much faster than _invert_mat */
- _matmul(tmp_m + k * k, tmp_m, retval->enc_matrix + k * k, n - k, k, k);
+ _invert_vdm(tmp_m, d); /* much faster than _invert_mat */
+ _matmul(tmp_m + d * d, tmp_m, retval->enc_matrix + d * d, dp - d, d, d);
/* the upper matrix is I so do not bother with a slow multiply */
- memset(retval->enc_matrix, '\0', k * k * sizeof(uint8_t));
- for (p = retval->enc_matrix, col = 0; col < k; col++, p += k + 1)
+ memset(retval->enc_matrix, '\0', d * d * sizeof(uint8_t));
+ for (p = retval->enc_matrix, col = 0; col < d; col++, p += d + 1)
*p = 1;
free(tmp_m);
@@ -508,19 +508,19 @@ void fec_encode(const struct fec *code,
size_t num_block_nums, size_t sz)
{
unsigned char i, j;
- size_t k;
+ size_t d;
unsigned fecnum;
const uint8_t *p;
- for (k = 0; k < sz; k += STRIDE) {
- size_t stride = ((sz-k) < STRIDE) ? (sz-k) : STRIDE;
+ for (d = 0; d < sz; d += STRIDE) {
+ size_t stride = ((sz-d) < STRIDE) ? (sz-d) : STRIDE;
for (i = 0; i < num_block_nums; i++) {
fecnum = block_nums[i];
- assert(fecnum >= code->k);
- memset(fecs[i]+k, 0, stride);
- p = &(code->enc_matrix[fecnum * code->k]);
- for (j = 0; j < code->k; j++)
- addmul(fecs[i]+k, src[j]+k, p[j], stride);
+ assert(fecnum >= code->d);
+ memset(fecs[i]+d, 0, stride);
+ p = &(code->enc_matrix[fecnum * code->d]);
+ for (j = 0; j < code->d; j++)
+ addmul(fecs[i]+d, src[j]+d, p[j], stride);
}
}
}
@@ -528,24 +528,24 @@ void fec_encode(const struct fec *code,
/*
* Build decode matrix into some memory space.
*
- * @param matrix a space allocated for a k by k matrix
+ * @param matrix a space allocated for a d by d matrix
*/
static void
build_decode_matrix_into_space(const struct fec *const code,
const int *const idx,
- const unsigned k, uint8_t *const matrix)
+ const unsigned d, uint8_t *const matrix)
{
unsigned char i;
uint8_t *p;
- for (i = 0, p = matrix; i < k; i++, p += k) {
- if (idx[i] < k) {
- memset(p, 0, k);
+ for (i = 0, p = matrix; i < d; i++, p += d) {
+ if (idx[i] < d) {
+ memset(p, 0, d);
p[i] = 1;
} else {
- memcpy(p, &(code->enc_matrix[idx[i] * code->k]), k);
+ memcpy(p, &(code->enc_matrix[idx[i] * code->d]), d);
}
}
- _invert_mat(matrix, k);
+ _invert_mat(matrix, d);
}
void fec_decode(const struct fec *code,
@@ -553,25 +553,25 @@ void fec_decode(const struct fec *code,
uint8_t *const *const outpkts,
const int *const idx, size_t sz)
{
- uint8_t m_dec[code->k * code->k];
+ uint8_t m_dec[code->d * code->d];
unsigned char outix = 0;
unsigned char row = 0;
unsigned char col = 0;
- assert(code->k * code->k < 8 * 1024 * 1024);
- build_decode_matrix_into_space(code, idx, code->k, m_dec);
+ assert(code->d * code->d < 8 * 1024 * 1024);
+ build_decode_matrix_into_space(code, idx, code->d, m_dec);
- for (row = 0; row < code->k; row++) {
+ for (row = 0; row < code->d; row++) {
/*
* If the block whose number is i is present, then it is
* required to be in the i'th element.
*/
- assert((idx[row] >= code->k) || (idx[row] == row));
- if (idx[row] >= code->k) {
+ assert((idx[row] >= code->d) || (idx[row] == row));
+ if (idx[row] >= code->d) {
memset(outpkts[outix], 0, sz);
- for (col = 0; col < code->k; col++)
+ for (col = 0; col < code->d; col++)
addmul(outpkts[outix], inpkts[col],
- m_dec[row * code->k + col], sz);
+ m_dec[row * code->d + col], sz);
outix++;
}
}
@@ -584,19 +584,20 @@ void fec_decode(const struct fec *code,
*
* Return out and outidx as fec_decode requested.
*/
-static inline void decode_prepare(const uint8_t *dp[], const uint8_t *out[],
+static inline void decode_prepare(struct fec *ctx, const uint8_t *dp[],
+ const uint8_t *out[],
int outidx[])
{
int i, p = 0;
- for (i = SD_EC_D; i < SD_EC_DP; i++) {
+ for (i = ctx->d; i < ctx->dp; i++) {
if (dp[i]) {
p = i;
break;
}
}
- for (i = 0; i < SD_EC_D; i++) {
+ for (i = 0; i < ctx->d; i++) {
if (dp[i]) {
out[i] = dp[i];
outidx[i] = i;
@@ -608,9 +609,9 @@ static inline void decode_prepare(const uint8_t *dp[], const uint8_t *out[],
}
}
-static inline bool data_is_missing(const uint8_t *dp[])
+static inline bool data_is_missing(const uint8_t *dp[], int d)
{
- for (int i = 0; i < SD_EC_D; i++)
+ for (int i = 0; i < d; i++)
if (!dp[i])
return true;
return false;
@@ -625,39 +626,50 @@ static inline bool data_is_missing(const uint8_t *dp[])
* @output: the lost ds or ps to return
* @idx: index of output which is lost
*/
-void ec_decode(struct fec *ctx, const uint8_t *input[SD_EC_D],
- const int inidx[SD_EC_D],
+void ec_decode(struct fec *ctx, const uint8_t *input[], const int inidx[],
uint8_t output[], int idx)
{
- const uint8_t *dp[SD_EC_DP] = { NULL };
- const uint8_t *oin[SD_EC_D] = { NULL };
- int oidx[SD_EC_D] = { 0 }, i;
- uint8_t m0[SD_EC_STRIP_SIZE], m1[SD_EC_STRIP_SIZE],
- p0[SD_EC_STRIP_SIZE], p1[SD_EC_STRIP_SIZE];
- uint8_t *missing[SD_EC_P] = { m0, m1 };
- uint8_t *p[SD_EC_P] = { p0, p1 };
-
- for (i = 0; i < SD_EC_D; i++)
+ int edp = ctx->dp, ep = ctx->dp - ctx->d, ed = ctx->d;
+ const uint8_t *dp[edp];
+ const uint8_t *oin[ed];
+ int oidx[ed], i;
+ int strip_size = SD_EC_DATA_STRIPE_SIZE / ed;
+ uint8_t m0[strip_size], m1[strip_size], m2[strip_size], m3[strip_size],
+ m4[strip_size], m5[strip_size], m6[strip_size], m7[strip_size],
+ p0[strip_size], p1[strip_size], p2[strip_size], p3[strip_size],
+ p4[strip_size], p5[strip_size], p6[strip_size], p7[strip_size];
+#define SD_EC_MAX_PARITY 8
+ uint8_t *missing[SD_EC_MAX_PARITY] = { m0, m1, m2, m3, m4, m5, m6, m7 };
+ uint8_t *p[SD_EC_MAX_PARITY] = { p0, p1, p2, p3, p4, p5, p6, p7 };
+
+ for (i = 0; i < edp; i++)
+ dp[i] = NULL;
+ for (i = 0; i < ed; i++)
+ oin[i] = NULL;
+ for (i = 0; i < ed; i++)
+ oidx[i] = 0;
+
+ for (i = 0; i < ed; i++)
dp[inidx[i]] = input[i];
- decode_prepare(dp, oin, oidx);
+ decode_prepare(ctx, dp, oin, oidx);
/* Fill the data strip if missing */
- if (data_is_missing(dp)) {
+ if (data_is_missing(dp, ed)) {
int m = 0;
- fec_decode(ctx, oin, missing, oidx, SD_EC_STRIP_SIZE);
- for (i = 0; i < SD_EC_D; i++)
+ fec_decode(ctx, oin, missing, oidx, strip_size);
+ for (i = 0; i < ed; i++)
if (!dp[i])
dp[i] = missing[m++];
}
- if (idx < SD_EC_D)
+ if (idx < ed)
goto out;
/* Fill the parity strip */
ec_encode(ctx, dp, p);
- for (i = 0; i < SD_EC_P; i++)
- dp[SD_EC_D + i] = p[i];
+ for (i = 0; i < ep; i++)
+ dp[ed + i] = p[i];
out:
- memcpy(output, dp[idx], SD_EC_STRIP_SIZE);
+ memcpy(output, dp[idx], strip_size);
}
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 08b9c1e..4d7e0e2 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -48,11 +48,12 @@ static struct req_iter *prepare_replication_requests(struct request *req,
}
/*
- * Make sure we don't overwrite the existing data for unaligned write
+ * Make sure we don't overwrite the existing data for misaligned write
*
- * If either offset or length of request isn't aligned to SD_EC_D_SIZE, we have
- * to read the unaligned blocks before write. This kind of write amplification
- * indeed slow down the write operation with extra read overhead.
+ * If either offset or length of request isn't aligned to
+ * SD_EC_DATA_STRIPE_SIZE, we have to read the unaligned blocks before write.
+ * This kind of write amplification indeed slow down the write operation with
+ * extra read overhead.
*/
static void *init_erasure_buffer(struct request *req, int buf_len)
{
@@ -62,18 +63,18 @@ static void *init_erasure_buffer(struct request *req, int buf_len)
uint64_t oid = req->rq.obj.oid;
int opcode = req->rq.opcode;
struct sd_req hdr;
- uint64_t head = round_down(off, SD_EC_D_SIZE);
- uint64_t tail = round_down(off + len, SD_EC_D_SIZE);
+ uint64_t head = round_down(off, SD_EC_DATA_STRIPE_SIZE);
+ uint64_t tail = round_down(off + len, SD_EC_DATA_STRIPE_SIZE);
int ret;
if (opcode != SD_OP_WRITE_OBJ)
goto out;
- if (off % SD_EC_D_SIZE) {
+ if (off % SD_EC_DATA_STRIPE_SIZE) {
/* Read head */
sd_init_req(&hdr, SD_OP_READ_OBJ);
hdr.obj.oid = oid;
- hdr.data_length = SD_EC_D_SIZE;
+ hdr.data_length = SD_EC_DATA_STRIPE_SIZE;
hdr.obj.offset = head;
ret = exec_local_req(&hdr, buf);
if (ret != SD_RES_SUCCESS) {
@@ -82,11 +83,11 @@ static void *init_erasure_buffer(struct request *req, int buf_len)
}
}
- if ((len + off) % SD_EC_D_SIZE && tail - head > 0) {
+ if ((len + off) % SD_EC_DATA_STRIPE_SIZE && tail - head > 0) {
/* Read tail */
sd_init_req(&hdr, SD_OP_READ_OBJ);
hdr.obj.oid = oid;
- hdr.data_length = SD_EC_D_SIZE;
+ hdr.data_length = SD_EC_DATA_STRIPE_SIZE;
hdr.obj.offset = tail;
ret = exec_local_req(&hdr, buf + tail - head);
if (ret != SD_RES_SUCCESS) {
@@ -95,7 +96,7 @@ static void *init_erasure_buffer(struct request *req, int buf_len)
}
}
out:
- memcpy(buf + off % SD_EC_D_SIZE, req->data, len);
+ memcpy(buf + off % SD_EC_DATA_STRIPE_SIZE, req->data, len);
return buf;
}
@@ -108,11 +109,12 @@ static struct req_iter *prepare_erasure_requests(struct request *req, int *nr)
uint32_t len = req->rq.data_length;
uint64_t off = req->rq.obj.offset;
int opcode = req->rq.opcode;
- int start = off / SD_EC_D_SIZE;
- int end = DIV_ROUND_UP(off + len, SD_EC_D_SIZE), i, j;
+ int start = off / SD_EC_DATA_STRIPE_SIZE;
+ int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j;
int nr_stripe = end - start;
- struct fec *ctx = ec_init();
+ struct fec *ctx = ec_init(SD_EC_D, SD_EC_DP);
int nr_to_send = (opcode == SD_OP_READ_OBJ) ? SD_EC_D : SD_EC_DP;
+ int strip_size = SD_EC_DATA_STRIPE_SIZE / SD_EC_D;
struct req_iter *reqs = xzalloc(sizeof(*reqs) * nr_to_send);
char *p, *buf = NULL;
@@ -121,11 +123,11 @@ static struct req_iter *prepare_erasure_requests(struct request *req, int *nr)
*nr = nr_to_send;
for (i = 0; i < nr_to_send; i++) {
- int l = SD_EC_STRIP_SIZE * nr_stripe;
+ int l = strip_size * nr_stripe;
reqs[i].buf = xmalloc(l);
reqs[i].dlen = l;
- reqs[i].off = start * SD_EC_STRIP_SIZE;
+ reqs[i].off = start * strip_size;
switch (opcode) {
case SD_OP_CREATE_AND_WRITE_OBJ:
case SD_OP_WRITE_OBJ:
@@ -139,7 +141,7 @@ static struct req_iter *prepare_erasure_requests(struct request *req, int *nr)
if (opcode != SD_OP_WRITE_OBJ && opcode != SD_OP_CREATE_AND_WRITE_OBJ)
goto out; /* Read and remove operation */
- p = buf = init_erasure_buffer(req, SD_EC_D_SIZE * nr_stripe);
+ p = buf = init_erasure_buffer(req, SD_EC_DATA_STRIPE_SIZE * nr_stripe);
if (!buf) {
sd_err("failed to init erasure buffer %"PRIx64,
req->rq.obj.oid);
@@ -152,16 +154,16 @@ static struct req_iter *prepare_erasure_requests(struct request *req, int *nr)
uint8_t *ps[SD_EC_P];
for (j = 0; j < SD_EC_D; j++)
- ds[j] = reqs[j].buf + SD_EC_STRIP_SIZE * i;
+ ds[j] = reqs[j].buf + strip_size * i;
for (j = 0; j < SD_EC_P; j++)
- ps[j] = reqs[SD_EC_D + j].buf + SD_EC_STRIP_SIZE * i;
+ ps[j] = reqs[SD_EC_D + j].buf + strip_size * i;
for (j = 0; j < SD_EC_D; j++)
- memcpy((uint8_t *)ds[j], p + j * SD_EC_STRIP_SIZE,
- SD_EC_STRIP_SIZE);
+ memcpy((uint8_t *)ds[j], p + j * strip_size,
+ strip_size);
ec_encode(ctx, ds, ps);
- p += SD_EC_D_SIZE;
+ p += SD_EC_DATA_STRIPE_SIZE;
}
out:
ec_destroy(ctx);
@@ -212,8 +214,8 @@ static void finish_requests(struct request *req, struct req_iter *reqs,
uint32_t len = req->rq.data_length;
uint64_t off = req->rq.obj.offset;
int opcode = req->rq.opcode;
- int start = off / SD_EC_D_SIZE;
- int end = DIV_ROUND_UP(off + len, SD_EC_D_SIZE), i, j;
+ int start = off / SD_EC_DATA_STRIPE_SIZE;
+ int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j;
int nr_stripe = end - start;
if (!is_erasure_oid(oid))
@@ -224,17 +226,18 @@ static void finish_requests(struct request *req, struct req_iter *reqs,
/* We need to assemble the data strips into the req buffer for read */
if (opcode == SD_OP_READ_OBJ) {
- char *p, *buf = xmalloc(SD_EC_D_SIZE * nr_stripe);
+ char *p, *buf = xmalloc(SD_EC_DATA_STRIPE_SIZE * nr_stripe);
+ int strip_size = SD_EC_DATA_STRIPE_SIZE / SD_EC_D;
p = buf;
for (i = 0; i < nr_stripe; i++) {
for (j = 0; j < nr_to_send; j++) {
- memcpy(p, reqs[j].buf + SD_EC_STRIP_SIZE * i,
- SD_EC_STRIP_SIZE);
- p += SD_EC_STRIP_SIZE;
+ memcpy(p, reqs[j].buf + strip_size * i,
+ strip_size);
+ p += strip_size;
}
}
- memcpy(req->data, buf + off % SD_EC_D_SIZE, len);
+ memcpy(req->data, buf + off % SD_EC_DATA_STRIPE_SIZE, len);
req->rp.data_length = req->rq.data_length;
free(buf);
}
diff --git a/sheep/recovery.c b/sheep/recovery.c
index e63e3d6..037d2c3 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -392,7 +392,7 @@ static void *rebuild_erasure_object(uint64_t oid, uint8_t idx,
{
uint8_t *bufs[SD_EC_D] = { 0 };
int idxs[SD_EC_D], len = get_store_objsize(oid);
- struct fec *ctx = ec_init();
+ struct fec *ctx = ec_init(SD_EC_D, SD_EC_DP);
char *lost = xvalloc(len);
int i, j;
@@ -414,12 +414,13 @@ static void *rebuild_erasure_object(uint64_t oid, uint8_t idx,
/* Rebuild the lost replica */
for (i = 0; i < SD_EC_NR_STRIPE_PER_OBJECT; i++) {
const uint8_t *in[SD_EC_D];
- uint8_t out[SD_EC_STRIP_SIZE];
+ int strip_size = SD_EC_DATA_STRIPE_SIZE / SD_EC_D;
+ uint8_t out[strip_size];
for (j = 0; j < SD_EC_D; j++)
- in[j] = bufs[j] + SD_EC_STRIP_SIZE * i;
+ in[j] = bufs[j] + strip_size * i;
ec_decode(ctx, in, idxs, out, idx);
- memcpy(lost + SD_EC_STRIP_SIZE * i, out, SD_EC_STRIP_SIZE);
+ memcpy(lost + strip_size * i, out, strip_size);
}
out:
ec_destroy(ctx);
--
1.7.9.5
More information about the sheepdog
mailing list