[sheepdog] [PATCH v2 01/11] add forward error correction for erasure code

Thu Sep 26 09:25:38 CEST 2013

This is imported and based on zfec

Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 include/Makefile.am |    2 +-
 include/fec.h       |  173 ++++++++++++++++
 include/util.h      |    2 +
 lib/Makefile.am     |    2 +-
 lib/fec.c           |  574 +++++++++++++++++++++++++++++++++++++++++++++++++++
 sheep/sheep.c       |    2 +
 sheep/sheep_priv.h  |    1 +
 7 files changed, 754 insertions(+), 2 deletions(-)
 create mode 100644 include/fec.h
 create mode 100644 lib/fec.c

diff --git a/include/Makefile.am b/include/Makefile.am
index 06e97a6..2c86984 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -3,4 +3,4 @@ MAINTAINERCLEANFILES    = Makefile.in config.h.in
 noinst_HEADERS          = bitops.h event.h logger.h sheepdog_proto.h util.h \
 			  list.h net.h sheep.h exits.h strbuf.h rbtree.h \
 			  sha1.h option.h internal_proto.h shepherd.h work.h \
-			  sockfd_cache.h compiler.h
+			  sockfd_cache.h compiler.h fec.h
diff --git a/include/fec.h b/include/fec.h
new file mode 100644
index 0000000..33ab565
--- /dev/null
+++ b/include/fec.h
@@ -0,0 +1,173 @@
+/*
+ * zfec -- fast forward error correction library
+ *
+ * Copyright (C) 2007-2008 Allmyds, Inc.
+ * Author: Zooko Wilcox-O'Hearn
+ *
+ * This file is part of zfec.
+ *
+ * See README.rst for licensing information.
+ */
+
+/*
+ * Much of this work is derived from the "fec" software by Luigi Rizzo, et
+ * al., the copyright notice and licence terms of which are included below
+ * for reference.
+ *
+ * fec.h -- forward error correction based on Vandermonde matrices
+ * 980614
+ * (C) 1997-98 Luigi Rizzo (luigi at iet.unipi.it)
+ *
+ * Portions derived from code by Phil Karn (karn at ka9q.ampr.org),
+ * Robert Morelos-Zaragoza (robert at spectra.eng.hawaii.edu) and Hari
+ * Thirumoorthy (harit at spectra.eng.hawaii.edu), Aug 1995
+ *
+ * Modifications by Dan Rubenstein (see Modifications.txt for
+ * their description.
+ * Modifications (C) 1998 Dan Rubenstein (drubenst at cs.umass.edu)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+ * OF SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+struct fec {
+	unsigned long magic;
+	unsigned short k, n;                     /* parameters of the code */
+	uint8_t *enc_matrix;
+};
+
+void init_fec(void);
+/*
+ * param k the number of blocks required to reconstruct
+ * param m the total number of blocks created
+ */
+struct fec *fec_new(unsigned short k, unsigned short m);
+void fec_free(struct fec *p);
+
+/*
+ * @param inpkts the "primary blocks" i.e. the chunks of the input data
+ * @param fecs buffers into which the secondary blocks will be written
+ * @param block_nums the numbers of the desired check blocks (the id >= k) which
+ * fec_encode() will produce and store into the buffers of the fecs parameter
+ * @param num_block_nums the length of the block_nums array
+ * @param sz size of a packet in bytes
+ */
+void fec_encode(const struct fec *code,
+		const uint8_t *const *const src,
+		uint8_t *const *const fecs,
+		const int *const block_nums,
+		size_t num_block_nums, size_t sz);
+
+/*
+ * @param inpkts an array of packets (size k); If a primary block, i, is present
+ * then it must be at index i. Secondary blocks can appear anywhere.
+ * @param outpkts an array of buffers into which the reconstructed output
+ * packets will be written (only packets which are not present in the inpkts
+ * input will be reconstructed and written to outpkts)
+ * @param index an array of the blocknums of the packets in inpkts
+ * @param sz size of a packet in bytes
+ */
+void fec_decode(const struct fec *code,
+		const uint8_t *const *const inpkts,
+		uint8_t *const *const outpkts,
+		const int *const index, size_t sz);
+
+#define SD_EC_D	4 /* No. of data strips */
+#define SD_EC_P 2 /* No. of parity strips */
+#define SD_EC_DP (SD_EC_D + SD_EC_P)
+#define SD_EC_STRIP_SIZE (256)
+
+/*
+ * SD_EC_D_SIZE <= 1K is the safe value to run VM after some experimentations.
+ *
+ * Though most OS's file system will operate on 4K block, some softwares like
+ * grub will operate on 512 bytes and Linux kernel itself will sometimes
+ * operate on 1K blocks. I have tried 4K alignement and centos6 installation
+ * failed (grub got screwed) and 1K is probably the biggest value if we want
+ * VM to run on erasure coded volume.
+ */
+#define SD_EC_D_SIZE (SD_EC_STRIP_SIZE * SD_EC_D)
+#define SD_EC_OBJECT_SIZE (SD_DATA_OBJ_SIZE / SD_EC_D)
+#define SD_EC_STRIPE (SD_EC_STRIP_SIZE * SD_EC_DP)
+#define STRIP_PER_OBJECT (SD_DATA_OBJ_SIZE / SD_EC_STRIP_SIZE)
+
+/*
+ * Stripe: data strips + parity strips, spread on all replica
+ * DS: data strip
+ * PS: parity strip
+ * R: Replica
+ *
+ *  +--------------------stripe ----------------------+
+ *  v                                                 v
+ * +----+----------------------------------------------+
+ * | ds | ds | ds | ds | ds | ... | ps | ps | ... | ps |
+ * +----+----------------------------------------------+
+ * | .. | .. | .. | .. | .. | ... | .. | .. | ... | .. |
+ * +----+----+----+----+----+ ... +----+----+-----+----+
+ *  R1    R2   R3   R4   R5   ...   Rn  Rn+1  Rn+2  Rn+3
+ */
+
+/* Return the erasure code context to encode|decode */
+static inline struct fec *ec_init(void)
+{
+	return fec_new(SD_EC_D, SD_EC_DP);
+}
+
+/*
+ * This function decodes the data strips and return the parity strips
+ *
+ * @ds: data strips to generate parity strips
+ * @ps: parity strips to return
+ */
+static inline void ec_encode(struct fec *ctx, const uint8_t *ds[SD_EC_D],
+			     uint8_t *ps[SD_EC_P])
+{
+	int total = SD_EC_D + SD_EC_P;
+	const int pidx[SD_EC_P] = { total - 2, total - 1 };
+
+	fec_encode(ctx, ds, ps, pidx, SD_EC_P, SD_EC_STRIP_SIZE);
+}
+
+/*
+ * This function takes input strips and return the lost strips
+ *
+ * @input: strips (either ds or ps) that are used to generate lost strips
+ * @output: the lost ds or ps to return
+ * @idx: indexes of each input strip in the whole stripe
+ */
+static inline void ec_decode(struct fec *ctx, const uint8_t *input[SD_EC_D],
+			     uint8_t *output[],
+			     const int idx[])
+{
+	fec_decode(ctx, input, output, idx, SD_EC_STRIP_SIZE);
+}
+
+/* Destroy the erasure code context */
+static inline void ec_destroy(struct fec *ctx)
+{
+	fec_free(ctx);
+}
diff --git a/include/util.h b/include/util.h
index fb59192..218553f 100644
--- a/include/util.h
+++ b/include/util.h
@@ -196,6 +196,8 @@ int atomic_create_and_write(const char *path, const char *buf, size_t len,
 #define assert(expr) ((void)0)
 #endif	/* NDEBUG */
 
+#define SWAP(a, b) { typeof(a) tmp; tmp = a; a = b; b = tmp; }
+
 /* urcu helpers */
 
 /* Boolean data type which can be accessed by multiple threads */
diff --git a/lib/Makefile.am b/lib/Makefile.am
index a066b3b..b6ac290 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -5,7 +5,7 @@ AM_CPPFLAGS                = -I$(top_builddir)/include -I$(top_srcdir)/include
 noinst_LIBRARIES	= libsheepdog.a
 
 libsheepdog_a_SOURCES	= event.c logger.c net.c util.c rbtree.c strbuf.c \
-			  sha1.c option.c work.c sockfd_cache.c
+			  sha1.c option.c work.c sockfd_cache.c fec.c
 
 if BUILD_SHA1_HW
 libsheepdog_a_SOURCES	+= sha1_ssse3.S
diff --git a/lib/fec.c b/lib/fec.c
new file mode 100644
index 0000000..e300ab0
--- /dev/null
+++ b/lib/fec.c
@@ -0,0 +1,574 @@
+/*
+ * zfec -- fast forward error correction
+ *
+ * Copyright (C) 2007-2010 Zooko Wilcox-O'Hearn
+ * Author: Zooko Wilcox-O'Hearn
+ *
+ * This file is part of zfec.
+ *
+ * Imported by Liu Yuan <namei.unix at gmail.com>
+ *
+ */
+
+/*
+ * This work is derived from the "fec" software by Luigi Rizzo, et al., the
+ * copyright notice and licence terms of which are included below for reference.
+ * fec.c -- forward error correction based on Vandermonde matrices 980624 (C)
+ * 1997-98 Luigi Rizzo (luigi at iet.unipi.it)
+ *
+ * Portions derived from code by Phil Karn (karn at ka9q.ampr.org),
+ * Robert Morelos-Zaragoza (robert at spectra.eng.hawaii.edu) and Hari
+ * Thirumoorthy (harit at spectra.eng.hawaii.edu), Aug 1995
+ *
+ * Modifications by Dan Rubenstein (see Modifications.txt for
+ * their description.
+ * Modifications (C) 1998 Dan Rubenstein (drubenst at cs.umass.edu)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+ * OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "fec.h"
+#include "util.h"
+
+/*
+ * Primitive polynomials - see Lin & Costello, Appendix A,
+ * and  Lee & Messerschmitt, p. 453.
+ */
+static const char *const Pp = "101110001";
+
+/*
+ * To speed up computations, we have tables for logarithm, exponent and
+ * inverse of a number.  We use a table for multiplication as well (it takes
+ * 64K, no big deal even on a PDA, especially because it can be
+ * pre-initialized an put into a ROM!), otherwhise we use a table of
+ * logarithms. In any case the macro gf_mul(x,y) takes care of
+ * multiplications.
+ */
+
+static uint8_t gf_exp[510];  /* idx->poly form conversion table    */
+static int gf_log[256]; /* Poly->idx form conversion table    */
+static uint8_t inverse[256]; /* inverse of field elem.               */
+/* inv[\alpha**i]=\alpha**(GF_SIZE-i-1) */
+
+/*
+ * modnn(x) computes x % GF_SIZE, where GF_SIZE is 2**GF_BITS - 1,
+ * without a slow divide.
+ */
+static uint8_t modnn(int x)
+{
+	while (x >= 255) {
+		x -= 255;
+		x = (x >> 8) + (x & 255);
+	}
+	return x;
+}
+
+/*
+ * gf_mul(x,y) multiplies two numbers.  It is much faster to use a
+ * multiplication table.
+ *
+ * USE_GF_MULC, GF_MULC0(c) and GF_ADDMULC(x) can be used when multiplying
+ * many numbers by the same constant. In this case the first call sets the
+ * constant, and others perform the multiplications.  A value related to the
+ * multiplication is held in a local variable declared with USE_GF_MULC . See
+ * usage in _addmul1().
+ */
+static uint8_t gf_mul_table[256][256];
+
+#define gf_mul(x, y) gf_mul_table[x][y]
+
+#define USE_GF_MULC register uint8_t *__gf_mulc_
+
+#define GF_MULC0(c) __gf_mulc_ = gf_mul_table[c]
+#define GF_ADDMULC(dst, x) dst ^= __gf_mulc_[x]
+
+/*
+ * Generate GF(2**m) from the irreducible polynomial p(X) in p[0]..p[m]
+ * Lookup tables:
+ *     idx->polynomial form		gf_exp[] contains j= \alpha^i;
+ *     polynomial form -> idx form	gf_log[ j = \alpha^i ] = i
+ * \alpha=x is the primitive element of GF(2^m)
+ *
+ * For efficiency, gf_exp[] has size 2*GF_SIZE, so that a simple
+ * multiplication of two numbers can be resolved without calling modnn
+ */
+static void _init_mul_table(void)
+{
+	int i, j;
+	for (i = 0; i < 256; i++)
+		for (j = 0; j < 256; j++)
+			gf_mul_table[i][j] = gf_exp[modnn(gf_log[i] +
+							  gf_log[j])];
+
+	for (j = 0; j < 256; j++)
+		gf_mul_table[0][j] = gf_mul_table[j][0] = 0;
+}
+
+#define NEW_GF_MATRIX(rows, cols) \
+	(uint8_t *)xmalloc(rows * cols)
+
+/* initialize the data structures used for computations in GF. */
+static void generate_gf(void)
+{
+	int i;
+	uint8_t mask;
+
+	mask = 1;                     /* x ** 0 = 1 */
+	gf_exp[8] = 0;          /* will be updated at the end of the 1st loop */
+	/*
+	 * first, generate the (polynomial representation of) powers of \alpha,
+	 * which are stored in gf_exp[i] = \alpha ** i .
+	 * At the same time build gf_log[gf_exp[i]] = i .
+	 * The first 8 powers are simply bits shifted to the left.
+	 */
+	for (i = 0; i < 8; i++, mask <<= 1) {
+		gf_exp[i] = mask;
+		gf_log[gf_exp[i]] = i;
+		/*
+		 * If Pp[i] == 1 then \alpha ** i occurs in poly-repr
+		 * gf_exp[8] = \alpha ** 8
+		 */
+		if (Pp[i] == '1')
+			gf_exp[8] ^= mask;
+	}
+	/*
+	 * now gf_exp[8] = \alpha ** 8 is complete, so can also
+	 * compute its inverse.
+	 */
+	gf_log[gf_exp[8]] = 8;
+	/*
+	 * Poly-repr of \alpha ** (i+1) is given by poly-repr of
+	 * \alpha ** i shifted left one-bit and accounting for any
+	 * \alpha ** 8 term that may occur when poly-repr of
+	 * \alpha ** i is shifted.
+	 */
+	mask = 1 << 7;
+	for (i = 9; i < 255; i++) {
+		if (gf_exp[i - 1] >= mask)
+			gf_exp[i] = gf_exp[8] ^ ((gf_exp[i - 1] ^ mask) << 1);
+		else
+			gf_exp[i] = gf_exp[i - 1] << 1;
+		gf_log[gf_exp[i]] = i;
+	}
+	/* log(0) is not defined, so use a special value */
+	gf_log[0] = 255;
+	/* set the extended gf_exp values for fast multiply */
+	for (i = 0; i < 255; i++)
+		gf_exp[i + 255] = gf_exp[i];
+
+	/*
+	 * again special cases. 0 has no inverse. This used to
+	 * be initialized to 255, but it should make no difference
+	 * since noone is supposed to read from here.
+	 */
+	inverse[0] = 0;
+	inverse[1] = 1;
+	for (i = 2; i <= 255; i++)
+		inverse[i] = gf_exp[255 - gf_log[i]];
+}
+
+/* Various linear algebra operations that i use often. */
+
+/*
+ * addmul() computes dst[] = dst[] + c * src[]
+ * This is used often, so better optimize it! Currently the loop is
+ * unrolled 16 times, a good value for 486 and pentium-class machines.
+ * The case c=0 is also optimized, whereas c=1 is not. These
+ * calls are unfrequent in my typical apps so I did not bother.
+ */
+#define addmul(dst, src, c, sz)                 \
+	if (c != 0)				\
+		_addmul1(dst, src, c, sz)
+
+#define UNROLL 16               /* 1, 4, 8, 16 */
+static void _addmul1(register uint8_t *dst,
+		     const register uint8_t *src,
+		     uint8_t c, size_t sz)
+{
+	USE_GF_MULC;
+	const uint8_t *lim = &dst[sz - UNROLL + 1];
+
+	GF_MULC0(c);
+
+#if (UNROLL > 1) /* unrolling by 8/16 is quite effective on the pentium */
+	for (; dst < lim; dst += UNROLL, src += UNROLL) {
+		GF_ADDMULC(dst[0], src[0]);
+		GF_ADDMULC(dst[1], src[1]);
+		GF_ADDMULC(dst[2], src[2]);
+		GF_ADDMULC(dst[3], src[3]);
+#if (UNROLL > 4)
+		GF_ADDMULC(dst[4], src[4]);
+		GF_ADDMULC(dst[5], src[5]);
+		GF_ADDMULC(dst[6], src[6]);
+		GF_ADDMULC(dst[7], src[7]);
+#endif
+#if (UNROLL > 8)
+		GF_ADDMULC(dst[8], src[8]);
+		GF_ADDMULC(dst[9], src[9]);
+		GF_ADDMULC(dst[10], src[10]);
+		GF_ADDMULC(dst[11], src[11]);
+		GF_ADDMULC(dst[12], src[12]);
+		GF_ADDMULC(dst[13], src[13]);
+		GF_ADDMULC(dst[14], src[14]);
+		GF_ADDMULC(dst[15], src[15]);
+#endif
+	}
+#endif
+	lim += UNROLL - 1;
+	for (; dst < lim; dst++, src++)       /* final components */
+		GF_ADDMULC(*dst, *src);
+}
+
+/* computes C = AB where A is n*k, B is k*m, C is n*m */
+static void _matmul(uint8_t *a, uint8_t *b, uint8_t *c, unsigned n, unsigned k,
+		    unsigned m)
+{
+	unsigned row, col, i;
+
+	for (row = 0; row < n; row++) {
+		for (col = 0; col < m; col++) {
+			uint8_t *pa = &a[row * k];
+			uint8_t *pb = &b[col];
+			uint8_t acc = 0;
+			for (i = 0; i < k; i++, pa++, pb += m)
+				acc ^= gf_mul(*pa, *pb);
+			c[row * m + col] = acc;
+		}
+	}
+}
+
+/*
+ * _invert_mat() takes a matrix and produces its inverse
+ * k is the size of the matrix.
+ * (Gauss-Jordan, adapted from Numerical Recipes in C)
+ * Return non-zero if singular.
+ */
+static void _invert_mat(uint8_t *src, unsigned k)
+{
+	uint8_t c, *p;
+	unsigned irow = 0;
+	unsigned icol = 0;
+	unsigned row, col, i, ix;
+
+	unsigned *indxc = (unsigned *)xmalloc(k * sizeof(unsigned));
+	unsigned *indxr = (unsigned *)xmalloc(k * sizeof(unsigned));
+	unsigned *ipiv = (unsigned *)xmalloc(k * sizeof(unsigned));
+	uint8_t *id_row = NEW_GF_MATRIX(1, k);
+
+	memset(id_row, '\0', k * sizeof(uint8_t));
+	/* ipiv marks elements already used as pivots. */
+	for (i = 0; i < k; i++)
+		ipiv[i] = 0;
+
+	for (col = 0; col < k; col++) {
+		uint8_t *pivot_row;
+		/*
+		 * Zeroing column 'col', look for a non-zero element.
+		 * First try on the diagonal, if it fails, look elsewhere.
+		 */
+		if (ipiv[col] != 1 && src[col * k + col] != 0) {
+			irow = col;
+			icol = col;
+			goto found_piv;
+		}
+		for (row = 0; row < k; row++) {
+			if (ipiv[row] != 1) {
+				for (ix = 0; ix < k; ix++) {
+					if (ipiv[ix] == 0) {
+						if (src[row * k + ix] != 0) {
+							irow = row;
+							icol = ix;
+							goto found_piv;
+						}
+					} else
+						assert(ipiv[ix] <= 1);
+				}
+			}
+		}
+found_piv:
+		++(ipiv[icol]);
+		/*
+		 * swap rows irow and icol, so afterwards the diagonal
+		 * element will be correct. Rarely done, not worth
+		 * optimizing.
+		 */
+		if (irow != icol)
+			for (ix = 0; ix < k; ix++)
+				SWAP(src[irow*k + ix], src[icol*k + ix]);
+		indxr[col] = irow;
+		indxc[col] = icol;
+		pivot_row = &src[icol * k];
+		c = pivot_row[icol];
+		assert(c != 0);
+		if (c != 1) {   /* otherwhise this is a NOP */
+			/*
+			 * this is done often , but optimizing is not so
+			 * fruitful, at least in the obvious ways (unrolling)
+			 */
+			c = inverse[c];
+			pivot_row[icol] = 1;
+			for (ix = 0; ix < k; ix++)
+				pivot_row[ix] = gf_mul(c, pivot_row[ix]);
+		}
+		/*
+		 * from all rows, remove multiples of the selected row
+		 * to zero the relevant entry (in fact, the entry is not zero
+		 * because we know it must be zero).
+		 * (Here, if we know that the pivot_row is the identity,
+		 * we can optimize the addmul).
+		 */
+		id_row[icol] = 1;
+		if (memcmp(pivot_row, id_row, k * sizeof(uint8_t)) != 0) {
+			for (p = src, ix = 0; ix < k; ix++, p += k) {
+				if (ix != icol) {
+					c = p[icol];
+					p[icol] = 0;
+					addmul(p, pivot_row, c, k);
+				}
+			}
+		}
+		id_row[icol] = 0;
+	}                           /* done all columns */
+	for (col = k; col > 0; col--)
+		if (indxr[col-1] != indxc[col-1])
+			for (row = 0; row < k; row++)
+				SWAP(src[row * k + indxr[col-1]],
+				     src[row * k + indxc[col-1]]);
+}
+
+/*
+ * fast code for inverting a vandermonde matrix.
+ *
+ * NOTE: It assumes that the matrix is not singular and _IS_ a vandermonde
+ * matrix. Only uses the second column of the matrix, containing the p_i's.
+ *
+ * Algorithm borrowed from "Numerical recipes in C" -- sec.2.8, but largely
+ * revised for my purposes.
+ * p = coefficients of the matrix (p_i)
+ * q = values of the polynomial (known)
+ */
+static void _invert_vdm(uint8_t *src, unsigned k)
+{
+	unsigned i, j, row, col;
+	uint8_t *b, *c, *p;
+	uint8_t t, xx;
+
+	if (k == 1)     /* degenerate case, matrix must be p^0 = 1 */
+		return;
+	/*
+	 * c holds the coefficient of P(x) = Prod (x - p_i), i=0..k-1
+	 * b holds the coefficient for the matrix inversion
+	 */
+	c = NEW_GF_MATRIX(1, k);
+	b = NEW_GF_MATRIX(1, k);
+	p = NEW_GF_MATRIX(1, k);
+
+	for (j = 1, i = 0; i < k; i++, j += k) {
+		c[i] = 0;
+		p[i] = src[j];            /* p[i] */
+	}
+	/*
+	 * construct coeffs. recursively. We know c[k] = 1 (implicit)
+	 * and start P_0 = x - p_0, then at each stage multiply by
+	 * x - p_i generating P_i = x P_{i-1} - p_i P_{i-1}
+	 * After k steps we are done.
+	 */
+	c[k - 1] = p[0];              /* really -p(0), but x = -x in GF(2^m) */
+	for (i = 1; i < k; i++) {
+		uint8_t p_i = p[i];            /* see above comment */
+		for (j = k - 1 - (i - 1); j < k - 1; j++)
+			c[j] ^= gf_mul(p_i, c[j + 1]);
+		c[k - 1] ^= p_i;
+	}
+
+	for (row = 0; row < k; row++) {
+		/* synthetic division etc. */
+		xx = p[row];
+		t = 1;
+		b[k - 1] = 1;             /* this is in fact c[k] */
+		for (i = k - 1; i > 0; i--) {
+			b[i-1] = c[i] ^ gf_mul(xx, b[i]);
+			t = gf_mul(xx, t) ^ b[i-1];
+		}
+		for (col = 0; col < k; col++)
+			src[col * k + row] = gf_mul(inverse[t], b[col]);
+	}
+	free(c);
+	free(b);
+	free(p);
+	return;
+}
+
+void init_fec(void)
+{
+	generate_gf();
+	_init_mul_table();
+}
+
+/*
+ * This section contains the proper FEC encoding/decoding routines.
+ * The encoding matrix is computed starting with a Vandermonde matrix,
+ * and then transforming it into a systematic matrix.
+ */
+
+#define FEC_MAGIC	0xFECC0DEC
+
+void fec_free(struct fec *p)
+{
+	assert(p != NULL && p->magic == (((FEC_MAGIC ^ p->k) ^ p->n) ^
+					 (unsigned long) (p->enc_matrix)));
+	free(p->enc_matrix);
+	free(p);
+}
+
+struct fec *fec_new(unsigned short k, unsigned short n)
+{
+	unsigned row, col;
+	uint8_t *p, *tmp_m;
+
+	struct fec *retval;
+
+		init_fec();
+
+	retval = (struct fec *)xmalloc(sizeof(struct fec));
+	retval->k = k;
+	retval->n = n;
+	retval->enc_matrix = NEW_GF_MATRIX(n, k);
+	retval->magic = ((FEC_MAGIC^k)^n)^(unsigned long)(retval->enc_matrix);
+	tmp_m = NEW_GF_MATRIX(n, k);
+	/*
+	 * fill the matrix with powers of field elements, starting from 0.
+	 * The first row is special, cannot be computed with exp. table.
+	 */
+	tmp_m[0] = 1;
+	for (col = 1; col < k; col++)
+		tmp_m[col] = 0;
+	for (p = tmp_m + k, row = 0; row < n - 1; row++, p += k)
+		for (col = 0; col < k; col++)
+			p[col] = gf_exp[modnn(row * col)];
+
+	/*
+	 * quick code to build systematic matrix: invert the top
+	 * k*k vandermonde matrix, multiply right the bottom n-k rows
+	 * by the inverse, and construct the identity matrix at the top.
+	 */
+	_invert_vdm(tmp_m, k);        /* much faster than _invert_mat */
+	_matmul(tmp_m + k * k, tmp_m, retval->enc_matrix + k * k, n - k, k, k);
+	/* the upper matrix is I so do not bother with a slow multiply */
+	memset(retval->enc_matrix, '\0', k * k * sizeof(uint8_t));
+	for (p = retval->enc_matrix, col = 0; col < k; col++, p += k + 1)
+		*p = 1;
+	free(tmp_m);
+
+	return retval;
+}
+
+/*
+ * To make sure that we stay within cache in the inner loops of fec_encode().
+ * (It would probably help to also do this for fec_decode().
+ */
+#ifndef STRIDE
+#define STRIDE 8192
+#endif
+
+void fec_encode(const struct fec *code,
+		const uint8_t *const *const src,
+		uint8_t *const *const fecs,
+		const int *const block_nums,
+		size_t num_block_nums, size_t sz)
+{
+	unsigned char i, j;
+	size_t k;
+	unsigned fecnum;
+	const uint8_t *p;
+
+	for (k = 0; k < sz; k += STRIDE) {
+		size_t stride = ((sz-k) < STRIDE) ? (sz-k) : STRIDE;
+		for (i = 0; i < num_block_nums; i++) {
+			fecnum = block_nums[i];
+			assert(fecnum >= code->k);
+			memset(fecs[i]+k, 0, stride);
+			p = &(code->enc_matrix[fecnum * code->k]);
+			for (j = 0; j < code->k; j++)
+				addmul(fecs[i]+k, src[j]+k, p[j], stride);
+		}
+	}
+}
+
+/*
+ * Build decode matrix into some memory space.
+ *
+ * @param matrix a space allocated for a k by k matrix
+ */
+static void
+build_decode_matrix_into_space(const struct fec *const code,
+			       const int *const idx,
+			       const unsigned k, uint8_t *const matrix)
+{
+	unsigned char i;
+	uint8_t *p;
+	for (i = 0, p = matrix; i < k; i++, p += k) {
+		if (idx[i] < k) {
+			memset(p, 0, k);
+			p[i] = 1;
+		} else {
+			memcpy(p, &(code->enc_matrix[idx[i] * code->k]), k);
+		}
+	}
+	_invert_mat(matrix, k);
+}
+
+void fec_decode(const struct fec *code,
+		const uint8_t *const *const inpkts,
+		uint8_t *const *const outpkts,
+		const int *const idx, size_t sz)
+{
+	uint8_t m_dec[code->k * code->k];
+	unsigned char outix = 0;
+	unsigned char row = 0;
+	unsigned char col = 0;
+
+	assert(code->k * code->k < 8 * 1024 * 1024);
+	build_decode_matrix_into_space(code, idx, code->k, m_dec);
+
+	for (row = 0; row < code->k; row++) {
+		/*
+		 * If the block whose number is i is present, then it is
+		 * required to be in the i'th element.
+		 */
+		assert((idx[row] >= code->k) || (idx[row] == row));
+		if (idx[row] >= code->k) {
+			memset(outpkts[outix], 0, sz);
+			for (col = 0; col < code->k; col++)
+				addmul(outpkts[outix], inpkts[col],
+				       m_dec[row * code->k + col], sz);
+			outix++;
+		}
+	}
+}
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 32fcd62..c73d0f9 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -771,6 +771,8 @@ int main(int argc, char **argv)
 			exit(1);
 	}
 
+	init_fec();
+
 	/*
 	 * After this function, we are multi-threaded.
 	 *
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 588a61c..97a71a4 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -42,6 +42,7 @@
 #include "sha1.h"
 #include "config.h"
 #include "sockfd_cache.h"
+#include "fec.h"
 
  /*
   * Functions that update global info must be called in the main
-- 
1.7.9.5