[sheepdog] [PATCH v3 2/6] erasure: support user-defined redundancy

Liu Yuan namei.unix at gmail.com
Fri Oct 18 20:34:22 CEST 2013


usage:
$ dog vdi create -c x:y test 10g # create a erasured vdi

x represent number of data strips and y number of parity strips

x can be 2, 4, 8, 16 , 0 < y < 8 and x > y

Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 dog/common.c               |    8 ++-
 dog/vdi.c                  |   68 +++++++++++++++----
 include/fec.h              |   33 +++++++--
 include/internal_proto.h   |    7 +-
 lib/fec.c                  |   15 +++--
 sheep/gateway.c            |   38 +++++++----
 sheep/ops.c                |    2 +-
 sheep/plain_store.c        |   30 +++++++--
 sheep/recovery.c           |   47 ++++++++-----
 tests/functional/022       |    4 +-
 tests/functional/029       |   16 +++--
 tests/functional/029.out   |  161 ++++++++++++++++++++++++++++++++++++++++----
 tests/functional/031       |    2 +-
 tests/functional/common.rc |    2 +-
 14 files changed, 349 insertions(+), 84 deletions(-)

diff --git a/dog/common.c b/dog/common.c
index a2fb945..028d367 100644
--- a/dog/common.c
+++ b/dog/common.c
@@ -334,7 +334,11 @@ size_t get_store_objsize(uint8_t copy_policy, uint64_t oid)
 {
 	if (is_vdi_obj(oid))
 		return SD_INODE_SIZE;
-	if (copy_policy != 0)
-		return SD_EC_OBJECT_SIZE;
+	if (copy_policy != 0) {
+		int d;
+
+		ec_policy_to_dp(copy_policy, &d, NULL);
+		return SD_DATA_OBJ_SIZE / d;
+	}
 	return get_objsize(oid);
 }
diff --git a/dog/vdi.c b/dog/vdi.c
index 7e0fc63..6ee97d4 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -28,10 +28,9 @@ static struct sd_option vdi_options[] = {
 	{'x', "exclusive", false, "write in an exclusive mode"},
 	{'d', "delete", false, "delete a key"},
 	{'w', "writeback", false, "use writeback mode"},
-	{'c', "copies", true, "specify the data redundancy (number of copies)"},
+	{'c', "copies", true, "specify the data redundancy level"},
 	{'F', "from", true, "create a differential backup from the snapshot"},
 	{'f', "force", false, "do operation forcibly"},
-	{'e', "erasure", false, "create erasure coded vdi"},
 	{ 0, NULL, false, NULL },
 };
 
@@ -893,7 +892,8 @@ static int vdi_object(int argc, char **argv)
 			exit(EXIT_FAILURE);
 		}
 
-		size = info.copy_policy ? SD_EC_OBJECT_SIZE : SD_DATA_OBJ_SIZE;
+		size = get_store_objsize(info.copy_policy,
+					 vid_to_data_oid(vid, 0));
 		parse_objs(vid_to_vdi_oid(vid), obj_info_filler, &oid_info,
 			   size);
 
@@ -1020,7 +1020,8 @@ static int vdi_track(int argc, char **argv)
 	}
 
 	parse_objs(vid_to_vdi_oid(vid), obj_info_filler, &oid_info,
-		   info.copy_policy ? SD_EC_OBJECT_SIZE : SD_DATA_OBJ_SIZE);
+		   get_store_objsize(info.copy_policy,
+				     vid_to_data_oid(vid, 0)));
 
 	if (!oid_info.success) {
 		sd_err("Failed to read the inode object 0x%" PRIx32, vid);
@@ -2121,7 +2122,7 @@ static struct subcommand vdi_cmd[] = {
 	{"check", "<vdiname>", "saph", "check and repair image's consistency",
 	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
 	 vdi_check, vdi_options},
-	{"create", "<vdiname> <size>", "Pcapherv", "create an image",
+	{"create", "<vdiname> <size>", "Pcaphrv", "create an image",
 	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
 	 vdi_create, vdi_options},
 	{"snapshot", "<vdiname>", "saphrv", "create a snapshot",
@@ -2175,10 +2176,47 @@ static struct subcommand vdi_cmd[] = {
 	{NULL,},
 };
 
+/* Return 0 to indicate ill str */
+static uint8_t parse_copy(const char *str, uint8_t *copy_policy)
+{
+	char *n1, *n2;
+	uint8_t copy, parity;
+	char p[10];
+
+	strcpy(p, str);
+	n1 = strtok(p, ":");
+	n2 = strtok(NULL, ":");
+
+	if ((n1 && !is_numeric(n1)) || (n2 && !is_numeric(n2)))
+		return 0;
+
+	copy = strtol(n1, NULL, 10);
+	if (copy > SD_MAX_COPIES)
+		return 0;
+	if (!n2) {
+		*copy_policy = 0;
+		return copy;
+	}
+
+	if (copy != 2 && copy != 4 && copy != 8 && copy != 16)
+		return 0;
+
+	parity = strtol(n2, NULL, 10);
+	if (parity >= SD_EC_MAX_STRIP || parity >= copy || parity == 0)
+		return 0;
+
+	/*
+	 * 4 bits for parity and 4 bits for data.
+	 * We have to compress upper data bits because it can't represent 16
+	 */
+	*copy_policy = ((copy / 2) << 4) + parity;
+	copy = copy + parity;
+	return copy;
+}
+
 static int vdi_parser(int ch, const char *opt)
 {
 	char *p;
-	int nr_copies;
 
 	switch (ch) {
 	case 'P':
@@ -2213,13 +2251,19 @@ static int vdi_parser(int ch, const char *opt)
 		vdi_cmd_data.writeback = true;
 		break;
 	case 'c':
-		nr_copies = strtol(opt, &p, 10);
-		if (opt == p || nr_copies < 0 || nr_copies > SD_MAX_COPIES) {
-			sd_err("Invalid copies number, must be "
-			       "an integer between 0 and %d", SD_MAX_COPIES);
+		vdi_cmd_data.nr_copies = parse_copy(opt,
+						    &vdi_cmd_data.copy_policy);
+		if (!vdi_cmd_data.nr_copies) {
+			sd_err("Invalid parameter %s\n"
+			       "To create replicated vdi, set -c x\n"
+			       "  x(1 to %d)   - number of replicated copies\n"
+			       "To create erasure coded vdi, set -c x:y\n"
+			       "  x(2,4,8,16)  - number of data strips\n"
+			       "  y(1 to 15)   - number of parity strips\n"
+			       "and meet the condition x > y",
+			       opt, SD_MAX_COPIES);
 			exit(EXIT_FAILURE);
 		}
-		vdi_cmd_data.nr_copies = nr_copies;
 		break;
 	case 'F':
 		vdi_cmd_data.from_snapshot_id = strtol(opt, &p, 10);
@@ -2232,8 +2276,6 @@ static int vdi_parser(int ch, const char *opt)
 	case 'f':
 		vdi_cmd_data.force = true;
 		break;
-	case 'e':
-		vdi_cmd_data.copy_policy = 1;
 	}
 
 	return 0;
diff --git a/include/fec.h b/include/fec.h
index 55e0bdb..c8b5d5c 100644
--- a/include/fec.h
+++ b/include/fec.h
@@ -1,3 +1,6 @@
+#ifndef __FEC_H__
+#define __FEC_H__
+
 /*
  * zfec -- fast forward error correction library
  *
@@ -59,6 +62,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include "util.h"
+
 struct fec {
 	unsigned long magic;
 	unsigned short d, dp;                     /* parameters of the code */
@@ -101,12 +106,8 @@ void fec_decode(const struct fec *code,
 		uint8_t *const *const outpkts,
 		const int *const index, size_t sz);
 
-#define SD_EC_D	4 /* No. of data strips */
-#define SD_EC_P 2 /* No. of parity strips */
-#define SD_EC_DP (SD_EC_D + SD_EC_P)
-
 /*
- * SD_EC_D_SIZE <= 1K is the safe value to run VM after some experimentations.
+ * data stripe <= 1K is the safe value to run VM after some experimentations.
  *
  * Though most OS's file system will operate on 4K block, some softwares like
  * grub will operate on 512 bytes and Linux kernel itself will sometimes
@@ -115,8 +116,26 @@ void fec_decode(const struct fec *code,
  * VM to run on erasure coded volume.
  */
 #define SD_EC_DATA_STRIPE_SIZE (1024) /* 1K */
-#define SD_EC_OBJECT_SIZE (SD_DATA_OBJ_SIZE / SD_EC_D)
 #define SD_EC_NR_STRIPE_PER_OBJECT (SD_DATA_OBJ_SIZE / SD_EC_DATA_STRIPE_SIZE)
+#define SD_EC_MAX_STRIP (16)
+
+static inline int ec_policy_to_dp(uint8_t policy, int *d, int *p)
+{
+	int ed = 0, ep = 0;
+
+	ep = policy & 0b1111;
+	ed = policy >> 4;
+
+	if (unlikely(!ep))
+		panic("invalid policy %d", policy);
+
+	if (d)
+		*d = ed * 2;
+	if (p)
+		*p = ep;
+
+	return ed * 2 + ep;
+}
 
 /*
  * Stripe: data strips + parity strips, spread on all replica
@@ -176,3 +195,5 @@ static inline void ec_destroy(struct fec *ctx)
 {
 	fec_free(ctx);
 }
+
+#endif
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 59c6e2a..8a23737 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -22,11 +22,16 @@
 
 #include "sheepdog_proto.h"
 #include "rbtree.h"
+#include "fec.h"
 
 #define SD_SHEEP_PROTO_VER 0x08
 
 #define SD_DEFAULT_COPIES 3
-#define SD_MAX_COPIES 8
+/*
+ * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
+ * (SD_EC_MAX_STRIP - 1) for parity strips
+ */
+#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1)
 
 /*
  * The max number of nodes sheep daemon can support is constrained by
diff --git a/lib/fec.c b/lib/fec.c
index 05dffa0..f978ae0 100644
--- a/lib/fec.c
+++ b/lib/fec.c
@@ -636,11 +636,18 @@ void ec_decode(struct fec *ctx, const uint8_t *input[], const int inidx[],
 	int strip_size = SD_EC_DATA_STRIPE_SIZE / ed;
 	uint8_t m0[strip_size], m1[strip_size], m2[strip_size], m3[strip_size],
 		m4[strip_size], m5[strip_size], m6[strip_size], m7[strip_size],
+		m8[strip_size], m9[strip_size], m10[strip_size],
+		m11[strip_size], m12[strip_size], m13[strip_size],
+		m14[strip_size], m15[strip_size],
 		p0[strip_size], p1[strip_size], p2[strip_size], p3[strip_size],
-		p4[strip_size], p5[strip_size], p6[strip_size], p7[strip_size];
-#define SD_EC_MAX_PARITY 8
-	uint8_t *missing[SD_EC_MAX_PARITY] = { m0, m1, m2, m3, m4, m5, m6, m7 };
-	uint8_t *p[SD_EC_MAX_PARITY] = { p0, p1, p2, p3, p4, p5, p6, p7 };
+		p4[strip_size], p5[strip_size], p6[strip_size], p7[strip_size],
+		p8[strip_size], p9[strip_size], p10[strip_size],
+		p11[strip_size], p12[strip_size], p13[strip_size],
+		p14[strip_size];
+	uint8_t *missing[SD_EC_MAX_STRIP] = { m0, m1, m2, m3, m4, m5, m6, m7,
+		m8, m9, m10, m11, m12, m13, m14, m15 };
+	uint8_t *p[SD_EC_MAX_STRIP - 1] = { p0, p1, p2, p3, p4, p5, p6, p7, p8,
+		p9, p10, p11, p12, p13, p14 };
 
 	for (i = 0; i < edp; i++)
 		dp[i] = NULL;
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 4d7e0e2..e2e5a21 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -112,16 +112,23 @@ static struct req_iter *prepare_erasure_requests(struct request *req, int *nr)
 	int start = off / SD_EC_DATA_STRIPE_SIZE;
 	int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j;
 	int nr_stripe = end - start;
-	struct fec *ctx = ec_init(SD_EC_D, SD_EC_DP);
-	int nr_to_send = (opcode == SD_OP_READ_OBJ) ? SD_EC_D : SD_EC_DP;
-	int strip_size = SD_EC_DATA_STRIPE_SIZE / SD_EC_D;
-	struct req_iter *reqs = xzalloc(sizeof(*reqs) * nr_to_send);
+	struct fec *ctx;
+	int strip_size, nr_to_send;
+	struct req_iter *reqs;
 	char *p, *buf = NULL;
+	uint8_t policy = req->rq.obj.copy_policy ?:
+		get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid));
+	int ed = 0, ep = 0, edp;
+
+	edp = ec_policy_to_dp(policy, &ed, &ep);
+	ctx = ec_init(ed, edp);
+	*nr = nr_to_send = (opcode == SD_OP_READ_OBJ) ? ed : edp;
+	strip_size = SD_EC_DATA_STRIPE_SIZE / ed;
+	reqs = xzalloc(sizeof(*reqs) * nr_to_send);
 
 	sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32,
 		 start, end, nr_to_send, off, len);
 
-	*nr = nr_to_send;
 	for (i = 0; i < nr_to_send; i++) {
 		int l = strip_size * nr_stripe;
 
@@ -150,16 +157,16 @@ static struct req_iter *prepare_erasure_requests(struct request *req, int *nr)
 		goto out;
 	}
 	for (i = 0; i < nr_stripe; i++) {
-		const uint8_t *ds[SD_EC_D];
-		uint8_t *ps[SD_EC_P];
+		const uint8_t *ds[ed];
+		uint8_t *ps[ep];
 
-		for (j = 0; j < SD_EC_D; j++)
+		for (j = 0; j < ed; j++)
 			ds[j] = reqs[j].buf + strip_size * i;
 
-		for (j = 0; j < SD_EC_P; j++)
-			ps[j] = reqs[SD_EC_D + j].buf + strip_size * i;
+		for (j = 0; j < ep; j++)
+			ps[j] = reqs[ed + j].buf + strip_size * i;
 
-		for (j = 0; j < SD_EC_D; j++)
+		for (j = 0; j < ed; j++)
 			memcpy((uint8_t *)ds[j], p + j * strip_size,
 			       strip_size);
 		ec_encode(ctx, ds, ps);
@@ -218,7 +225,7 @@ static void finish_requests(struct request *req, struct req_iter *reqs,
 	int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j;
 	int nr_stripe = end - start;
 
-	if (!is_erasure_oid(oid))
+	if (!is_erasure_obj(oid, req->rq.obj.copy_policy))
 		goto out;
 
 	sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32,
@@ -227,7 +234,12 @@ static void finish_requests(struct request *req, struct req_iter *reqs,
 	/* We need to assemble the data strips into the req buffer for read */
 	if (opcode == SD_OP_READ_OBJ) {
 		char *p, *buf = xmalloc(SD_EC_DATA_STRIPE_SIZE * nr_stripe);
-		int strip_size = SD_EC_DATA_STRIPE_SIZE / SD_EC_D;
+		uint8_t policy = req->rq.obj.copy_policy ?:
+			get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid));
+		int ed = 0, strip_size;
+
+		ec_policy_to_dp(policy, &ed, NULL);
+		strip_size = SD_EC_DATA_STRIPE_SIZE / ed;
 
 		p = buf;
 		for (i = 0; i < nr_stripe; i++) {
diff --git a/sheep/ops.c b/sheep/ops.c
index a10f908..7f73ab1 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -90,7 +90,7 @@ static int cluster_new_vdi(struct request *req)
 	};
 
 	if (iocb.copy_policy)
-		iocb.nr_copies = SD_EC_DP;
+		iocb.nr_copies = ec_policy_to_dp(iocb.copy_policy, NULL, NULL);
 
 	if (hdr->data_length != SD_MAX_VDI_LEN)
 		return SD_RES_INVALID_PARMS;
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 7d8b5d0..fde7c9f 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -330,8 +330,12 @@ int prealloc(int fd, uint32_t size)
 
 size_t get_store_objsize(uint64_t oid)
 {
-	if (is_erasure_oid(oid))
-		return SD_EC_OBJECT_SIZE;
+	if (is_erasure_oid(oid)) {
+		uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
+		int d;
+		ec_policy_to_dp(policy, &d, NULL);
+		return SD_DATA_OBJ_SIZE / d;
+	}
 	return get_objsize(oid);
 }
 
@@ -342,6 +346,7 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 	int ret, fd;
 	uint32_t len = iocb->length;
 	bool ec = is_erasure_obj(oid, iocb->copy_policy);
+	size_t obj_size;
 
 	sd_debug("%"PRIx64, oid);
 	get_obj_path(oid, path, sizeof(path));
@@ -375,7 +380,16 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 		return err_to_sderr(path, oid, errno);
 	}
 
-	ret = prealloc(fd, ec ? SD_EC_OBJECT_SIZE : get_objsize(oid));
+	if (ec) {
+		uint8_t policy = iocb->copy_policy ?:
+			get_vdi_copy_policy(oid_to_vid(oid));
+		int d;
+		ec_policy_to_dp(policy, &d, NULL);
+		obj_size = SD_DATA_OBJ_SIZE / d;
+	} else
+		obj_size = get_objsize(oid);
+
+	ret = prealloc(fd, obj_size);
 	if (ret < 0) {
 		ret = err_to_sderr(path, oid, errno);
 		goto out;
@@ -447,7 +461,6 @@ static bool oid_stale(uint64_t oid)
 	const struct sd_vnode *v;
 	bool ret = true;
 	const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
-
 	vinfo = get_vnode_info();
 
 	/*
@@ -455,8 +468,13 @@ static bool oid_stale(uint64_t oid)
 	 * know it is stale or not. In this case, we keep it stay in the working
 	 * directory in order to recover it when we get enough zones
 	 */
-	if (unlikely(vinfo->nr_zones < SD_EC_DP) && is_erasure_oid(oid))
-		return false;
+	if (is_erasure_oid(oid)) {
+		uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
+		int edp = ec_policy_to_dp(policy, NULL, NULL);
+
+		if (unlikely(vinfo->nr_zones < edp))
+			return false;
+	}
 
 	nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
 	oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes);
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 037d2c3..790bca8 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -164,8 +164,10 @@ static void *read_erasure_object(uint64_t oid, uint8_t idx,
 	struct vnode_info *old = grab_vnode_info(rw->old_vinfo), *new_old;
 	uint32_t epoch = rw->epoch, tgt_epoch = rw->tgt_epoch;
 	const struct sd_node *node;
+	uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
+	int edp = ec_policy_to_dp(policy, NULL, NULL);
 again:
-	if (old->nr_zones < SD_EC_DP) {
+	if (unlikely(old->nr_zones < edp)) {
 		if (search_erasure_object(oid, idx, &old->nroot, rw,
 					  tgt_epoch, buf)
 		    == SD_RES_SUCCESS)
@@ -390,14 +392,23 @@ out:
 static void *rebuild_erasure_object(uint64_t oid, uint8_t idx,
 				    struct recovery_work *rw)
 {
-	uint8_t *bufs[SD_EC_D] = { 0 };
-	int idxs[SD_EC_D], len = get_store_objsize(oid);
-	struct fec *ctx = ec_init(SD_EC_D, SD_EC_DP);
+	int len = get_store_objsize(oid);
 	char *lost = xvalloc(len);
 	int i, j;
+	uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
+	int ed = 0, edp;
+	edp = ec_policy_to_dp(policy, &ed, NULL);
+	struct fec *ctx = ec_init(ed, edp);
+	uint8_t *bufs[ed];
+	int idxs[ed];
+
+	for (i = 0; i < ed; i++)
+		bufs[i] = NULL;
+	for (i = 0; i < ed; i++)
+		idxs[i] = 0;
 
 	/* Prepare replica */
-	for (i = 0, j = 0; i < SD_EC_DP && j < SD_EC_D; i++) {
+	for (i = 0, j = 0; i < edp && j < ed; i++) {
 		if (i == idx)
 			continue;
 		bufs[j] = read_erasure_object(oid, i, rw);
@@ -405,7 +416,7 @@ static void *rebuild_erasure_object(uint64_t oid, uint8_t idx,
 			continue;
 		idxs[j++] = i;
 	}
-	if (j != SD_EC_D) {
+	if (j != ed) {
 		free(lost);
 		lost = NULL;
 		goto out;
@@ -413,18 +424,18 @@ static void *rebuild_erasure_object(uint64_t oid, uint8_t idx,
 
 	/* Rebuild the lost replica */
 	for (i = 0; i < SD_EC_NR_STRIPE_PER_OBJECT; i++) {
-		const uint8_t *in[SD_EC_D];
-		int strip_size = SD_EC_DATA_STRIPE_SIZE / SD_EC_D;
+		const uint8_t *in[ed];
+		int strip_size = SD_EC_DATA_STRIPE_SIZE / ed;
 		uint8_t out[strip_size];
 
-		for (j = 0; j < SD_EC_D; j++)
+		for (j = 0; j < ed; j++)
 			in[j] = bufs[j] + strip_size * i;
 		ec_decode(ctx, in, idxs, out, idx);
 		memcpy(lost + strip_size * i, out, strip_size);
 	}
 out:
 	ec_destroy(ctx);
-	for (i = 0; i < SD_EC_D; i++)
+	for (i = 0; i < ed; i++)
 		free(bufs[i]);
 	return lost;
 }
@@ -432,10 +443,12 @@ out:
 static uint8_t local_node_copy_index(struct rb_root *vroot, uint64_t oid)
 {
 	const struct sd_node *target_nodes[SD_MAX_NODES];
+	uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
 	uint8_t idx;
+	int edp = ec_policy_to_dp(policy, NULL, NULL);
 
-	oid_to_nodes(oid, vroot, SD_EC_DP, target_nodes);
-	for (idx = 0; idx < SD_EC_DP; idx++)
+	oid_to_nodes(oid, vroot, edp, target_nodes);
+	for (idx = 0; idx < edp; idx++)
 		if (node_is_local(target_nodes[idx]))
 			return idx;
 	panic("can't get valid index for %"PRIx64, oid);
@@ -450,9 +463,9 @@ static uint8_t local_node_copy_index(struct rb_root *vroot, uint64_t oid)
  *    2.1 read enough other copies from their tracks in epoch history
  *    2.2 rebuild the lost object from the content of copies read at 2.1
  *
- * The subtle case is number for available zones is less than SD_EC_DP or the
- * requested index of lost object:
- *    1 we need to make sure nr_zones >= SD_EC_DP to avoid panic of
+ * The subtle case is number for available zones is less than total copy number
+ * or the requested index of lost object:
+ *    1 we need to make sure nr_zones >= total_copy_nr to avoid panic of
  *      oid_to_node(s) helpers.
  *    2 we have to search all the available zones when we can't get idx. Its
  *      okay to do a mad search when number of available zones is small
@@ -466,8 +479,10 @@ static int recover_erasure_object(struct recovery_obj_work *row)
 	void *buf = NULL;
 	uint8_t idx;
 	int ret = -1;
+	uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
+	int edp = ec_policy_to_dp(policy, NULL, NULL);
 
-	if (cur->nr_zones < SD_EC_DP)
+	if (cur->nr_zones < edp)
 		return -1;
 
 	idx = local_node_copy_index(&cur->vroot, oid);
diff --git a/tests/functional/022 b/tests/functional/022
index 22ad2a1..1325b8b 100755
--- a/tests/functional/022
+++ b/tests/functional/022
@@ -4,11 +4,11 @@
 
 . ./common
 
-for i in `seq 0 2`; do
+for i in `seq 0 5`; do
     _start_sheep $i "-g"
 done
 
-_wait_for_sheep "3"
+_wait_for_sheep 6
 
 _cluster_format -c 3
 
diff --git a/tests/functional/029 b/tests/functional/029
index 45e0a81..e0eb771 100755
--- a/tests/functional/029
+++ b/tests/functional/029
@@ -4,11 +4,11 @@
 
 . ./common
 
-for i in `seq 0 7`; do
+for i in `seq 0 22`; do
 	_start_sheep $i;
 done
 
-_wait_for_sheep 8
+_wait_for_sheep 23
 
 _cluster_format -c 3
 
@@ -16,16 +16,20 @@ for i in `seq 2 4`; do
 	$DOG vdi create test$i 20M -c $i -P
 done
 
-$DOG vdi create -e test5 20M -P
+$DOG vdi create -c 4:2 test5 20M -P
+$DOG vdi create -c 4:3 test6 20M -P
+$DOG vdi create -c 4:4 fail 20M -P
+$DOG vdi create -c 4:5 fail 20M -P
+$DOG vdi create -c 16:7 test7 20M -P
+$DOG vdi create -c 16:8 fail 20M -P
 $DOG vdi snapshot -s tag test2
 $DOG vdi clone -s 1 test2 clone -c 3
 
-
 for i in `seq 2 3`; do
 	_kill_sheep $i;
 done
 
-for i in `seq 8 9`; do
+for i in `seq 23 24`; do
 	_start_sheep $i;
 done
 
@@ -36,7 +40,7 @@ $DOG vdi delete -s 1 test2
 
 _vdi_list
 
-for i in `seq 2 5`; do
+for i in `seq 2 7`; do
 	$DOG vdi object test$i -i 1;
 done
 
diff --git a/tests/functional/029.out b/tests/functional/029.out
index 6e1456e..ea80196 100644
--- a/tests/functional/029.out
+++ b/tests/functional/029.out
@@ -1,47 +1,184 @@
 QA output created by 029
 using backend plain store
+Invalid parameter 4:4
+To create replicated vdi, set -c x
+  x(1 to 31)   - number of replicated copies
+To create erasure coded vdi, set -c x:y
+  x(2,4,8,16)  - number of data strips
+  y(1 to 15)   - number of parity strips
+and meet the condition x > y
+Invalid parameter 4:5
+To create replicated vdi, set -c x
+  x(1 to 31)   - number of replicated copies
+To create erasure coded vdi, set -c x:y
+  x(2,4,8,16)  - number of data strips
+  y(1 to 15)   - number of parity strips
+and meet the condition x > y
+There are not enough nodes(23) to hold the copies(24)
+Usage: dog vdi create [-P] [-c copies] [-a address] [-p port] [-h] [-r] [-v] <vdiname> <size>
+Options:
+  -P, --prealloc          preallocate all the data objects
+  -c, --copies            specify the data redundancy level
+  -a, --address           specify the daemon address (default: localhost)
+  -p, --port              specify the daemon port
+  -h, --help              display this help and exit
+  -r, --raw               raw output mode: omit headers, separate fields with
+                          single spaces and print all sizes in decimal bytes
+  -v, --verbose           print more information than default
   Name        Id    Size    Used  Shared    Creation time   VDI id  Copies  Tag
   test5        0   20 MB   20 MB  0.0 MB DATE   fd2c30     6              
   test4        0   20 MB   20 MB  0.0 MB DATE   fd2de3     4              
+  test7        0   20 MB   20 MB  0.0 MB DATE   fd2f96    23              
+  test6        0   20 MB   20 MB  0.0 MB DATE   fd3149     7              
   test3        0   20 MB   20 MB  0.0 MB DATE   fd3662     3              
   test2        0   20 MB  0.0 MB   20 MB DATE   fd3816     2              
-Looking for the object 0xfd381500000001 (the inode vid 0xfd3816 idx 1) with 8 nodes
+Looking for the object 0xfd381500000001 (the inode vid 0xfd3816 idx 1) with 23 nodes
 
-127.0.0.1:7000 has the object (should be 2 copies)
+127.0.0.1:7000 doesn't have the object
 127.0.0.1:7001 doesn't have the object
 127.0.0.1:7004 doesn't have the object
 127.0.0.1:7005 doesn't have the object
 127.0.0.1:7006 doesn't have the object
 127.0.0.1:7007 doesn't have the object
 127.0.0.1:7008 doesn't have the object
-127.0.0.1:7009 has the object (should be 2 copies)
-Looking for the object 0xfd366200000001 (the inode vid 0xfd3662 idx 1) with 8 nodes
+127.0.0.1:7009 doesn't have the object
+127.0.0.1:7010 doesn't have the object
+127.0.0.1:7011 doesn't have the object
+127.0.0.1:7012 doesn't have the object
+127.0.0.1:7013 doesn't have the object
+127.0.0.1:7014 doesn't have the object
+127.0.0.1:7015 doesn't have the object
+127.0.0.1:7016 has the object (should be 2 copies)
+127.0.0.1:7017 doesn't have the object
+127.0.0.1:7018 doesn't have the object
+127.0.0.1:7019 doesn't have the object
+127.0.0.1:7020 doesn't have the object
+127.0.0.1:7021 doesn't have the object
+127.0.0.1:7022 has the object (should be 2 copies)
+127.0.0.1:7023 doesn't have the object
+127.0.0.1:7024 doesn't have the object
+Looking for the object 0xfd366200000001 (the inode vid 0xfd3662 idx 1) with 23 nodes
 
 127.0.0.1:7000 doesn't have the object
 127.0.0.1:7001 has the object (should be 3 copies)
-127.0.0.1:7004 has the object (should be 3 copies)
+127.0.0.1:7004 doesn't have the object
 127.0.0.1:7005 doesn't have the object
 127.0.0.1:7006 doesn't have the object
 127.0.0.1:7007 has the object (should be 3 copies)
 127.0.0.1:7008 doesn't have the object
 127.0.0.1:7009 doesn't have the object
-Looking for the object 0xfd2de300000001 (the inode vid 0xfd2de3 idx 1) with 8 nodes
+127.0.0.1:7010 doesn't have the object
+127.0.0.1:7011 doesn't have the object
+127.0.0.1:7012 has the object (should be 3 copies)
+127.0.0.1:7013 doesn't have the object
+127.0.0.1:7014 doesn't have the object
+127.0.0.1:7015 doesn't have the object
+127.0.0.1:7016 doesn't have the object
+127.0.0.1:7017 doesn't have the object
+127.0.0.1:7018 doesn't have the object
+127.0.0.1:7019 doesn't have the object
+127.0.0.1:7020 doesn't have the object
+127.0.0.1:7021 doesn't have the object
+127.0.0.1:7022 doesn't have the object
+127.0.0.1:7023 doesn't have the object
+127.0.0.1:7024 doesn't have the object
+Looking for the object 0xfd2de300000001 (the inode vid 0xfd2de3 idx 1) with 23 nodes
 
 127.0.0.1:7000 doesn't have the object
 127.0.0.1:7001 has the object (should be 4 copies)
 127.0.0.1:7004 doesn't have the object
 127.0.0.1:7005 has the object (should be 4 copies)
 127.0.0.1:7006 doesn't have the object
-127.0.0.1:7007 has the object (should be 4 copies)
-127.0.0.1:7008 has the object (should be 4 copies)
+127.0.0.1:7007 doesn't have the object
+127.0.0.1:7008 doesn't have the object
 127.0.0.1:7009 doesn't have the object
-Looking for the object 0xfd2c3000000001 (the inode vid 0xfd2c30 idx 1) with 8 nodes
+127.0.0.1:7010 doesn't have the object
+127.0.0.1:7011 doesn't have the object
+127.0.0.1:7012 doesn't have the object
+127.0.0.1:7013 doesn't have the object
+127.0.0.1:7014 has the object (should be 4 copies)
+127.0.0.1:7015 doesn't have the object
+127.0.0.1:7016 doesn't have the object
+127.0.0.1:7017 doesn't have the object
+127.0.0.1:7018 doesn't have the object
+127.0.0.1:7019 has the object (should be 4 copies)
+127.0.0.1:7020 doesn't have the object
+127.0.0.1:7021 doesn't have the object
+127.0.0.1:7022 doesn't have the object
+127.0.0.1:7023 doesn't have the object
+127.0.0.1:7024 doesn't have the object
+Looking for the object 0xfd2c3000000001 (the inode vid 0xfd2c30 idx 1) with 23 nodes
 
 127.0.0.1:7000 has the object (should be 6 copies)
 127.0.0.1:7001 has the object (should be 6 copies)
-127.0.0.1:7004 has the object (should be 6 copies)
+127.0.0.1:7004 doesn't have the object
 127.0.0.1:7005 doesn't have the object
-127.0.0.1:7006 has the object (should be 6 copies)
-127.0.0.1:7007 has the object (should be 6 copies)
+127.0.0.1:7006 doesn't have the object
+127.0.0.1:7007 doesn't have the object
 127.0.0.1:7008 doesn't have the object
 127.0.0.1:7009 has the object (should be 6 copies)
+127.0.0.1:7010 doesn't have the object
+127.0.0.1:7011 has the object (should be 6 copies)
+127.0.0.1:7012 doesn't have the object
+127.0.0.1:7013 doesn't have the object
+127.0.0.1:7014 doesn't have the object
+127.0.0.1:7015 doesn't have the object
+127.0.0.1:7016 doesn't have the object
+127.0.0.1:7017 doesn't have the object
+127.0.0.1:7018 doesn't have the object
+127.0.0.1:7019 doesn't have the object
+127.0.0.1:7020 doesn't have the object
+127.0.0.1:7021 has the object (should be 6 copies)
+127.0.0.1:7022 doesn't have the object
+127.0.0.1:7023 has the object (should be 6 copies)
+127.0.0.1:7024 doesn't have the object
+Looking for the object 0xfd314900000001 (the inode vid 0xfd3149 idx 1) with 23 nodes
+
+127.0.0.1:7000 doesn't have the object
+127.0.0.1:7001 has the object (should be 7 copies)
+127.0.0.1:7004 doesn't have the object
+127.0.0.1:7005 doesn't have the object
+127.0.0.1:7006 doesn't have the object
+127.0.0.1:7007 doesn't have the object
+127.0.0.1:7008 has the object (should be 7 copies)
+127.0.0.1:7009 doesn't have the object
+127.0.0.1:7010 doesn't have the object
+127.0.0.1:7011 has the object (should be 7 copies)
+127.0.0.1:7012 has the object (should be 7 copies)
+127.0.0.1:7013 doesn't have the object
+127.0.0.1:7014 doesn't have the object
+127.0.0.1:7015 doesn't have the object
+127.0.0.1:7016 doesn't have the object
+127.0.0.1:7017 doesn't have the object
+127.0.0.1:7018 doesn't have the object
+127.0.0.1:7019 has the object (should be 7 copies)
+127.0.0.1:7020 doesn't have the object
+127.0.0.1:7021 doesn't have the object
+127.0.0.1:7022 has the object (should be 7 copies)
+127.0.0.1:7023 doesn't have the object
+127.0.0.1:7024 has the object (should be 7 copies)
+Looking for the object 0xfd2f9600000001 (the inode vid 0xfd2f96 idx 1) with 23 nodes
+
+127.0.0.1:7000 has the object (should be 23 copies)
+127.0.0.1:7001 has the object (should be 23 copies)
+127.0.0.1:7004 has the object (should be 23 copies)
+127.0.0.1:7005 has the object (should be 23 copies)
+127.0.0.1:7006 has the object (should be 23 copies)
+127.0.0.1:7007 has the object (should be 23 copies)
+127.0.0.1:7008 has the object (should be 23 copies)
+127.0.0.1:7009 has the object (should be 23 copies)
+127.0.0.1:7010 has the object (should be 23 copies)
+127.0.0.1:7011 has the object (should be 23 copies)
+127.0.0.1:7012 has the object (should be 23 copies)
+127.0.0.1:7013 has the object (should be 23 copies)
+127.0.0.1:7014 has the object (should be 23 copies)
+127.0.0.1:7015 has the object (should be 23 copies)
+127.0.0.1:7016 has the object (should be 23 copies)
+127.0.0.1:7017 has the object (should be 23 copies)
+127.0.0.1:7018 has the object (should be 23 copies)
+127.0.0.1:7019 has the object (should be 23 copies)
+127.0.0.1:7020 has the object (should be 23 copies)
+127.0.0.1:7021 has the object (should be 23 copies)
+127.0.0.1:7022 has the object (should be 23 copies)
+127.0.0.1:7023 has the object (should be 23 copies)
+127.0.0.1:7024 has the object (should be 23 copies)
diff --git a/tests/functional/031 b/tests/functional/031
index 56145ec..6af9f0a 100755
--- a/tests/functional/031
+++ b/tests/functional/031
@@ -13,7 +13,7 @@ for i in 1 2 3; do
     $DOG vdi create test$i ${i}00M -c $i
 done
 
-$DOG vdi create -e test4 400M
+$DOG vdi create -c 4:2 test4 400M
 
 _vdi_list
 
diff --git a/tests/functional/common.rc b/tests/functional/common.rc
index 24c1d99..eadedf7 100644
--- a/tests/functional/common.rc
+++ b/tests/functional/common.rc
@@ -513,7 +513,7 @@ _vdi_list()
 _vdi_create()
 {
 	if $EC; then
-		$DOG vdi create -e $*
+		$DOG vdi create -c 4:2 $*
 	else
 		$DOG vdi create $*
 	fi
-- 
1.7.9.5




More information about the sheepdog mailing list