[sheepdog] [PATCH v2 07/11] sheep: use copy_policy to control erasure vdi

Liu Yuan namei.unix at gmail.com
Thu Sep 26 09:25:44 CEST 2013


inode->copy_policy is a good place to control erasure coded vdi. But I don't
think we need uint16_t for it, so I just allocate uint8_t for it.

to create a erasure coded vdi:

$dog vdi create -e test 10G # This will create a erasure coded vdi

For now we only use a fixed scheme (4 data and 2 parity strips) with '-e'. But
I have '-e number' in plan, that users could specify how many parity replica he
wants with different erasure scheme for different vdis. E.g, we can have

-e 2 --> 4 : 2 (0.5 redundancy and can stand with 2 nodes failure)
-e 3 --> 8 : 3 (0.375 redunandcy and can stand with 3 nodes failure)
-e 4 --> 8 : 4 (0.5 redandancy and can stand with 4 nodes failure)

Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 dog/dog.h                |    2 +-
 dog/farm/farm.c          |   11 ++++++++---
 dog/vdi.c                |   24 +++++++++++++++---------
 include/sheepdog_proto.h |    5 ++++-
 sheep/gateway.c          |    2 ++
 sheep/group.c            |    3 ++-
 sheep/ops.c              |    9 +++++++--
 sheep/plain_store.c      |    2 +-
 sheep/sheep_priv.h       |    7 +++++--
 sheep/vdi.c              |   39 +++++++++++++++++++++++++++++++--------
 10 files changed, 76 insertions(+), 28 deletions(-)

diff --git a/dog/dog.h b/dog/dog.h
index 8c54c10..aac0bba 100644
--- a/dog/dog.h
+++ b/dog/dog.h
@@ -79,7 +79,7 @@ void confirm(const char *message);
 void work_queue_wait(struct work_queue *q);
 int do_vdi_create(const char *vdiname, int64_t vdi_size,
 		  uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
-		  int nr_copies);
+		  int nr_copies, uint8_t copy_policy);
 int do_vdi_check(const struct sd_inode *inode);
 void show_progress(uint64_t done, uint64_t total, bool raw);
 
diff --git a/dog/farm/farm.c b/dog/farm/farm.c
index 0204d1a..0e3a8eb 100644
--- a/dog/farm/farm.c
+++ b/dog/farm/farm.c
@@ -28,6 +28,7 @@ struct vdi_entry {
 	uint32_t vdi_id;
 	uint32_t snap_id;
 	uint8_t  nr_copies;
+	uint8_t copy_policy;
 	struct rb_node rb;
 };
 static struct rb_root last_vdi_tree = RB_ROOT;
@@ -56,7 +57,7 @@ static struct vdi_entry *find_vdi(const char *name)
 
 static struct vdi_entry *new_vdi(const char *name, uint64_t vdi_size,
 				 uint32_t vdi_id, uint32_t snap_id,
-				 uint8_t nr_copies)
+				 uint8_t nr_copies, uint8_t copy_policy)
 {
 	struct vdi_entry *vdi;
 	vdi = xmalloc(sizeof(struct vdi_entry));
@@ -65,6 +66,7 @@ static struct vdi_entry *new_vdi(const char *name, uint64_t vdi_size,
 	vdi->vdi_id = vdi_id;
 	vdi->snap_id = snap_id;
 	vdi->nr_copies = nr_copies;
+	vdi->copy_policy = copy_policy;
 	return vdi;
 }
 
@@ -77,13 +79,15 @@ static void insert_vdi(struct sd_inode *new)
 			      new->vdi_size,
 			      new->vdi_id,
 			      new->snap_id,
-			      new->nr_copies);
+			      new->nr_copies,
+			      new->copy_policy);
 		rb_insert(&last_vdi_tree, vdi, rb, vdi_cmp);
 	} else if (vdi->snap_id < new->snap_id) {
 		vdi->vdi_size = new->vdi_size;
 		vdi->vdi_id = new->vdi_id;
 		vdi->snap_id = new->snap_id;
 		vdi->nr_copies = new->nr_copies;
+		vdi->copy_policy = new->copy_policy;
 	}
 }
 
@@ -95,7 +99,8 @@ static int create_active_vdis(void)
 		if (do_vdi_create(vdi->name,
 				  vdi->vdi_size,
 				  vdi->vdi_id, &new_vid,
-				  false, vdi->nr_copies) < 0)
+				  false, vdi->nr_copies,
+				  vdi->copy_policy) < 0)
 			return -1;
 	}
 	return 0;
diff --git a/dog/vdi.c b/dog/vdi.c
index a465e6a..9004b08 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -30,6 +30,7 @@ static struct sd_option vdi_options[] = {
 	{'c', "copies", true, "specify the data redundancy (number of copies)"},
 	{'F', "from", true, "create a differential backup from the snapshot"},
 	{'f', "force", false, "do operation forcibly"},
+	{'e', "erasure", false, "create erasure coded vdi"},
 	{ 0, NULL, false, NULL },
 };
 
@@ -45,6 +46,7 @@ static struct vdi_cmd_data {
 	int from_snapshot_id;
 	char from_snapshot_tag[SD_MAX_VDI_TAG_LEN];
 	bool force;
+	uint8_t copy_policy;
 } vdi_cmd_data = { ~0, };
 
 struct get_vdi_info {
@@ -464,8 +466,8 @@ static int read_vdi_obj(const char *vdiname, int snapid, const char *tag,
 }
 
 int do_vdi_create(const char *vdiname, int64_t vdi_size,
-			 uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
-			 int nr_copies)
+		  uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
+		  int nr_copies, uint8_t copy_policy)
 {
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
@@ -483,6 +485,7 @@ int do_vdi_create(const char *vdiname, int64_t vdi_size,
 	hdr.vdi.snapid = snapshot ? 1 : 0;
 	hdr.vdi.vdi_size = vdi_size;
 	hdr.vdi.copies = nr_copies;
+	hdr.vdi.copy_policy = copy_policy;
 
 	ret = dog_exec_req(&sd_nid, &hdr, buf);
 	if (ret < 0)
@@ -528,7 +531,7 @@ static int vdi_create(int argc, char **argv)
 	}
 
 	ret = do_vdi_create(vdiname, size, 0, &vid, false,
-			    vdi_cmd_data.nr_copies);
+			    vdi_cmd_data.nr_copies, vdi_cmd_data.copy_policy);
 	if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc)
 		goto out;
 
@@ -603,7 +606,7 @@ static int vdi_snapshot(int argc, char **argv)
 		return EXIT_FAILURE;
 
 	ret = do_vdi_create(vdiname, inode->vdi_size, vid, NULL, true,
-			    inode->nr_copies);
+			    inode->nr_copies, inode->copy_policy);
 
 	if (ret == EXIT_SUCCESS && verbose) {
 		if (raw_output)
@@ -647,7 +650,7 @@ static int vdi_clone(int argc, char **argv)
 		goto out;
 
 	ret = do_vdi_create(dst_vdi, inode->vdi_size, base_vid, &new_vid, false,
-			    vdi_cmd_data.nr_copies);
+			    vdi_cmd_data.nr_copies, inode->copy_policy);
 	if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc)
 		goto out;
 
@@ -827,7 +830,7 @@ static int vdi_rollback(int argc, char **argv)
 	}
 
 	ret = do_vdi_create(vdiname, inode->vdi_size, base_vid, &new_vid,
-			     false, vdi_cmd_data.nr_copies);
+			     false, vdi_cmd_data.nr_copies, inode->copy_policy);
 
 	if (ret == EXIT_SUCCESS && verbose) {
 		if (raw_output)
@@ -1817,7 +1820,7 @@ static uint32_t do_restore(const char *vdiname, int snapid, const char *tag)
 		goto out;
 
 	ret = do_vdi_create(vdiname, inode->vdi_size, inode->vdi_id, &vid,
-			    false, inode->nr_copies);
+			    false, inode->nr_copies, inode->copy_policy);
 	if (ret != EXIT_SUCCESS) {
 		sd_err("Failed to read VDI");
 		goto out;
@@ -1913,7 +1916,8 @@ out:
 		/* recreate the current vdi object */
 		recovery_ret = do_vdi_create(vdiname, current_inode->vdi_size,
 					     current_inode->parent_vdi_id, NULL,
-					     true, current_inode->nr_copies);
+					     true, current_inode->nr_copies,
+					     current_inode->copy_policy);
 		if (recovery_ret != EXIT_SUCCESS) {
 			sd_err("failed to resume the current vdi");
 			ret = recovery_ret;
@@ -2090,7 +2094,7 @@ static struct subcommand vdi_cmd[] = {
 	{"check", "<vdiname>", "saph", "check and repair image's consistency",
 	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
 	 vdi_check, vdi_options},
-	{"create", "<vdiname> <size>", "Pcaphrv", "create an image",
+	{"create", "<vdiname> <size>", "Pcapherv", "create an image",
 	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
 	 vdi_create, vdi_options},
 	{"snapshot", "<vdiname>", "saphrv", "create a snapshot",
@@ -2201,6 +2205,8 @@ static int vdi_parser(int ch, const char *opt)
 	case 'f':
 		vdi_cmd_data.force = true;
 		break;
+	case 'e':
+		vdi_cmd_data.copy_policy = 1;
 	}
 
 	return 0;
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index ddc710f..3dd6936 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -134,6 +134,7 @@ struct sd_req {
 			uint32_t	base_vdi_id;
 			uint32_t	copies;
 			uint32_t	snapid;
+			uint8_t		copy_policy;
 		} vdi;
 
 		/* sheepdog-internal */
@@ -149,6 +150,7 @@ struct sd_req {
 			uint32_t	copies;
 			uint8_t		set_bitmap; /* 0 means false */
 						    /* others mean true */
+			uint8_t		copy_policy;
 		} vdi_state;
 
 		uint32_t		__pad[8];
@@ -203,7 +205,8 @@ struct sd_inode {
 	uint64_t vm_clock_nsec;
 	uint64_t vdi_size;
 	uint64_t vm_state_size;
-	uint16_t copy_policy;
+	uint8_t  copy_policy;
+	uint8_t  reserved;
 	uint8_t  nr_copies;
 	uint8_t  block_size_shift;
 	uint32_t snap_id;
diff --git a/sheep/gateway.c b/sheep/gateway.c
index f2747e7..55d18fe 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -35,6 +35,8 @@ static struct req_iter *prepare_replication_requests(struct request *req,
 	uint64_t off = req->rq.obj.offset;
 	struct req_iter *reqs = xzalloc(sizeof(*reqs) * nr_copies);
 
+	sd_debug("%"PRIx64, req->rq.obj.oid);
+
 	*nr = nr_copies;
 	for (int i = 0; i < nr_copies; i++) {
 		reqs[i].buf = data;
diff --git a/sheep/group.c b/sheep/group.c
index 5e90fd5..16f1532 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -473,7 +473,8 @@ static int get_vdis_from(struct sd_node *node)
 	count = rsp->data_length / sizeof(*vs);
 	for (i = 0; i < count; i++) {
 		atomic_set_bit(vs[i].vid, sys->vdi_inuse);
-		add_vdi_state(vs[i].vid, vs[i].nr_copies, vs[i].snapshot);
+		add_vdi_state(vs[i].vid, vs[i].nr_copies, vs[i].snapshot,
+			      vs[i].copy_policy);
 	}
 out:
 	free(vs);
diff --git a/sheep/ops.c b/sheep/ops.c
index 7fdb351..5206faf 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -77,11 +77,15 @@ static int cluster_new_vdi(struct request *req)
 		.size = hdr->vdi.vdi_size,
 		.base_vid = hdr->vdi.base_vdi_id,
 		.create_snapshot = !!hdr->vdi.snapid,
+		.copy_policy = hdr->vdi.copy_policy,
 		.nr_copies = hdr->vdi.copies ? hdr->vdi.copies :
 				sys->cinfo.nr_copies,
 		.time = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000,
 	};
 
+	if (iocb.copy_policy)
+		iocb.nr_copies = SD_EC_DP;
+
 	if (hdr->data_length != SD_MAX_VDI_LEN)
 		return SD_RES_INVALID_PARMS;
 
@@ -592,12 +596,13 @@ static int cluster_notify_vdi_add(const struct sd_req *req, struct sd_rsp *rsp,
 		/* make the previous working vdi a snapshot */
 		add_vdi_state(req->vdi_state.old_vid,
 			      get_vdi_copy_number(req->vdi_state.old_vid),
-			      true);
+			      true, 0);
 
 	if (req->vdi_state.set_bitmap)
 		atomic_set_bit(req->vdi_state.new_vid, sys->vdi_inuse);
 
-	add_vdi_state(req->vdi_state.new_vid, req->vdi_state.copies, false);
+	add_vdi_state(req->vdi_state.new_vid, req->vdi_state.copies, false,
+		      req->vdi_state.copy_policy);
 
 	return SD_RES_SUCCESS;
 }
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index dc5e3d9..b363402 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -195,7 +195,7 @@ static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch)
 	}
 
 	add_vdi_state(oid_to_vid(oid), inode->nr_copies,
-		      vdi_is_snapshot(inode));
+		      vdi_is_snapshot(inode), inode->copy_policy);
 	atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse);
 
 	ret = SD_RES_SUCCESS;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 97a71a4..fab25b5 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -174,6 +174,7 @@ struct vdi_iocb {
 	uint32_t base_vid;
 	uint32_t snapid;
 	bool create_snapshot;
+	uint8_t copy_policy;
 	int nr_copies;
 	uint64_t time;
 };
@@ -190,7 +191,8 @@ struct vdi_state {
 	uint32_t vid;
 	uint8_t nr_copies;
 	uint8_t snapshot;
-	uint16_t _pad;
+	uint8_t copy_policy;
+	uint8_t _pad;
 };
 
 struct store_driver {
@@ -278,9 +280,10 @@ int lock_base_dir(const char *d);
 int fill_vdi_state_list(void *data);
 bool oid_is_readonly(uint64_t oid);
 int get_vdi_copy_number(uint32_t vid);
+int get_vdi_copy_policy(uint32_t vid);
 int get_obj_copy_number(uint64_t oid, int nr_zones);
 int get_req_copy_number(struct request *req);
-int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot);
+int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot, uint8_t);
 int vdi_exist(uint32_t vid);
 int vdi_create(const struct vdi_iocb *iocb, uint32_t *new_vid);
 int vdi_snapshot(const struct vdi_iocb *iocb, uint32_t *new_vid);
diff --git a/sheep/vdi.c b/sheep/vdi.c
index e46e3e7..97e52ea 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -15,6 +15,7 @@ struct vdi_state_entry {
 	uint32_t vid;
 	unsigned int nr_copies;
 	bool snapshot;
+	uint8_t copy_policy;
 	struct rb_node node;
 };
 
@@ -83,6 +84,20 @@ int get_vdi_copy_number(uint32_t vid)
 	return entry->nr_copies;
 }
 
+int get_vdi_copy_policy(uint32_t vid)
+{
+	struct vdi_state_entry *entry;
+
+	sd_read_lock(&vdi_state_lock);
+	entry = vdi_state_search(&vdi_state_root, vid);
+	sd_unlock(&vdi_state_lock);
+
+	if (!entry)
+		panic("copy policy for %" PRIx32 " not found", vid);
+
+	return entry->copy_policy;
+}
+
 int get_obj_copy_number(uint64_t oid, int nr_zones)
 {
 	return min(get_vdi_copy_number(oid_to_vid(oid)), nr_zones);
@@ -100,7 +115,7 @@ int get_req_copy_number(struct request *req)
 	return nr_copies;
 }
 
-int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot)
+int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot, uint8_t cp)
 {
 	struct vdi_state_entry *entry, *old;
 
@@ -108,8 +123,9 @@ int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot)
 	entry->vid = vid;
 	entry->nr_copies = nr_copies;
 	entry->snapshot = snapshot;
+	entry->copy_policy = cp;
 
-	sd_debug("%" PRIx32 ", %d", vid, nr_copies);
+	sd_debug("%" PRIx32 ", %d, %d", vid, nr_copies, cp);
 
 	sd_write_lock(&vdi_state_lock);
 	old = vdi_state_insert(&vdi_state_root, entry);
@@ -118,6 +134,7 @@ int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot)
 		entry = old;
 		entry->nr_copies = nr_copies;
 		entry->snapshot = snapshot;
+		entry->copy_policy = cp;
 	}
 
 	sd_unlock(&vdi_state_lock);
@@ -137,6 +154,7 @@ int fill_vdi_state_list(void *data)
 		vs->vid = entry->vid;
 		vs->nr_copies = entry->nr_copies;
 		vs->snapshot = entry->snapshot;
+		vs->copy_policy = entry->copy_policy;
 		vs++;
 		nr++;
 	}
@@ -185,7 +203,7 @@ static struct sd_inode *alloc_inode(const struct vdi_iocb *iocb,
 	new->vdi_id = new_vid;
 	new->create_time = iocb->time;
 	new->vdi_size = iocb->size;
-	new->copy_policy = 0;
+	new->copy_policy = iocb->copy_policy;
 	new->nr_copies = iocb->nr_copies;
 	new->block_size_shift = find_next_bit(&block_size, BITS_PER_LONG, 0);
 	new->snap_id = new_snapid;
@@ -215,8 +233,9 @@ static int create_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
 	int ret;
 
 	sd_debug("%s: size %" PRIu64 ", new_vid %" PRIx32 ", copies %d, "
-		 "snapid %" PRIu32, iocb->name, iocb->size, new_vid,
-		 iocb->nr_copies, new_snapid);
+		 "snapid %" PRIu32 " copy policy %"PRIu8, iocb->name,
+		 iocb->size, new_vid, iocb->nr_copies, new_snapid,
+		 new->copy_policy);
 
 	ret = write_object(vid_to_vdi_oid(new_vid), (char *)new, sizeof(*new),
 			   0, true);
@@ -579,7 +598,8 @@ int vdi_lookup(const struct vdi_iocb *iocb, struct vdi_info *info)
 	return fill_vdi_info(left, right, iocb, info);
 }
 
-static int notify_vdi_add(uint32_t vdi_id, uint32_t nr_copies, uint32_t old_vid)
+static int notify_vdi_add(uint32_t vdi_id, uint32_t nr_copies, uint32_t old_vid,
+			  uint8_t copy_policy)
 {
 	int ret = SD_RES_SUCCESS;
 	struct sd_req hdr;
@@ -589,6 +609,7 @@ static int notify_vdi_add(uint32_t vdi_id, uint32_t nr_copies, uint32_t old_vid)
 	hdr.vdi_state.new_vid = vdi_id;
 	hdr.vdi_state.copies = nr_copies;
 	hdr.vdi_state.set_bitmap = false;
+	hdr.vdi_state.copy_policy = copy_policy;
 
 	ret = exec_local_req(&hdr, NULL);
 	if (ret != SD_RES_SUCCESS)
@@ -643,7 +664,8 @@ int vdi_create(const struct vdi_iocb *iocb, uint32_t *new_vid)
 	if (info.snapid == 0)
 		info.snapid = 1;
 	*new_vid = info.free_bit;
-	ret = notify_vdi_add(*new_vid, iocb->nr_copies, info.vid);
+	ret = notify_vdi_add(*new_vid, iocb->nr_copies, info.vid,
+			     iocb->copy_policy);
 	if (ret != SD_RES_SUCCESS)
 		return ret;
 
@@ -682,7 +704,8 @@ int vdi_snapshot(const struct vdi_iocb *iocb, uint32_t *new_vid)
 
 	assert(info.snapid > 0);
 	*new_vid = info.free_bit;
-	ret = notify_vdi_add(*new_vid, iocb->nr_copies, info.vid);
+	ret = notify_vdi_add(*new_vid, iocb->nr_copies, info.vid,
+			     iocb->copy_policy);
 	if (ret != SD_RES_SUCCESS)
 		return ret;
 
-- 
1.7.9.5




More information about the sheepdog mailing list