[sheepdog] [PATCH 1/3] sheep, dog: support deepcopy snapshot

Hitoshi Mitake mitake.hitoshi at lab.ntt.co.jp
Mon Jan 19 02:51:43 CET 2015


Current sheepdog supports deepcopy cloning of VDI (with --no-share
option). This patch also implements the deepcopy feature for
snapshot. It is useful for fine grained GC of VID which is important
for long living, periodically snapshotted VDIs.

This patch introduces two important changes:
1. cut_relation field of vdi request
It is used for cutting a relation between parent and child
VDIs. Without this, VID GC cannot work well for deep copied snapshots.

2. new rule of vdi_lookup()
Some scenarios of VDI deletion cause fail of finding VID via VDI
name (example can be found in a new test case 100). The new rule
solves it with finding by seeking from 0 to SD_NR_VDIS.

Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 dog/dog.h                |   3 +-
 dog/farm/farm.c          |   2 +-
 dog/vdi.c                | 104 +++++++++++++++++++++++++++++++++++++++--------
 include/sheepdog_proto.h |   2 +
 sheep/ops.c              |   1 +
 sheep/sheep_priv.h       |   1 +
 sheep/vdi.c              |  27 +++++++++++-
 7 files changed, 119 insertions(+), 21 deletions(-)

diff --git a/dog/dog.h b/dog/dog.h
index bcf0e6e..37355e5 100644
--- a/dog/dog.h
+++ b/dog/dog.h
@@ -88,7 +88,8 @@ void work_queue_wait(struct work_queue *q);
 int do_vdi_create(const char *vdiname, int64_t vdi_size,
 		  uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
 		  uint8_t nr_copies, uint8_t copy_policy,
-		  uint8_t store_policy, uint8_t block_size_shift);
+		  uint8_t store_policy, uint8_t block_size_shift,
+		  bool cut_relation);
 int do_vdi_check(const struct sd_inode *inode);
 void show_progress(uint64_t done, uint64_t total, bool raw);
 size_t get_store_objsize(uint8_t copy_policy, uint8_t block_size_shift,
diff --git a/dog/farm/farm.c b/dog/farm/farm.c
index 55bc274..e2b07ad 100644
--- a/dog/farm/farm.c
+++ b/dog/farm/farm.c
@@ -134,7 +134,7 @@ static int create_active_vdis(void)
 				  false, vdi->nr_copies,
 				  vdi->copy_policy,
 				  vdi->store_policy,
-				  vdi->block_size_shift) < 0)
+				  vdi->block_size_shift, false) < 0)
 			return -1;
 	}
 	return 0;
diff --git a/dog/vdi.c b/dog/vdi.c
index 6b6465a..379321d 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -405,7 +405,8 @@ int read_vdi_obj(const char *vdiname, int snapid, const char *tag,
 int do_vdi_create(const char *vdiname, int64_t vdi_size,
 		  uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
 		  uint8_t nr_copies, uint8_t copy_policy,
-		  uint8_t store_policy, uint8_t block_size_shift)
+		  uint8_t store_policy, uint8_t block_size_shift,
+		  bool cut_relation)
 {
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
@@ -426,6 +427,7 @@ int do_vdi_create(const char *vdiname, int64_t vdi_size,
 	hdr.vdi.copy_policy = copy_policy;
 	hdr.vdi.store_policy = store_policy;
 	hdr.vdi.block_size_shift = block_size_shift;
+	hdr.vdi.cut_relation = cut_relation ? 1 : 0;
 
 	ret = dog_exec_req(&sd_nid, &hdr, buf);
 	if (ret < 0)
@@ -511,7 +513,7 @@ static int vdi_create(int argc, char **argv)
 	ret = do_vdi_create(vdiname, size, 0, &vid, false,
 			    vdi_cmd_data.nr_copies, vdi_cmd_data.copy_policy,
 			    vdi_cmd_data.store_policy,
-			    vdi_cmd_data.block_size_shift);
+			    vdi_cmd_data.block_size_shift, false);
 	if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc)
 		goto out;
 
@@ -606,14 +608,16 @@ static int vdi_snapshot(int argc, char **argv)
 	const char *vdiname = argv[optind++];
 	uint32_t vid, new_vid;
 	int ret;
-	char buf[SD_INODE_HEADER_SIZE];
-	struct sd_inode *inode = (struct sd_inode *)buf;
+	struct sd_inode *inode, *new_inode;
 	struct sd_req hdr;
 	struct vdi_state *vs = NULL;
 	int vs_count = 0;
 	struct node_id owners[SD_MAX_COPIES];
 	int nr_owners = 0, nr_issued_prevent_inode_update = 0;
 	bool fail_if_snapshot = false;
+	uint32_t object_size;
+	char *data_obj_buf = NULL;
+	uint32_t idx, max_idx;
 
 	if (vdi_cmd_data.snapshot_id != 0) {
 		sd_err("Please specify a non-integer value for "
@@ -635,9 +639,11 @@ static int vdi_snapshot(int argc, char **argv)
 		return EXIT_FAILURE;
 	}
 
+	inode = xzalloc(sizeof(*inode));
+
 	if (fail_if_snapshot) {
-		ret = dog_read_object(vid_to_vdi_oid(vid), inode,
-				      SD_INODE_HEADER_SIZE, 0, true);
+		ret = dog_read_object(vid_to_vdi_oid(vid), inode, sizeof(*inode), 0,
+				      true);
 		if (ret != EXIT_SUCCESS)
 			return ret;
 
@@ -649,8 +655,7 @@ static int vdi_snapshot(int argc, char **argv)
 			return EXIT_FAILURE;
 		}
 	} else {
-		ret = read_vdi_obj(vdiname, 0, "", &vid, inode,
-				   SD_INODE_HEADER_SIZE);
+		ret = read_vdi_obj(vdiname, 0, "", &vid, inode, sizeof(*inode));
 		if (ret != EXIT_SUCCESS)
 			return ret;
 	}
@@ -708,16 +713,80 @@ static int vdi_snapshot(int argc, char **argv)
 	if (ret != SD_RES_SUCCESS)
 		goto out;
 
-	ret = do_vdi_create(vdiname, inode->vdi_size, vid, &new_vid, true,
+	ret = do_vdi_create(vdiname, inode->vdi_size,
+			    vid, &new_vid, true,
 			    inode->nr_copies, inode->copy_policy,
-			    inode->store_policy, inode->block_size_shift);
+			    inode->store_policy, inode->block_size_shift,
+			    vdi_cmd_data.no_share);
+	if (!vdi_cmd_data.no_share) {
+		if (ret == EXIT_SUCCESS)
+			goto print_result;
+		else
+			goto out;
+	}
 
-	if (ret == EXIT_SUCCESS && verbose) {
+	new_inode = xmalloc(sizeof(*inode));
+	ret = read_vdi_obj(vdiname, 0, "", &new_vid, new_inode,
+			   SD_INODE_HEADER_SIZE);
+	if (ret != EXIT_SUCCESS)
+		goto out;
+
+	/*
+	 * Clients (QEMU, tgtd) cannot find the new working VDI because
+	 * COW requests are prevented by SD_OP_PREVENT_INODE_UPDATE.
+	 * So we don't have to worry about that clients see working VDI with
+	 * inconsistent data_vdi_id.
+	 */
+	object_size = (UINT32_C(1) << inode->block_size_shift);
+	data_obj_buf = xzalloc(object_size);
+	max_idx = count_data_objs(inode);
+
+	for (idx = 0; idx < max_idx; idx++) {
+		uint32_t vdi_id;
+		uint64_t oid;
+
+		vdi_show_progress(idx * object_size, inode->vdi_size);
+
+		vdi_id = sd_inode_get_vid(inode, idx);
+		if (!vdi_id)
+			continue;
+
+		oid = vid_to_data_oid(vdi_id, idx);
+		ret = dog_read_object(oid, data_obj_buf, object_size, 0,
+				      true);
+		if (ret) {
+			ret = EXIT_FAILURE;
+			goto out;
+		}
+
+		oid = vid_to_data_oid(new_vid, idx);
+		ret = dog_write_object(oid, 0, data_obj_buf, object_size, 0, 0,
+				       inode->nr_copies,
+				       inode->copy_policy, true, true);
+		if (ret != SD_RES_SUCCESS) {
+			ret = EXIT_FAILURE;
+			goto out;
+		}
+
+		sd_inode_set_vid(new_inode, idx, new_vid);
+		ret = sd_inode_write_vid(new_inode, idx, new_vid, new_vid, 0,
+					 false, true);
+		if (ret) {
+			ret = EXIT_FAILURE;
+			goto out;
+		}
+	}
+
+	vdi_show_progress(idx * object_size, inode->vdi_size);
+
+print_result:
+	if (verbose) {
 		if (raw_output)
 			printf("%x %x\n", new_vid, vid);
 		else
 			printf("new VID of original VDI: %x,"
-			       " VDI ID of newly created snapshot: %x\n", new_vid, vid);
+			       " VDI ID of newly created snapshot:"
+			       " %x\n", new_vid, vid);
 	}
 
 out:
@@ -769,7 +838,7 @@ static int vdi_clone(int argc, char **argv)
 	object_size = (UINT32_C(1) << inode->block_size_shift);
 	ret = do_vdi_create(dst_vdi, inode->vdi_size, base_vid, &new_vid, false,
 			    inode->nr_copies, inode->copy_policy,
-			    inode->store_policy, inode->block_size_shift);
+			    inode->store_policy, inode->block_size_shift, false);
 	if (ret != EXIT_SUCCESS ||
 			(!vdi_cmd_data.prealloc && !vdi_cmd_data.no_share))
 		goto out;
@@ -1027,7 +1096,7 @@ static int vdi_rollback(int argc, char **argv)
 
 	ret = do_vdi_create(vdiname, inode->vdi_size, base_vid, &new_vid,
 			     false, vdi_cmd_data.nr_copies, inode->copy_policy,
-			     inode->store_policy, inode->block_size_shift);
+			    inode->store_policy, inode->block_size_shift, false);
 
 	if (ret == EXIT_SUCCESS && verbose) {
 		if (raw_output)
@@ -2404,7 +2473,7 @@ static uint32_t do_restore(const char *vdiname, int snapid, const char *tag)
 
 	ret = do_vdi_create(vdiname, inode->vdi_size, inode->vdi_id, &vid,
 			    false, inode->nr_copies, inode->copy_policy,
-			    inode->store_policy, inode->block_size_shift);
+			    inode->store_policy, inode->block_size_shift, false);
 	if (ret != EXIT_SUCCESS) {
 		sd_err("Failed to read VDI");
 		goto out;
@@ -2514,7 +2583,8 @@ out:
 					     true, current_inode->nr_copies,
 					     current_inode->copy_policy,
 					     current_inode->store_policy,
-					     current_inode->block_size_shift);
+					     current_inode->block_size_shift,
+					     false);
 		if (recovery_ret != EXIT_SUCCESS) {
 			sd_err("failed to resume the current vdi");
 			ret = recovery_ret;
@@ -3048,7 +3118,7 @@ static struct subcommand vdi_cmd[] = {
 	{"create", "<vdiname> <size>", "PycaphrvzT", "create an image",
 	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
 	 vdi_create, vdi_options},
-	{"snapshot", "<vdiname>", "saphrvT", "create a snapshot",
+	{"snapshot", "<vdiname>", "saphrvTn", "create a snapshot",
 	 NULL, CMD_NEED_ARG,
 	 vdi_snapshot, vdi_options},
 	{"clone", "<src vdi> <dst vdi>", "sPnaphrvT", "clone an image",
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 3910bd5..c5d4dde 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -166,6 +166,8 @@ struct sd_req {
 			uint8_t		block_size_shift;
 			uint32_t	snapid;
 			uint32_t        type;
+			uint8_t		cut_relation;
+			uint8_t		__pad[3];
 		} vdi;
 
 		/* sheepdog-internal */
diff --git a/sheep/ops.c b/sheep/ops.c
index dad03a6..27c80bd 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -95,6 +95,7 @@ static int cluster_new_vdi(struct request *req)
 		.nr_copies = hdr->vdi.copies,
 		.block_size_shift = hdr->vdi.block_size_shift,
 		.time = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000,
+		.cut_relation = !!hdr->vdi.cut_relation,
 	};
 
 	/* Client doesn't specify redundancy scheme (copy = 0) */
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 46d0ba9..a867874 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -223,6 +223,7 @@ struct vdi_iocb {
 	uint8_t nr_copies;
 	uint8_t block_size_shift;
 	uint64_t time;
+	bool cut_relation;
 };
 
 /* This structure is used to get information from sheepdog. */
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 8114fb5..bb7fa6a 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -1404,7 +1404,10 @@ static int fill_vdi_info_range(uint32_t left, uint32_t right,
 		ret = SD_RES_NO_MEM;
 		goto out;
 	}
-	for (i = right - 1; i >= left; i--) {
+	for (i = right - 1; i && i >= left; i--) {
+		if (!test_bit(i, sys->vdi_inuse))
+			continue;
+
 		ret = sd_read_object(vid_to_vdi_oid(i), (char *)inode,
 				     SD_INODE_HEADER_SIZE, 0);
 		if (ret != SD_RES_SUCCESS)
@@ -1474,6 +1477,25 @@ int vdi_lookup(const struct vdi_iocb *iocb, struct vdi_info *info)
 	sd_debug("%s left %lx right %lx, %x", iocb->name, left, right, ret);
 	switch (ret) {
 	case SD_RES_NO_VDI:
+		/*
+		 * handle a case like this:
+		 * dog vdi create A
+		 * dog vdi snapshot A --no-share
+		 * dog vdi delete A -s 1
+		 * try to find vdi A
+		 *
+		 * In this case, the above get_vdi_bitmap_range() cannot find
+		 * bitmap range for vdi A, because original VID is already
+		 * freed (newly working vdi A shares nothing with the snapshot
+		 * because it is created with --no-share).
+		 *
+		 * Of course, the below fill_vdi_info() can take long time. But
+		 * another case (SD_RES_SUCCESS) can also have similar cost.
+		 *
+		 * TODO: for checking before creation, the below fill_vdi_info()
+		 * isn't required. It must be eliminated.
+		 */
+		return fill_vdi_info(0, SD_NR_VDIS, iocb, info);
 	case SD_RES_FULL_VDI:
 		return ret;
 	case SD_RES_SUCCESS:
@@ -1591,7 +1613,8 @@ int vdi_snapshot(const struct vdi_iocb *iocb, uint32_t *new_vid)
 
 	assert(info.snapid > 0);
 	*new_vid = info.free_bit;
-	ret = notify_vdi_add(*new_vid, iocb->nr_copies, info.vid,
+	ret = notify_vdi_add(*new_vid, iocb->nr_copies,
+			     iocb->cut_relation ? 0 : info.vid,
 			     iocb->copy_policy, iocb->block_size_shift);
 	if (ret != SD_RES_SUCCESS)
 		return ret;
-- 
1.9.1




More information about the sheepdog mailing list