[sheepdog] [PATCH v2 3/4] sheep, dog: fast deep copy for snapshot

Mon Jan 19 11:28:53 CET 2015

dog vdi snapshot <vdi> --no-share has a bottleneck: the dog
process. This patch adds a new option --fast-deep-copy to dog vdi
snapshot, which avoid the bottleneck.

If the option is passed to the dog command, actual copying of data
objects are done by sheep processes. So the dog process isn't a
bottleneck in this case.

Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 dog/vdi.c                | 149 +++++++++++++++++++++++++++++---------
 include/internal_proto.h |   1 +
 include/sheepdog_proto.h |   5 +-
 sheep/ops.c              |  16 +++++
 sheep/sheep_priv.h       |   1 +
 sheep/vdi.c              | 184 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 322 insertions(+), 34 deletions(-)

v2:
 - correct detection of deep copy completion

diff --git a/dog/vdi.c b/dog/vdi.c
index 379321d..dee4b92 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -40,6 +40,8 @@ static struct sd_option vdi_options[] = {
 	 "                          neither comparing nor repairing"},
 	{'z', "block_size_shift", true, "specify the bit shift num for"
 			       " data object size"},
+	{'D', "fast-deep-copy", false, "fast deep copy for"
+	 "snapshot with --no-share"},
 	{ 0, NULL, false, NULL },
 };
 
@@ -61,6 +63,7 @@ static struct vdi_cmd_data {
 	uint64_t oid;
 	bool no_share;
 	bool exist;
+	bool fast_deep_copy;
 } vdi_cmd_data = { ~0, };
 
 struct get_vdi_info {
@@ -603,6 +606,36 @@ fail:
 	return NULL;
 }
 
+struct req_fast_deep_copy {
+	struct work work;
+
+	struct sd_node *node;
+	uint32_t src, dst;
+};
+
+static void req_fast_deep_copy_work(struct work *work)
+{
+	struct req_fast_deep_copy *w =
+		container_of(work, struct req_fast_deep_copy, work);
+	struct sd_req hdr;
+	int ret;
+
+	sd_init_req(&hdr, SD_OP_FAST_DEEP_COPY);
+	hdr.fast_deep_copy.src_vid = w->src;
+	hdr.fast_deep_copy.dst_vid = w->dst;
+	ret = dog_exec_req(&w->node->nid, &hdr, NULL);
+	if (ret < 0)
+		sd_err("deep copy failed");
+	/* TODO: error handling */
+}
+
+static void req_fast_deep_copy_done(struct work *work)
+{
+	struct req_fast_deep_copy *w =
+		container_of(work, struct req_fast_deep_copy, work);
+	free(w);
+}
+
 static int vdi_snapshot(int argc, char **argv)
 {
 	const char *vdiname = argv[optind++];
@@ -727,7 +760,7 @@ static int vdi_snapshot(int argc, char **argv)
 
 	new_inode = xmalloc(sizeof(*inode));
 	ret = read_vdi_obj(vdiname, 0, "", &new_vid, new_inode,
-			   SD_INODE_HEADER_SIZE);
+			   sizeof(*inode));
 	if (ret != EXIT_SUCCESS)
 		goto out;
 
@@ -737,47 +770,94 @@ static int vdi_snapshot(int argc, char **argv)
 	 * So we don't have to worry about that clients see working VDI with
 	 * inconsistent data_vdi_id.
 	 */
-	object_size = (UINT32_C(1) << inode->block_size_shift);
-	data_obj_buf = xzalloc(object_size);
-	max_idx = count_data_objs(inode);
+	if (vdi_cmd_data.fast_deep_copy) {
+		struct work_queue *q;
+		struct sd_node *n;
 
-	for (idx = 0; idx < max_idx; idx++) {
-		uint32_t vdi_id;
-		uint64_t oid;
+		q = create_work_queue("deep copy", WQ_DYNAMIC);
 
-		vdi_show_progress(idx * object_size, inode->vdi_size);
+		rb_for_each_entry(n, &sd_nroot, rb) {
+			struct req_fast_deep_copy *w;
 
-		vdi_id = sd_inode_get_vid(inode, idx);
-		if (!vdi_id)
-			continue;
+			w = xzalloc(sizeof(*w));
 
-		oid = vid_to_data_oid(vdi_id, idx);
-		ret = dog_read_object(oid, data_obj_buf, object_size, 0,
-				      true);
-		if (ret) {
-			ret = EXIT_FAILURE;
-			goto out;
+			w->src = vid;
+			w->dst = new_vid;
+			w->node = n;
+
+			w->work.fn = req_fast_deep_copy_work;
+			w->work.done = req_fast_deep_copy_done;
+
+			queue_work(q, &w->work);
 		}
 
-		oid = vid_to_data_oid(new_vid, idx);
-		ret = dog_write_object(oid, 0, data_obj_buf, object_size, 0, 0,
-				       inode->nr_copies,
-				       inode->copy_policy, true, true);
-		if (ret != SD_RES_SUCCESS) {
-			ret = EXIT_FAILURE;
-			goto out;
+		work_queue_wait(q);
+
+		/* fast deep copy completed */
+
+		for (int new_idx = 0; new_idx < SD_INODE_DATA_INDEX;
+		     new_idx++) {
+			if (inode->data_vdi_id[new_idx])
+				new_inode->data_vdi_id[new_idx] = new_vid;
 		}
 
-		sd_inode_set_vid(new_inode, idx, new_vid);
-		ret = sd_inode_write_vid(new_inode, idx, new_vid, new_vid, 0,
-					 false, true);
-		if (ret) {
-			ret = EXIT_FAILURE;
+		ret = dog_write_object(vid_to_vdi_oid(new_vid), 0,
+				       new_inode->data_vdi_id,
+				       SD_INODE_DATA_INDEX *
+				       sizeof(new_inode->data_vdi_id[0]),
+				       offsetof(struct sd_inode,
+						data_vdi_id[0]),
+				       0, new_inode->nr_copies,
+				       new_inode->copy_policy,
+				       false, true);
+		if (ret < 0) {
+			sd_err("updating inode failed");
 			goto out;
 		}
-	}
+	} else {
+		object_size = (UINT32_C(1) << inode->block_size_shift);
+		data_obj_buf = xzalloc(object_size);
+		max_idx = count_data_objs(inode);
 
-	vdi_show_progress(idx * object_size, inode->vdi_size);
+		for (idx = 0; idx < max_idx; idx++) {
+			uint32_t vdi_id;
+			uint64_t oid;
+
+			vdi_show_progress(idx * object_size, inode->vdi_size);
+
+			vdi_id = sd_inode_get_vid(inode, idx);
+			if (!vdi_id)
+				continue;
+
+			oid = vid_to_data_oid(vdi_id, idx);
+			ret = dog_read_object(oid, data_obj_buf, object_size, 0,
+					      true);
+			if (ret) {
+				ret = EXIT_FAILURE;
+				goto out;
+			}
+
+			oid = vid_to_data_oid(new_vid, idx);
+			ret = dog_write_object(oid, 0, data_obj_buf,
+					       object_size, 0, 0,
+					       inode->nr_copies,
+					       inode->copy_policy, true, true);
+			if (ret != SD_RES_SUCCESS) {
+				ret = EXIT_FAILURE;
+				goto out;
+			}
+
+			sd_inode_set_vid(new_inode, idx, new_vid);
+			ret = sd_inode_write_vid(new_inode, idx, new_vid, new_vid,
+						 0, false, true);
+			if (ret) {
+				ret = EXIT_FAILURE;
+				goto out;
+			}
+		}
+
+		vdi_show_progress(idx * object_size, inode->vdi_size);
+	}
 
 print_result:
 	if (verbose) {
@@ -3118,8 +3198,8 @@ static struct subcommand vdi_cmd[] = {
 	{"create", "<vdiname> <size>", "PycaphrvzT", "create an image",
 	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
 	 vdi_create, vdi_options},
-	{"snapshot", "<vdiname>", "saphrvTn", "create a snapshot",
-	 NULL, CMD_NEED_ARG,
+	{"snapshot", "<vdiname>", "saphrvTnD", "create a snapshot",
+	 NULL, CMD_NEED_ARG|CMD_NEED_NODELIST,
 	 vdi_snapshot, vdi_options},
 	{"clone", "<src vdi> <dst vdi>", "sPnaphrvT", "clone an image",
 	 NULL, CMD_NEED_ARG,
@@ -3275,6 +3355,9 @@ static int vdi_parser(int ch, const char *opt)
 		}
 		vdi_cmd_data.block_size_shift = block_size_shift;
 		break;
+	case 'D':
+		vdi_cmd_data.fast_deep_copy = true;
+		break;
 	}
 
 	return 0;
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 225cc28..6b24b5c 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -115,6 +115,7 @@
 #define SD_OP_SET_RECOVERY      0xCB
 #define SD_OP_SET_VNODES 0xCC
 #define SD_OP_GET_VNODES 0xCD
+#define SD_OP_FAST_DEEP_COPY 0xCE
 
 /* internal flags for hdr.flags, must be above 0x80 */
 #define SD_FLAG_CMD_RECOVERY 0x0080
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index c5d4dde..bd6a768 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -210,7 +210,10 @@ struct sd_req {
 			uint32_t        vid;
 			uint32_t        validate;
 		} inode_coherence;
-
+		struct {
+			uint32_t        src_vid;
+			uint32_t        dst_vid;
+		} fast_deep_copy;
 
 		uint32_t		__pad[8];
 	};
diff --git a/sheep/ops.c b/sheep/ops.c
index 27c80bd..376f9c4 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -1487,6 +1487,16 @@ static int cluster_inode_coherence(const struct sd_req *req,
 			       !!req->inode_coherence.validate, &sender->nid);
 }
 
+static int local_fast_deep_copy(struct request *req)
+{
+	sd_debug("fast deep copy, source VID: %"PRIx32", destination VID: %"
+		 PRIx32, req->rq.fast_deep_copy.src_vid,
+		 req->rq.fast_deep_copy.dst_vid);
+
+	return fast_deep_copy(req->vinfo, req->rq.fast_deep_copy.src_vid,
+			      req->rq.fast_deep_copy.dst_vid);
+}
+
 static int local_get_recovery(struct request *req)
 {
 	struct recovery_throttling rthrottling;
@@ -1955,6 +1965,12 @@ static struct sd_op_template sd_ops[] = {
 		.process_main = local_set_vnodes,
 	},
 
+	[SD_OP_FAST_DEEP_COPY] = {
+		.name = "FAST_DEEP_COPY",
+		.type = SD_OP_TYPE_LOCAL,
+		.process_work = local_fast_deep_copy,
+	},
+
 	/* gateway I/O operations */
 	[SD_OP_CREATE_AND_WRITE_OBJ] = {
 		.name = "CREATE_AND_WRITE_OBJ",
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index a867874..3399a36 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -366,6 +366,7 @@ int inode_coherence_update(uint32_t vid, bool validate,
 			   const struct node_id *sender);
 void remove_node_from_participants(const struct node_id *left);
 void run_vid_gc(uint32_t vid);
+int fast_deep_copy(struct vnode_info *vinfo, uint32_t src, uint32_t dst);
 
 extern int ec_max_data_strip;
 
diff --git a/sheep/vdi.c b/sheep/vdi.c
index bb7fa6a..2889df6 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -2178,3 +2178,187 @@ out:
 	sd_rw_unlock(&vdi_state_lock);
 
 }
+
+struct fast_deep_copy_work {
+	struct work work;
+
+	uint32_t src, dst;
+	int nr_copies, block_size_shift;
+	struct vnode_info *vinfo;
+	int epoch;
+
+	refcnt_t refcnt;
+	eventfd_t finish_fd;
+};
+
+struct copy_single_object_work {
+	struct work work;
+
+	uint64_t src, new;
+	int block_size_shift;
+	int epoch;
+
+	struct fast_deep_copy_work *dcw;
+};
+
+static void copy_single_object_worker(struct work *work)
+{
+	struct copy_single_object_work *w =
+		container_of(work, struct copy_single_object_work, work);
+	char *obj;
+	int obj_size, ret;
+	struct siocb iocb = { 0 };
+
+	sd_debug("copying from %"PRIx64 " to %"PRIx64, w->src, w->new);
+
+	obj_size = 1 << w->block_size_shift;
+	obj = xzalloc(obj_size);
+
+	ret = sd_read_object(w->src, obj, obj_size, 0);
+	if (ret != SD_RES_SUCCESS) {
+		sd_err("failed to read source object: %"PRIx64, w->src);
+		goto out;
+	}
+
+	iocb.epoch = w->epoch;
+	iocb.length = obj_size;
+	iocb.offset = 0;
+	iocb.buf = obj;
+
+	sd_debug("writing new obj: %"PRIx64, w->new);
+	sd_store->create_and_write(w->new, &iocb);
+	if (ret != SD_RES_SUCCESS)
+		sd_err("failed to write object: %"PRIx64, w->new);
+
+out:
+	refcount_dec(&w->dcw->refcnt);
+	if (refcount_read(&w->dcw->refcnt) == 0)
+		eventfd_xwrite(w->dcw->finish_fd, 1);
+
+	free(obj);
+}
+
+static void copy_single_object_done(struct work *work)
+{
+	struct copy_single_object_work *w =
+		container_of(work, struct copy_single_object_work, work);
+	free(w);
+}
+
+static void fast_deep_copy_worker(struct work *work)
+{
+	struct fast_deep_copy_work *w =
+		container_of(work, struct fast_deep_copy_work, work);
+	uint32_t *src_data_vdi_id;
+	int ret;
+
+	src_data_vdi_id = xcalloc(SD_INODE_DATA_INDEX, sizeof(uint32_t));
+
+	ret = sd_read_object(vid_to_vdi_oid(w->src), (char *)src_data_vdi_id,
+			     SD_INODE_DATA_INDEX * sizeof(uint32_t),
+			     offsetof(struct sd_inode, data_vdi_id[0]));
+	if (ret != SD_RES_SUCCESS) {
+		sd_err("failed to read data_vdi_id of source VDI: %"PRIx32,
+		       w->src);
+		goto out;
+	}
+
+	for (int idx = 0; idx < SD_INODE_DATA_INDEX; idx++) {
+		/*
+		 * FIXME: need to calculate refcnt before actual queuing work.
+		 * cleaning is needed...
+		 */
+		uint64_t new_oid;
+		const struct sd_vnode *v;
+		const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
+
+		if (!src_data_vdi_id[idx])
+			continue;
+
+		new_oid = vid_to_data_oid(w->dst, idx);
+		oid_to_vnodes(new_oid, &w->vinfo->vroot, w->nr_copies,
+			      obj_vnodes);
+		for (int i = 0; i < w->nr_copies; i++) {
+			v = obj_vnodes[i];
+			if (vnode_is_local(v))
+				refcount_inc(&w->refcnt);
+		}
+	}
+
+	sd_debug("a number of objects to copy: %d", refcount_read(&w->refcnt));
+
+	for (int idx = 0; idx < SD_INODE_DATA_INDEX; idx++) {
+		struct copy_single_object_work *single;
+		uint64_t new_oid;
+		const struct sd_vnode *v;
+		const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
+
+		if (!src_data_vdi_id[idx])
+			continue;
+
+		new_oid = vid_to_data_oid(w->dst, idx);
+		oid_to_vnodes(new_oid, &w->vinfo->vroot, w->nr_copies,
+			      obj_vnodes);
+		for (int i = 0; i < w->nr_copies; i++) {
+			v = obj_vnodes[i];
+			if (vnode_is_local(v))
+				goto do_copy;
+
+		}
+		continue;
+
+do_copy:
+		single = xzalloc(sizeof(*single));
+
+		single->src = vid_to_data_oid(src_data_vdi_id[idx], idx);
+		single->new = new_oid;
+		single->block_size_shift = w->block_size_shift;
+		single->epoch = w->epoch;
+		single->dcw = w;
+
+		single->work.fn = copy_single_object_worker;
+		single->work.done = copy_single_object_done;
+
+		queue_work(sys->io_wqueue, &single->work);
+	}
+
+out:
+	free(src_data_vdi_id);
+}
+
+static void fast_deep_copy_done(struct work *work)
+{
+	struct fast_deep_copy_work *w =
+		container_of(work, struct fast_deep_copy_work, work);
+
+	sd_debug("fast deep copy finished (%"PRIx32" -> %"PRIx32")",
+		 w->src, w->dst);
+	put_vnode_info(w->vinfo);
+	free(w);
+}
+
+worker_fn int fast_deep_copy(struct vnode_info *vinfo,
+			     uint32_t src, uint32_t dst)
+{
+	struct fast_deep_copy_work *w;
+
+	w = xzalloc(sizeof(*w));
+
+	w->src = src;
+	w->dst = dst;
+	w->vinfo = grab_vnode_info(vinfo);
+	w->nr_copies = get_vdi_copy_number(src);
+	w->block_size_shift = get_vdi_block_size_shift(src);
+	w->epoch = get_latest_epoch();
+	w->finish_fd = eventfd(0, EFD_SEMAPHORE);
+
+	w->work.fn = fast_deep_copy_worker;
+	w->work.done = fast_deep_copy_done;
+
+	queue_work(sys->io_wqueue, &w->work);
+
+	eventfd_xread(w->finish_fd);
+	close(w->finish_fd);
+
+	return SD_RES_SUCCESS;
+}
-- 
1.9.1