[sheepdog] [PATCH v8 09/19] sheep, dog: prevent COW during snapshot creation

Thu May 15 17:22:36 CEST 2014

Current generational reference counting scheme for garbage collection
of snapshot object requires mutual exclusion between COW operation and
snapshot creation. Because both of COW operation and snapshot creation
requires modification of generation reference status. This patch adds
operations for preventing/allowing COW temporally.

This change introduces limitation that "dog vdi snapshot" command and
QEMU process which uses a snapshotted VDI must connect to a same sheep
process. But this limitation can be removed by a functionality for
detecting "which QEMU (node) is interested in this VDI" (the feature
is also useful for VDI locking and iSCSI multipath). So it can be
resolved in the near future.

Cc: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
Cc: Valerio Pachera <sirio81 at gmail.com>
Cc: Alessandro Bolgia <alessandro at extensys.it>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 dog/vdi.c                | 15 +++++++++++++++
 include/internal_proto.h |  2 ++
 sheep/gateway.c          | 14 --------------
 sheep/group.c            |  3 +++
 sheep/ops.c              | 50 ++++++++++++++++++++++++++++++++++++++++++++++++
 sheep/request.c          | 37 ++++++++++++++++++++++++++++++++---
 sheep/sheep_priv.h       | 23 ++++++++++++++++++++++
 7 files changed, 127 insertions(+), 17 deletions(-)

diff --git a/dog/vdi.c b/dog/vdi.c
index 4d7fd54..16630ac 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -495,6 +495,7 @@ static int vdi_snapshot(int argc, char **argv)
 	int ret;
 	char buf[SD_INODE_HEADER_SIZE];
 	struct sd_inode *inode = (struct sd_inode *)buf;
+	struct sd_req hdr;
 
 	if (vdi_cmd_data.snapshot_id != 0) {
 		sd_err("Please specify a non-integer value for "
@@ -511,6 +512,13 @@ static int vdi_snapshot(int argc, char **argv)
 		return EXIT_FAILURE;
 	}
 
+	sd_init_req(&hdr, SD_OP_PREVENT_COW);
+	ret = dog_exec_req(&sd_nid, &hdr, NULL);
+	if (ret < 0) {
+		sd_err("preventing COW failed");
+		return EXIT_FAILURE;
+	}
+
 	ret = dog_write_object(vid_to_vdi_oid(vid), 0,
 			       vdi_cmd_data.snapshot_tag,
 			       SD_MAX_VDI_TAG_LEN,
@@ -532,6 +540,13 @@ static int vdi_snapshot(int argc, char **argv)
 			       " VDI ID of newly created snapshot: %x\n", new_vid, vid);
 	}
 
+	sd_init_req(&hdr, SD_OP_ALLOW_COW);
+	ret = dog_exec_req(&sd_nid, &hdr, NULL);
+	if (ret < 0) {
+		sd_err("allowing COW failed");
+		return EXIT_FAILURE;
+	}
+
 	return ret;
 }
 
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 0fce57a..d59d788 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -103,6 +103,8 @@
 #define SD_OP_CLUSTER_INFO	0xBE
 #define SD_OP_DECREF_OBJ     0xBF
 #define SD_OP_DECREF_PEER    0xC0
+#define SD_OP_PREVENT_COW    0xC1
+#define SD_OP_ALLOW_COW      0xC2
 
 /* internal flags for hdr.flags, must be above 0x80 */
 #define SD_FLAG_CMD_RECOVERY 0x0080
diff --git a/sheep/gateway.c b/sheep/gateway.c
index c787ec1..8868bce 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -613,20 +613,6 @@ static int update_obj_refcnt(const struct sd_req *hdr, uint32_t *vids,
 			       false);
 }
 
-/*
- * return true if the request updates a data_vdi_id field of a vdi object
- *
- * XXX: we assume that VMs don't update the inode header and the data_vdi_id
- * field at the same time.
- */
-static bool is_data_vid_update(const struct sd_req *hdr)
-{
-	return is_vdi_obj(hdr->obj.oid) &&
-		data_vid_offset(0) <= hdr->obj.offset &&
-		hdr->obj.offset + hdr->data_length <=
-			data_vid_offset(SD_INODE_DATA_INDEX);
-}
-
 int gateway_read_obj(struct request *req)
 {
 	uint64_t oid = req->rq.obj.oid;
diff --git a/sheep/group.c b/sheep/group.c
index 4114dfb..aaf4a1f 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -1050,6 +1050,9 @@ int create_cluster(int port, int64_t zone, int nr_vnodes,
 	if (ret != 0)
 		return -1;
 
+	INIT_LIST_HEAD(&sys->prevented_cow_request_queue);
+	INIT_LIST_HEAD(&sys->pending_prevent_cow_request_queue);
+
 	return 0;
 }
 
diff --git a/sheep/ops.c b/sheep/ops.c
index 040c204..790e72d 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -1154,6 +1154,44 @@ out:
 	return ret;
 }
 
+static int local_prevent_cow(const struct sd_req *req, struct sd_rsp *rsp,
+			     void *data)
+{
+	/* FIXME: change type of process_main() */
+	struct request *rq = container_of(req, struct request, rq);
+
+	sd_debug("preventing COW request, ongoing COW requests: %d",
+		 sys->nr_ongoing_cow_request);
+
+	sys->prevent_cow = true;
+
+	if (sys->nr_ongoing_cow_request) {
+		list_add_tail(&rq->pending_prevent_cow_request_list,
+			      &sys->pending_prevent_cow_request_queue);
+		get_request(rq);
+	}
+
+	return SD_RES_SUCCESS;
+}
+
+static int local_allow_cow(const struct sd_req *req, struct sd_rsp *rsp,
+			   void *data)
+{
+	struct request *rq;
+
+	sd_debug("allowing COW request");
+
+	sys->prevent_cow = false;
+
+	list_for_each_entry(rq, &sys->prevented_cow_request_queue,
+			    prevented_cow_request_list) {
+		list_del(&rq->prevented_cow_request_list);
+		requeue_request(rq);
+	}
+
+	return SD_RES_SUCCESS;
+}
+
 static struct sd_op_template sd_ops[] = {
 
 	/* cluster operations */
@@ -1479,6 +1517,18 @@ static struct sd_op_template sd_ops[] = {
 	},
 #endif
 
+	[SD_OP_PREVENT_COW] = {
+		.name = "PREVENT_COW",
+		.type = SD_OP_TYPE_LOCAL,
+		.process_main = local_prevent_cow,
+	},
+
+	[SD_OP_ALLOW_COW] = {
+		.name = "ALLOW_COW",
+		.type = SD_OP_TYPE_LOCAL,
+		.process_main = local_allow_cow,
+	},
+
 	/* gateway I/O operations */
 	[SD_OP_CREATE_AND_WRITE_OBJ] = {
 		.name = "CREATE_AND_WRITE_OBJ",
diff --git a/sheep/request.c b/sheep/request.c
index 6972f10..f5150d5 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -13,8 +13,6 @@
 
 #include "sheep_priv.h"
 
-static void requeue_request(struct request *req);
-
 static void del_requeue_request(struct request *req)
 {
 	list_del(&req->request_list);
@@ -92,6 +90,24 @@ static void gateway_op_done(struct work *work)
 	struct request *req = container_of(work, struct request, work);
 	struct sd_req *hdr = &req->rq;
 
+	if (hdr->opcode == SD_OP_WRITE_OBJ && is_data_vid_update(hdr)) {
+		struct request *rq;
+
+		sys->nr_ongoing_cow_request--;
+		assert(0 <= sys->nr_ongoing_cow_request);
+		sd_debug("a number of ongoing cow request: %d",
+			 sys->nr_ongoing_cow_request);
+
+		if (!sys->nr_ongoing_cow_request) {
+			list_for_each_entry(rq,
+				    &sys->pending_prevent_cow_request_queue,
+				    pending_prevent_cow_request_list) {
+				list_del(&rq->pending_prevent_cow_request_list);
+				put_request(rq);
+			}
+		}
+	}
+
 	switch (req->rp.result) {
 	case SD_RES_OLD_NODE_VER:
 		if (req->rp.epoch > sys->cinfo.epoch) {
@@ -336,6 +352,20 @@ queue_work:
 		goto end_request;
 	}
 
+	if (req->rq.opcode == SD_OP_WRITE_OBJ && is_data_vid_update(&req->rq)) {
+		if (sys->prevent_cow) {
+			sd_debug("preventing COW");
+			list_add_tail(&req->prevented_cow_request_list,
+				      &sys->prevented_cow_request_queue);
+			return;
+		} else {
+			assert(0 <= sys->nr_ongoing_cow_request);
+			sys->nr_ongoing_cow_request++;
+			sd_debug("a number of ongoing cow request: %d",
+				 sys->nr_ongoing_cow_request);
+		}
+	}
+
 	req->work.fn = do_process_work;
 	req->work.done = gateway_op_done;
 	queue_work(sys->gateway_wqueue, &req->work);
@@ -497,7 +527,7 @@ done:
 	put_request(req);
 }
 
-static void requeue_request(struct request *req)
+void requeue_request(struct request *req)
 {
 	if (req->vinfo) {
 		put_vnode_info(req->vinfo);
@@ -836,6 +866,7 @@ static void tx_main(struct work *work)
 			 ci->conn.ipstr,
 			 ci->conn.port);
 	}
+
 	free_request(ci->tx_req);
 	ci->tx_req = NULL;
 
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 9c08b0c..52ac5d6 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -114,6 +114,9 @@ struct request {
 	struct work work;
 	enum REQUST_STATUS status;
 	bool stat; /* true if this request is during stat */
+
+	struct list_node prevented_cow_request_list;
+	struct list_node pending_prevent_cow_request_list;
 };
 
 struct system_info {
@@ -162,6 +165,11 @@ struct system_info {
 	/* upgrade data layout before starting service if necessary*/
 	bool upgrade;
 	struct sd_stat stat;
+
+	bool prevent_cow;
+	int nr_ongoing_cow_request;
+	struct list_head prevented_cow_request_queue;
+	struct list_head pending_prevent_cow_request_queue;
 };
 
 struct siocb {
@@ -399,6 +407,7 @@ void objlist_cache_remove(uint64_t oid);
 
 void put_request(struct request *req);
 void get_request(struct request *req);
+void requeue_request(struct request *req);
 
 int sheep_bnode_writer(uint64_t oid, void *mem, unsigned int len,
 		       uint64_t offset, uint32_t flags, int copies,
@@ -456,6 +465,20 @@ int gateway_decref_object(struct request *req);
 bool is_erasure_oid(uint64_t oid);
 uint8_t local_ec_index(struct vnode_info *vinfo, uint64_t oid);
 
+/*
+ * return true if the request updates a data_vdi_id field of a vdi object
+ *
+ * XXX: we assume that VMs don't update the inode header and the data_vdi_id
+ * field at the same time.
+ */
+static inline bool is_data_vid_update(const struct sd_req *hdr)
+{
+	return is_vdi_obj(hdr->obj.oid) &&
+		data_vid_offset(0) <= hdr->obj.offset &&
+		hdr->obj.offset + hdr->data_length <=
+			data_vid_offset(SD_INODE_DATA_INDEX);
+}
+
 /* object_cache */
 
 void object_cache_format(void);
-- 
1.9.1