[sheepdog] [PATCH] introduce SD_FLAG_CMD_DIRECT to bypass object cache

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Thu Sep 20 20:54:23 CEST 2012


Currently, collie commands also use object cache if it is enabled.
This causes problems because sheep doesn't assume that other nodes
also creates cache data of the same opened VDIs.  If we allow it, data
inconsistency problems happen when another node updates the cached
objects.

This patch forces collie commands to bypass object cache to avoid the
problem, but vdi read and write still use object cache because they
are used to emulate VM I/Os.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 collie/collie.h          |    7 ++++---
 collie/common.c          |   15 ++++++++++-----
 collie/vdi.c             |   36 ++++++++++++++++++------------------
 include/sheepdog_proto.h |    1 +
 sheep/object_cache.c     |   21 +++++++++++++++++++++
 5 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/collie/collie.h b/collie/collie.h
index ba5ebb4..dcf12e4 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -72,9 +72,10 @@ typedef void (*vdi_parser_func_t)(uint32_t vid, char *name, char *tag,
 				  struct sheepdog_inode *i, void *data);
 int parse_vdi(vdi_parser_func_t func, size_t size, void *data);
 int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
-		   uint64_t offset);
-int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data, unsigned int datalen,
-		    uint64_t offset, uint32_t flags, int copies, int create);
+		   uint64_t offset, bool direct);
+int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data,
+		    unsigned int datalen, uint64_t offset, uint32_t flags,
+		    int copies, int create, bool direct);
 int send_light_req(struct sd_req *hdr, const char *host, int port);
 int send_light_req_get_response(struct sd_req *hdr, const char *host, int port);
 
diff --git a/collie/common.c b/collie/common.c
index a3a77aa..fecdbe1 100644
--- a/collie/common.c
+++ b/collie/common.c
@@ -43,7 +43,7 @@ char *size_to_str(uint64_t _size, char *str, int str_size)
 }
 
 int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
-		   uint64_t offset)
+		   uint64_t offset, bool direct)
 {
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
@@ -61,6 +61,8 @@ int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
 
 	hdr.obj.oid = oid;
 	hdr.obj.offset = offset;
+	if (direct)
+		hdr.flags |= SD_FLAG_CMD_DIRECT;
 
 	ret = exec_req(fd, &hdr, data, &wlen, &rlen);
 	close(fd);
@@ -79,8 +81,9 @@ int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
 	return SD_RES_SUCCESS;
 }
 
-int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data, unsigned int datalen,
-		    uint64_t offset, uint32_t flags, int copies, int create)
+int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data,
+		    unsigned int datalen, uint64_t offset, uint32_t flags,
+		    int copies, int create, bool direct)
 {
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
@@ -102,6 +105,8 @@ int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data, unsigned int dat
 	hdr.flags = flags | SD_FLAG_CMD_WRITE;
 	if (cow_oid)
 		hdr.flags |= SD_FLAG_CMD_COW;
+	if (direct)
+		hdr.flags |= SD_FLAG_CMD_DIRECT;
 
 	hdr.obj.copies = copies;
 	hdr.obj.oid = oid;
@@ -166,7 +171,7 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data)
 		oid = vid_to_vdi_oid(vc[nr].vid);
 
 		memset(&i, 0, sizeof(i));
-		ret = sd_read_object(oid, &i, SD_INODE_HEADER_SIZE, 0);
+		ret = sd_read_object(oid, &i, SD_INODE_HEADER_SIZE, 0, true);
 		if (ret != SD_RES_SUCCESS) {
 			fprintf(stderr, "Failed to read inode header\n");
 			continue;
@@ -182,7 +187,7 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data)
 				rlen = size - SD_INODE_HEADER_SIZE;
 
 			ret = sd_read_object(oid, ((char *)&i) + SD_INODE_HEADER_SIZE,
-					     rlen, SD_INODE_HEADER_SIZE);
+					     rlen, SD_INODE_HEADER_SIZE, true);
 
 			if (ret != SD_RES_SUCCESS) {
 				fprintf(stderr, "Failed to read inode\n");
diff --git a/collie/vdi.c b/collie/vdi.c
index a80ce79..1b8ec4c 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -430,7 +430,7 @@ static int read_vdi_obj(char *vdiname, int snapid, const char *tag,
 		return EXIT_FAILURE;
 	}
 
-	ret = sd_read_object(vid_to_vdi_oid(vid), inode, size, 0);
+	ret = sd_read_object(vid_to_vdi_oid(vid), inode, size, 0, true);
 	if (ret != SD_RES_SUCCESS) {
 		if (snapid) {
 			fprintf(stderr, "Failed to read a snapshot %s:%d\n",
@@ -534,7 +534,7 @@ static int vdi_create(int argc, char **argv)
 		goto out;
 	}
 
-	ret = sd_read_object(vid_to_vdi_oid(vid), inode, sizeof(*inode), 0);
+	ret = sd_read_object(vid_to_vdi_oid(vid), inode, sizeof(*inode), 0, true);
 	if (ret != SD_RES_SUCCESS) {
 		fprintf(stderr, "Failed to read a newly created VDI object\n");
 		ret = EXIT_FAILURE;
@@ -546,7 +546,7 @@ static int vdi_create(int argc, char **argv)
 		oid = vid_to_data_oid(vid, idx);
 
 		ret = sd_write_object(oid, 0, buf, SD_DATA_OBJ_SIZE, 0, 0,
-				      inode->nr_copies, 1);
+				      inode->nr_copies, 1, true);
 		if (ret != SD_RES_SUCCESS) {
 			ret = EXIT_FAILURE;
 			goto out;
@@ -555,7 +555,7 @@ static int vdi_create(int argc, char **argv)
 		inode->data_vdi_id[idx] = vid;
 		ret = sd_write_object(vid_to_vdi_oid(vid), 0, &vid, sizeof(vid),
 				      SD_INODE_HEADER_SIZE + sizeof(vid) * idx, 0,
-				      inode->nr_copies, 0);
+				      inode->nr_copies, 0, true);
 		if (ret) {
 			ret = EXIT_FAILURE;
 			goto out;
@@ -590,7 +590,7 @@ static int vdi_snapshot(int argc, char **argv)
 		ret = sd_write_object(vid_to_vdi_oid(vid), 0, vdi_cmd_data.snapshot_tag,
 				      SD_MAX_VDI_TAG_LEN,
 				      offsetof(struct sheepdog_inode, tag),
-				      0, inode->nr_copies, 0);
+				      0, inode->nr_copies, 0, true);
 	}
 
 	return do_vdi_create(vdiname, inode->vdi_size, vid, NULL, 1,
@@ -650,7 +650,7 @@ static int vdi_clone(int argc, char **argv)
 	for (idx = 0; idx < max_idx; idx++) {
 		if (inode->data_vdi_id[idx]) {
 			oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
-			ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
+			ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0, true);
 			if (ret) {
 				ret = EXIT_FAILURE;
 				goto out;
@@ -660,7 +660,7 @@ static int vdi_clone(int argc, char **argv)
 
 		oid = vid_to_data_oid(new_vid, idx);
 		ret = sd_write_object(oid, 0, buf, SD_DATA_OBJ_SIZE, 0, 0,
-				      inode->nr_copies, 1);
+				      inode->nr_copies, 1, true);
 		if (ret != SD_RES_SUCCESS) {
 			ret = EXIT_FAILURE;
 			goto out;
@@ -668,7 +668,7 @@ static int vdi_clone(int argc, char **argv)
 
 		ret = sd_write_object(vid_to_vdi_oid(new_vid), 0, &new_vid, sizeof(new_vid),
 				      SD_INODE_HEADER_SIZE + sizeof(new_vid) * idx, 0,
-				      inode->nr_copies, 0);
+				      inode->nr_copies, 0, true);
 		if (ret) {
 			ret = EXIT_FAILURE;
 			goto out;
@@ -713,7 +713,7 @@ static int vdi_resize(int argc, char **argv)
 	inode->vdi_size = new_size;
 
 	ret = sd_write_object(vid_to_vdi_oid(vid), 0, inode, SD_INODE_HEADER_SIZE, 0,
-			      0, inode->nr_copies, 0);
+			      0, inode->nr_copies, 0, true);
 	if (ret != SD_RES_SUCCESS) {
 		fprintf(stderr, "Failed to update an inode header\n");
 		return EXIT_FAILURE;
@@ -1138,7 +1138,7 @@ static int vdi_getattr(int argc, char **argv)
 
 	oid = attr_oid;
 
-	ret = sd_read_object(oid, &vattr, SD_ATTR_OBJ_SIZE, 0);
+	ret = sd_read_object(oid, &vattr, SD_ATTR_OBJ_SIZE, 0, true);
 	if (ret != SD_RES_SUCCESS) {
 		fprintf(stderr, "Failed to read attribute oid: %s\n",
 			sd_strerror(ret));
@@ -1202,7 +1202,7 @@ static int vdi_read(int argc, char **argv)
 
 		if (inode->data_vdi_id[idx]) {
 			oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
-			ret = sd_read_object(oid, buf, len, offset);
+			ret = sd_read_object(oid, buf, len, offset, false);
 			if (ret != SD_RES_SUCCESS) {
 				fprintf(stderr, "Failed to read VDI\n");
 				ret = EXIT_FAILURE;
@@ -1323,7 +1323,7 @@ static int vdi_write(int argc, char **argv)
 		inode->data_vdi_id[idx] = inode->vdi_id;
 		oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
 		ret = sd_write_object(oid, old_oid, buf, len, offset, flags,
-				      inode->nr_copies, create);
+				      inode->nr_copies, create, false);
 		if (ret != SD_RES_SUCCESS) {
 			fprintf(stderr, "Failed to write VDI\n");
 			ret = EXIT_FAILURE;
@@ -1333,7 +1333,7 @@ static int vdi_write(int argc, char **argv)
 		if (create) {
 			ret = sd_write_object(vid_to_vdi_oid(vid), 0, &vid, sizeof(vid),
 					      SD_INODE_HEADER_SIZE + sizeof(vid) * idx,
-					      flags, inode->nr_copies, 0);
+					      flags, inode->nr_copies, 0, false);
 			if (ret) {
 				ret = EXIT_FAILURE;
 				goto out;
@@ -1583,7 +1583,7 @@ static int get_obj_backup(int idx, uint32_t from_vid, uint32_t to_vid,
 
 	if (to_vid) {
 		ret = sd_read_object(vid_to_data_oid(to_vid, idx), backup->data,
-				     SD_DATA_OBJ_SIZE, 0);
+				     SD_DATA_OBJ_SIZE, 0, true);
 		if (ret != SD_RES_SUCCESS) {
 			fprintf(stderr, "Failed to read object %"PRIx32", %d\n",
 				to_vid, idx);
@@ -1594,7 +1594,7 @@ static int get_obj_backup(int idx, uint32_t from_vid, uint32_t to_vid,
 
 	if (from_vid) {
 		ret = sd_read_object(vid_to_data_oid(from_vid, idx), from_data,
-				     SD_DATA_OBJ_SIZE, 0);
+				     SD_DATA_OBJ_SIZE, 0, true);
 		if (ret != SD_RES_SUCCESS) {
 			fprintf(stderr, "Failed to read object %"PRIx32", %d\n",
 				from_vid, idx);
@@ -1714,13 +1714,13 @@ static int restore_obj(struct obj_backup *backup, uint32_t vid,
 	/* send a copy-on-write request */
 	ret = sd_write_object(vid_to_data_oid(vid, backup->idx), parent_oid,
 			      backup->data, backup->length, backup->offset,
-			      0, parent_inode->nr_copies, 1);
+			      0, parent_inode->nr_copies, 1, true);
 	if (ret != SD_RES_SUCCESS)
 		return ret;
 
 	return sd_write_object(vid_to_vdi_oid(vid), 0, &vid, sizeof(vid),
 			       SD_INODE_HEADER_SIZE + sizeof(vid) * backup->idx,
-			       0, parent_inode->nr_copies, 0);
+			       0, parent_inode->nr_copies, 0, true);
 }
 
 static uint32_t do_restore(char *vdiname, int snapid, const char *tag)
@@ -1815,7 +1815,7 @@ static int vdi_restore(int argc, char **argv)
 		goto out;
 
 	ret = sd_read_object(vid_to_vdi_oid(current_inode->parent_vdi_id),
-			     parent_inode, SD_INODE_HEADER_SIZE, 0);
+			     parent_inode, SD_INODE_HEADER_SIZE, 0, true);
 	if (ret != SD_RES_SUCCESS) {
 		printf("error\n");
 		goto out;
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 75c6126..e887ca1 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -34,6 +34,7 @@
 #define SD_FLAG_CMD_WRITE    0x01
 #define SD_FLAG_CMD_COW      0x02
 #define SD_FLAG_CMD_CACHE    0x04
+#define SD_FLAG_CMD_DIRECT   0x08 /* don't use object cache */
 /* flags above 0x80 are sheepdog-internal */
 
 #define SD_RES_SUCCESS       0x00 /* Success */
diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index 79c56cb..109b56e 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -940,6 +940,27 @@ int bypass_object_cache(struct request *req)
 {
 	uint64_t oid = req->rq.obj.oid;
 
+	if (req->rq.flags & SD_FLAG_CMD_DIRECT) {
+		uint32_t vid = oid_to_vid(oid);
+		struct object_cache *cache;
+
+		cache = find_object_cache(vid, 0);
+		if (!cache)
+			return 1;
+		if (req->rq.flags & SD_FLAG_CMD_WRITE) {
+			object_cache_flush_and_delete(cache);
+			return 1;
+		} else  {
+			/* For read requet, we can read cache if any */
+			uint32_t idx = object_cache_oid_to_idx(oid);
+
+			if (object_cache_lookup(cache, idx, 0, false) == 0)
+				return 0;
+			else
+				return 1;
+		}
+	}
+
 	/*
 	 * For vmstate && vdi_attr object, we don't do caching
 	 */
-- 
1.7.2.5




More information about the sheepdog mailing list