[sheepdog] [PATCH] tests: add a test to test vdi snapshot functionality concurrently

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Thu Sep 20 01:40:35 CEST 2012


At Thu, 20 Sep 2012 01:52:41 +0900,
MORITA Kazutaka wrote:
> 
> At Wed, 19 Sep 2012 20:31:17 +0800,
> Liu Yuan wrote:
> > 
> > On 09/19/2012 07:29 PM, MORITA Kazutaka wrote:
> > > I'm still testing and trying to understand what is the problem, but
> > > looks working almost correctly, at least for now.  The only problem I
> > > encountered is that sheep pulled deleted vdi objects to the local
> > > cache directory and this is the reason why the master branch doesn't
> > > pass tests/044.  However, it is not a fatal problem, I think, because
> > > sheep just only caches the info that the object is deleted.
> > > 
> > > When object cache is enabled, collie accesses object with a
> > > writethrough mode - is it a really problem?  What is the scenario
> > > where data inconsistency actually happens?
> > 
> > Yes, I do meet the inconsistency problem. For e.g, on node A you issue collie
> > command to pull the VDI object to the local node. Then there is a gateway request
> > that is to modify the VDI from node B. This request won't access the cached object VDI
> > at all on node A. After this request, cached object VDI is older than the copies in
> > the backend and we'll read the older VDI on node A.
> 
> Well, I guess it's a good time to fix this kind of coherence problem
> of object cache.  I think the root cause here is that sheep doesn't
> release object cache even after the same VM is opened on another node.
> Does below patch solve this problem?  IIUC, this patch fixes the
> inconsistency problem even if collie/qemu uses a writeback mode.

On second thought, the patch doesn't prevent 'vdi list' and 'node
info' from reading the old data.

But I still wonder if we should rollback to the old cache semantics
since it looks a bit confusing to me.  The below patch introduces a
new flag SD_FLAG_CMD_CACHE to skip object cache.  With this patch, all
collie commands other than 'vdi read' will try to read objects
directly from the backing store.  Doesn't it work as you expect?


diff --git a/collie/collie.h b/collie/collie.h
index ba5ebb4..6c042f0 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -72,7 +72,7 @@ typedef void (*vdi_parser_func_t)(uint32_t vid, char *name, char *tag,
 				  struct sheepdog_inode *i, void *data);
 int parse_vdi(vdi_parser_func_t func, size_t size, void *data);
 int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
-		   uint64_t offset);
+		   uint64_t offset, bool direct);
 int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data, unsigned int datalen,
 		    uint64_t offset, uint32_t flags, int copies, int create);
 int send_light_req(struct sd_req *hdr, const char *host, int port);
diff --git a/collie/common.c b/collie/common.c
index a3a77aa..96b68b1 100644
--- a/collie/common.c
+++ b/collie/common.c
@@ -43,7 +43,7 @@ char *size_to_str(uint64_t _size, char *str, int str_size)
 }
 
 int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
-		   uint64_t offset)
+		   uint64_t offset, bool direct)
 {
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
@@ -61,6 +61,8 @@ int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
 
 	hdr.obj.oid = oid;
 	hdr.obj.offset = offset;
+	if (direct)
+		hdr.flags = SD_FLAG_CMD_DIRECT;
 
 	ret = exec_req(fd, &hdr, data, &wlen, &rlen);
 	close(fd);
@@ -166,7 +168,7 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data)
 		oid = vid_to_vdi_oid(vc[nr].vid);
 
 		memset(&i, 0, sizeof(i));
-		ret = sd_read_object(oid, &i, SD_INODE_HEADER_SIZE, 0);
+		ret = sd_read_object(oid, &i, SD_INODE_HEADER_SIZE, 0, true);
 		if (ret != SD_RES_SUCCESS) {
 			fprintf(stderr, "Failed to read inode header\n");
 			continue;
@@ -182,7 +184,7 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data)
 				rlen = size - SD_INODE_HEADER_SIZE;
 
 			ret = sd_read_object(oid, ((char *)&i) + SD_INODE_HEADER_SIZE,
-					     rlen, SD_INODE_HEADER_SIZE);
+					     rlen, SD_INODE_HEADER_SIZE, true);
 
 			if (ret != SD_RES_SUCCESS) {
 				fprintf(stderr, "Failed to read inode\n");
diff --git a/collie/vdi.c b/collie/vdi.c
index a80ce79..756691a 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -430,7 +430,7 @@ static int read_vdi_obj(char *vdiname, int snapid, const char *tag,
 		return EXIT_FAILURE;
 	}
 
-	ret = sd_read_object(vid_to_vdi_oid(vid), inode, size, 0);
+	ret = sd_read_object(vid_to_vdi_oid(vid), inode, size, 0, true);
 	if (ret != SD_RES_SUCCESS) {
 		if (snapid) {
 			fprintf(stderr, "Failed to read a snapshot %s:%d\n",
@@ -534,7 +534,7 @@ static int vdi_create(int argc, char **argv)
 		goto out;
 	}
 
-	ret = sd_read_object(vid_to_vdi_oid(vid), inode, sizeof(*inode), 0);
+	ret = sd_read_object(vid_to_vdi_oid(vid), inode, sizeof(*inode), 0, true);
 	if (ret != SD_RES_SUCCESS) {
 		fprintf(stderr, "Failed to read a newly created VDI object\n");
 		ret = EXIT_FAILURE;
@@ -650,7 +650,7 @@ static int vdi_clone(int argc, char **argv)
 	for (idx = 0; idx < max_idx; idx++) {
 		if (inode->data_vdi_id[idx]) {
 			oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
-			ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
+			ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0, true);
 			if (ret) {
 				ret = EXIT_FAILURE;
 				goto out;
@@ -1138,7 +1138,7 @@ static int vdi_getattr(int argc, char **argv)
 
 	oid = attr_oid;
 
-	ret = sd_read_object(oid, &vattr, SD_ATTR_OBJ_SIZE, 0);
+	ret = sd_read_object(oid, &vattr, SD_ATTR_OBJ_SIZE, 0, true);
 	if (ret != SD_RES_SUCCESS) {
 		fprintf(stderr, "Failed to read attribute oid: %s\n",
 			sd_strerror(ret));
@@ -1202,7 +1202,7 @@ static int vdi_read(int argc, char **argv)
 
 		if (inode->data_vdi_id[idx]) {
 			oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
-			ret = sd_read_object(oid, buf, len, offset);
+			ret = sd_read_object(oid, buf, len, offset, false);
 			if (ret != SD_RES_SUCCESS) {
 				fprintf(stderr, "Failed to read VDI\n");
 				ret = EXIT_FAILURE;
@@ -1583,7 +1583,7 @@ static int get_obj_backup(int idx, uint32_t from_vid, uint32_t to_vid,
 
 	if (to_vid) {
 		ret = sd_read_object(vid_to_data_oid(to_vid, idx), backup->data,
-				     SD_DATA_OBJ_SIZE, 0);
+				     SD_DATA_OBJ_SIZE, 0, true);
 		if (ret != SD_RES_SUCCESS) {
 			fprintf(stderr, "Failed to read object %"PRIx32", %d\n",
 				to_vid, idx);
@@ -1594,7 +1594,7 @@ static int get_obj_backup(int idx, uint32_t from_vid, uint32_t to_vid,
 
 	if (from_vid) {
 		ret = sd_read_object(vid_to_data_oid(from_vid, idx), from_data,
-				     SD_DATA_OBJ_SIZE, 0);
+				     SD_DATA_OBJ_SIZE, 0, true);
 		if (ret != SD_RES_SUCCESS) {
 			fprintf(stderr, "Failed to read object %"PRIx32", %d\n",
 				from_vid, idx);
@@ -1815,7 +1815,7 @@ static int vdi_restore(int argc, char **argv)
 		goto out;
 
 	ret = sd_read_object(vid_to_vdi_oid(current_inode->parent_vdi_id),
-			     parent_inode, SD_INODE_HEADER_SIZE, 0);
+			     parent_inode, SD_INODE_HEADER_SIZE, 0, true);
 	if (ret != SD_RES_SUCCESS) {
 		printf("error\n");
 		goto out;
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 75c6126..e887ca1 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -34,6 +34,7 @@
 #define SD_FLAG_CMD_WRITE    0x01
 #define SD_FLAG_CMD_COW      0x02
 #define SD_FLAG_CMD_CACHE    0x04
+#define SD_FLAG_CMD_DIRECT   0x08 /* don't use object cache */
 /* flags above 0x80 are sheepdog-internal */
 
 #define SD_RES_SUCCESS       0x00 /* Success */
diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index 79c56cb..85da214 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -940,6 +940,26 @@ int bypass_object_cache(struct request *req)
 {
 	uint64_t oid = req->rq.obj.oid;
 
+	if (req->rq.flags & SD_FLAG_CMD_DIRECT) {
+		uint32_t vid = oid_to_vid(oid);
+		struct object_cache *cache;
+
+		cache = find_object_cache(vid, 0);
+		if (!cache)
+			return 1;
+		if (req->rq.flags & SD_FLAG_CMD_WRITE) {
+			object_cache_flush_and_delete(cache);
+			return 1;
+		} else  {
+			/* For read requet, we can read cache if any */
+			uint32_t idx = object_cache_oid_to_idx(oid);
+
+			if (object_cache_lookup(cache, idx, 0, false) == 0)
+				return 0;
+			else
+				return 1;
+		}
+	}
 	/*
 	 * For vmstate && vdi_attr object, we don't do caching
 	 */




More information about the sheepdog mailing list