[sheepdog] [PATCH] dog: vdi checking for lost inode objects

Hitoshi Mitake mitake.hitoshi at gmail.com
Wed Nov 20 14:54:02 CET 2013


Current "dog vdi check" cannot work well in a case of inode object
lost. This patch lets the subcommand handle the case correctly.

Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 dog/common.c             |  12 +++--
 dog/dog.h                |   2 +
 dog/vdi.c                | 114 +++++++++++++++++++++++++++++++++++++++++++++--
 tests/functional/077     |  15 +++++++
 tests/functional/077.out |   2 +
 5 files changed, 139 insertions(+), 6 deletions(-)

diff --git a/dog/common.c b/dog/common.c
index 59d38dc..649f303 100644
--- a/dog/common.c
+++ b/dog/common.c
@@ -55,8 +55,8 @@ char *strnumber(uint64_t size)
 	return strnumber_raw(size, raw_output);
 }
 
-int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
-		   uint64_t offset, bool direct)
+int do_sd_read_object(struct node_id *nid, uint64_t oid, void *data,
+		   unsigned int datalen, uint64_t offset, bool direct)
 {
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
@@ -70,7 +70,7 @@ int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
 	if (direct)
 		hdr.flags |= SD_FLAG_CMD_DIRECT;
 
-	ret = dog_exec_req(&sd_nid, &hdr, data);
+	ret = dog_exec_req(nid, &hdr, data);
 	if (ret < 0) {
 		sd_err("Failed to read object %" PRIx64, oid);
 		return SD_RES_EIO;
@@ -85,6 +85,12 @@ int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
 	return SD_RES_SUCCESS;
 }
 
+int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
+		   uint64_t offset, bool direct)
+{
+	return do_sd_read_object(&sd_nid, oid, data, datalen, offset, direct);
+}
+
 int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data,
 		    unsigned int datalen, uint64_t offset, uint32_t flags,
 		    uint8_t copies, uint8_t copy_policy, bool create,
diff --git a/dog/dog.h b/dog/dog.h
index af3cefa..f46297b 100644
--- a/dog/dog.h
+++ b/dog/dog.h
@@ -71,6 +71,8 @@ typedef void (*vdi_parser_func_t)(uint32_t vid, const char *name,
 				  uint32_t flags,
 				  const struct sd_inode *i, void *data);
 int parse_vdi(vdi_parser_func_t func, size_t size, void *data);
+int do_sd_read_object(struct node_id *nid, uint64_t oid, void *data,
+		      unsigned int datalen, uint64_t offset, bool direct);
 int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
 		   uint64_t offset, bool direct);
 int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data,
diff --git a/dog/vdi.c b/dog/vdi.c
index bfee11f..909de35 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -532,6 +532,114 @@ static int read_vdi_obj(const char *vdiname, int snapid, const char *tag,
 	return EXIT_SUCCESS;
 }
 
+static int read_majority_vdi_obj(const char *vdiname, int snapid,
+				 const char *tag, uint32_t *pvid,
+				 struct sd_inode *inode,
+				 size_t size)
+{
+	int ret;
+	uint32_t vid;
+	uint64_t oid;
+	const struct sd_vnode *vnodes[SD_MAX_COPIES];
+	/*
+	 * FIXME: we should handle a case of
+	 * inode->nr_copies != SD_DEFAULT_COPIES
+	 */
+
+	struct {
+		bool found;
+		uint8_t digest[20];
+	} results[SD_DEFAULT_COPIES];
+
+	int count = 0, nr_live_copies = 0;
+	int majority_idx = -1;
+
+	ret = find_vdi_name(vdiname, snapid, tag, &vid, 0);
+	if (ret < 0) {
+		sd_err("Failed to open VDI %s", vdiname);
+		return EXIT_FAILURE;
+	}
+
+	oid = vid_to_vdi_oid(vid);
+	oid_to_vnodes(oid, &sd_vroot, SD_DEFAULT_COPIES, vnodes);
+
+	for (int i = 0; i < SD_DEFAULT_COPIES; i++) {
+		struct sd_req hdr;
+		struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+
+		sd_init_req(&hdr, SD_OP_GET_HASH);
+
+		hdr.obj.oid = oid;
+		hdr.obj.tgt_epoch = sd_epoch;
+
+		ret = dog_exec_req(&vnodes[i]->node->nid, &hdr, NULL);
+		if (ret < 0)
+			exit(EXIT_SYSFAIL);
+
+		switch (rsp->result) {
+		case SD_RES_SUCCESS:
+			results[i].found = true;
+			memcpy(results[i].digest, rsp->hash.digest,
+			       sizeof(results[i].digest));
+			break;
+		case SD_RES_NO_OBJ:
+			results[i].found = false;
+			break;
+		default:
+			sd_err("failed to read %" PRIx64 " from %s, %s", oid,
+			       addr_to_str(vnodes[i]->node->nid.addr,
+					   vnodes[i]->node->nid.port),
+			       sd_strerror(rsp->result));
+			exit(EXIT_FAILURE);
+		}
+
+	}
+
+	/* Boyer Moore MJRTY */
+	for (int i = 0; i < SD_DEFAULT_COPIES; i++) {
+		if (!results[i].found)
+			continue;
+		nr_live_copies++;
+
+		if (!count)
+			majority_idx = i;
+
+		if (!memcmp(results[majority_idx].digest, results[i].digest,
+			    sizeof(results[majority_idx].digest)))
+			count++;
+		else
+			count--;
+	}
+
+	if (majority_idx == -1) {
+		sd_err("no inode object (%" PRIx64 ") found", oid);
+		return EXIT_FAILURE;
+	} else if (count < nr_live_copies / 2) {
+		sd_err("no majority inode object (%" PRIx64 ") found", oid);
+		return EXIT_FAILURE;
+	}
+
+	ret = do_sd_read_object((struct node_id *)
+				&vnodes[majority_idx]->node->nid, oid, inode,
+				size, 0, true);
+	if (ret != SD_RES_SUCCESS) {
+		if (snapid) {
+			sd_err("Failed to read a snapshot %s:%d", vdiname,
+			       snapid);
+		} else if (tag && tag[0]) {
+			sd_err("Failed to read a snapshot %s:%s", vdiname, tag);
+		} else {
+			sd_err("Failed to read a vdi %s", vdiname);
+		}
+		return EXIT_FAILURE;
+	}
+
+	if (pvid)
+		*pvid = vid;
+
+	return EXIT_SUCCESS;
+}
+
 int do_vdi_create(const char *vdiname, int64_t vdi_size,
 		  uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
 		  uint8_t nr_copies, uint8_t copy_policy, uint8_t store_policy)
@@ -1880,9 +1988,9 @@ static int vdi_check(int argc, char **argv)
 	int ret;
 	struct sd_inode *inode = xmalloc(sizeof(*inode));
 
-	ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id,
-			   vdi_cmd_data.snapshot_tag, NULL, inode,
-			   SD_INODE_SIZE);
+	ret = read_majority_vdi_obj(vdiname, vdi_cmd_data.snapshot_id,
+				    vdi_cmd_data.snapshot_tag, NULL, inode,
+				    SD_INODE_SIZE);
 	if (ret != EXIT_SUCCESS) {
 		sd_err("FATAL: no inode objects");
 		return ret;
diff --git a/tests/functional/077 b/tests/functional/077
index f2c2211..7f14b8c 100755
--- a/tests/functional/077
+++ b/tests/functional/077
@@ -68,6 +68,20 @@ $DOG vdi read test 0 14
 
 $DOG cluster shutdown
 
+# single inode object lost
+
+rm $STORE/0/obj/807c2b2500000000
+
+for i in `seq 0 2`; do
+    _start_sheep $i
+done
+
+_wait_for_sheep 3
+
+$DOG vdi check test
+
+$DOG cluster shutdown
+
 # single object lost, single broken object. no majority
 
 rm $STORE/0/obj/007c2b2500000000
@@ -82,3 +96,4 @@ _wait_for_sheep 3
 $DOG vdi check test
 
 $DOG cluster shutdown
+
diff --git a/tests/functional/077.out b/tests/functional/077.out
index 5bddb9b..e76b1ee 100644
--- a/tests/functional/077.out
+++ b/tests/functional/077.out
@@ -12,5 +12,7 @@ fixed missing 7c2b2500000000
 fixed missing 7c2b2500000000
 finish check&repair test
 original data
+fixed missing 807c2b2500000000
+finish check&repair test
 no majority of 7c2b2500000000
 finish check&repair test
-- 
1.8.1.2




More information about the sheepdog mailing list