[sheepdog] [PATCH RFC] add a new flag of cluster SD_CLUSTER_FLAG_INODE_HASH_CHECK for checking inode object corruption

Hitoshi Mitake mitake.hitoshi at gmail.com
Wed Jan 29 16:20:35 CET 2014


From: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>

Current sheepdog cannot handle corruption of inode objects. For
example, members like name or nr_copies of sd_inode are broken by
silent data corruption of disks, even initialization of sheep
processes fail. Because sheep and dog themselves interpret the content
of inode objects.

For detecting such a corruption of inode objects, this patch adds a
new flag of cluster SD_CLUSTER_FLAG_INODE_HASH_CHECK. If the flag is
passed as an option of cluster format (dog cluster format -i), sheep
processes belong to the cluster do below actions:

- when the sheep updates inode objects, it stores sha1 value of the
  object to xattr (default_write())
- when the sheep reads an inode object, it caliculates sha1 value of
  the inode object. Then it compares the caliculated value with the
  stored one. If these values differ, the reading causes error
  (default_read()).

This checking mechanism prevents interpretation of corrupted inode
objects by sheep.

Of course this checking incurs significant overhead of COW
operation. Some users don't need it. It depends on SLA. So the
checking is activated only on a cluster which is formatted with the -i
option.

Currently, default_read() simply returns an error when the checking
fails. Recoverying corrupted objects is a remaining todo. But before
implementing it, I'd like to hear opinions of the community.

Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 dog/cluster.c            |  10 +++-
 include/internal_proto.h |   2 +
 sheep/plain_store.c      | 147 ++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 131 insertions(+), 28 deletions(-)

diff --git a/dog/cluster.c b/dog/cluster.c
index 9d60202..68d3787 100644
--- a/dog/cluster.c
+++ b/dog/cluster.c
@@ -23,6 +23,8 @@ static struct sd_option cluster_options[] = {
 	{'f', "force", false, "do not prompt for confirmation"},
 	{'t', "strict", false,
 	 "do not serve write request if number of nodes is not sufficient"},
+	{'i', "inode-hash-check", false,
+	 "format cluster with enabling inode hash check"},
 	{'s', "backend", false, "show backend store information"},
 	{ 0, NULL, false, NULL },
 };
@@ -33,6 +35,7 @@ static struct cluster_cmd_data {
 	bool force;
 	bool show_store;
 	bool strict;
+	bool inode_hash_check;
 	char name[STORE_LEN];
 } cluster_cmd_data;
 
@@ -122,6 +125,8 @@ static int cluster_format(int argc, char **argv)
 	hdr.flags |= SD_FLAG_CMD_WRITE;
 	if (cluster_cmd_data.strict)
 		hdr.cluster.flags |= SD_CLUSTER_FLAG_STRICT;
+	if (cluster_cmd_data.inode_hash_check)
+		hdr.cluster.flags |= SD_CLUSTER_FLAG_INODE_HASH_CHECK;
 
 	printf("using backend %s store\n", store_name);
 	ret = dog_exec_req(&sd_nid, &hdr, store_name);
@@ -557,7 +562,7 @@ static int cluster_check(int argc, char **argv)
 static struct subcommand cluster_cmd[] = {
 	{"info", NULL, "aprhs", "show cluster information",
 	 NULL, CMD_NEED_NODELIST, cluster_info, cluster_options},
-	{"format", NULL, "bctaph", "create a Sheepdog store",
+	{"format", NULL, "bctaphi", "create a Sheepdog store",
 	 NULL, CMD_NEED_NODELIST, cluster_format, cluster_options},
 	{"shutdown", NULL, "aph", "stop Sheepdog",
 	 NULL, 0, cluster_shutdown, cluster_options},
@@ -605,6 +610,9 @@ static int cluster_parser(int ch, const char *opt)
 	case 't':
 		cluster_cmd_data.strict = true;
 		break;
+	case 'i':
+		cluster_cmd_data.inode_hash_check = true;
+		break;
 	}
 
 	return 0;
diff --git a/include/internal_proto.h b/include/internal_proto.h
index ceb8f84..00a0d9d 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -127,6 +127,8 @@
 #define SD_RES_VDI_NOT_EMPTY    0x92 /* VDI is not empty */
 
 #define SD_CLUSTER_FLAG_STRICT  0x0001 /* Strict mode for write */
+/* check hash value of inode object when it is updated or read */
+#define SD_CLUSTER_FLAG_INODE_HASH_CHECK  0x0002
 
 enum sd_status {
 	SD_STATUS_OK = 1,
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 9a4871c..1593ba4 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -118,6 +118,33 @@ static int err_to_sderr(const char *path, uint64_t oid, int err)
 	}
 }
 
+#define SHA1NAME "user.obj.sha1"
+
+static int get_object_sha1(const char *path, uint8_t *sha1)
+{
+	if (getxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE)
+	    != SHA1_DIGEST_SIZE) {
+		if (errno == ENODATA)
+			sd_debug("sha1 is not cached yet, %s", path);
+		else
+			sd_err("fail to get xattr, %s", path);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int set_object_sha1(const char *path, const uint8_t *sha1)
+{
+	int ret;
+
+	ret = setxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE, 0);
+	if (ret < 0)
+		sd_err("fail to set sha1, %s", path);
+
+	return ret;
+}
+
 int default_write(uint64_t oid, const struct siocb *iocb)
 {
 	int flags = prepare_iocb(oid, iocb, false), fd,
@@ -154,6 +181,30 @@ int default_write(uint64_t oid, const struct siocb *iocb)
 		ret = err_to_sderr(path, oid, errno);
 		goto out;
 	}
+
+	if (sys->cinfo.flags & SD_CLUSTER_FLAG_INODE_HASH_CHECK
+	    && is_vdi_obj(oid)) {
+		uint8_t sha1[20];
+		unsigned char *inode_buf;
+
+		inode_buf = xzalloc(SD_INODE_SIZE);
+
+		ret = xread(fd, inode_buf, SD_INODE_SIZE);
+		if (ret != SD_INODE_SIZE) {
+			sd_err("reading inode object failed: %m");
+			ret = err_to_sderr(path, oid, errno);
+			goto hash_check_out;
+		}
+
+		get_buffer_sha1(inode_buf, SD_INODE_SIZE, sha1);
+		ret = set_object_sha1(path, sha1);
+		if (ret < 0)
+			ret = err_to_sderr(path, oid, errno);
+
+hash_check_out:
+		free(inode_buf);
+	}
+
 out:
 	close(fd);
 	return ret;
@@ -310,6 +361,62 @@ int default_read(uint64_t oid, const struct siocb *iocb)
 		ret = default_read_from_path(oid, path, iocb);
 	}
 
+	if (sys->cinfo.flags & SD_CLUSTER_FLAG_INODE_HASH_CHECK
+	    && is_vdi_obj(oid)) {
+		int fd = -1;
+		uint8_t sha1[20], stored_sha1[20];
+		unsigned char *inode_buf;
+
+		inode_buf = xzalloc(SD_INODE_SIZE);
+
+		fd = open(path, O_RDONLY);
+		if (fd < 0) {
+			sd_err("opening a path of inode object (%s) failed: %m",
+			       path);
+			ret = err_to_sderr(path, oid, errno);
+			goto hash_check_out;
+		}
+
+		ret = xread(fd, inode_buf, SD_INODE_SIZE);
+		if (ret != SD_INODE_SIZE) {
+			sd_err("reading inode object failed: %m");
+			ret = err_to_sderr(path, oid, errno);
+			goto hash_check_out;
+		}
+
+		get_buffer_sha1(inode_buf, SD_INODE_SIZE, sha1);
+		ret = set_object_sha1(path, sha1);
+		if (ret < 0) {
+			ret = err_to_sderr(path, oid, errno);
+			goto hash_check_out;
+		}
+
+		ret = get_object_sha1(path, stored_sha1);
+		if (ret < 0) {
+			ret = err_to_sderr(path, oid, errno);
+			goto hash_check_out;
+		}
+
+		if (memcmp(sha1, stored_sha1, SD_INODE_SIZE)) {
+			sd_err("stored sha1 value and caliculated sha1 value"
+			       " of %"PRIx64" donn't match, object seems"
+			       " to be broken", oid);
+			/*
+			 * return SD_RES_NO_OBJ and wait for "dog vdi check"
+			 * TODO: active recovery by sheep process
+			 */
+			ret = SD_RES_NO_OBJ;
+			goto hash_check_out;
+		}
+
+		ret = SD_RES_SUCCESS;
+
+hash_check_out:
+		free(inode_buf);
+		if (0 <= fd)
+			close(fd);
+	}
+
 	return ret;
 }
 
@@ -413,6 +520,19 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 		ret = err_to_sderr(path, oid, errno);
 		goto out;
 	}
+
+	if (sys->cinfo.flags & SD_CLUSTER_FLAG_INODE_HASH_CHECK
+	    && is_vdi_obj(oid)) {
+		uint8_t sha1[20];
+
+		get_buffer_sha1(iocb->buf, iocb->length, sha1);
+		ret = set_object_sha1(path, sha1);
+		if (ret < 0) {
+			ret = err_to_sderr(path, oid, errno);
+			goto out;
+		}
+	}
+
 	ret = SD_RES_SUCCESS;
 	objlist_cache_insert(oid);
 out:
@@ -559,33 +679,6 @@ int default_remove_object(uint64_t oid)
 	return SD_RES_SUCCESS;
 }
 
-#define SHA1NAME "user.obj.sha1"
-
-static int get_object_sha1(const char *path, uint8_t *sha1)
-{
-	if (getxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE)
-	    != SHA1_DIGEST_SIZE) {
-		if (errno == ENODATA)
-			sd_debug("sha1 is not cached yet, %s", path);
-		else
-			sd_err("fail to get xattr, %s", path);
-		return -1;
-	}
-
-	return 0;
-}
-
-static int set_object_sha1(const char *path, const uint8_t *sha1)
-{
-	int ret;
-
-	ret = setxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE, 0);
-	if (ret < 0)
-		sd_err("fail to set sha1, %s", path);
-
-	return ret;
-}
-
 static int get_object_path(uint64_t oid, uint32_t epoch, char *path,
 			   size_t size)
 {
-- 
1.8.3.2




More information about the sheepdog mailing list