[sheepdog] [PATCH v3] add vdi check -e to quickly check which objects are lost.

Ruoyu liangry at ucweb.com
Wed May 28 11:48:38 CEST 2014


Sometimes we want to quickly check whether some of the vdi objects
or data objects are lost due to unexpected issue.

Although vdi check will do, it spends a lot of time because of
too many client-server communication. And the probability of
triggering data auto fixing is quite low since the writing process
is strong consistency.

Therefore, the new option -e (--exist) check whether all the objects
related to the vdi are existed or not. It is fast because it submit
the batched object id only one time per node. I think this is enough
for the situation.

Usage: dog vdi check -e <vdiname>

Example:
$ dog vdi check -e test
test is fine, no object is missing.

$ dog vdi check -e ucweb
[127.0.0.1:7001] oid 80b8071d00000000 is missing.
[127.0.0.1:7001] oid 00b8071d000000ee is missing.
ucweb lost 2 object(s).

Signed-off-by: Ruoyu <liangry at ucweb.com>
---
 dog/vdi.c                | 114 ++++++++++++++++++++++++++++++++++++++++++++++-
 include/internal_proto.h |   9 ++++
 include/sheep.h          |   6 +++
 include/sheepdog_proto.h |   1 +
 lib/net.c                |   2 +-
 sheep/ops.c              |  31 +++++++++++++
 6 files changed, 160 insertions(+), 3 deletions(-)

diff --git a/dog/vdi.c b/dog/vdi.c
index 866cb36..45fe6a9 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -21,6 +21,8 @@
 #include "sha1.h"
 #include "fec.h"
 
+struct rb_root oid_tree = RB_ROOT;
+
 static struct sd_option vdi_options[] = {
 	{'P', "prealloc", false, "preallocate all the data objects"},
 	{'n', "no-share", false, "share nothing with its parent"},
@@ -34,6 +36,7 @@ static struct sd_option vdi_options[] = {
 	{'f', "force", false, "do operation forcibly"},
 	{'y', "hyper", false, "create a hyper volume"},
 	{'o', "oid", true, "specify the object id of the tracking object"},
+	{'e', "exist", false, "check objects exist or not only, no repairing"},
 	{ 0, NULL, false, NULL },
 };
 
@@ -53,6 +56,7 @@ static struct vdi_cmd_data {
 	uint8_t store_policy;
 	uint64_t oid;
 	bool no_share;
+	bool exist;
 } vdi_cmd_data = { ~0, };
 
 struct get_vdi_info {
@@ -875,6 +879,106 @@ out:
 	return ret;
 }
 
+#define OIDS_INIT_LENGTH 1024
+
+static void save_oid(uint64_t oid, int copies)
+{
+	const struct sd_vnode *vnodes[SD_MAX_COPIES];
+	struct oid_entry *entry;
+
+	oid_to_vnodes(oid, &sd_vroot, copies, vnodes);
+	for (int i = 0; i < copies; i++) {
+		struct oid_entry key = {
+			.node = (struct sd_node *) vnodes[i]->node
+		};
+		entry = rb_search(&oid_tree, &key, rb, oid_entry_cmp);
+		if (!entry)
+			panic("rb_search() failure.");
+
+		if (entry->last >= entry->end) {
+			entry->end *= 2;
+			entry->oids = xrealloc(entry->oids,
+					sizeof(uint64_t) * entry->end);
+		}
+		entry->oids[entry->last] = oid;
+		entry->last++;
+	}
+}
+
+static void build_oid_tree(const struct sd_inode *inode)
+{
+	uint32_t max_idx, vid;
+	uint64_t oid;
+	struct sd_node *node;
+	struct oid_entry *entry;
+	int copies = min((int)inode->nr_copies, sd_zones_nr);
+
+	rb_for_each_entry(node, &sd_nroot, rb) {
+		entry = xmalloc(sizeof(*entry));
+		entry->node = node;
+		entry->oids = xmalloc(sizeof(uint64_t) * OIDS_INIT_LENGTH);
+		entry->end  = OIDS_INIT_LENGTH;
+		entry->last = 0;
+		rb_insert(&oid_tree, entry, rb, oid_entry_cmp);
+	}
+
+	save_oid(vid_to_vdi_oid(inode->vdi_id), copies);
+	max_idx = count_data_objs(inode);
+	for (uint32_t idx = 0; idx < max_idx; idx++) {
+		vid = sd_inode_get_vid(inode, idx);
+		if (vid == 0)
+			continue;
+		oid = vid_to_data_oid(vid, idx);
+		save_oid(oid, copies);
+	}
+}
+
+static void destroy_oid_tree(void)
+{
+	struct oid_entry *entry;
+
+	rb_for_each_entry(entry, &oid_tree, rb)
+		free(entry->oids);
+	rb_destroy(&oid_tree, struct oid_entry, rb);
+}
+
+static int do_obj_check(const struct sd_inode *inode)
+{
+	int total = 0;
+	struct oid_entry *entry;
+	struct sd_req hdr;
+	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+
+	build_oid_tree(inode);
+
+	rb_for_each_entry(entry, &oid_tree, rb) {
+		sd_init_req(&hdr, SD_OP_OIDS_EXIST);
+		hdr.data_length = sizeof(uint64_t) * entry->last;
+		hdr.flags = SD_FLAG_CMD_WRITE | SD_FLAG_CMD_READ;
+		int ret = dog_exec_req(&entry->node->nid, &hdr, entry->oids);
+		if (ret < 0)
+			panic("dog_exec_req() failure.");
+
+		int n = rsp->data_length / sizeof(uint64_t);
+		total += n;
+		for (int i = 0; i < n; i++)
+			printf("[%s] oid %016"PRIx64" is missing.\n",
+					addr_to_str(entry->node->nid.addr,
+							entry->node->nid.port),
+					entry->oids[i]);
+	}
+
+	destroy_oid_tree();
+
+	if (total == 0) {
+		printf("%s is fine, no object is missing.\n", inode->name);
+		return EXIT_SUCCESS;
+	} else {
+		printf("%s lost %d object(s).\n", inode->name, total);
+		return EXIT_FAILURE;
+	}
+}
+
 static int do_track_object(uint64_t oid, uint8_t nr_copies)
 {
 	int i, j, ret;
@@ -1771,7 +1875,10 @@ static int vdi_check(int argc, char **argv)
 		goto out;
 	}
 
-	ret = do_vdi_check(inode);
+	if (vdi_cmd_data.exist)
+		ret = do_obj_check(inode);
+	else
+		ret = do_vdi_check(inode);
 out:
 	free(inode);
 	return ret;
@@ -2359,7 +2466,7 @@ static int vdi_cache(int argc, char **argv)
 }
 
 static struct subcommand vdi_cmd[] = {
-	{"check", "<vdiname>", "saph", "check and repair image's consistency",
+	{"check", "<vdiname>", "seaph", "check and repair image's consistency",
 	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
 	 vdi_check, vdi_options},
 	{"create", "<vdiname> <size>", "Pycaphrv", "create an image",
@@ -2491,6 +2598,9 @@ static int vdi_parser(int ch, const char *opt)
 			exit(EXIT_FAILURE);
 		}
 		break;
+	case 'e':
+		vdi_cmd_data.exist = true;
+		break;
 	}
 
 	return 0;
diff --git a/include/internal_proto.h b/include/internal_proto.h
index ada084f..9d3928e 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -100,6 +100,7 @@
 #define SD_OP_NFS_DELETE	0xBC
 #define SD_OP_EXIST	0xBD
 #define SD_OP_CLUSTER_INFO	0xBE
+#define SD_OP_OIDS_EXIST	0xBF
 #define SD_OP_ALTER_CLUSTER_COPY	0xC0
 #define SD_OP_ALTER_VDI_COPY	0xC1
 
@@ -175,6 +176,14 @@ struct sd_node {
 #endif
 };
 
+struct oid_entry {
+	struct rb_node rb;
+	struct sd_node *node; /* key */
+	uint64_t *oids;       /* object id array */
+	int end;              /* idx to the end of the allocated oid array */
+	int last;             /* idx to the last element of the oid array */
+};
+
 /*
  * A joining sheep multicasts the local cluster info.  Then, the existing nodes
  * reply the latest cluster info which is unique among all of the nodes.
diff --git a/include/sheep.h b/include/sheep.h
index 785883e..ef8958c 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -199,6 +199,12 @@ static inline int node_cmp(const struct sd_node *node1,
 	return node_id_cmp(&node1->nid, &node2->nid);
 }
 
+static inline int oid_entry_cmp(const struct oid_entry *entry1,
+			   const struct oid_entry *entry2)
+{
+	return node_cmp(entry1->node, entry2->node);
+}
+
 static inline bool node_eq(const struct sd_node *a, const struct sd_node *b)
 {
 	return node_cmp(a, b) == 0;
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 9361bad..c6e21b4 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -45,6 +45,7 @@
 #define SD_FLAG_CMD_COW      0x02
 #define SD_FLAG_CMD_CACHE    0x04
 #define SD_FLAG_CMD_DIRECT   0x08 /* don't use object cache */
+#define SD_FLAG_CMD_READ     0x10
 /* flags above 0x80 are sheepdog-internal */
 
 #define SD_RES_SUCCESS       0x00 /* Success */
diff --git a/lib/net.c b/lib/net.c
index b32e022..c2d86cb 100644
--- a/lib/net.c
+++ b/lib/net.c
@@ -333,7 +333,7 @@ int exec_req(int sockfd, struct sd_req *hdr, void *data,
 
 	if (hdr->flags & SD_FLAG_CMD_WRITE) {
 		wlen = hdr->data_length;
-		rlen = 0;
+		rlen = (hdr->flags & SD_FLAG_CMD_READ) ? hdr->data_length : 0;
 	} else {
 		wlen = 0;
 		rlen = hdr->data_length;
diff --git a/sheep/ops.c b/sheep/ops.c
index 61eb37f..c54017e 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -1058,6 +1058,30 @@ static int local_oid_exist(struct request *req)
 	return SD_RES_NO_OBJ;
 }
 
+static int local_oids_exist(const struct sd_req *req, struct sd_rsp *rsp,
+			      void *data)
+{
+	struct request *r = container_of(req, struct request, rq);
+	uint64_t *oids = (uint64_t *) data;
+	uint8_t ec_index;
+	int i, j, n = req->data_length / sizeof(uint64_t);
+
+	for (i = 0, j = 0; i < n; i++) {
+		ec_index = local_ec_index(r->vinfo, oids[i]);
+		if (is_erasure_oid(oids[i]) && ec_index == SD_MAX_COPIES)
+			oids[j++] = oids[i];
+		else if (!sd_store->exist(oids[i], ec_index))
+			oids[j++] = oids[i];
+	}
+
+	if (j > 0) {
+		rsp->data_length = sizeof(uint64_t) * j;
+		return SD_RES_NO_OBJ;
+	}
+
+	return SD_RES_SUCCESS;
+}
+
 static int local_cluster_info(const struct sd_req *req, struct sd_rsp *rsp,
 			      void *data)
 {
@@ -1408,6 +1432,13 @@ static struct sd_op_template sd_ops[] = {
 		.process_work = local_oid_exist,
 	},
 
+	[SD_OP_OIDS_EXIST] =  {
+		.name = "OIDS_EXIST",
+		.type = SD_OP_TYPE_LOCAL,
+		.force = true,
+		.process_main = local_oids_exist,
+	},
+
 	[SD_OP_CLUSTER_INFO] = {
 		.name = "CLUSTER INFO",
 		.type = SD_OP_TYPE_LOCAL,
-- 
1.8.3.2





More information about the sheepdog mailing list