[sheepdog] [PATCH] dog: add command to check and repair cluster

MORITA Kazutaka morita.kazutaka at gmail.com
Mon Aug 19 10:56:19 CEST 2013


From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

This patch adds a 'dog cluster check' command to check all the
objects' consistency.  While 'dog vdi check' is used when VM stops
unexpectedly because of the gateway failure, 'dog cluster check' is
expected to be used for fully checking of cluster status, e.g., after
total node failure happens or sheepdog hangs because of bugs.

With the current implementation, running 'dog cluster check' is same
as doing 'dog vdi check' against all the vdi images.  However, this is
usually very wasteful because vdis can share the same objects
especially when there are many snapshots.  I think of improving it to
skip redundant checks.

I'm planning to add other check commands later:

 - 'collie cluster check reference' : fix reference counts for object reclaim
 - 'collie cluster check epoch'     : fix a broken epoch file

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 dog/cluster.c |   23 +++++++++++++++++++++++
 dog/dog.h     |    1 +
 dog/vdi.c     |   41 ++++++++++++++++++++++++-----------------
 3 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/dog/cluster.c b/dog/cluster.c
index 4c7654d..c539b2b 100644
--- a/dog/cluster.c
+++ b/dog/cluster.c
@@ -468,6 +468,27 @@ static int cluster_reweight(int argc, char **argv)
 	return EXIT_SUCCESS;
 }
 
+static void cluster_check_cb(uint32_t vid, const char *name, const char *tag,
+			     uint32_t snapid, uint32_t flags,
+			     const struct sd_inode *inode, void *data)
+{
+	if (vdi_is_snapshot(inode))
+		printf("fix snapshot %s (id: %d, tag: \"%s\")\n", name,
+		       snapid, tag);
+	else
+		printf("fix vdi %s\n", name);
+
+	do_vdi_check(inode);
+}
+
+static int cluster_check(int argc, char **argv)
+{
+	if (parse_vdi(cluster_check_cb, SD_INODE_SIZE, NULL) < 0)
+		return EXIT_SYSFAIL;
+
+	return EXIT_SUCCESS;
+}
+
 static struct subcommand cluster_cmd[] = {
 	{"info", NULL, "aprh", "show cluster information",
 	 NULL, CMD_NEED_NODELIST, cluster_info, cluster_options},
@@ -484,6 +505,8 @@ static struct subcommand cluster_cmd[] = {
 	 cluster_recover, cluster_options},
 	{"reweight", NULL, "aph", "reweight the cluster", NULL, 0,
 	 cluster_reweight, cluster_options},
+	{"check", NULL, "aph", "check and repair cluster", NULL,
+	 CMD_NEED_NODELIST, cluster_check, cluster_options},
 	{NULL,},
 };
 
diff --git a/dog/dog.h b/dog/dog.h
index 92dd4d2..897cf92 100644
--- a/dog/dog.h
+++ b/dog/dog.h
@@ -82,6 +82,7 @@ void work_queue_wait(struct work_queue *q);
 int do_vdi_create(const char *vdiname, int64_t vdi_size,
 		  uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
 		  int nr_copies);
+int do_vdi_check(const struct sd_inode *inode);
 void show_progress(uint64_t done, uint64_t total, bool raw);
 
 extern struct command vdi_command;
diff --git a/dog/vdi.c b/dog/vdi.c
index 370ad7b..6b8d677 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -1508,7 +1508,7 @@ static void vdi_hash_check_main(struct work *work)
 		free_vdi_check_info(info);
 }
 
-static void queue_vdi_check_work(struct sd_inode *inode, uint64_t oid,
+static void queue_vdi_check_work(const struct sd_inode *inode, uint64_t oid,
 				 uint64_t *done, struct work_queue *wq)
 {
 	struct vdi_check_info *info;
@@ -1533,22 +1533,13 @@ static void queue_vdi_check_work(struct sd_inode *inode, uint64_t oid,
 	}
 }
 
-static int vdi_check(int argc, char **argv)
+int do_vdi_check(const struct sd_inode *inode)
 {
-	const char *vdiname = argv[optind++];
-	int ret, max_idx;
+	int max_idx;
 	uint64_t done = 0, oid;
 	uint32_t vid;
-	struct sd_inode *inode = xmalloc(sizeof(*inode));
 	struct work_queue *wq;
 
-	ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id,
-			   vdi_cmd_data.snapshot_tag, &vid, inode,
-			   SD_INODE_SIZE);
-	if (ret != EXIT_SUCCESS) {
-		sd_err("FATAL: no inode objects");
-		goto out;
-	}
 	if (sd_nodes_nr < inode->nr_copies) {
 		sd_err("ABORT: Not enough active nodes for consistency-check");
 		return EXIT_FAILURE;
@@ -1556,14 +1547,14 @@ static int vdi_check(int argc, char **argv)
 
 	wq = create_work_queue("vdi check", WQ_DYNAMIC);
 
-	queue_vdi_check_work(inode, vid_to_vdi_oid(vid), NULL, wq);
+	queue_vdi_check_work(inode, vid_to_vdi_oid(inode->vdi_id), NULL, wq);
 
 	max_idx = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE);
 	vdi_show_progress(done, inode->vdi_size);
 	for (int idx = 0; idx < max_idx; idx++) {
 		vid = inode->data_vdi_id[idx];
 		if (vid) {
-			oid = vid_to_data_oid(vid, idx);
+			oid = vid_to_data_oid(inode->vdi_id, idx);
 			queue_vdi_check_work(inode, oid, &done, wq);
 		} else {
 			done += SD_DATA_OBJ_SIZE;
@@ -1573,10 +1564,26 @@ static int vdi_check(int argc, char **argv)
 
 	work_queue_wait(wq);
 
-	fprintf(stdout, "finish check&repair %s\n", vdiname);
+	fprintf(stdout, "finish check&repair %s\n", inode->name);
+
 	return EXIT_SUCCESS;
-out:
-	return ret;
+}
+
+static int vdi_check(int argc, char **argv)
+{
+	const char *vdiname = argv[optind++];
+	int ret;
+	struct sd_inode *inode = xmalloc(sizeof(*inode));
+
+	ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id,
+			   vdi_cmd_data.snapshot_tag, NULL, inode,
+			   SD_INODE_SIZE);
+	if (ret != EXIT_SUCCESS) {
+		sd_err("FATAL: no inode objects");
+		return ret;
+	}
+
+	return do_vdi_check(inode);
 }
 
 /* vdi backup format */
-- 
1.7.9.5




More information about the sheepdog mailing list