[sheepdog] [PATCH 1/4] dog: make consistency check optional

MORITA Kazutaka morita.kazutaka at gmail.com
Thu Aug 22 06:01:40 CEST 2013


From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

This splits vdi check into replica existence check and replica
consistency check, and make consistency check optional.  It is because:

 - Existence check can be done even when there is a VM who uses the
   vdi.

 - Existence check doesn't need to calculate sha1 values and it is
   much faster than consistency check.

 - Existence check is more important because write requests are
   rejected when one of the replicas is missing.

 - In most cases, replica inconsistency doesn't cause a problem
   because the client should have received EIO when the consistency
   problem happened and the client shouldn't use the inconsistent area
   without updating the area again.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 dog/cluster.c |    9 +++++++--
 dog/dog.h     |    2 +-
 dog/vdi.c     |   45 +++++++++++++++++++++++++++++++--------------
 3 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/dog/cluster.c b/dog/cluster.c
index 3fd87bd..c73c500 100644
--- a/dog/cluster.c
+++ b/dog/cluster.c
@@ -20,6 +20,7 @@
 static struct sd_option cluster_options[] = {
 	{'b', "store", true, "specify backend store"},
 	{'c', "copies", true, "specify the default data redundancy (number of copies)"},
+	{'C', "consistency", false, "check replica consistency"},
 	{'f', "force", false, "do not prompt for confirmation"},
 
 	{ 0, NULL, false, NULL },
@@ -27,6 +28,7 @@ static struct sd_option cluster_options[] = {
 
 static struct cluster_cmd_data {
 	int copies;
+	bool check_consistency;
 	bool force;
 	char name[STORE_LEN];
 } cluster_cmd_data;
@@ -482,7 +484,7 @@ static void cluster_check_cb(uint32_t vid, const char *name, const char *tag,
 	else
 		printf("fix vdi %s\n", name);
 
-	do_vdi_check(inode);
+	do_vdi_check(inode, cluster_cmd_data.check_consistency);
 }
 
 static int cluster_check(int argc, char **argv)
@@ -509,7 +511,7 @@ static struct subcommand cluster_cmd[] = {
 	 cluster_recover, cluster_options},
 	{"reweight", NULL, "aph", "reweight the cluster", NULL, 0,
 	 cluster_reweight, cluster_options},
-	{"check", NULL, "aph", "check and repair cluster", NULL,
+	{"check", NULL, "aphC", "check and repair cluster", NULL,
 	 CMD_NEED_NODELIST, cluster_check, cluster_options},
 	{NULL,},
 };
@@ -536,6 +538,9 @@ static int cluster_parser(int ch, char *opt)
 		}
 		cluster_cmd_data.copies = copies;
 		break;
+	case 'C':
+		cluster_cmd_data.check_consistency = true;
+		break;
 	case 'f':
 		cluster_cmd_data.force = true;
 		break;
diff --git a/dog/dog.h b/dog/dog.h
index 897cf92..226599a 100644
--- a/dog/dog.h
+++ b/dog/dog.h
@@ -82,7 +82,7 @@ void work_queue_wait(struct work_queue *q);
 int do_vdi_create(const char *vdiname, int64_t vdi_size,
 		  uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
 		  int nr_copies);
-int do_vdi_check(const struct sd_inode *inode);
+int do_vdi_check(const struct sd_inode *inode, bool check_consistency);
 void show_progress(uint64_t done, uint64_t total, bool raw);
 
 extern struct command vdi_command;
diff --git a/dog/vdi.c b/dog/vdi.c
index 3eb03dd..fff6a6b 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -28,6 +28,7 @@ static struct sd_option vdi_options[] = {
 	{'d', "delete", false, "delete a key"},
 	{'w', "writeback", false, "use writeback mode"},
 	{'c', "copies", true, "specify the data redundancy (number of copies)"},
+	{'C', "consistency", false, "check replica consistency"},
 	{'F', "from", true, "create a differential backup from the snapshot"},
 	{'f', "force", false, "do operation forcibly"},
 	{ 0, NULL, false, NULL },
@@ -41,6 +42,7 @@ static struct vdi_cmd_data {
 	bool delete;
 	bool prealloc;
 	int nr_copies;
+	bool check_consistency;
 	bool writeback;
 	int from_snapshot_id;
 	char from_snapshot_tag[SD_MAX_VDI_TAG_LEN];
@@ -1402,6 +1404,7 @@ struct vdi_check_info {
 	uint64_t total;
 	uint64_t *done;
 	int refcnt;
+	bool check_consistency;
 	struct work_queue *wq;
 	struct vdi_check_work *base;
 	struct vdi_check_work vcw[0];
@@ -1444,7 +1447,7 @@ static void vdi_repair_main(struct work *work)
 		free_vdi_check_info(info);
 }
 
-static void vdi_hash_check_work(struct work *work)
+static void vdi_health_check_work(struct work *work)
 {
 	struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
 						  work);
@@ -1453,9 +1456,15 @@ static void vdi_hash_check_work(struct work *work)
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
 
-	sd_init_req(&hdr, SD_OP_GET_HASH);
-	hdr.obj.oid = info->oid;
-	hdr.obj.tgt_epoch = sd_epoch;
+	if (info->check_consistency) {
+		sd_init_req(&hdr, SD_OP_GET_HASH);
+		hdr.obj.oid = info->oid;
+		hdr.obj.tgt_epoch = sd_epoch;
+	} else {
+		sd_init_req(&hdr, SD_OP_READ_PEER);
+		hdr.obj.oid = info->oid;
+		hdr.epoch = sd_epoch;
+	}
 
 	ret = dog_exec_req(vcw->vnode->nid.addr, vcw->vnode->nid.port, &hdr,
 			      NULL);
@@ -1465,7 +1474,8 @@ static void vdi_hash_check_work(struct work *work)
 	switch (rsp->result) {
 	case SD_RES_SUCCESS:
 		vcw->object_found = true;
-		memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash));
+		if (info->check_consistency)
+			memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash));
 		uatomic_set(&info->base, vcw);
 		break;
 	case SD_RES_NO_OBJ:
@@ -1479,7 +1489,7 @@ static void vdi_hash_check_work(struct work *work)
 	}
 }
 
-static void vdi_hash_check_main(struct work *work)
+static void vdi_health_check_main(struct work *work)
 {
 	struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
 						  work);
@@ -1513,7 +1523,8 @@ static void vdi_hash_check_main(struct work *work)
 }
 
 static void queue_vdi_check_work(const struct sd_inode *inode, uint64_t oid,
-				 uint64_t *done, struct work_queue *wq)
+				 bool check_consistency, uint64_t *done,
+				 struct work_queue *wq)
 {
 	struct vdi_check_info *info;
 	const struct sd_vnode *tgt_vnodes[SD_MAX_COPIES];
@@ -1523,6 +1534,7 @@ static void queue_vdi_check_work(const struct sd_inode *inode, uint64_t oid,
 	info->oid = oid;
 	info->nr_copies = nr_copies;
 	info->total = inode->vdi_size;
+	info->check_consistency = check_consistency;
 	info->done = done;
 	info->wq = wq;
 
@@ -1530,14 +1542,14 @@ static void queue_vdi_check_work(const struct sd_inode *inode, uint64_t oid,
 	for (int i = 0; i < nr_copies; i++) {
 		info->vcw[i].info = info;
 		info->vcw[i].vnode = tgt_vnodes[i];
-		info->vcw[i].work.fn = vdi_hash_check_work;
-		info->vcw[i].work.done = vdi_hash_check_main;
+		info->vcw[i].work.fn = vdi_health_check_work;
+		info->vcw[i].work.done = vdi_health_check_main;
 		info->refcnt++;
 		queue_work(info->wq, &info->vcw[i].work);
 	}
 }
 
-int do_vdi_check(const struct sd_inode *inode)
+int do_vdi_check(const struct sd_inode *inode, bool check_consistency)
 {
 	int max_idx;
 	uint64_t done = 0, oid;
@@ -1551,7 +1563,8 @@ int do_vdi_check(const struct sd_inode *inode)
 
 	wq = create_work_queue("vdi check", WQ_DYNAMIC);
 
-	queue_vdi_check_work(inode, vid_to_vdi_oid(inode->vdi_id), NULL, wq);
+	queue_vdi_check_work(inode, vid_to_vdi_oid(inode->vdi_id),
+			     check_consistency, NULL, wq);
 
 	max_idx = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE);
 	vdi_show_progress(done, inode->vdi_size);
@@ -1559,7 +1572,8 @@ int do_vdi_check(const struct sd_inode *inode)
 		vid = inode->data_vdi_id[idx];
 		if (vid) {
 			oid = vid_to_data_oid(inode->vdi_id, idx);
-			queue_vdi_check_work(inode, oid, &done, wq);
+			queue_vdi_check_work(inode, oid, check_consistency,
+					     &done, wq);
 		} else {
 			done += SD_DATA_OBJ_SIZE;
 			vdi_show_progress(done, inode->vdi_size);
@@ -1587,7 +1601,7 @@ static int vdi_check(int argc, char **argv)
 		return ret;
 	}
 
-	return do_vdi_check(inode);
+	return do_vdi_check(inode, vdi_cmd_data.check_consistency);
 }
 
 /* vdi backup format */
@@ -2082,7 +2096,7 @@ static int vdi_cache(int argc, char **argv)
 }
 
 static struct subcommand vdi_cmd[] = {
-	{"check", "<vdiname>", "saph", "check and repair image's consistency",
+	{"check", "<vdiname>", "saphC", "check and repair image's consistency",
 	 NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
 	 vdi_check, vdi_options},
 	{"create", "<vdiname> <size>", "Pcaphrv", "create an image",
@@ -2185,6 +2199,9 @@ static int vdi_parser(int ch, char *opt)
 		}
 		vdi_cmd_data.nr_copies = nr_copies;
 		break;
+	case 'C':
+		vdi_cmd_data.check_consistency = true;
+		break;
 	case 'F':
 		vdi_cmd_data.from_snapshot_id = strtol(opt, &p, 10);
 		if (opt == p) {
-- 
1.7.9.5




More information about the sheepdog mailing list