[sheepdog] [PATCH] collie: enhance 'vdi check' to fix missing object

Liu Yuan namei.unix at gmail.com
Sun Jan 6 07:05:28 CET 2013


From: Liu Yuan <tailai.ly at taobao.com>

In case of object unfound but not yet recovered for any reason, try our
best to fix missing object. Also add inode to check list.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 collie/vdi.c  |   71 ++++++++++++++++++++++++++++++++++-----------------------
 tests/048     |    2 +-
 tests/048.out |    3 ++-
 3 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/collie/vdi.c b/collie/vdi.c
index 4e721fb..6ed7162 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -1379,7 +1379,7 @@ static void *read_object_from(const struct sd_vnode *vnode, uint64_t oid)
 	addr_to_str(name, sizeof(name), vnode->nid.addr, 0);
 	fd = connect_to(name, vnode->nid.port);
 	if (fd < 0) {
-		fprintf(stderr, "failed to connect to %s:%"PRIu32"\n",
+		fprintf(stderr, "FATAL: failed to connect to %s:%"PRIu32"\n",
 			name, vnode->nid.port);
 		exit(EXIT_FAILURE);
 	}
@@ -1395,24 +1395,28 @@ static void *read_object_from(const struct sd_vnode *vnode, uint64_t oid)
 	close(fd);
 
 	if (ret) {
-		fprintf(stderr, "Failed to execute request\n");
+		fprintf(stderr, "FATAL: failed to execute request\n");
 		exit(EXIT_FAILURE);
 	}
 
-	if (rsp->result != SD_RES_SUCCESS) {
-		fprintf(stderr, "Failed to read, %s\n",
+	switch (rsp->result)  {
+	case SD_RES_SUCCESS:
+		set_trimmed_sectors(buf, rsp->obj.offset, rsp->data_length,
+				    SD_DATA_OBJ_SIZE);
+		break;
+	case SD_RES_NO_OBJ:
+		free(buf);
+		return NULL;
+	default:
+		fprintf(stderr, "FATAL: failed to read, %s\n",
 			sd_strerror(rsp->result));
 		exit(EXIT_FAILURE);
 	}
-
-	set_trimmed_sectors(buf, rsp->obj.offset, rsp->data_length,
-			    SD_DATA_OBJ_SIZE);
-
 	return buf;
 }
 
 static void write_object_to(const struct sd_vnode *vnode, uint64_t oid,
-			    void *buf)
+			    void *buf, bool create)
 {
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
@@ -1422,28 +1426,30 @@ static void write_object_to(const struct sd_vnode *vnode, uint64_t oid,
 	addr_to_str(name, sizeof(name), vnode->nid.addr, 0);
 	fd = connect_to(name, vnode->nid.port);
 	if (fd < 0) {
-		fprintf(stderr, "failed to connect to %s:%"PRIu32"\n",
+		fprintf(stderr, "FATAL: failed to connect to %s:%"PRIu32"\n",
 			name, vnode->nid.port);
 		exit(EXIT_FAILURE);
 	}
 
-	sd_init_req(&hdr, SD_OP_WRITE_PEER);
+	if (create)
+		sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_PEER);
+	else
+		sd_init_req(&hdr, SD_OP_WRITE_PEER);
 	hdr.epoch = sd_epoch;
 	hdr.flags = SD_FLAG_CMD_WRITE;
 	hdr.data_length = SD_DATA_OBJ_SIZE;
-
 	hdr.obj.oid = oid;
 
 	ret = exec_req(fd, &hdr, buf);
 	close(fd);
 
 	if (ret) {
-		fprintf(stderr, "Failed to execute request\n");
+		fprintf(stderr, "FATAL: failed to execute request\n");
 		exit(EXIT_FAILURE);
 	}
 
 	if (rsp->result != SD_RES_SUCCESS) {
-		fprintf(stderr, "Failed to read, %s\n",
+		fprintf(stderr, "FATAL: failed to write, %s\n",
 			sd_strerror(rsp->result));
 		exit(EXIT_FAILURE);
 	}
@@ -1452,33 +1458,37 @@ static void write_object_to(const struct sd_vnode *vnode, uint64_t oid,
 /*
  * Fix consistency of the replica of oid.
  *
- * XXX: The fix is rather dumb, just read the first copy and write it
+ * XXX: The fix is rather dumb, just read the random copy and write it
  * to other replica.
  */
 static void do_check_repair(uint64_t oid, int nr_copies)
 {
 	const struct sd_vnode *tgt_vnodes[SD_MAX_COPIES];
-	void *buf, *buf_cmp;
-	int i;
+	void *buf = xmalloc(SD_DATA_OBJ_SIZE), *buf_cmp;
+	int i, ret;
+
+	ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0, true);
+	if (ret != SD_RES_SUCCESS) {
+		fprintf(stderr, "FATAL: read %"PRIx64" failed\n", oid);
+		exit(EXIT_FAILURE);
+	}
 
 	oid_to_vnodes(sd_vnodes, sd_vnodes_nr, oid, nr_copies, tgt_vnodes);
-	buf = read_object_from(tgt_vnodes[0], oid);
-	for (i = 1; i < nr_copies; i++) {
+	for (i = 0; i < nr_copies; i++) {
 		buf_cmp = read_object_from(tgt_vnodes[i], oid);
+		if (!buf_cmp) {
+			write_object_to(tgt_vnodes[i], oid, buf, true);
+			fprintf(stdout, "fixed missing %"PRIx64"\n", oid);
+			continue;
+		}
 		if (memcmp(buf, buf_cmp, SD_DATA_OBJ_SIZE)) {
-			free(buf_cmp);
-			goto fix_consistency;
+			write_object_to(tgt_vnodes[i], oid, buf, false);
+			fprintf(stdout, "fixed replica %"PRIx64"\n", oid);
 		}
 		free(buf_cmp);
 	}
 	free(buf);
 	return;
-
-fix_consistency:
-	for (i = 1; i < nr_copies; i++)
-		write_object_to(tgt_vnodes[i], oid, buf);
-	fprintf(stdout, "fix %"PRIx64" success\n", oid);
-	free(buf);
 }
 
 static int vdi_check(int argc, char **argv)
@@ -1490,11 +1500,14 @@ static int vdi_check(int argc, char **argv)
 	struct sheepdog_inode *inode = xmalloc(sizeof(*inode));
 
 	ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id,
-			   vdi_cmd_data.snapshot_tag, NULL, inode,
+			   vdi_cmd_data.snapshot_tag, &vid, inode,
 			   SD_INODE_SIZE);
-	if (ret != EXIT_SUCCESS)
+	if (ret != EXIT_SUCCESS) {
+		fprintf(stderr, "FATAL: no inode objects\n");
 		goto out;
+	}
 
+	do_check_repair(vid_to_vdi_oid(vid), inode->nr_copies);
 	total = inode->vdi_size;
 	while (done < total) {
 		vid = inode->data_vdi_id[idx];
diff --git a/tests/048 b/tests/048
index 34695cf..97ed45e 100755
--- a/tests/048
+++ b/tests/048
@@ -35,7 +35,7 @@ $COLLIE node list -p 7001
 
 for i in 3 4 0; do
 	_start_sheep $i
-	sleep 1
+	sleep 2
 done
 _wait_for_sheep 5
 _wait_for_sheep_recovery 0
diff --git a/tests/048.out b/tests/048.out
index a13ace3..7b71326 100644
--- a/tests/048.out
+++ b/tests/048.out
@@ -67,6 +67,7 @@ obj 7c2b2500000001 locations at epoch 5, copies = 3
   test         1  1.0 GB  4.0 MB  0.0 MB DATE   7c2b25     3              
 = test 1 1073741824 4194304 0 MASKED 7c2b25 3
 finish check&repair test
-fix 7c2b2500000001 success
+fixed replica 7c2b2500000001
+fixed replica 7c2b2500000001
 finish check&repair test
 
-- 
1.7.9.5




More information about the sheepdog mailing list