[sheepdog] [PATCH] dog: vdi checking for lost inode objects
Hitoshi Mitake
mitake.hitoshi at gmail.com
Wed Nov 20 14:54:02 CET 2013
Current "dog vdi check" cannot work well in a case of inode object
lost. This patch lets the subcommand handle the case correctly.
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
dog/common.c | 12 +++--
dog/dog.h | 2 +
dog/vdi.c | 114 +++++++++++++++++++++++++++++++++++++++++++++--
tests/functional/077 | 15 +++++++
tests/functional/077.out | 2 +
5 files changed, 139 insertions(+), 6 deletions(-)
diff --git a/dog/common.c b/dog/common.c
index 59d38dc..649f303 100644
--- a/dog/common.c
+++ b/dog/common.c
@@ -55,8 +55,8 @@ char *strnumber(uint64_t size)
return strnumber_raw(size, raw_output);
}
-int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
- uint64_t offset, bool direct)
+int do_sd_read_object(struct node_id *nid, uint64_t oid, void *data,
+ unsigned int datalen, uint64_t offset, bool direct)
{
struct sd_req hdr;
struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
@@ -70,7 +70,7 @@ int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
if (direct)
hdr.flags |= SD_FLAG_CMD_DIRECT;
- ret = dog_exec_req(&sd_nid, &hdr, data);
+ ret = dog_exec_req(nid, &hdr, data);
if (ret < 0) {
sd_err("Failed to read object %" PRIx64, oid);
return SD_RES_EIO;
@@ -85,6 +85,12 @@ int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
return SD_RES_SUCCESS;
}
+int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
+ uint64_t offset, bool direct)
+{
+ return do_sd_read_object(&sd_nid, oid, data, datalen, offset, direct);
+}
+
int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data,
unsigned int datalen, uint64_t offset, uint32_t flags,
uint8_t copies, uint8_t copy_policy, bool create,
diff --git a/dog/dog.h b/dog/dog.h
index af3cefa..f46297b 100644
--- a/dog/dog.h
+++ b/dog/dog.h
@@ -71,6 +71,8 @@ typedef void (*vdi_parser_func_t)(uint32_t vid, const char *name,
uint32_t flags,
const struct sd_inode *i, void *data);
int parse_vdi(vdi_parser_func_t func, size_t size, void *data);
+int do_sd_read_object(struct node_id *nid, uint64_t oid, void *data,
+ unsigned int datalen, uint64_t offset, bool direct);
int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
uint64_t offset, bool direct);
int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data,
diff --git a/dog/vdi.c b/dog/vdi.c
index bfee11f..909de35 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -532,6 +532,114 @@ static int read_vdi_obj(const char *vdiname, int snapid, const char *tag,
return EXIT_SUCCESS;
}
+static int read_majority_vdi_obj(const char *vdiname, int snapid,
+ const char *tag, uint32_t *pvid,
+ struct sd_inode *inode,
+ size_t size)
+{
+ int ret;
+ uint32_t vid;
+ uint64_t oid;
+ const struct sd_vnode *vnodes[SD_MAX_COPIES];
+ /*
+ * FIXME: we should handle a case of
+ * inode->nr_copies != SD_DEFAULT_COPIES
+ */
+
+ struct {
+ bool found;
+ uint8_t digest[20];
+ } results[SD_DEFAULT_COPIES];
+
+ int count = 0, nr_live_copies = 0;
+ int majority_idx = -1;
+
+ ret = find_vdi_name(vdiname, snapid, tag, &vid, 0);
+ if (ret < 0) {
+ sd_err("Failed to open VDI %s", vdiname);
+ return EXIT_FAILURE;
+ }
+
+ oid = vid_to_vdi_oid(vid);
+ oid_to_vnodes(oid, &sd_vroot, SD_DEFAULT_COPIES, vnodes);
+
+ for (int i = 0; i < SD_DEFAULT_COPIES; i++) {
+ struct sd_req hdr;
+ struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+
+ sd_init_req(&hdr, SD_OP_GET_HASH);
+
+ hdr.obj.oid = oid;
+ hdr.obj.tgt_epoch = sd_epoch;
+
+ ret = dog_exec_req(&vnodes[i]->node->nid, &hdr, NULL);
+ if (ret < 0)
+ exit(EXIT_SYSFAIL);
+
+ switch (rsp->result) {
+ case SD_RES_SUCCESS:
+ results[i].found = true;
+ memcpy(results[i].digest, rsp->hash.digest,
+ sizeof(results[i].digest));
+ break;
+ case SD_RES_NO_OBJ:
+ results[i].found = false;
+ break;
+ default:
+ sd_err("failed to read %" PRIx64 " from %s, %s", oid,
+ addr_to_str(vnodes[i]->node->nid.addr,
+ vnodes[i]->node->nid.port),
+ sd_strerror(rsp->result));
+ exit(EXIT_FAILURE);
+ }
+
+ }
+
+ /* Boyer Moore MJRTY */
+ for (int i = 0; i < SD_DEFAULT_COPIES; i++) {
+ if (!results[i].found)
+ continue;
+ nr_live_copies++;
+
+ if (!count)
+ majority_idx = i;
+
+ if (!memcmp(results[majority_idx].digest, results[i].digest,
+ sizeof(results[majority_idx].digest)))
+ count++;
+ else
+ count--;
+ }
+
+ if (majority_idx == -1) {
+ sd_err("no inode object (%" PRIx64 ") found", oid);
+ return EXIT_FAILURE;
+ } else if (count < nr_live_copies / 2) {
+ sd_err("no majority inode object (%" PRIx64 ") found", oid);
+ return EXIT_FAILURE;
+ }
+
+ ret = do_sd_read_object((struct node_id *)
+ &vnodes[majority_idx]->node->nid, oid, inode,
+ size, 0, true);
+ if (ret != SD_RES_SUCCESS) {
+ if (snapid) {
+ sd_err("Failed to read a snapshot %s:%d", vdiname,
+ snapid);
+ } else if (tag && tag[0]) {
+ sd_err("Failed to read a snapshot %s:%s", vdiname, tag);
+ } else {
+ sd_err("Failed to read a vdi %s", vdiname);
+ }
+ return EXIT_FAILURE;
+ }
+
+ if (pvid)
+ *pvid = vid;
+
+ return EXIT_SUCCESS;
+}
+
int do_vdi_create(const char *vdiname, int64_t vdi_size,
uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
uint8_t nr_copies, uint8_t copy_policy, uint8_t store_policy)
@@ -1880,9 +1988,9 @@ static int vdi_check(int argc, char **argv)
int ret;
struct sd_inode *inode = xmalloc(sizeof(*inode));
- ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id,
- vdi_cmd_data.snapshot_tag, NULL, inode,
- SD_INODE_SIZE);
+ ret = read_majority_vdi_obj(vdiname, vdi_cmd_data.snapshot_id,
+ vdi_cmd_data.snapshot_tag, NULL, inode,
+ SD_INODE_SIZE);
if (ret != EXIT_SUCCESS) {
sd_err("FATAL: no inode objects");
return ret;
diff --git a/tests/functional/077 b/tests/functional/077
index f2c2211..7f14b8c 100755
--- a/tests/functional/077
+++ b/tests/functional/077
@@ -68,6 +68,20 @@ $DOG vdi read test 0 14
$DOG cluster shutdown
+# single inode object lost
+
+rm $STORE/0/obj/807c2b2500000000
+
+for i in `seq 0 2`; do
+ _start_sheep $i
+done
+
+_wait_for_sheep 3
+
+$DOG vdi check test
+
+$DOG cluster shutdown
+
# single object lost, single broken object. no majority
rm $STORE/0/obj/007c2b2500000000
@@ -82,3 +96,4 @@ _wait_for_sheep 3
$DOG vdi check test
$DOG cluster shutdown
+
diff --git a/tests/functional/077.out b/tests/functional/077.out
index 5bddb9b..e76b1ee 100644
--- a/tests/functional/077.out
+++ b/tests/functional/077.out
@@ -12,5 +12,7 @@ fixed missing 7c2b2500000000
fixed missing 7c2b2500000000
finish check&repair test
original data
+fixed missing 807c2b2500000000
+finish check&repair test
no majority of 7c2b2500000000
finish check&repair test
--
1.8.1.2
More information about the sheepdog
mailing list