From: Yunkai Zhang <qiushu.zyk at taobao.com> Reading all of vdi's objects from cluster when checking them will lead to a lot of waste of network bandwith, let's calculate the checksum of objects in backend and only send the checksum result to the collie client. And I think repairing object automaticly is dangerous, as we don't known which replica is correct. In order to let user have a chance to check them if necessary, I add a new option: '-F (--force_repair)'. By default, this command just do check, not repair(as the command name implies). After add '-F' flag, the help looks like: $ collie vdi check Usage: collie vdi check [-F] [-s snapshot] [-a address] [-p port] [-h] <vdiname> Options: -F, --force_repair force repair object's copies (dangerous) -s, --snapshot specify a snapshot id or tag name -a, --address specify the daemon address (default: localhost) -p, --port specify the daemon port -h, --help display this help and exit Let's show some examples when execute this command: * Success: $ collie vdi check test.img CHECKING VDI:test.img ... PASSED * Failure (by default not repair): $ collie vdi check test.img CHECKING VDI:test.img ... Failed oid: 9c5e6800000001 >> copy[0], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7000 >> copy[1], sha1: 46dbc769de60a508faf134c6d51926741c0e38fa, from: 127.0.0.1:7001 >> copy[2], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7004 FAILED With the output showed above, user can check all copies of this object and decide which one is correct (I plan to add a new option: '--oid' to 'collie vdi read' in another patch, so that user can specify which copy of object to be exported: $ collie vdi read test.img --oid 9c5e6800000001 at 127.0.0.1:7001 > foo.img By testing foo.img, we can known which copy is correct). User can do force repair by specify -F or --force_repair flag: * Force repair: $ collie vdi check -F test.img CHECKING VDI:test.img ... Failed oid: 9c5e6800000001 >> copy[0], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7000 >> copy[1], sha1: 46dbc769de60a508faf134c6d51926741c0e38fa, from: 127.0.0.1:7001 >> copy[2], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7004 >> force repairing ... >> copy this object from 127.0.0.1:7000 => 127.0.0.1:7001 >> copy this object from 127.0.0.1:7000 => 127.0.0.1:7004 >> repair finished FORCE REPAIRED Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com> --- collie/collie.c | 1 + collie/vdi.c | 175 ++++++++++++++++++++++++-------------------- include/internal_proto.h | 6 +- include/sheep.h | 16 ++++ sheep/farm/farm.h | 1 - sheep/farm/sha1_file.c | 15 ---- sheep/ops.c | 187 ++++++++++++++++++++++++++++++++++++++++------- sheep/sheep_priv.h | 2 + 8 files changed, 277 insertions(+), 126 deletions(-) diff --git a/collie/collie.c b/collie/collie.c index b112b73..fcd44f1 100644 --- a/collie/collie.c +++ b/collie/collie.c @@ -38,6 +38,7 @@ static const struct sd_option collie_options[] = { {'s', "snapshot", 1, "specify a snapshot id or tag name"}, {'x', "exclusive", 0, "write in an exclusive mode"}, {'d', "delete", 0, "delete a key"}, + {'F', "force_repair", 0, "force repair object's copies (dangerous)"}, /* cluster options */ {'b', "store", 1, "specify backend store"}, diff --git a/collie/vdi.c b/collie/vdi.c index e4f4f65..dbebc23 100644 --- a/collie/vdi.c +++ b/collie/vdi.c @@ -23,6 +23,7 @@ struct vdi_cmd_data { int exclusive; int delete; int prealloc; + int force_repair; } vdi_cmd_data = { ~0, }; struct get_vdi_info { @@ -1304,126 +1305,126 @@ out: return ret; } -static void *read_object_from(struct sd_vnode *vnode, uint64_t oid) +static void get_obj_checksum_from(struct sd_vnode *vnode, uint64_t oid, + unsigned char *sha1) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; - int fd, ret; - unsigned wlen = 0, rlen = SD_DATA_OBJ_SIZE; - char name[128]; - void *buf; + char host[128]; + int ret; - buf = malloc(SD_DATA_OBJ_SIZE); - if (!buf) { - fprintf(stderr, "Failed to allocate memory\n"); - exit(EXIT_SYSFAIL); - } + addr_to_str(host, sizeof(host), vnode->nid.addr, 0); - addr_to_str(name, sizeof(name), vnode->nid.addr, 0); - fd = connect_to(name, vnode->nid.port); - if (fd < 0) { - fprintf(stderr, "failed to connect to %s:%"PRIu32"\n", - name, vnode->nid.port); - exit(EXIT_FAILURE); - } - - sd_init_req(&hdr, SD_OP_READ_PEER); + sd_init_req(&hdr, SD_OP_CALC_CHKSUM_PEER); hdr.epoch = sd_epoch; - hdr.flags = 0; - hdr.data_length = rlen; - hdr.obj.oid = oid; - ret = exec_req(fd, &hdr, buf, &wlen, &rlen); - close(fd); - - if (ret) { - fprintf(stderr, "Failed to execute request\n"); - exit(EXIT_FAILURE); - } - - if (rsp->result != SD_RES_SUCCESS) { + ret = send_light_req(&hdr, host, vnode->nid.port); + if (ret || rsp->result != SD_RES_SUCCESS) { fprintf(stderr, "Failed to read, %s\n", sd_strerror(rsp->result)); exit(EXIT_FAILURE); } - return buf; + + memcpy(sha1, (unsigned char *)&rsp->__pad[0], SHA1_LEN); } -static void write_object_to(struct sd_vnode *vnode, uint64_t oid, void *buf) +static int do_repair(uint64_t oid, struct node_id *src, struct node_id *dest) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; + unsigned rlen, wlen; + char host[128]; int fd, ret; - unsigned wlen = SD_DATA_OBJ_SIZE, rlen = 0; - char name[128]; - addr_to_str(name, sizeof(name), vnode->nid.addr, 0); - fd = connect_to(name, vnode->nid.port); + addr_to_str(host, sizeof(host), dest->addr, 0); + + fd = connect_to(host, dest->port); if (fd < 0) { - fprintf(stderr, "failed to connect to %s:%"PRIu32"\n", - name, vnode->nid.port); - exit(EXIT_FAILURE); + fprintf(stderr, "Failed to connect\n"); + return SD_RES_EIO; } - sd_init_req(&hdr, SD_OP_WRITE_PEER); - hdr.epoch = sd_epoch; - hdr.flags = SD_FLAG_CMD_WRITE; - hdr.data_length = wlen; + sd_init_req(&hdr, SD_OP_REPAIR_OBJ_PEER); + rlen = 0; + wlen = sizeof(*src); + + hdr.epoch = sd_epoch; hdr.obj.oid = oid; + hdr.data_length = wlen; + hdr.flags = SD_FLAG_CMD_WRITE; - ret = exec_req(fd, &hdr, buf, &wlen, &rlen); + ret = exec_req(fd, &hdr, src, &wlen, &rlen); close(fd); - if (ret) { - fprintf(stderr, "Failed to execute request\n"); - exit(EXIT_FAILURE); + fprintf(stderr, "Failed to repair oid:%"PRIx64"\n", oid); + return SD_RES_EIO; } - if (rsp->result != SD_RES_SUCCESS) { - fprintf(stderr, "Failed to read, %s\n", - sd_strerror(rsp->result)); - exit(EXIT_FAILURE); + fprintf(stderr, "Failed to repair oid:%"PRIx64", %s\n", + oid, sd_strerror(rsp->result)); + return rsp->result; } + + return SD_RES_SUCCESS; } -/* - * Fix consistency of the replica of oid. - * - * XXX: The fix is rather dumb, just read the first copy and write it - * to other replica. - */ -static void do_check_repair(uint64_t oid, int nr_copies) +static int do_check_repair(uint64_t oid, int nr_copies) { struct sd_vnode *tgt_vnodes[nr_copies]; - void *buf, *buf_cmp; - int i; + unsigned char sha1[SD_MAX_COPIES][SHA1_LEN]; + char host[128]; + int i, j; oid_to_vnodes(sd_vnodes, sd_vnodes_nr, oid, nr_copies, tgt_vnodes); - buf = read_object_from(tgt_vnodes[0], oid); - for (i = 1; i < nr_copies; i++) { - buf_cmp = read_object_from(tgt_vnodes[i], oid); - if (memcmp(buf, buf_cmp, SD_DATA_OBJ_SIZE)) { - free(buf_cmp); - goto fix_consistency; + for (i = 0; i < nr_copies; i++) { + get_obj_checksum_from(tgt_vnodes[i], oid, sha1[i]); + } + + for (i = 0; i < nr_copies; i++) { + for (j = (i + 1); j < nr_copies; j++) { + if (memcmp(sha1[i], sha1[j], SHA1_LEN)) + goto diff; } - free(buf_cmp); } - free(buf); - return; + return 0; -fix_consistency: - for (i = 1; i < nr_copies; i++) - write_object_to(tgt_vnodes[i], oid, buf); - fprintf(stdout, "fix %"PRIx64" success\n", oid); - free(buf); +diff: + fprintf(stderr, "Failed oid: %"PRIx64"\n", oid); + for (i = 0; i < nr_copies; i++) { + addr_to_str(host, sizeof(host), tgt_vnodes[i]->nid.addr, 0); + fprintf(stderr, ">> copy[%d], sha1: %s, from: %s:%d\n", + i, sha1_to_hex(sha1[i]), host, tgt_vnodes[i]->nid.port); + } + + if (!vdi_cmd_data.force_repair) + return -1; + + /* + * Force repair the consistency of oid's replica + * + * FIXME: this fix is rather dumb, it just read the + * first copy and write it to other replica, + */ + fprintf(stderr, ">> force repairing ...\n"); + addr_to_str(host, sizeof(host), tgt_vnodes[0]->nid.addr, + tgt_vnodes[0]->nid.port); + for (i = 1; i< nr_copies; i++) { + char dest[128]; + addr_to_str(dest, sizeof(dest), tgt_vnodes[i]->nid.addr, + tgt_vnodes[i]->nid.port); + fprintf(stderr, ">> copy this object from %s => %s\n", host, dest); + do_repair(oid, &tgt_vnodes[0]->nid, &tgt_vnodes[i]->nid); + } + fprintf(stderr, ">> repair finished\n"); + return -1; } static int check_repair_vdi(uint32_t vid) { struct sheepdog_inode *inode; - int ret; + int ret, failed = 0; uint64_t total, done = 0, oid; uint32_t idx = 0, dvid; @@ -1432,22 +1433,29 @@ static int check_repair_vdi(uint32_t vid) fprintf(stderr, "Failed to allocate memory\n"); return EXIT_SYSFAIL; } + ret = sd_read_object(vid_to_vdi_oid(vid), inode, SD_INODE_SIZE, 0); if (ret != SD_RES_SUCCESS) { fprintf(stderr, "Failed to read an inode\n"); return EXIT_FAILURE; } + if (do_check_repair(vid_to_vdi_oid(vid), inode->nr_copies)) + failed = 1; + total = inode->vdi_size; while(done < total) { dvid = inode->data_vdi_id[idx]; if (dvid) { oid = vid_to_data_oid(dvid, idx); - do_check_repair(oid, inode->nr_copies); + if (do_check_repair(oid, inode->nr_copies)) + failed = 1; } done += SD_DATA_OBJ_SIZE; idx++; } + if (failed) + return EXIT_FAILURE; return EXIT_SUCCESS; } @@ -1466,18 +1474,22 @@ static int vdi_check(int argc, char **argv) goto out; } + printf("CHECKING VDI:%s ...\n", vdiname); ret = check_repair_vdi(vid); - if (ret != EXIT_SUCCESS) + if (ret != EXIT_SUCCESS) { + printf("%s\n", + vdi_cmd_data.force_repair?"FORCE REPAIRED":"FAILED"); goto out; + } - fprintf(stdout, "finish check&repair %s\n", vdiname); + printf("PASSED\n"); return EXIT_SUCCESS; out: return ret; } static struct subcommand vdi_cmd[] = { - {"check", "<vdiname>", "saph", "check and repair image's consistency", + {"check", "<vdiname>", "Fsaph", "check and repair image's consistency", NULL, SUBCMD_FLAG_NEED_NODELIST|SUBCMD_FLAG_NEED_THIRD_ARG, vdi_check}, {"create", "<vdiname> <size>", "Paph", "create an image", NULL, SUBCMD_FLAG_NEED_NODELIST|SUBCMD_FLAG_NEED_THIRD_ARG, vdi_create}, @@ -1533,6 +1545,9 @@ static int vdi_parser(int ch, char *opt) sizeof(vdi_cmd_data.snapshot_tag)); } break; + case 'F': + vdi_cmd_data.force_repair = 1; + break; case 'x': vdi_cmd_data.exclusive = 1; break; diff --git a/include/internal_proto.h b/include/internal_proto.h index 83d98f1..3d1654c 100644 --- a/include/internal_proto.h +++ b/include/internal_proto.h @@ -61,8 +61,10 @@ #define SD_OP_REMOVE_PEER 0xA6 #define SD_OP_SET_CACHE_SIZE 0xA7 #define SD_OP_ENABLE_RECOVER 0xA8 -#define SD_OP_DISABLE_RECOVER 0xA9 -#define SD_OP_INFO_RECOVER 0xAA +#define SD_OP_DISABLE_RECOVER 0xA9 +#define SD_OP_INFO_RECOVER 0xAA +#define SD_OP_CALC_CHKSUM_PEER 0xAB +#define SD_OP_REPAIR_OBJ_PEER 0xAC /* internal flags for hdr.flags, must be above 0x80 */ #define SD_FLAG_CMD_RECOVERY 0x0080 diff --git a/include/sheep.h b/include/sheep.h index 719d18f..54b6eb3 100644 --- a/include/sheep.h +++ b/include/sheep.h @@ -279,4 +279,20 @@ static inline int nodes_to_vnodes(struct sd_node *nodes, int nr, return nr_vnodes; } +static inline char *sha1_to_hex(const unsigned char *sha1) +{ + static char buffer[50]; + static const char hex[] = "0123456789abcdef"; + char *buf = buffer; + int i; + + for (i = 0; i < SHA1_LEN; i++) { + unsigned int val = *sha1++; + *buf++ = hex[val >> 4]; + *buf++ = hex[val & 0xf]; + } + buffer[2 * SHA1_LEN] = 0; + return buffer; +} + #endif diff --git a/sheep/farm/farm.h b/sheep/farm/farm.h index 27e65cd..d0b635a 100644 --- a/sheep/farm/farm.h +++ b/sheep/farm/farm.h @@ -53,7 +53,6 @@ extern char farm_obj_dir[PATH_MAX]; extern char *sha1_to_path(const unsigned char *sha1); extern int sha1_file_write(unsigned char *buf, unsigned len, unsigned char *outsha1); extern void *sha1_file_read(const unsigned char *sha1, struct sha1_file_hdr *hdr); -extern char *sha1_to_hex(const unsigned char *sha1); extern int get_sha1_hex(const char *hex, unsigned char *sha1); extern int sha1_file_try_delete(const unsigned char *sha1); diff --git a/sheep/farm/sha1_file.c b/sheep/farm/sha1_file.c index b0abc16..3cd4f40 100644 --- a/sheep/farm/sha1_file.c +++ b/sheep/farm/sha1_file.c @@ -257,18 +257,3 @@ int get_sha1_hex(const char *hex, unsigned char *sha1) } return 0; } - -char *sha1_to_hex(const unsigned char *sha1) -{ - static char buffer[50]; - static const char hex[] = "0123456789abcdef"; - char *buf = buffer; - int i; - - for (i = 0; i < SHA1_LEN; i++) { - unsigned int val = *sha1++; - *buf++ = hex[val >> 4]; - *buf++ = hex[val & 0xf]; - } - return buffer; -} diff --git a/sheep/ops.c b/sheep/ops.c index 8ca8748..4fdded8 100644 --- a/sheep/ops.c +++ b/sheep/ops.c @@ -26,6 +26,7 @@ #include "strbuf.h" #include "trace/trace.h" #include "util.h" +#include "sha1.h" enum sd_op_type { SD_OP_TYPE_CLUSTER = 1, /* cluster operations */ @@ -625,6 +626,51 @@ static int local_kill_node(const struct sd_req *req, struct sd_rsp *rsp, return SD_RES_SUCCESS; } +static inline int read_object_from(struct node_id *src, uint32_t epoch, + uint64_t oid, char *buf) +{ + struct sd_req hdr; + struct sd_rsp *rsp = (struct sd_rsp *)&hdr; + int fd, ret; + unsigned wlen, rlen; + char host[128]; + + addr_to_str(host, sizeof(host), src->addr, 0); + + fd = connect_to(host, src->port); + if (fd < 0) { + dprintf("failed to connect to %s:%"PRIu32"\n", + host, src->port); + return SD_RES_NETWORK_ERROR; + } + + rlen = SD_DATA_OBJ_SIZE; + if (is_vdi_obj(oid)) + rlen = SD_INODE_SIZE; + wlen = 0; + + sd_init_req(&hdr, SD_OP_READ_PEER); + hdr.epoch = epoch; + hdr.data_length = rlen; + + hdr.obj.oid = oid; + hdr.obj.offset = 0; + + ret = exec_req(fd, &hdr, buf, &wlen, &rlen); + close(fd); + if (ret) { + dprintf("Failed to execute request\n"); + return SD_RES_NETWORK_ERROR; + } + + if (rsp->result != SD_RES_SUCCESS) { + dprintf("Failed to read, %s\n", sd_strerror(rsp->result)); + return SD_RES_NETWORK_ERROR; + } + + return SD_RES_SUCCESS; +} + static int read_copy_from_replica(struct vnode_info *vnodes, uint32_t epoch, uint64_t oid, char *buf) { @@ -666,9 +712,6 @@ static int read_copy_from_replica(struct vnode_info *vnodes, uint32_t epoch, rounded_rand = random() % nr_copies; for (i = 0; i < nr_copies; i++) { - unsigned wlen, rlen; - int fd; - j = (i + rounded_rand) % nr_copies; /* bypass the local copy */ @@ -676,32 +719,9 @@ static int read_copy_from_replica(struct vnode_info *vnodes, uint32_t epoch, continue; v = obj_vnodes[j]; - addr_to_str(name, sizeof(name), v->nid.addr, 0); - - fd = connect_to(name, v->nid.port); - if (fd < 0) - continue; - rlen = SD_DATA_OBJ_SIZE; - wlen = 0; - - sd_init_req(&hdr, SD_OP_READ_PEER); - hdr.epoch = epoch; - hdr.data_length = rlen; - - hdr.obj.oid = oid; - hdr.obj.offset = 0; - - ret = exec_req(fd, &hdr, buf, &wlen, &rlen); - - close(fd); - - if (ret) { - dprintf("%x, %x\n", ret, rsp->result); - continue; - } - - if (rsp->result == SD_RES_SUCCESS) + ret = read_object_from(&v->nid, epoch, oid, buf); + if (ret == SD_RES_SUCCESS) break; } @@ -748,6 +768,60 @@ out: return ret; } +int peer_calc_obj_chksum(struct request *req) +{ + struct sd_req *hdr = &req->rq; + struct sd_rsp *rsp = &req->rp; + uint32_t epoch = hdr->epoch; + unsigned char sha1[SHA1_LEN]; + struct siocb iocb; + struct sha1_ctx ctx; + void *buf; + int ret; + + if (sys->gateway_only) + return SD_RES_NO_OBJ; + + hdr->data_length = SD_DATA_OBJ_SIZE; + if (is_vdi_obj(hdr->obj.oid)) + hdr->data_length = SD_INODE_SIZE; + + buf = valloc(hdr->data_length); + req->data = buf; + hdr->obj.offset = 0; + + if (sys->enable_write_cache && !req->local + && !bypass_object_cache(req)) { + ret = object_cache_handle_request(req); + if (ret != SD_RES_SUCCESS) + goto out; + + goto checksum; + } + + memset(&iocb, 0, sizeof(iocb)); + iocb.epoch = epoch; + iocb.flags = hdr->flags; + iocb.buf = req->data; + iocb.length = hdr->data_length; + iocb.offset = hdr->obj.offset; + ret = sd_store->read(hdr->obj.oid, &iocb); + if (ret != SD_RES_SUCCESS) + goto out; +checksum: + sha1_init(&ctx); + sha1_update(&ctx, buf, hdr->data_length); + sha1_final(&ctx, sha1); + memcpy(&rsp->__pad[0], sha1, SHA1_LEN); +out: + req->data = NULL; + req->data_length = 0; + hdr->data_length = 0; + rsp->data_length = 0; + free(buf); + return ret; +} + static int do_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch, void *data, int create) { @@ -777,6 +851,50 @@ static int do_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch, return ret; } +int peer_repair_obj(struct request *req) +{ + struct sd_req whdr, *hdr = &req->rq; + uint32_t epoch = hdr->epoch; + uint64_t oid = hdr->obj.oid; + struct node_id *src; + struct siocb iocb; + char host[128]; + int ret, len; + void *buf; + + if (sys->gateway_only) + return SD_RES_NO_OBJ; + + src = (struct node_id *)req->data; + addr_to_str(host, sizeof(host), src->addr, src->port); + + dprintf("oid:%"PRIx64", from:%s\n", oid, host); + + len = SD_DATA_OBJ_SIZE; + if (is_vdi_obj(hdr->obj.oid)) + len = SD_INODE_SIZE; + + buf = valloc(len); + if (!buf) { + eprintf("can not allocate memory\n"); + return SD_RES_NO_SPACE; + } + + ret = read_object_from(src, epoch, oid, buf); + if (ret != SD_RES_SUCCESS) + return ret; + + memset(&whdr, 0, sizeof(whdr)); + memset(&iocb, 0, sizeof(iocb)); + + whdr.data_length = len; + whdr.obj.offset = 0; + whdr.obj.oid = oid; + whdr.epoch = epoch; + iocb.epoch = epoch; + return do_write_obj(&iocb, &whdr, epoch, buf, 0); +} + int peer_write_obj(struct request *req) { struct sd_req *hdr = &req->rq; @@ -1084,11 +1202,24 @@ static struct sd_op_template sd_ops[] = { .type = SD_OP_TYPE_CLUSTER, .process_main = cluster_disable_recover, }, + [SD_OP_INFO_RECOVER] = { .name = "INFO_RECOVER", .type = SD_OP_TYPE_LOCAL, .process_main = local_info_recover, }, + + [SD_OP_CALC_CHKSUM_PEER] = { + .name = "CALC_CHKSUM_PEER", + .type = SD_OP_TYPE_PEER, + .process_work = peer_calc_obj_chksum, + }, + + [SD_OP_REPAIR_OBJ_PEER] = { + .name = "REPAIR_OBJ_PEER", + .type = SD_OP_TYPE_PEER, + .process_work = peer_repair_obj, + }, }; struct sd_op_template *get_sd_op(uint8_t opcode) diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index 857cf87..35fc292 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -325,6 +325,8 @@ int peer_read_obj(struct request *req); int peer_write_obj(struct request *req); int peer_create_and_write_obj(struct request *req); int peer_remove_obj(struct request *req); +int peer_calc_obj_chksum(struct request *req); +int peer_repair_obj(struct request *req); /* object_cache */ -- 1.7.11.2 |