From: Yunkai Zhang <qiushu.zyk at taobao.com> V2: - use '-R, --repair' instread of '-F, --force_repair' - not connect to target sheep directly - define a member name instead of using __pad. - update this commit log ---------------------------------------------- >8 Reading all of vdi's objects from cluster when checking them will lead to a lot of waste of network bandwith, let's calculate the checksum of objects in backend and only send the checksum result to the collie client. And I think repairing object automaticly is dangerous, as we don't known which replica is correct. In order to let user have a chance to check them if necessary, I add a new option: '-R, --repair'. By default, this command just do check, not repair(as the command name implies). After add '-R' flag, the help looks like: $ collie vdi check Usage: collie vdi check [-R] [-s snapshot] [-a address] [-p port] [-h] <vdiname> Options: -R, --repair force repair object's copies (dangerous) -s, --snapshot specify a snapshot id or tag name -a, --address specify the daemon address (default: localhost) -p, --port specify the daemon port -h, --help display this help and exit Let's show some examples when execute this command: * Success: $ collie vdi check test.img CHECKING VDI:test.img ... PASSED * Failure (by default not repair): $ collie vdi check test.img CHECKING VDI:test.img ... Failed oid: 9c5e6800000001 >> copy[0], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7000 >> copy[1], sha1: 46dbc769de60a508faf134c6d51926741c0e38fa, from: 127.0.0.1:7001 >> copy[2], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7004 FAILED With the output showed above, user can check all copies of this object and decide which one is correct (I plan to add a new option: '--oid' to 'collie vdi read' in another patch, so that user can specify which copy of object to be exported: $ collie vdi read test.img --oid 9c5e6800000001 at 127.0.0.1:7001 > foo.img By testing foo.img, we can known which copy is correct). User can do force repair by specify -R or --repair flag: * Force repair: $ collie vdi check -R test.img CHECKING VDI:test.img ... Failed oid: 9c5e6800000001 >> copy[0], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7000 >> copy[1], sha1: 46dbc769de60a508faf134c6d51926741c0e38fa, from: 127.0.0.1:7001 >> copy[2], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7004 >> repairing ... >> copy this object from 127.0.0.1:7000 => 127.0.0.1:7001 >> copy this object from 127.0.0.1:7000 => 127.0.0.1:7004 >> repair finished REPAIRED Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com> --- collie/common.c | 6 +- collie/vdi.c | 185 ++++++++++++++++++++++++++------------------- include/internal_proto.h | 22 +++++- include/sheep.h | 16 ++++ sheep/farm/farm.h | 1 - sheep/farm/sha1_file.c | 15 ---- sheep/gateway.c | 76 +++++++++++++++++++ sheep/ops.c | 191 ++++++++++++++++++++++++++++++++++++++++------- sheep/sheep_priv.h | 4 + 9 files changed, 390 insertions(+), 126 deletions(-) diff --git a/collie/common.c b/collie/common.c index f885c8c..83a2c3d 100644 --- a/collie/common.c +++ b/collie/common.c @@ -207,8 +207,7 @@ int send_light_req_get_response(struct sd_req *hdr, const char *host, int port) ret = exec_req(fd, hdr, NULL, &wlen, &rlen); close(fd); if (ret) { - fprintf(stderr, "failed to connect to %s:%d\n", - host, port); + dprintf("failed to connect to %s:%d\n", host, port); return -1; } @@ -229,8 +228,7 @@ int send_light_req(struct sd_req *hdr, const char *host, int port) return -1; if (ret != SD_RES_SUCCESS) { - fprintf(stderr, "Response's result: %s\n", - sd_strerror(ret)); + dprintf("Response's result: %s\n", sd_strerror(ret)); return -1; } diff --git a/collie/vdi.c b/collie/vdi.c index d27b5af..b479efd 100644 --- a/collie/vdi.c +++ b/collie/vdi.c @@ -23,6 +23,7 @@ static struct sd_option vdi_options[] = { {'x', "exclusive", 0, "write in an exclusive mode"}, {'d', "delete", 0, "delete a key"}, {'C', "cache", 0, "enable object cache"}, + {'R', "repair", 0, "force repair object's copies (dangerous)"}, { 0, NULL, 0, NULL }, }; @@ -35,6 +36,7 @@ struct vdi_cmd_data { int delete; int prealloc; int cache; + int repair; } vdi_cmd_data = { ~0, }; struct get_vdi_info { @@ -1320,126 +1322,143 @@ out: return ret; } -static void *read_object_from(struct sd_vnode *vnode, uint64_t oid) +static void get_obj_checksum_from(struct sd_vnode *vnode, uint64_t oid, + unsigned char *sha1) { struct sd_req hdr; - struct sd_rsp *rsp = (struct sd_rsp *)&hdr; + struct sd_checksum_rsp *rsp = (struct sd_checksum_rsp *)&hdr; + unsigned rlen, wlen; int fd, ret; - unsigned wlen = 0, rlen = SD_DATA_OBJ_SIZE; - char name[128]; - void *buf; - - buf = malloc(SD_DATA_OBJ_SIZE); - if (!buf) { - fprintf(stderr, "Failed to allocate memory\n"); - exit(EXIT_SYSFAIL); - } - addr_to_str(name, sizeof(name), vnode->nid.addr, 0); - fd = connect_to(name, vnode->nid.port); + fd = connect_to(sdhost, sdport); if (fd < 0) { - fprintf(stderr, "failed to connect to %s:%"PRIu32"\n", - name, vnode->nid.port); - exit(EXIT_FAILURE); + fprintf(stderr, "Failed to connect\n"); + return; } - sd_init_req(&hdr, SD_OP_READ_PEER); - hdr.epoch = sd_epoch; - hdr.flags = 0; - hdr.data_length = rlen; + sd_init_req(&hdr, SD_OP_CALC_CHKSUM); + rlen = 0; + wlen = sizeof(vnode->nid); + + hdr.epoch = sd_epoch; hdr.obj.oid = oid; + hdr.data_length = wlen; + hdr.flags = SD_FLAG_CMD_WRITE; - ret = exec_req(fd, &hdr, buf, &wlen, &rlen); + ret = exec_req(fd, &hdr, &vnode->nid, &wlen, &rlen); close(fd); - if (ret) { - fprintf(stderr, "Failed to execute request\n"); - exit(EXIT_FAILURE); + fprintf(stderr, "Failed to repair oid:%"PRIx64"\n", oid); + return; } - if (rsp->result != SD_RES_SUCCESS) { - fprintf(stderr, "Failed to read, %s\n", - sd_strerror(rsp->result)); - exit(EXIT_FAILURE); + fprintf(stderr, "Failed to repair oid:%"PRIx64", %s\n", + oid, sd_strerror(rsp->result)); + return; } - return buf; + + memcpy(sha1, rsp->sha1, SHA1_LEN); + return; } -static void write_object_to(struct sd_vnode *vnode, uint64_t oid, void *buf) +static int do_repair(uint64_t oid, struct node_id *src, struct node_id *dest) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; + unsigned rlen, wlen; + struct node_id nids[2]; int fd, ret; - unsigned wlen = SD_DATA_OBJ_SIZE, rlen = 0; - char name[128]; - addr_to_str(name, sizeof(name), vnode->nid.addr, 0); - fd = connect_to(name, vnode->nid.port); + fd = connect_to(sdhost, sdport); if (fd < 0) { - fprintf(stderr, "failed to connect to %s:%"PRIu32"\n", - name, vnode->nid.port); - exit(EXIT_FAILURE); + fprintf(stderr, "Failed to connect\n"); + return SD_RES_EIO; } - sd_init_req(&hdr, SD_OP_WRITE_PEER); + sd_init_req(&hdr, SD_OP_REPAIR_OBJ); + + rlen = 0; + wlen = sizeof(nids); + hdr.epoch = sd_epoch; - hdr.flags = SD_FLAG_CMD_WRITE; + hdr.obj.oid = oid; hdr.data_length = wlen; + hdr.flags = SD_FLAG_CMD_WRITE; - hdr.obj.oid = oid; + nids[0] = *src; + nids[1] = *dest; - ret = exec_req(fd, &hdr, buf, &wlen, &rlen); + ret = exec_req(fd, &hdr, nids, &wlen, &rlen); close(fd); - if (ret) { - fprintf(stderr, "Failed to execute request\n"); - exit(EXIT_FAILURE); + fprintf(stderr, "Failed to repair oid:%"PRIx64"\n", oid); + return SD_RES_EIO; } - if (rsp->result != SD_RES_SUCCESS) { - fprintf(stderr, "Failed to read, %s\n", - sd_strerror(rsp->result)); - exit(EXIT_FAILURE); + fprintf(stderr, "Failed to repair oid:%"PRIx64", %s\n", + oid, sd_strerror(rsp->result)); + return rsp->result; } + + return SD_RES_SUCCESS; } -/* - * Fix consistency of the replica of oid. - * - * XXX: The fix is rather dumb, just read the first copy and write it - * to other replica. - */ -static void do_check_repair(uint64_t oid, int nr_copies) +static int do_check_repair(uint64_t oid, int nr_copies) { struct sd_vnode *tgt_vnodes[nr_copies]; - void *buf, *buf_cmp; - int i; + unsigned char sha1[SD_MAX_COPIES][SHA1_LEN]; + char host[128]; + int i, j; oid_to_vnodes(sd_vnodes, sd_vnodes_nr, oid, nr_copies, tgt_vnodes); - buf = read_object_from(tgt_vnodes[0], oid); - for (i = 1; i < nr_copies; i++) { - buf_cmp = read_object_from(tgt_vnodes[i], oid); - if (memcmp(buf, buf_cmp, SD_DATA_OBJ_SIZE)) { - free(buf_cmp); - goto fix_consistency; + for (i = 0; i < nr_copies; i++) + get_obj_checksum_from(tgt_vnodes[i], oid, sha1[i]); + + for (i = 0; i < nr_copies; i++) { + for (j = (i + 1); j < nr_copies; j++) { + if (memcmp(sha1[i], sha1[j], SHA1_LEN)) + goto diff; } - free(buf_cmp); } - free(buf); - return; + return 0; -fix_consistency: - for (i = 1; i < nr_copies; i++) - write_object_to(tgt_vnodes[i], oid, buf); - fprintf(stdout, "fix %"PRIx64" success\n", oid); - free(buf); +diff: + fprintf(stderr, "Failed oid: %"PRIx64"\n", oid); + for (i = 0; i < nr_copies; i++) { + addr_to_str(host, sizeof(host), tgt_vnodes[i]->nid.addr, 0); + fprintf(stderr, ">> copy[%d], sha1: %s, from: %s:%d\n", + i, sha1_to_hex(sha1[i]), host, tgt_vnodes[i]->nid.port); + } + + if (!vdi_cmd_data.repair) + return -1; + + /* + * Force repair the consistency of oid's replica + * + * FIXME: this fix is rather dumb, it just read the + * first copy and write it to other replica, + */ + fprintf(stderr, ">> repairing ...\n"); + addr_to_str(host, sizeof(host), tgt_vnodes[0]->nid.addr, + tgt_vnodes[0]->nid.port); + for (i = 1; i < nr_copies; i++) { + char dest[128]; + addr_to_str(dest, sizeof(dest), tgt_vnodes[i]->nid.addr, + tgt_vnodes[i]->nid.port); + fprintf(stderr, ">> copy this object from %s => %s\n", + host, dest); + do_repair(oid, &tgt_vnodes[0]->nid, &tgt_vnodes[i]->nid); + } + fprintf(stderr, ">> repair finished\n"); + return -1; } static int check_repair_vdi(uint32_t vid) { struct sheepdog_inode *inode; - int ret; + int ret, failed = 0; uint64_t total, done = 0, oid; uint32_t idx = 0, dvid; @@ -1448,22 +1467,29 @@ static int check_repair_vdi(uint32_t vid) fprintf(stderr, "Failed to allocate memory\n"); return EXIT_SYSFAIL; } + ret = sd_read_object(vid_to_vdi_oid(vid), inode, SD_INODE_SIZE, 0); if (ret != SD_RES_SUCCESS) { fprintf(stderr, "Failed to read an inode\n"); return EXIT_FAILURE; } + if (do_check_repair(vid_to_vdi_oid(vid), inode->nr_copies)) + failed = 1; + total = inode->vdi_size; while(done < total) { dvid = inode->data_vdi_id[idx]; if (dvid) { oid = vid_to_data_oid(dvid, idx); - do_check_repair(oid, inode->nr_copies); + if (do_check_repair(oid, inode->nr_copies)) + failed = 1; } done += SD_DATA_OBJ_SIZE; idx++; } + if (failed) + return EXIT_FAILURE; return EXIT_SUCCESS; } @@ -1482,11 +1508,15 @@ static int vdi_check(int argc, char **argv) goto out; } + printf("CHECKING VDI:%s ...\n", vdiname); ret = check_repair_vdi(vid); - if (ret != EXIT_SUCCESS) + if (ret != EXIT_SUCCESS) { + printf("%s\n", + vdi_cmd_data.repair ? "REPAIRED" : "FAILED"); goto out; + } - fprintf(stdout, "finish check&repair %s\n", vdiname); + printf("PASSED\n"); return EXIT_SUCCESS; out: return ret; @@ -1519,7 +1549,7 @@ out: } static struct subcommand vdi_cmd[] = { - {"check", "<vdiname>", "saph", "check and repair image's consistency", + {"check", "<vdiname>", "Rsaph", "check and repair image's consistency", NULL, SUBCMD_FLAG_NEED_NODELIST|SUBCMD_FLAG_NEED_THIRD_ARG, vdi_check, vdi_options}, {"create", "<vdiname> <size>", "Paph", "create an image", @@ -1599,6 +1629,9 @@ static int vdi_parser(int ch, char *opt) case 'C': vdi_cmd_data.cache = 1; break; + case 'R': + vdi_cmd_data.repair = 1; + break; } return 0; diff --git a/include/internal_proto.h b/include/internal_proto.h index 83d98f1..22a050a 100644 --- a/include/internal_proto.h +++ b/include/internal_proto.h @@ -61,8 +61,12 @@ #define SD_OP_REMOVE_PEER 0xA6 #define SD_OP_SET_CACHE_SIZE 0xA7 #define SD_OP_ENABLE_RECOVER 0xA8 -#define SD_OP_DISABLE_RECOVER 0xA9 -#define SD_OP_INFO_RECOVER 0xAA +#define SD_OP_DISABLE_RECOVER 0xA9 +#define SD_OP_INFO_RECOVER 0xAA +#define SD_OP_CALC_CHKSUM 0xAB +#define SD_OP_CALC_CHKSUM_PEER 0xAC +#define SD_OP_REPAIR_OBJ 0xAD +#define SD_OP_REPAIR_OBJ_PEER 0xAE /* internal flags for hdr.flags, must be above 0x80 */ #define SD_FLAG_CMD_RECOVERY 0x0080 @@ -167,6 +171,20 @@ struct sd_node_rsp { uint64_t store_free; }; +struct sd_checksum_rsp { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint32_t result; + union { + uint8_t sha1[SHA1_LEN]; + uint32_t __pad[7]; + }; +}; + struct node_id { uint8_t addr[16]; uint16_t port; diff --git a/include/sheep.h b/include/sheep.h index 719d18f..54b6eb3 100644 --- a/include/sheep.h +++ b/include/sheep.h @@ -279,4 +279,20 @@ static inline int nodes_to_vnodes(struct sd_node *nodes, int nr, return nr_vnodes; } +static inline char *sha1_to_hex(const unsigned char *sha1) +{ + static char buffer[50]; + static const char hex[] = "0123456789abcdef"; + char *buf = buffer; + int i; + + for (i = 0; i < SHA1_LEN; i++) { + unsigned int val = *sha1++; + *buf++ = hex[val >> 4]; + *buf++ = hex[val & 0xf]; + } + buffer[2 * SHA1_LEN] = 0; + return buffer; +} + #endif diff --git a/sheep/farm/farm.h b/sheep/farm/farm.h index 27e65cd..d0b635a 100644 --- a/sheep/farm/farm.h +++ b/sheep/farm/farm.h @@ -53,7 +53,6 @@ extern char farm_obj_dir[PATH_MAX]; extern char *sha1_to_path(const unsigned char *sha1); extern int sha1_file_write(unsigned char *buf, unsigned len, unsigned char *outsha1); extern void *sha1_file_read(const unsigned char *sha1, struct sha1_file_hdr *hdr); -extern char *sha1_to_hex(const unsigned char *sha1); extern int get_sha1_hex(const char *hex, unsigned char *sha1); extern int sha1_file_try_delete(const unsigned char *sha1); diff --git a/sheep/farm/sha1_file.c b/sheep/farm/sha1_file.c index b0abc16..3cd4f40 100644 --- a/sheep/farm/sha1_file.c +++ b/sheep/farm/sha1_file.c @@ -257,18 +257,3 @@ int get_sha1_hex(const char *hex, unsigned char *sha1) } return 0; } - -char *sha1_to_hex(const unsigned char *sha1) -{ - static char buffer[50]; - static const char hex[] = "0123456789abcdef"; - char *buf = buffer; - int i; - - for (i = 0; i < SHA1_LEN; i++) { - unsigned int val = *sha1++; - *buf++ = hex[val >> 4]; - *buf++ = hex[val & 0xf]; - } - return buffer; -} diff --git a/sheep/gateway.c b/sheep/gateway.c index bdbd08c..82ef145 100644 --- a/sheep/gateway.c +++ b/sheep/gateway.c @@ -316,3 +316,79 @@ int gateway_remove_obj(struct request *req) { return gateway_forward_request(req); } + +int gateway_calc_obj_chksum(struct request *req) +{ + struct sd_req fwdhdr, *hdr = &req->rq; + struct sd_checksum_rsp *rsp; + struct node_id *nid = (struct node_id *)req->data; + char host[128]; + unsigned int rlen, wlen; + int fd, ret; + + addr_to_str(host, sizeof(host), nid->addr, 0); + + fd = connect_to(host, nid->port); + if (fd < 0) { + dprintf("Failed to connect\n"); + return SD_RES_EIO; + } + + rlen = 0; + wlen = 0; + + memcpy(&fwdhdr, hdr, sizeof(fwdhdr)); + fwdhdr.opcode = gateway_to_peer_opcode(hdr->opcode); + fwdhdr.flags = 0; + fwdhdr.data_length = 0; + + rsp = (struct sd_checksum_rsp *)&fwdhdr; + ret = exec_req(fd, &fwdhdr, NULL, &wlen, &rlen); + close(fd); + if (ret) + return SD_RES_EIO; + + if (rsp->result == SD_RES_SUCCESS) { + memcpy((struct sd_checksum_rsp *)&req->rp, rsp, sizeof(*rsp)); + return SD_RES_SUCCESS; + } + + return rsp->result; +} + +int gateway_repair_obj(struct request *req) +{ + struct sd_req fwdhdr, *hdr = &req->rq; + struct sd_rsp *rsp = (struct sd_rsp *)&hdr; + unsigned rlen, wlen; + struct node_id *src, *dest; + char host[128]; + int fd, ret; + + src = (struct node_id *)req->data; + dest = src + 1; + + addr_to_str(host, sizeof(host), dest->addr, 0); + + fd = connect_to(host, dest->port); + if (fd < 0) { + dprintf("Failed to connect\n"); + return SD_RES_EIO; + } + + rlen = 0; + wlen = sizeof(*src); + + memcpy(&fwdhdr, hdr, sizeof(fwdhdr)); + fwdhdr.opcode = gateway_to_peer_opcode(hdr->opcode); + fwdhdr.flags = 0; + fwdhdr.data_length = wlen; + fwdhdr.flags = SD_FLAG_CMD_WRITE; + + ret = exec_req(fd, &fwdhdr, src, &wlen, &rlen); + close(fd); + if (ret) + return SD_RES_EIO; + + return rsp->result; +} diff --git a/sheep/ops.c b/sheep/ops.c index 8ca8748..d0aac31 100644 --- a/sheep/ops.c +++ b/sheep/ops.c @@ -26,6 +26,7 @@ #include "strbuf.h" #include "trace/trace.h" #include "util.h" +#include "sha1.h" enum sd_op_type { SD_OP_TYPE_CLUSTER = 1, /* cluster operations */ @@ -625,6 +626,51 @@ static int local_kill_node(const struct sd_req *req, struct sd_rsp *rsp, return SD_RES_SUCCESS; } +static inline int read_object_from(struct node_id *src, uint32_t epoch, + uint64_t oid, char *buf) +{ + struct sd_req hdr; + struct sd_rsp *rsp = (struct sd_rsp *)&hdr; + int fd, ret; + unsigned wlen, rlen; + char host[128]; + + addr_to_str(host, sizeof(host), src->addr, 0); + + fd = connect_to(host, src->port); + if (fd < 0) { + dprintf("failed to connect to %s:%"PRIu32"\n", + host, src->port); + return SD_RES_NETWORK_ERROR; + } + + rlen = SD_DATA_OBJ_SIZE; + if (is_vdi_obj(oid)) + rlen = SD_INODE_SIZE; + wlen = 0; + + sd_init_req(&hdr, SD_OP_READ_PEER); + hdr.epoch = epoch; + hdr.data_length = rlen; + + hdr.obj.oid = oid; + hdr.obj.offset = 0; + + ret = exec_req(fd, &hdr, buf, &wlen, &rlen); + close(fd); + if (ret) { + dprintf("Failed to execute request\n"); + return SD_RES_NETWORK_ERROR; + } + + if (rsp->result != SD_RES_SUCCESS) { + dprintf("Failed to read, %s\n", sd_strerror(rsp->result)); + return SD_RES_NETWORK_ERROR; + } + + return SD_RES_SUCCESS; +} + static int read_copy_from_replica(struct vnode_info *vnodes, uint32_t epoch, uint64_t oid, char *buf) { @@ -666,9 +712,6 @@ static int read_copy_from_replica(struct vnode_info *vnodes, uint32_t epoch, rounded_rand = random() % nr_copies; for (i = 0; i < nr_copies; i++) { - unsigned wlen, rlen; - int fd; - j = (i + rounded_rand) % nr_copies; /* bypass the local copy */ @@ -676,32 +719,9 @@ static int read_copy_from_replica(struct vnode_info *vnodes, uint32_t epoch, continue; v = obj_vnodes[j]; - addr_to_str(name, sizeof(name), v->nid.addr, 0); - - fd = connect_to(name, v->nid.port); - if (fd < 0) - continue; - - rlen = SD_DATA_OBJ_SIZE; - wlen = 0; - sd_init_req(&hdr, SD_OP_READ_PEER); - hdr.epoch = epoch; - hdr.data_length = rlen; - - hdr.obj.oid = oid; - hdr.obj.offset = 0; - - ret = exec_req(fd, &hdr, buf, &wlen, &rlen); - - close(fd); - - if (ret) { - dprintf("%x, %x\n", ret, rsp->result); - continue; - } - - if (rsp->result == SD_RES_SUCCESS) + ret = read_object_from(&v->nid, epoch, oid, buf); + if (ret == SD_RES_SUCCESS) break; } @@ -748,6 +768,50 @@ out: return ret; } +int peer_calc_obj_chksum(struct request *req) +{ + struct sd_req *hdr = &req->rq; + struct sd_checksum_rsp *rsp = (struct sd_checksum_rsp *)&req->rp; + uint32_t epoch = hdr->epoch; + unsigned char sha1[SHA1_LEN]; + struct siocb iocb; + struct sha1_ctx ctx; + void *buf; + int ret; + + if (sys->gateway_only) + return SD_RES_NO_OBJ; + + hdr->data_length = SD_DATA_OBJ_SIZE; + if (is_vdi_obj(hdr->obj.oid)) + hdr->data_length = SD_INODE_SIZE; + + buf = valloc(hdr->data_length); + hdr->obj.offset = 0; + + memset(&iocb, 0, sizeof(iocb)); + iocb.epoch = epoch; + iocb.flags = hdr->flags; + iocb.buf = buf; + iocb.length = hdr->data_length; + iocb.offset = hdr->obj.offset; + ret = sd_store->read(hdr->obj.oid, &iocb); + if (ret != SD_RES_SUCCESS) + goto out; + + sha1_init(&ctx); + sha1_update(&ctx, buf, hdr->data_length); + sha1_final(&ctx, sha1); + memcpy(&rsp->sha1, sha1, SHA1_LEN); +out: + req->data = NULL; + req->data_length = 0; + hdr->data_length = 0; + rsp->data_length = 0; + free(buf); + return ret; +} + static int do_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch, void *data, int create) { @@ -777,6 +841,50 @@ static int do_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch, return ret; } +int peer_repair_obj(struct request *req) +{ + struct sd_req whdr, *hdr = &req->rq; + uint32_t epoch = hdr->epoch; + uint64_t oid = hdr->obj.oid; + struct node_id *src; + struct siocb iocb; + char host[128]; + int ret, len; + void *buf; + + if (sys->gateway_only) + return SD_RES_NO_OBJ; + + src = (struct node_id *)req->data; + addr_to_str(host, sizeof(host), src->addr, src->port); + + dprintf("oid:%"PRIx64", from:%s\n", oid, host); + + len = SD_DATA_OBJ_SIZE; + if (is_vdi_obj(hdr->obj.oid)) + len = SD_INODE_SIZE; + + buf = valloc(len); + if (!buf) { + eprintf("can not allocate memory\n"); + return SD_RES_NO_SPACE; + } + + ret = read_object_from(src, epoch, oid, buf); + if (ret != SD_RES_SUCCESS) + return ret; + + memset(&whdr, 0, sizeof(whdr)); + memset(&iocb, 0, sizeof(iocb)); + + whdr.data_length = len; + whdr.obj.offset = 0; + whdr.obj.oid = oid; + whdr.epoch = epoch; + iocb.epoch = epoch; + return do_write_obj(&iocb, &whdr, epoch, buf, 1); +} + int peer_write_obj(struct request *req) { struct sd_req *hdr = &req->rq; @@ -1084,11 +1192,36 @@ static struct sd_op_template sd_ops[] = { .type = SD_OP_TYPE_CLUSTER, .process_main = cluster_disable_recover, }, + [SD_OP_INFO_RECOVER] = { .name = "INFO_RECOVER", .type = SD_OP_TYPE_LOCAL, .process_main = local_info_recover, }, + + [SD_OP_CALC_CHKSUM] = { + .name = "CALC_CHKSUM", + .type = SD_OP_TYPE_GATEWAY, + .process_work = gateway_calc_obj_chksum, + }, + + [SD_OP_CALC_CHKSUM_PEER] = { + .name = "CALC_CHKSUM_PEER", + .type = SD_OP_TYPE_PEER, + .process_work = peer_calc_obj_chksum, + }, + + [SD_OP_REPAIR_OBJ] = { + .name = "REPAIR_OBJ", + .type = SD_OP_TYPE_GATEWAY, + .process_work = gateway_repair_obj, + }, + + [SD_OP_REPAIR_OBJ_PEER] = { + .name = "REPAIR_OBJ_PEER", + .type = SD_OP_TYPE_PEER, + .process_work = peer_repair_obj, + }, }; struct sd_op_template *get_sd_op(uint8_t opcode) @@ -1174,6 +1307,8 @@ static int map_table[] = { [SD_OP_READ_OBJ] = SD_OP_READ_PEER, [SD_OP_WRITE_OBJ] = SD_OP_WRITE_PEER, [SD_OP_REMOVE_OBJ] = SD_OP_REMOVE_PEER, + [SD_OP_CALC_CHKSUM] = SD_OP_CALC_CHKSUM_PEER, + [SD_OP_REPAIR_OBJ] = SD_OP_REPAIR_OBJ_PEER, }; int gateway_to_peer_opcode(int opcode) diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index 857cf87..48e076c 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -319,12 +319,16 @@ int gateway_read_obj(struct request *req); int gateway_write_obj(struct request *req); int gateway_create_and_write_obj(struct request *req); int gateway_remove_obj(struct request *req); +int gateway_calc_obj_chksum(struct request *req); +int gateway_repair_obj(struct request *req); /* backend store */ int peer_read_obj(struct request *req); int peer_write_obj(struct request *req); int peer_create_and_write_obj(struct request *req); int peer_remove_obj(struct request *req); +int peer_calc_obj_chksum(struct request *req); +int peer_repair_obj(struct request *req); /* object_cache */ -- 1.7.11.2 |