[sheepdog] [PATCH V3] collie: optimize 'collie vdi check' command
Yunkai Zhang
yunkai.me at gmail.com
Tue Aug 28 13:42:05 CEST 2012
From: Yunkai Zhang <qiushu.zyk at taobao.com>
V3:
- use local sheep as gateway, no just a proxy
- update commit log
V2:
- use '-R, --repair' instread of '-F, --force_repair'
- not connect to target sheep directly
- define a member name instead of using __pad.
- update this commit log
---------------------------------------------- >8
Reading all of vdi's objects from cluster when checking them will lead to a lot
of waste of network bandwith, let's calculate the checksum of objects in backend
and only send the checksum result to the collie client.
And I think repairing object automaticly is dangerous, as we don't known which
replica is correct. In order to let user have a chance to check them if
necessary, I add a new option: '-R, --repair'. By default, this command
just do check, not repair(as the command name implies).
After add '-R' flag, the help looks like:
$ collie vdi check
Usage: collie vdi check [-R] [-s snapshot] [-a address] [-p port] [-h] <vdiname>
Options:
-R, --repair force repair object's copies (dangerous)
-s, --snapshot specify a snapshot id or tag name
-a, --address specify the daemon address (default: localhost)
-p, --port specify the daemon port
-h, --help display this help and exit
Let's show some examples when execute this command:
* Success:
$ collie vdi check test.img
CHECKING VDI:test.img ...
PASSED
* Failure (by default not repair):
$ collie vdi check test.img
CHECKING VDI:test.img ...
Failed oid: 9c5e6800000001
>> copy[0], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7000
>> copy[1], sha1: 46dbc769de60a508faf134c6d51926741c0e38fa, from: 127.0.0.1:7001
>> copy[2], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7004
FAILED
User can do force repair by specify -R or --repair flag:
* Force repair:
$ collie vdi check -R test.img
CHECKING VDI:test.img ...
Failed oid: 9c5e6800000001
>> copy[0], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7000
>> copy[1], sha1: 46dbc769de60a508faf134c6d51926741c0e38fa, from: 127.0.0.1:7001
>> copy[2], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7004
>> Repaired successfully!
REPAIRED
Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
---
collie/common.c | 6 +-
collie/vdi.c | 184 ++++++++++++++++++++++++--------------------
include/internal_proto.h | 24 +++++-
include/sheep.h | 16 ++++
sheep/farm/farm.h | 1 -
sheep/farm/sha1_file.c | 15 ----
sheep/gateway.c | 116 ++++++++++++++++++++++++++++
sheep/ops.c | 193 +++++++++++++++++++++++++++++++++++++++--------
sheep/sheep_priv.h | 4 +
9 files changed, 424 insertions(+), 135 deletions(-)
diff --git a/collie/common.c b/collie/common.c
index ce8dcf7..dd54ae4 100644
--- a/collie/common.c
+++ b/collie/common.c
@@ -214,8 +214,7 @@ int send_light_req_get_response(struct sd_req *hdr, const char *host, int port)
ret = exec_req(fd, hdr, NULL, &wlen, &rlen);
close(fd);
if (ret) {
- fprintf(stderr, "failed to connect to %s:%d\n",
- host, port);
+ dprintf("failed to connect to %s:%d\n", host, port);
return -1;
}
@@ -236,8 +235,7 @@ int send_light_req(struct sd_req *hdr, const char *host, int port)
return -1;
if (ret != SD_RES_SUCCESS) {
- fprintf(stderr, "Response's result: %s\n",
- sd_strerror(ret));
+ dprintf("Response's result: %s\n", sd_strerror(ret));
return -1;
}
diff --git a/collie/vdi.c b/collie/vdi.c
index 7e2f5b0..e9e4962 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -24,6 +24,8 @@ static struct sd_option vdi_options[] = {
{'d', "delete", 0, "delete a key"},
{'C', "cache", 0, "enable object cache"},
{'c', "copies", 1, "specify the data redundancy (number of copies)"},
+ {'R', "repair", 0, "force repair object's copies"},
+
{ 0, NULL, 0, NULL },
};
@@ -36,6 +38,7 @@ struct vdi_cmd_data {
int prealloc;
int nr_copies;
int cache;
+ int repair;
} vdi_cmd_data = { ~0, };
struct get_vdi_info {
@@ -1324,126 +1327,129 @@ out:
return ret;
}
-static void *read_object_from(struct sd_vnode *vnode, uint64_t oid)
+static int get_object_checksum(uint64_t oid,
+ unsigned char sha1[SD_MAX_COPIES][SHA1_LEN],
+ struct node_id nids[SD_MAX_COPIES])
{
struct sd_req hdr;
struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
- int fd, ret;
- unsigned wlen = 0, rlen = SD_DATA_OBJ_SIZE;
- char name[128];
+ unsigned rlen, wlen;
+ int i, fd, ret, offset, nr_copies;
void *buf;
- buf = malloc(SD_DATA_OBJ_SIZE);
- if (!buf) {
- fprintf(stderr, "Failed to allocate memory\n");
- exit(EXIT_SYSFAIL);
- }
-
- addr_to_str(name, sizeof(name), vnode->nid.addr, 0);
- fd = connect_to(name, vnode->nid.port);
+ fd = connect_to(sdhost, sdport);
if (fd < 0) {
- fprintf(stderr, "failed to connect to %s:%"PRIu32"\n",
- name, vnode->nid.port);
- exit(EXIT_FAILURE);
+ fprintf(stderr, "Failed to connect\n");
+ return -1;
}
- sd_init_req(&hdr, SD_OP_READ_PEER);
- hdr.epoch = sd_epoch;
- hdr.flags = 0;
- hdr.data_length = rlen;
+ sd_init_req(&hdr, SD_OP_CALC_CHKSUM);
+
+ rlen = SD_MAX_COPIES * (SHA1_LEN + sizeof(*nids));
+ wlen = 0;
+ buf = xmalloc(rlen);
+ hdr.epoch = sd_epoch;
hdr.obj.oid = oid;
+ hdr.data_length = rlen;
+ hdr.flags = 0;
ret = exec_req(fd, &hdr, buf, &wlen, &rlen);
close(fd);
-
if (ret) {
- fprintf(stderr, "Failed to execute request\n");
- exit(EXIT_FAILURE);
+ fprintf(stderr, "Failed to get checksum, oid:%"PRIx64"\n", oid);
+ return -1;
}
-
if (rsp->result != SD_RES_SUCCESS) {
- fprintf(stderr, "Failed to read, %s\n",
- sd_strerror(rsp->result));
- exit(EXIT_FAILURE);
+ fprintf(stderr, "Failed to get checksum oid:%"PRIx64", %s\n",
+ oid, sd_strerror(rsp->result));
+ return -1;
}
- return buf;
+
+ offset = 0;
+ nr_copies = rsp->obj.copies;
+ for (i = 0; i < nr_copies; i++) {
+ memcpy(&sha1[i], (char *)buf + offset, SHA1_LEN);
+ memcpy(&nids[i], (char *)buf + offset + SHA1_LEN,
+ sizeof(*nids));
+ offset += SHA1_LEN + sizeof(*nids);
+ }
+
+ return nr_copies;
}
-static void write_object_to(struct sd_vnode *vnode, uint64_t oid, void *buf)
+static int do_repair(uint64_t oid)
{
struct sd_req hdr;
struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
- int fd, ret;
- unsigned wlen = SD_DATA_OBJ_SIZE, rlen = 0;
- char name[128];
+ int ret;
- addr_to_str(name, sizeof(name), vnode->nid.addr, 0);
- fd = connect_to(name, vnode->nid.port);
- if (fd < 0) {
- fprintf(stderr, "failed to connect to %s:%"PRIu32"\n",
- name, vnode->nid.port);
- exit(EXIT_FAILURE);
- }
+ sd_init_req(&hdr, SD_OP_REPAIR_OBJ);
- sd_init_req(&hdr, SD_OP_WRITE_PEER);
hdr.epoch = sd_epoch;
- hdr.flags = SD_FLAG_CMD_WRITE;
- hdr.data_length = wlen;
-
hdr.obj.oid = oid;
- ret = exec_req(fd, &hdr, buf, &wlen, &rlen);
- close(fd);
-
+ ret = send_light_req(&hdr, sdhost, sdport);
if (ret) {
- fprintf(stderr, "Failed to execute request\n");
- exit(EXIT_FAILURE);
+ fprintf(stderr, "Failed to repair oid:%"PRIx64"\n", oid);
+ return SD_RES_EIO;
}
-
if (rsp->result != SD_RES_SUCCESS) {
- fprintf(stderr, "Failed to read, %s\n",
- sd_strerror(rsp->result));
- exit(EXIT_FAILURE);
+ fprintf(stderr, "Failed to repair oid:%"PRIx64", %s\n",
+ oid, sd_strerror(rsp->result));
+ return rsp->result;
}
+
+ return SD_RES_SUCCESS;
}
-/*
- * Fix consistency of the replica of oid.
- *
- * XXX: The fix is rather dumb, just read the first copy and write it
- * to other replica.
- */
-static void do_check_repair(uint64_t oid, int nr_copies)
+static int do_check_repair(uint64_t oid)
{
- struct sd_vnode *tgt_vnodes[nr_copies];
- void *buf, *buf_cmp;
- int i;
-
- oid_to_vnodes(sd_vnodes, sd_vnodes_nr, oid, nr_copies, tgt_vnodes);
- buf = read_object_from(tgt_vnodes[0], oid);
- for (i = 1; i < nr_copies; i++) {
- buf_cmp = read_object_from(tgt_vnodes[i], oid);
- if (memcmp(buf, buf_cmp, SD_DATA_OBJ_SIZE)) {
- free(buf_cmp);
- goto fix_consistency;
+ unsigned char sha1[SD_MAX_COPIES][SHA1_LEN];
+ struct node_id nids[SD_MAX_COPIES];
+ int i, j, ret, nr_copies;
+ char host[128];
+
+ nr_copies = get_object_checksum(oid, sha1, nids);
+ for (i = 0; i < nr_copies; i++) {
+ for (j = (i + 1); j < nr_copies; j++) {
+ if (memcmp(sha1[i], sha1[j], SHA1_LEN))
+ goto diff;
}
- free(buf_cmp);
}
- free(buf);
- return;
+ return 0;
-fix_consistency:
- for (i = 1; i < nr_copies; i++)
- write_object_to(tgt_vnodes[i], oid, buf);
- fprintf(stdout, "fix %"PRIx64" success\n", oid);
- free(buf);
+diff:
+ fprintf(stderr, "Failed oid: %"PRIx64"\n", oid);
+ for (i = 0; i < nr_copies; i++) {
+ addr_to_str(host, sizeof(host), nids[i].addr, 0);
+ fprintf(stderr, ">> copy[%d], sha1: %s, from: %s:%d\n",
+ i, sha1_to_hex(sha1[i]), host, nids[i].port);
+ }
+
+ if (!vdi_cmd_data.repair)
+ return -1;
+
+ /*
+ * Force repair the consistency of oid's replica
+ *
+ * FIXME: this fix is rather dumb, it just read the
+ * first copy and write it to other replica,
+ */
+ ret = do_repair(oid);
+ if (ret == SD_RES_SUCCESS) {
+ fprintf(stderr, ">> Repaired successfully!\n");
+ return -1;
+ }
+
+ fprintf(stderr, ">> Failed to repair!\n");
+ return -1;
}
static int check_repair_vdi(uint32_t vid)
{
struct sheepdog_inode *inode;
- int ret;
+ int ret, failed = 0;
uint64_t total, done = 0, oid;
uint32_t idx = 0, dvid;
@@ -1452,22 +1458,29 @@ static int check_repair_vdi(uint32_t vid)
fprintf(stderr, "Failed to allocate memory\n");
return EXIT_SYSFAIL;
}
+
ret = sd_read_object(vid_to_vdi_oid(vid), inode, SD_INODE_SIZE, 0);
if (ret != SD_RES_SUCCESS) {
fprintf(stderr, "Failed to read an inode\n");
return EXIT_FAILURE;
}
+ if (do_check_repair(vid_to_vdi_oid(vid)))
+ failed = 1;
+
total = inode->vdi_size;
while(done < total) {
dvid = inode->data_vdi_id[idx];
if (dvid) {
oid = vid_to_data_oid(dvid, idx);
- do_check_repair(oid, inode->nr_copies);
+ if (do_check_repair(oid))
+ failed = 1;
}
done += SD_DATA_OBJ_SIZE;
idx++;
}
+ if (failed)
+ return EXIT_FAILURE;
return EXIT_SUCCESS;
}
@@ -1486,11 +1499,15 @@ static int vdi_check(int argc, char **argv)
goto out;
}
+ printf("CHECKING VDI:%s ...\n", vdiname);
ret = check_repair_vdi(vid);
- if (ret != EXIT_SUCCESS)
+ if (ret != EXIT_SUCCESS) {
+ printf("%s\n",
+ vdi_cmd_data.repair ? "REPAIRED" : "FAILED");
goto out;
+ }
- fprintf(stdout, "finish check&repair %s\n", vdiname);
+ printf("PASSED\n");
return EXIT_SUCCESS;
out:
return ret;
@@ -1523,7 +1540,7 @@ out:
}
static struct subcommand vdi_cmd[] = {
- {"check", "<vdiname>", "saph", "check and repair image's consistency",
+ {"check", "<vdiname>", "Rsaph", "check and repair image's consistency",
NULL, SUBCMD_FLAG_NEED_NODELIST|SUBCMD_FLAG_NEED_THIRD_ARG,
vdi_check, vdi_options},
{"create", "<vdiname> <size>", "Pcaph", "create an image",
@@ -1612,6 +1629,9 @@ static int vdi_parser(int ch, char *opt)
exit(EXIT_FAILURE);
}
vdi_cmd_data.nr_copies = nr_copies;
+ case 'R':
+ vdi_cmd_data.repair = 1;
+ break;
}
return 0;
diff --git a/include/internal_proto.h b/include/internal_proto.h
index c1d116a..96fee75 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -61,10 +61,14 @@
#define SD_OP_REMOVE_PEER 0xA6
#define SD_OP_SET_CACHE_SIZE 0xA7
#define SD_OP_ENABLE_RECOVER 0xA8
-#define SD_OP_DISABLE_RECOVER 0xA9
-#define SD_OP_INFO_RECOVER 0xAA
-#define SD_OP_GET_VDI_COPIES 0xAB
+#define SD_OP_DISABLE_RECOVER 0xA9
+#define SD_OP_INFO_RECOVER 0xAA
+#define SD_OP_GET_VDI_COPIES 0xAB
#define SD_OP_COMPLETE_RECOVERY 0xAC
+#define SD_OP_CALC_CHKSUM 0xAD
+#define SD_OP_CALC_CHKSUM_PEER 0xAE
+#define SD_OP_REPAIR_OBJ 0xAF
+#define SD_OP_REPAIR_OBJ_PEER 0xB0
/* internal flags for hdr.flags, must be above 0x80 */
#define SD_FLAG_CMD_RECOVERY 0x0080
@@ -169,6 +173,20 @@ struct sd_node_rsp {
uint64_t store_free;
};
+struct sd_checksum_rsp {
+ uint8_t proto_ver;
+ uint8_t opcode;
+ uint16_t flags;
+ uint32_t epoch;
+ uint32_t id;
+ uint32_t data_length;
+ uint32_t result;
+ union {
+ uint8_t sha1[SHA1_LEN];
+ uint32_t __pad[7];
+ };
+};
+
struct node_id {
uint8_t addr[16];
uint16_t port;
diff --git a/include/sheep.h b/include/sheep.h
index fbbab85..a97cb43 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -284,4 +284,20 @@ static inline int nodes_to_vnodes(struct sd_node *nodes, int nr,
return nr_vnodes;
}
+static inline char *sha1_to_hex(const unsigned char *sha1)
+{
+ static char buffer[50];
+ static const char hex[] = "0123456789abcdef";
+ char *buf = buffer;
+ int i;
+
+ for (i = 0; i < SHA1_LEN; i++) {
+ unsigned int val = *sha1++;
+ *buf++ = hex[val >> 4];
+ *buf++ = hex[val & 0xf];
+ }
+ buffer[2 * SHA1_LEN] = 0;
+ return buffer;
+}
+
#endif
diff --git a/sheep/farm/farm.h b/sheep/farm/farm.h
index 1a0081e..da79159 100644
--- a/sheep/farm/farm.h
+++ b/sheep/farm/farm.h
@@ -46,7 +46,6 @@ extern char farm_obj_dir[PATH_MAX];
extern char *sha1_to_path(const unsigned char *sha1);
extern int sha1_file_write(unsigned char *buf, unsigned len, unsigned char *);
extern void *sha1_file_read(const unsigned char *sha1, struct sha1_file_hdr *);
-extern char *sha1_to_hex(const unsigned char *sha1);
extern int get_sha1_hex(const char *hex, unsigned char *sha1);
extern int sha1_file_try_delete(const unsigned char *sha1);
diff --git a/sheep/farm/sha1_file.c b/sheep/farm/sha1_file.c
index b0abc16..3cd4f40 100644
--- a/sheep/farm/sha1_file.c
+++ b/sheep/farm/sha1_file.c
@@ -257,18 +257,3 @@ int get_sha1_hex(const char *hex, unsigned char *sha1)
}
return 0;
}
-
-char *sha1_to_hex(const unsigned char *sha1)
-{
- static char buffer[50];
- static const char hex[] = "0123456789abcdef";
- char *buf = buffer;
- int i;
-
- for (i = 0; i < SHA1_LEN; i++) {
- unsigned int val = *sha1++;
- *buf++ = hex[val >> 4];
- *buf++ = hex[val & 0xf];
- }
- return buffer;
-}
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 41d712b..528c912 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -316,3 +316,119 @@ int gateway_remove_obj(struct request *req)
{
return gateway_forward_request(req);
}
+
+int gateway_calc_obj_chksum(struct request *req)
+{
+ struct sd_req fwdhdr, *hdr = &req->rq;
+ struct sd_checksum_rsp *rsp;
+ struct sd_vnode *vnodes[SD_MAX_COPIES];
+ struct node_id *nid;
+ char host[128];
+ unsigned int rlen, wlen;
+ uint64_t oid = hdr->obj.oid;
+ int fd, ret, nr_objs;
+ int i = 0, offset = 0;
+
+ nr_objs = get_obj_copy_number(oid, req->vinfo->nr_zones);
+
+ oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes,
+ oid, nr_objs, vnodes);
+
+loop:
+ nid = &vnodes[i]->nid;
+ addr_to_str(host, sizeof(host), nid->addr, 0);
+
+ fd = connect_to(host, nid->port);
+ if (fd < 0) {
+ dprintf("Failed to connect\n");
+ return SD_RES_EIO;
+ }
+
+ rlen = 0;
+ wlen = 0;
+
+ memcpy(&fwdhdr, hdr, sizeof(fwdhdr));
+ fwdhdr.opcode = gateway_to_peer_opcode(hdr->opcode);
+ fwdhdr.flags = 0;
+ fwdhdr.data_length = 0;
+
+ rsp = (struct sd_checksum_rsp *)&fwdhdr;
+ ret = exec_req(fd, &fwdhdr, NULL, &wlen, &rlen);
+ close(fd);
+ if (ret)
+ return SD_RES_EIO;
+
+ if (rsp->result == SD_RES_SUCCESS)
+ memcpy((char *)req->data + offset, rsp->sha1, SHA1_LEN);
+ else if (rsp->result == SD_RES_NO_OBJ)
+ memset((char *)req->data + offset, 0, SHA1_LEN);
+ else
+ return rsp->result;
+
+ memcpy((char *)req->data + offset + SHA1_LEN, nid, sizeof(*nid));
+ offset += SHA1_LEN + sizeof(*nid);
+
+ if (++i < nr_objs)
+ goto loop;
+
+ req->rp.obj.copies = nr_objs;
+ req->rp.data_length = offset;
+
+ return SD_RES_SUCCESS;
+}
+
+int gateway_repair_obj(struct request *req)
+{
+ struct sd_req fwdhdr, *hdr = &req->rq;
+ struct sd_rsp *rsp = (struct sd_rsp *)&fwdhdr;
+ struct node_id *src, *dest;
+ struct sd_vnode *vnodes[SD_MAX_COPIES];
+ char host[128], to[128];
+ uint64_t oid = hdr->obj.oid;
+ unsigned rlen, wlen;
+ int i, fd, ret, err_ret, nr_objs;
+
+ nr_objs = get_obj_copy_number(oid, req->vinfo->nr_zones);
+
+ oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes,
+ oid, nr_objs, vnodes);
+
+ i = 1;
+ src = &vnodes[0]->nid;
+ err_ret = SD_RES_SUCCESS;
+loop:
+ dest = &vnodes[i]->nid;
+ addr_to_str(host, sizeof(host), dest->addr, 0);
+
+ fd = connect_to(host, dest->port);
+ if (fd < 0) {
+ dprintf("Failed to connect\n");
+ return SD_RES_EIO;
+ }
+
+ rlen = 0;
+ wlen = sizeof(*src);
+
+ memcpy(&fwdhdr, hdr, sizeof(fwdhdr));
+ fwdhdr.opcode = gateway_to_peer_opcode(hdr->opcode);
+ fwdhdr.flags = 0;
+ fwdhdr.data_length = wlen;
+ fwdhdr.flags = SD_FLAG_CMD_WRITE;
+
+ ret = exec_req(fd, &fwdhdr, src, &wlen, &rlen);
+ close(fd);
+ if (ret)
+ return SD_RES_EIO;
+
+ if (rsp->result != SD_RES_SUCCESS) {
+ dprintf("oid:%"PRIx64", from:%s to:%s\n", oid,
+ addr_to_str(host, sizeof(host), src->addr, src->port),
+ addr_to_str(to, sizeof(to), dest->addr, dest->port));
+ err_ret = rsp->result;
+ }
+
+ if (++i < nr_objs)
+ goto loop;
+
+ return err_ret;
+}
diff --git a/sheep/ops.c b/sheep/ops.c
index ccb1c5e..3225c59 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -26,6 +26,7 @@
#include "strbuf.h"
#include "trace/trace.h"
#include "util.h"
+#include "sha1.h"
enum sd_op_type {
SD_OP_TYPE_CLUSTER = 1, /* cluster operations */
@@ -219,7 +220,6 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
int i, ret;
uint32_t latest_epoch;
uint64_t created_time;
- struct siocb iocb = { 0 };
struct store_driver *driver;
driver = find_store_driver(data);
@@ -228,7 +228,6 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
sd_store = driver;
latest_epoch = get_latest_epoch();
- iocb.epoch = latest_epoch;
ret = sd_store->format(data);
if (ret != SD_RES_SUCCESS)
@@ -688,6 +687,51 @@ static int local_kill_node(const struct sd_req *req, struct sd_rsp *rsp,
return SD_RES_SUCCESS;
}
+static inline int read_object_from(struct node_id *src, uint32_t epoch,
+ uint64_t oid, char *buf)
+{
+ struct sd_req hdr;
+ struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+ int fd, ret;
+ unsigned wlen, rlen;
+ char host[128];
+
+ addr_to_str(host, sizeof(host), src->addr, 0);
+
+ fd = connect_to(host, src->port);
+ if (fd < 0) {
+ dprintf("failed to connect to %s:%"PRIu32"\n",
+ host, src->port);
+ return SD_RES_NETWORK_ERROR;
+ }
+
+ rlen = SD_DATA_OBJ_SIZE;
+ if (is_vdi_obj(oid))
+ rlen = SD_INODE_SIZE;
+ wlen = 0;
+
+ sd_init_req(&hdr, SD_OP_READ_PEER);
+ hdr.epoch = epoch;
+ hdr.data_length = rlen;
+
+ hdr.obj.oid = oid;
+ hdr.obj.offset = 0;
+
+ ret = exec_req(fd, &hdr, buf, &wlen, &rlen);
+ close(fd);
+ if (ret) {
+ dprintf("Failed to execute request\n");
+ return SD_RES_NETWORK_ERROR;
+ }
+
+ if (rsp->result != SD_RES_SUCCESS) {
+ dprintf("Failed to read, %s\n", sd_strerror(rsp->result));
+ return rsp->result;
+ }
+
+ return SD_RES_SUCCESS;
+}
+
static int read_copy_from_replica(struct vnode_info *vnodes, uint32_t epoch,
uint64_t oid, char *buf)
{
@@ -729,9 +773,6 @@ static int read_copy_from_replica(struct vnode_info *vnodes, uint32_t epoch,
rounded_rand = random() % nr_copies;
for (i = 0; i < nr_copies; i++) {
- unsigned wlen, rlen;
- int fd;
-
j = (i + rounded_rand) % nr_copies;
/* bypass the local copy */
@@ -739,32 +780,9 @@ static int read_copy_from_replica(struct vnode_info *vnodes, uint32_t epoch,
continue;
v = obj_vnodes[j];
- addr_to_str(name, sizeof(name), v->nid.addr, 0);
-
- fd = connect_to(name, v->nid.port);
- if (fd < 0)
- continue;
- rlen = SD_DATA_OBJ_SIZE;
- wlen = 0;
-
- sd_init_req(&hdr, SD_OP_READ_PEER);
- hdr.epoch = epoch;
- hdr.data_length = rlen;
-
- hdr.obj.oid = oid;
- hdr.obj.offset = 0;
-
- ret = exec_req(fd, &hdr, buf, &wlen, &rlen);
-
- close(fd);
-
- if (ret) {
- dprintf("%x, %x\n", ret, rsp->result);
- continue;
- }
-
- if (rsp->result == SD_RES_SUCCESS)
+ ret = read_object_from(&v->nid, epoch, oid, buf);
+ if (ret == SD_RES_SUCCESS)
break;
}
@@ -815,6 +833,50 @@ out:
return ret;
}
+int peer_calc_obj_chksum(struct request *req)
+{
+ struct sd_req *hdr = &req->rq;
+ struct sd_checksum_rsp *rsp = (struct sd_checksum_rsp *)&req->rp;
+ uint32_t epoch = hdr->epoch;
+ unsigned char sha1[SHA1_LEN];
+ struct siocb iocb;
+ struct sha1_ctx ctx;
+ void *buf;
+ int ret;
+
+ if (sys->gateway_only)
+ return SD_RES_NO_OBJ;
+
+ hdr->data_length = SD_DATA_OBJ_SIZE;
+ if (is_vdi_obj(hdr->obj.oid))
+ hdr->data_length = SD_INODE_SIZE;
+
+ buf = valloc(hdr->data_length);
+ hdr->obj.offset = 0;
+
+ memset(&iocb, 0, sizeof(iocb));
+ iocb.epoch = epoch;
+ iocb.flags = hdr->flags;
+ iocb.buf = buf;
+ iocb.length = hdr->data_length;
+ iocb.offset = hdr->obj.offset;
+ ret = sd_store->read(hdr->obj.oid, &iocb);
+ if (ret != SD_RES_SUCCESS)
+ goto out;
+
+ sha1_init(&ctx);
+ sha1_update(&ctx, buf, hdr->data_length);
+ sha1_final(&ctx, sha1);
+ memcpy(&rsp->sha1, sha1, SHA1_LEN);
+out:
+ req->data = NULL;
+ req->data_length = 0;
+ hdr->data_length = 0;
+ rsp->data_length = 0;
+ free(buf);
+ return ret;
+}
+
static int do_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch,
void *data, int create)
{
@@ -844,6 +906,50 @@ static int do_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch,
return ret;
}
+int peer_repair_obj(struct request *req)
+{
+ struct sd_req whdr, *hdr = &req->rq;
+ uint32_t epoch = hdr->epoch;
+ uint64_t oid = hdr->obj.oid;
+ struct node_id *src;
+ struct siocb iocb;
+ char host[128];
+ int ret, len;
+ void *buf;
+
+ if (sys->gateway_only)
+ return SD_RES_NO_OBJ;
+
+ src = (struct node_id *)req->data;
+ addr_to_str(host, sizeof(host), src->addr, src->port);
+
+ dprintf("oid:%"PRIx64", from:%s\n", oid, host);
+
+ len = SD_DATA_OBJ_SIZE;
+ if (is_vdi_obj(hdr->obj.oid))
+ len = SD_INODE_SIZE;
+
+ buf = valloc(len);
+ if (!buf) {
+ eprintf("can not allocate memory\n");
+ return SD_RES_NO_SPACE;
+ }
+
+ ret = read_object_from(src, epoch, oid, buf);
+ if (ret != SD_RES_SUCCESS)
+ return ret;
+
+ memset(&whdr, 0, sizeof(whdr));
+ memset(&iocb, 0, sizeof(iocb));
+
+ whdr.data_length = len;
+ whdr.obj.offset = 0;
+ whdr.obj.oid = oid;
+ whdr.epoch = epoch;
+ iocb.epoch = epoch;
+ return do_write_obj(&iocb, &whdr, epoch, buf, 1);
+}
+
int peer_write_obj(struct request *req)
{
struct sd_req *hdr = &req->rq;
@@ -1165,11 +1271,36 @@ static struct sd_op_template sd_ops[] = {
.type = SD_OP_TYPE_CLUSTER,
.process_main = cluster_disable_recover,
},
+
[SD_OP_INFO_RECOVER] = {
.name = "INFO_RECOVER",
.type = SD_OP_TYPE_LOCAL,
.process_main = local_info_recover,
},
+
+ [SD_OP_CALC_CHKSUM] = {
+ .name = "CALC_CHKSUM",
+ .type = SD_OP_TYPE_GATEWAY,
+ .process_work = gateway_calc_obj_chksum,
+ },
+
+ [SD_OP_CALC_CHKSUM_PEER] = {
+ .name = "CALC_CHKSUM_PEER",
+ .type = SD_OP_TYPE_PEER,
+ .process_work = peer_calc_obj_chksum,
+ },
+
+ [SD_OP_REPAIR_OBJ] = {
+ .name = "REPAIR_OBJ",
+ .type = SD_OP_TYPE_GATEWAY,
+ .process_work = gateway_repair_obj,
+ },
+
+ [SD_OP_REPAIR_OBJ_PEER] = {
+ .name = "REPAIR_OBJ_PEER",
+ .type = SD_OP_TYPE_PEER,
+ .process_work = peer_repair_obj,
+ },
};
struct sd_op_template *get_sd_op(uint8_t opcode)
@@ -1255,6 +1386,8 @@ static int map_table[] = {
[SD_OP_READ_OBJ] = SD_OP_READ_PEER,
[SD_OP_WRITE_OBJ] = SD_OP_WRITE_PEER,
[SD_OP_REMOVE_OBJ] = SD_OP_REMOVE_PEER,
+ [SD_OP_CALC_CHKSUM] = SD_OP_CALC_CHKSUM_PEER,
+ [SD_OP_REPAIR_OBJ] = SD_OP_REPAIR_OBJ_PEER,
};
int gateway_to_peer_opcode(int opcode)
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 90006f6..1af67b0 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -348,12 +348,16 @@ int gateway_read_obj(struct request *req);
int gateway_write_obj(struct request *req);
int gateway_create_and_write_obj(struct request *req);
int gateway_remove_obj(struct request *req);
+int gateway_calc_obj_chksum(struct request *req);
+int gateway_repair_obj(struct request *req);
/* backend store */
int peer_read_obj(struct request *req);
int peer_write_obj(struct request *req);
int peer_create_and_write_obj(struct request *req);
int peer_remove_obj(struct request *req);
+int peer_calc_obj_chksum(struct request *req);
+int peer_repair_obj(struct request *req);
/* object_cache */
--
1.7.11.4
More information about the sheepdog
mailing list