Hi, Thanks for reporting the bug. Can you try the below patch against the current git head? > PS. another question since I've used --copies=2 , i've expected to find > each sheepdog VM block repliacted to 2 nodes not 3 but > under /sheepdog/0/ i see that exactly the same number (with same names) > of files where created on all 3 nodes - the only exceptions > is that /sheepdog/0/vdi/zopa was created only on 2 nodes. > Is that expected and what is the actual meaning --copies=N ? The meaning of --copies is just what you expected, and this patch will also fix the problem, I think. == >From 7a45f310bd6b81f0c655217f3f1dfc63fd68c634 Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp> Date: Sun, 27 Dec 2009 06:08:09 +0900 Subject: [PATCH] use ANAME_COPIES as a number of replication Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp> --- collie/net.c | 1 + collie/store.c | 27 ++++++++++++++++++++ collie/vdi.c | 60 +++++++++++++++++++++++++++++++++------------ include/net.h | 3 +- include/sheepdog_proto.h | 5 +++- lib/net.c | 36 +++++++++++++++++---------- shepherd/shepherd.c | 3 +- 7 files changed, 103 insertions(+), 32 deletions(-) diff --git a/collie/net.c b/collie/net.c index 5505613..be19bcc 100644 --- a/collie/net.c +++ b/collie/net.c @@ -65,6 +65,7 @@ static void queue_request(struct request *req) case SD_OP_SO_NEW_VDI: case SD_OP_SO_LOOKUP_VDI: case SD_OP_SO_READ_VDIS: + case SD_OP_SO_STAT: req->work.fn = so_queue_request; break; default: diff --git a/collie/store.c b/collie/store.c index 4e95469..fce71ff 100644 --- a/collie/store.c +++ b/collie/store.c @@ -606,6 +606,33 @@ void so_queue_request(struct work *work, int idx) case SD_OP_SO_READ_VDIS: ret = so_read_vdis(req); break; + case SD_OP_SO_STAT: + fd = open(path, O_RDONLY); + if (fd < 0) { + result = SD_RES_EIO; + goto out; + } + + rsp->oid = 0; + ret = fgetxattr(fd, ANAME_LAST_OID, &rsp->oid, + sizeof(rsp->oid)); + if (ret != sizeof(rsp->oid)) { + close(fd); + result = SD_RES_SYSTEM_ERROR; + goto out; + } + + rsp->copies = 0; + ret = fgetxattr(fd, ANAME_COPIES, &rsp->copies, + sizeof(rsp->copies)); + if (ret != sizeof(rsp->copies)) { + close(fd); + result = SD_RES_SYSTEM_ERROR; + goto out; + } + + result = SD_RES_SUCCESS; + break; } out: diff --git a/collie/vdi.c b/collie/vdi.c index 31567d0..290d919 100644 --- a/collie/vdi.c +++ b/collie/vdi.c @@ -84,11 +84,12 @@ int add_vdi(struct cluster_info *ci, char *name, int len, uint64_t size, uint64_t *added_oid, uint64_t base_oid, uint32_t tag) { struct sheepdog_node_list_entry entries[SD_MAX_NODES]; - int nr_nodes; + int nr_nodes, nr_reqs; uint64_t oid = 0; int ret; int copies; struct sd_so_req req; + struct sd_so_rsp *rsp = (struct sd_so_rsp *)&req; memset(&req, 0, sizeof(req)); @@ -97,22 +98,31 @@ int add_vdi(struct cluster_info *ci, char *name, int len, uint64_t size, dprintf("%s (%d) %" PRIu64 ", base: %" PRIu64 "\n", name, len, size, base_oid); - /* todo */ -/* copies = sb->default_nr_copies; */ - copies = 3; - if (copies > nr_nodes) - copies = nr_nodes; + req.opcode = SD_OP_SO_STAT; + ret = exec_reqs(entries, nr_nodes, ci->epoch, + SD_DIR_OID, (struct sd_req *)&req, NULL, 0, 0, + nr_nodes, 1); + if (ret < 0) + return rsp->result; + + copies = rsp->copies; + nr_reqs = copies; + if (nr_reqs > nr_nodes) + nr_reqs = nr_nodes; + + memset(&req, 0, sizeof(req)); req.opcode = SD_OP_SO_NEW_VDI; req.copies = copies; req.tag = tag; ret = exec_reqs(entries, nr_nodes, ci->epoch, - SD_DIR_OID, (struct sd_req *)&req, name, len, 0, copies); + SD_DIR_OID, (struct sd_req *)&req, name, len, 0, + nr_reqs, nr_reqs); /* todo: error handling */ - oid = ((struct sd_so_rsp *)&req)->oid; + oid = rsp->oid; *added_oid = oid; dprintf("%s (%d) %" PRIu64 ", base: %" PRIu64 "\n", name, len, size, @@ -134,7 +144,7 @@ int lookup_vdi(struct cluster_info *ci, int *current) { struct sheepdog_node_list_entry entries[SD_MAX_NODES]; - int nr_nodes; + int nr_nodes, nr_reqs; int ret, copies; struct sd_so_req req; struct sd_so_rsp *rsp = (struct sd_so_rsp *)&req; @@ -147,16 +157,30 @@ int lookup_vdi(struct cluster_info *ci, dprintf("looking for %s %zd\n", filename, strlen(filename)); - /* todo */ - copies = 3; - if (copies > nr_nodes) - copies = nr_nodes; + req.opcode = SD_OP_SO_STAT; + ret = exec_reqs(entries, nr_nodes, ci->epoch, + SD_DIR_OID, (struct sd_req *)&req, NULL, 0, 0, + nr_nodes, 1); + if (ret < 0) + return rsp->result; + + copies = rsp->copies; + nr_reqs = copies; + if (nr_reqs > nr_nodes) + nr_reqs = nr_nodes; + + memset(&req, 0, sizeof(req)); + copies = rsp->copies; + nr_reqs = copies; + if (nr_reqs > nr_nodes) + nr_reqs = nr_nodes; req.opcode = SD_OP_SO_LOOKUP_VDI; req.tag = tag; ret = exec_reqs(entries, nr_nodes, ci->epoch, - SD_DIR_OID, (struct sd_req *)&req, filename, strlen(filename), 0, copies); + SD_DIR_OID, (struct sd_req *)&req, filename, strlen(filename), 0, + nr_reqs, 1); *oid = rsp->oid; if (rsp->flags & SD_VDI_RSP_FLAG_CURRENT) @@ -186,7 +210,11 @@ int make_super_object(struct cluster_info *ci, struct sd_vdi_req *hdr) nr_nodes = build_node_list(&ci->node_list, entries); ret = exec_reqs(entries, nr_nodes, ci->epoch, - SD_DIR_OID, (struct sd_req *)&req, NULL, 0, 0, req.copies); + SD_DIR_OID, (struct sd_req *)&req, NULL, 0, 0, req.copies, + req.copies); - return ret; + if (ret < 0) + return SD_RES_EIO; + + return SD_RES_SUCCESS; } diff --git a/include/net.h b/include/net.h index b0e3df0..7bf0dbb 100644 --- a/include/net.h +++ b/include/net.h @@ -46,7 +46,8 @@ int read_object(struct sheepdog_node_list_entry *e, int exec_reqs(struct sheepdog_node_list_entry *e, int nodes, uint32_t node_version, uint64_t oid, struct sd_req *hdr, - char *data, unsigned int wdatalen, unsigned int rdatalen, int nr); + char *data, unsigned int wdatalen, unsigned int rdatalen, int nr, + int quorum); int create_listen_ports(int port, int (*callback)(int fd, void *), void *data); diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h index 4bfb4e5..9557cd8 100644 --- a/include/sheepdog_proto.h +++ b/include/sheepdog_proto.h @@ -48,6 +48,7 @@ #define SD_OP_SO_DEL_VDI 0x62 #define SD_OP_SO_LOOKUP_VDI 0x63 #define SD_OP_SO_READ_VDIS 0x64 +#define SD_OP_SO_STAT 0x65 #define SD_OP_STAT_SHEEP 0xB0 @@ -125,8 +126,10 @@ struct sd_so_rsp { uint32_t id; uint32_t data_length; uint32_t result; + uint32_t copies; + uint64_t ctime; uint64_t oid; - uint32_t opcode_specific[5]; + uint32_t opcode_specific[2]; }; struct sd_obj_req { diff --git a/lib/net.c b/lib/net.c index caf592f..5e26f46 100644 --- a/lib/net.c +++ b/lib/net.c @@ -433,16 +433,19 @@ int read_object(struct sheepdog_node_list_entry *e, /* TODO: clean up with the above functions */ int exec_reqs(struct sheepdog_node_list_entry *e, int nodes, uint32_t node_version, uint64_t oid, struct sd_req *hdr, - char *data, unsigned int wdatalen, unsigned int rdatalen, int nr) + char *data, unsigned int wdatalen, unsigned int rdatalen, int nr, + int quorum) { char name[128]; int i = 0, n, fd, ret; int success = 0; struct sd_req tmp; struct sd_rsp *rsp = (struct sd_rsp *)&tmp; + unsigned wlen, rlen; for (i = 0; i < nr; i++) { - unsigned wlen = wdatalen, rlen = rdatalen; + wlen = wdatalen; + rlen = rdatalen; n = obj_to_sheep(e, nodes, oid, i); @@ -453,8 +456,10 @@ int exec_reqs(struct sheepdog_node_list_entry *e, e[n].addr[15]); fd = connect_to(name, e[n].port); - if (fd < 0) + if (fd < 0) { + ((struct sd_rsp *) hdr)->result = SD_RES_EIO; return -1; + } hdr->epoch = node_version; if (wdatalen) { @@ -470,18 +475,23 @@ int exec_reqs(struct sheepdog_node_list_entry *e, close(fd); rsp = (struct sd_rsp *)&tmp; - if (rdatalen) { - if (!ret) { - if (rsp->result == SD_RES_SUCCESS) { - memcpy(hdr, rsp, sizeof(*rsp)); - return rlen; - } - } - } else - if (!ret) + + if (!ret) { + if (rsp->result == SD_RES_SUCCESS) success++; + } + + if (success >= quorum) + break; } + memcpy(hdr, rsp, sizeof(*rsp)); - return !success; + if (success < quorum) + return -1; + + if (rdatalen) + return rlen; + else + return wlen; } diff --git a/shepherd/shepherd.c b/shepherd/shepherd.c index 9654888..5bbf29d 100644 --- a/shepherd/shepherd.c +++ b/shepherd/shepherd.c @@ -398,7 +398,8 @@ int parse_vdi(vdi_parser_func_t func, void *data) req.opcode = SD_OP_SO_READ_VDIS; ret = exec_reqs(node_list_entries, nr_nodes, node_list_version, - SD_DIR_OID, (struct sd_req *)&req, buf, 0, DIR_BUF_LEN,nr_nodes); + SD_DIR_OID, (struct sd_req *)&req, buf, 0, DIR_BUF_LEN, + nr_nodes, 1); if (ret < 0) { ret = 1; -- 1.6.5 |