From: Liu Yuan <tailai.ly at taobao.com> Collie should retry read for eagain to fix the false-timeout case. This fix the 035 failure on collie side. Signed-off-by: Liu Yuan <tailai.ly at taobao.com> --- collie/cluster.c | 10 +++++----- collie/collie.c | 2 +- collie/collie.h | 1 + collie/common.c | 13 +++++++++---- collie/debug.c | 2 +- collie/node.c | 2 +- collie/vdi.c | 16 ++++++++-------- include/net.h | 1 + lib/net.c | 23 ++++++++++++++++++----- 9 files changed, 45 insertions(+), 25 deletions(-) diff --git a/collie/cluster.c b/collie/cluster.c index 917e190..10ced3b 100644 --- a/collie/cluster.c +++ b/collie/cluster.c @@ -53,7 +53,7 @@ static int list_store(void) sd_init_req(&hdr, SD_OP_GET_STORE_LIST); hdr.data_length = 512; - ret = exec_req(fd, &hdr, buf); + ret = collie_exec_req(fd, &hdr, buf); close(fd); if (ret) { @@ -98,7 +98,7 @@ static int cluster_format(int argc, char **argv) sd_init_req((struct sd_req *)&hdr, SD_OP_READ_VDIS); hdr.data_length = sizeof(vdi_inuse); - ret = exec_req(fd, (struct sd_req *)&hdr, &vdi_inuse); + ret = collie_exec_req(fd, (struct sd_req *)&hdr, &vdi_inuse); if (ret < 0) { fprintf(stderr, "Failed to read VDIs from %s:%d\n", sdhost, sdport); @@ -144,7 +144,7 @@ static int cluster_format(int argc, char **argv) hdr.flags |= SD_FLAG_CMD_WRITE; printf("using backend %s store\n", store_name); - ret = exec_req(fd, (struct sd_req *)&hdr, store_name); + ret = collie_exec_req(fd, (struct sd_req *)&hdr, store_name); close(fd); if (ret) { @@ -194,7 +194,7 @@ again: sd_init_req(&hdr, SD_OP_STAT_CLUSTER); hdr.data_length = log_length; - ret = exec_req(fd, &hdr, logs); + ret = collie_exec_req(fd, &hdr, logs); close(fd); if (ret != 0) @@ -311,7 +311,7 @@ static int list_snap(void) sd_init_req(&hdr, SD_OP_GET_SNAP_FILE); hdr.data_length = SD_DATA_OBJ_SIZE; - ret = exec_req(fd, &hdr, buf); + ret = collie_exec_req(fd, &hdr, buf); close(fd); if (ret) { diff --git a/collie/collie.c b/collie/collie.c index e739662..a533141 100644 --- a/collie/collie.c +++ b/collie/collie.c @@ -68,7 +68,7 @@ static int update_node_list(int max_nodes, uint32_t epoch) hdr.data_length = size; - ret = exec_req(fd, (struct sd_req *)&hdr, buf); + ret = collie_exec_req(fd, (struct sd_req *)&hdr, buf); if (ret) { ret = -1; goto out; diff --git a/collie/collie.h b/collie/collie.h index 1b00e68..7e9b6be 100644 --- a/collie/collie.h +++ b/collie/collie.h @@ -73,6 +73,7 @@ int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data, int copies, bool create, bool direct); int send_light_req(struct sd_req *hdr, const char *host, int port); int send_light_req_get_response(struct sd_req *hdr, const char *host, int port); +int collie_exec_req(int sockfd, struct sd_req *hdr, void *data); extern struct command vdi_command; extern struct command node_command; diff --git a/collie/common.c b/collie/common.c index 5343ffa..e709964 100644 --- a/collie/common.c +++ b/collie/common.c @@ -63,7 +63,7 @@ int sd_read_object(uint64_t oid, void *data, unsigned int datalen, if (direct) hdr.flags |= SD_FLAG_CMD_DIRECT; - ret = exec_req(fd, &hdr, data); + ret = collie_exec_req(fd, &hdr, data); close(fd); if (ret) { @@ -113,7 +113,7 @@ int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data, hdr.obj.cow_oid = cow_oid; hdr.obj.offset = offset; - ret = exec_req(fd, &hdr, data); + ret = collie_exec_req(fd, &hdr, data); close(fd); if (ret) { @@ -148,7 +148,7 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data) sd_init_req(&req, SD_OP_READ_VDIS); req.data_length = sizeof(vdi_inuse); - ret = exec_req(fd, &req, &vdi_inuse); + ret = collie_exec_req(fd, &req, &vdi_inuse); if (ret < 0) { fprintf(stderr, "Failed to read VDIs from %s:%d\n", sdhost, sdport); @@ -206,7 +206,7 @@ int send_light_req_get_response(struct sd_req *hdr, const char *host, int port) if (fd < 0) return -1; - ret = exec_req(fd, hdr, NULL); + ret = collie_exec_req(fd, hdr, NULL); close(fd); if (ret) { fprintf(stderr, "failed to connect to %s:%d\n", @@ -238,3 +238,8 @@ int send_light_req(struct sd_req *hdr, const char *host, int port) return 0; } + +int collie_exec_req(int sockfd, struct sd_req *hdr, void *data) +{ + return net_exec_req(sockfd, hdr, data, true); +} diff --git a/collie/debug.c b/collie/debug.c index 11d5934..708ba25 100644 --- a/collie/debug.c +++ b/collie/debug.c @@ -94,7 +94,7 @@ read_buffer: sd_init_req(&hdr, SD_OP_TRACE_READ_BUF); hdr.data_length = TRACE_BUF_LEN; - ret = exec_req(fd, &hdr, buf); + ret = collie_exec_req(fd, &hdr, buf); if (ret) { fprintf(stderr, "Failed to connect\n"); diff --git a/collie/node.c b/collie/node.c index f79927f..a87d095 100644 --- a/collie/node.c +++ b/collie/node.c @@ -161,7 +161,7 @@ static int node_cache(int argc, char **argv) hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = sizeof(cache_size); - ret = exec_req(fd, &hdr, (void *)&cache_size); + ret = collie_exec_req(fd, &hdr, (void *)&cache_size); close(fd); if (ret) { diff --git a/collie/vdi.c b/collie/vdi.c index 6ed7162..2547bd7 100644 --- a/collie/vdi.c +++ b/collie/vdi.c @@ -319,7 +319,7 @@ static void parse_objs(uint64_t oid, obj_parser_func_t func, void *data, unsigne hdr.obj.oid = oid; - ret = exec_req(fd, &hdr, buf); + ret = collie_exec_req(fd, &hdr, buf); close(fd); sprintf(name + strlen(name), ":%d", sd_nodes[i].nid.port); @@ -410,7 +410,7 @@ static int find_vdi_name(const char *vdiname, uint32_t snapid, const char *tag, hdr.flags = SD_FLAG_CMD_WRITE; hdr.vdi.snapid = snapid; - ret = exec_req(fd, &hdr, buf); + ret = collie_exec_req(fd, &hdr, buf); if (ret) { ret = -1; goto out; @@ -490,7 +490,7 @@ static int do_vdi_create(const char *vdiname, int64_t vdi_size, hdr.vdi.vdi_size = roundup(vdi_size, 512); hdr.vdi.copies = nr_copies; - ret = exec_req(fd, &hdr, buf); + ret = collie_exec_req(fd, &hdr, buf); close(fd); @@ -751,7 +751,7 @@ static int do_vdi_delete(const char *vdiname, int snap_id, const char *snap_tag) if (snap_tag) pstrcpy(data + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag); - ret = exec_req(fd, &hdr, data); + ret = collie_exec_req(fd, &hdr, data); close(fd); if (ret) { @@ -884,7 +884,7 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies) sd_init_req(&hdr, SD_OP_STAT_CLUSTER); hdr.data_length = log_length; - ret = exec_req(fd, &hdr, logs); + ret = collie_exec_req(fd, &hdr, logs); close(fd); if (ret != 0) @@ -1030,7 +1030,7 @@ static int find_vdi_attr_oid(const char *vdiname, const char *tag, uint32_t snap if (delete) hdr.flags |= SD_FLAG_CMD_DEL; - ret = exec_req(fd, &hdr, &vattr); + ret = collie_exec_req(fd, &hdr, &vattr); if (ret) { ret = SD_RES_EIO; goto out; @@ -1391,7 +1391,7 @@ static void *read_object_from(const struct sd_vnode *vnode, uint64_t oid) hdr.obj.oid = oid; - ret = exec_req(fd, &hdr, buf); + ret = collie_exec_req(fd, &hdr, buf); close(fd); if (ret) { @@ -1440,7 +1440,7 @@ static void write_object_to(const struct sd_vnode *vnode, uint64_t oid, hdr.data_length = SD_DATA_OBJ_SIZE; hdr.obj.oid = oid; - ret = exec_req(fd, &hdr, buf); + ret = collie_exec_req(fd, &hdr, buf); close(fd); if (ret) { diff --git a/include/net.h b/include/net.h index 787ee79..97e93d0 100644 --- a/include/net.h +++ b/include/net.h @@ -52,6 +52,7 @@ int tx(struct connection *conn, enum conn_state next_state); int connect_to(const char *name, int port); int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen); int exec_req(int sockfd, struct sd_req *hdr, void *data); +int net_exec_req(int sockfd, struct sd_req *hdr, void *data, bool retry_eagain); int create_listen_ports(const char *bindaddr, int port, int (*callback)(int fd, void *), void *data); int create_unix_domain_socket(const char *unix_path, diff --git a/lib/net.c b/lib/net.c index 365c1e1..76b6eab 100644 --- a/lib/net.c +++ b/lib/net.c @@ -268,7 +268,7 @@ success: return fd; } -int do_read(int sockfd, void *buf, int len) +static int net_do_read(int sockfd, void *buf, int len, bool retry_eagain) { int ret; reread: @@ -276,7 +276,10 @@ reread: if (ret < 0 || !ret) { if (errno == EINTR) goto reread; - eprintf("failed to read from socket: %d\n", ret); + if (retry_eagain && errno == EAGAIN) + goto reread; + + eprintf("failed to read from socket: %d, %d(%m)\n", ret, errno); return 1; } @@ -288,6 +291,11 @@ reread: return 0; } +int do_read(int sockfd, void *buf, int len) +{ + return net_do_read(sockfd, buf, len, false); +} + static void forward_iov(struct msghdr *msg, int len) { while (msg->msg_iov->iov_len <= len) { @@ -352,7 +360,7 @@ int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen) return ret; } -int exec_req(int sockfd, struct sd_req *hdr, void *data) +int net_exec_req(int sockfd, struct sd_req *hdr, void *data, bool retry_eagain) { int ret; struct sd_rsp *rsp = (struct sd_rsp *)hdr; @@ -369,7 +377,7 @@ int exec_req(int sockfd, struct sd_req *hdr, void *data) if (send_req(sockfd, hdr, data, wlen)) return 1; - ret = do_read(sockfd, rsp, sizeof(*rsp)); + ret = net_do_read(sockfd, rsp, sizeof(*rsp), retry_eagain); if (ret) { eprintf("failed to read a response\n"); return 1; @@ -379,7 +387,7 @@ int exec_req(int sockfd, struct sd_req *hdr, void *data) rlen = rsp->data_length; if (rlen) { - ret = do_read(sockfd, data, rlen); + ret = net_do_read(sockfd, data, rlen, retry_eagain); if (ret) { eprintf("failed to read the response data\n"); return 1; @@ -389,6 +397,11 @@ int exec_req(int sockfd, struct sd_req *hdr, void *data) return 0; } +int exec_req(int sockfd, struct sd_req *hdr, void *data) +{ + return net_exec_req(sockfd, hdr, data, false); +} + char *addr_to_str(char *str, int size, const uint8_t *addr, uint16_t port) { int af = AF_INET6; -- 1.7.9.5 |