From: Liu Yuan <tailai.ly at taobao.com> This fixes the hang problem demenstated by refined 035 on sheep side. Signed-off-by: Liu Yuan <tailai.ly at taobao.com> --- include/net.h | 11 ++++++++++- lib/net.c | 24 ++++++++++++++++++++++-- sheep/gateway.c | 8 -------- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/include/net.h b/include/net.h index f795707..787ee79 100644 --- a/include/net.h +++ b/include/net.h @@ -6,6 +6,15 @@ #include "sheepdog_proto.h" +/* + * We observed that for a busy node, the response could be as long as 15s, so + * wait 30s would be a safe value. Even we are false timeouted, the gateway will + * retry the request and sockfd cache module will repair the false-closes. + */ +#define MAX_POLLTIME 30 /* seconds */ +#define POLL_TIMEOUT 5 /* seconds */ +#define MAX_RETRY_COUNT (MAX_POLLTIME / POLL_TIMEOUT) + enum conn_state { C_IO_HEADER = 0, C_IO_DATA_INIT, @@ -54,7 +63,7 @@ int set_nonblocking(int fd); int set_nodelay(int fd); int set_keepalive(int fd); int set_snd_timeout(int fd); -int set_timeout(int fd); +int set_rcv_timeout(int fd); int get_local_addr(uint8_t *bytes); bool inetaddr_is_valid(char *addr); diff --git a/lib/net.c b/lib/net.c index e6d8a56..365c1e1 100644 --- a/lib/net.c +++ b/lib/net.c @@ -238,6 +238,13 @@ int connect_to(const char *name, int port) break; } + ret = set_rcv_timeout(fd); + if (ret) { + eprintf("failed to set recv timeout: %m\n"); + close(fd); + break; + } + ret = connect(fd, res->ai_addr, res->ai_addrlen); if (ret) { eprintf("failed to connect to %s:%d: %m\n", @@ -436,18 +443,31 @@ int set_nonblocking(int fd) return ret; } -/* Send timeout for 5 second */ int set_snd_timeout(int fd) { struct timeval timeout; - timeout.tv_sec = 5; + timeout.tv_sec = POLL_TIMEOUT; timeout.tv_usec = 0; return setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout)); } +int set_rcv_timeout(int fd) +{ + struct timeval timeout; +/* + * We should wait longer for read than write because the target node might be + * busy doing IO + */ + timeout.tv_sec = MAX_POLLTIME; + timeout.tv_usec = 0; + + return setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, + sizeof(timeout)); +} + int set_nodelay(int fd) { int ret, opt; diff --git a/sheep/gateway.c b/sheep/gateway.c index 44f9ee2..33fad42 100644 --- a/sheep/gateway.c +++ b/sheep/gateway.c @@ -149,14 +149,6 @@ static inline void pfd_info_init(struct write_info *wi, struct pfd_info *pi) */ static int wait_forward_request(struct write_info *wi, struct request *req) { -/* - * We observed that for a busy node, the response could be as long as 15s, so - * wait 30s would be a safe value. Even we are false timeouted, the gateway will - * retry the request and sockfd cache module will repair the false-closes. - */ -#define MAX_POLLTIME 30 /* seconds */ -#define POLL_TIMEOUT 5 /* seconds */ -#define MAX_RETRY_COUNT (MAX_POLLTIME / POLL_TIMEOUT) int nr_sent, err_ret = SD_RES_SUCCESS, ret, pollret, i, repeat = MAX_RETRY_COUNT; struct pfd_info pi; -- 1.7.9.5 |