send()/recv() could sleep long time if network failure happens during network I/Os, and it prevents from incrementing epoch number because we assumes that there is no outstanding I/O requests while updating node membership info. This patch fixes the problem. It is not a problem to set a small value for timeout because I/Os are retried automatically even if send/recv timeout has occurred. Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp> --- Hi Yibin, I think this patch is much simpler than using timeout in poll(). How do you think? I also tried tcp keepalive, but it doesn't work on my environment for some reason. Thanks, Kazutaka include/net.h | 3 +++ lib/net.c | 27 +++++++++++++++++++++++++-- sheep/sdnet.c | 7 +++++++ 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/include/net.h b/include/net.h index 9e51fea..2d087e2 100644 --- a/include/net.h +++ b/include/net.h @@ -3,6 +3,8 @@ #include <sys/socket.h> +#define DEFAULT_SOCKET_TIMEOUT 5 /* seconds */ + enum conn_state { C_IO_HEADER = 0, C_IO_DATA_INIT, @@ -45,5 +47,6 @@ int create_listen_ports(int port, int (*callback)(int fd, void *), void *data); char *addr_to_str(char *str, int size, uint8_t *addr, uint16_t port); int set_nonblocking(int fd); int set_nodelay(int fd); +int set_timeout(int fd); #endif diff --git a/lib/net.c b/lib/net.c index d4a5d9b..3caba0f 100644 --- a/lib/net.c +++ b/lib/net.c @@ -242,7 +242,7 @@ int do_read(int sockfd, void *buf, int len) reread: ret = read(sockfd, buf, len); if (ret < 0 || !ret) { - if (errno == EINTR || errno == EAGAIN) + if (errno == EINTR) goto reread; fprintf(stderr, "failed to read from socket: %m\n"); return 1; @@ -275,7 +275,7 @@ static int do_write(int sockfd, struct msghdr *msg, int len) rewrite: ret = sendmsg(sockfd, msg, 0); if (ret < 0) { - if (errno == EINTR || errno == EAGAIN) + if (errno == EINTR) goto rewrite; fprintf(stderr, "failed to write to socket: %m\n"); return 1; @@ -397,3 +397,26 @@ int set_nodelay(int fd) ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &opt, sizeof(opt)); return ret; } + +int set_timeout(int fd) +{ + int ret; + const struct timeval tv = { + .tv_sec = DEFAULT_SOCKET_TIMEOUT, + .tv_usec = 0, + }; + + ret = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)); + if (ret) { + eprintf("failed to set send timeout\n"); + return ret; + } + + ret = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + if (ret) { + eprintf("failed to set recv timeout\n"); + return ret; + } + + return 0; +} diff --git a/sheep/sdnet.c b/sheep/sdnet.c index 8455653..510fd4e 100644 --- a/sheep/sdnet.c +++ b/sheep/sdnet.c @@ -841,6 +841,13 @@ int get_sheep_fd(uint8_t *addr, uint16_t port, int node_idx, if (fd < 0) return -1; + ret = set_timeout(fd); + if (ret) { + eprintf("%m\n"); + close(fd); + return -1; + } + ret = set_nodelay(fd); if (ret) { eprintf("%m\n"); -- 1.7.2.5 |