seems SO_SNDTIMEO & SO_RCVTIMEO are commonly used in block IO, I'm not sure whether they can work correctly in non-block IO code. On Mon, Dec 12, 2011 at 7:14 PM, MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp> wrote: > send()/recv() could sleep long time if network failure happens during > network I/Os, and it prevents from incrementing epoch number because > we assumes that there is no outstanding I/O requests while updating > node membership info. This patch fixes the problem. > > It is not a problem to set a small value for timeout because I/Os are > retried automatically even if send/recv timeout has occurred. > > Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp> > --- > Hi Yibin, > > I think this patch is much simpler than using timeout in poll(). > How do you think? > > I also tried tcp keepalive, but it doesn't work on my environment for > some reason. > > Thanks, > > Kazutaka > > > include/net.h | 3 +++ > lib/net.c | 27 +++++++++++++++++++++++++-- > sheep/sdnet.c | 7 +++++++ > 3 files changed, 35 insertions(+), 2 deletions(-) > > diff --git a/include/net.h b/include/net.h > index 9e51fea..2d087e2 100644 > --- a/include/net.h > +++ b/include/net.h > @@ -3,6 +3,8 @@ > > #include <sys/socket.h> > > +#define DEFAULT_SOCKET_TIMEOUT 5 /* seconds */ > + > enum conn_state { > C_IO_HEADER = 0, > C_IO_DATA_INIT, > @@ -45,5 +47,6 @@ int create_listen_ports(int port, int (*callback)(int fd, void *), void *data); > char *addr_to_str(char *str, int size, uint8_t *addr, uint16_t port); > int set_nonblocking(int fd); > int set_nodelay(int fd); > +int set_timeout(int fd); > > #endif > diff --git a/lib/net.c b/lib/net.c > index d4a5d9b..3caba0f 100644 > --- a/lib/net.c > +++ b/lib/net.c > @@ -242,7 +242,7 @@ int do_read(int sockfd, void *buf, int len) > reread: > ret = read(sockfd, buf, len); > if (ret < 0 || !ret) { > - if (errno == EINTR || errno == EAGAIN) > + if (errno == EINTR) > goto reread; > fprintf(stderr, "failed to read from socket: %m\n"); > return 1; > @@ -275,7 +275,7 @@ static int do_write(int sockfd, struct msghdr *msg, int len) > rewrite: > ret = sendmsg(sockfd, msg, 0); > if (ret < 0) { > - if (errno == EINTR || errno == EAGAIN) > + if (errno == EINTR) > goto rewrite; > fprintf(stderr, "failed to write to socket: %m\n"); > return 1; > @@ -397,3 +397,26 @@ int set_nodelay(int fd) > ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &opt, sizeof(opt)); > return ret; > } > + > +int set_timeout(int fd) > +{ > + int ret; > + const struct timeval tv = { > + .tv_sec = DEFAULT_SOCKET_TIMEOUT, > + .tv_usec = 0, > + }; > + > + ret = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)); > + if (ret) { > + eprintf("failed to set send timeout\n"); > + return ret; > + } > + > + ret = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); > + if (ret) { > + eprintf("failed to set recv timeout\n"); > + return ret; > + } > + > + return 0; > +} > diff --git a/sheep/sdnet.c b/sheep/sdnet.c > index 8455653..510fd4e 100644 > --- a/sheep/sdnet.c > +++ b/sheep/sdnet.c > @@ -841,6 +841,13 @@ int get_sheep_fd(uint8_t *addr, uint16_t port, int node_idx, > if (fd < 0) > return -1; > > + ret = set_timeout(fd); > + if (ret) { > + eprintf("%m\n"); > + close(fd); > + return -1; > + } > + > ret = set_nodelay(fd); > if (ret) { > eprintf("%m\n"); > -- > 1.7.2.5 > > -- > sheepdog mailing list > sheepdog at lists.wpkg.org > http://lists.wpkg.org/mailman/listinfo/sheepdog ________________________________ This email (including any attachments) is confidential and may be legally privileged. If you received this email in error, please delete it immediately and do not copy it or use it for any purpose or disclose its contents to any other person. Thank you. 本电邮(包括任何附件)可能含有机密资料并受法律保护。如您不是正确的收件人,请您立即删除本邮件。请不要将本电邮进行复制并用作任何其他用途、或透露本邮件之内容。谢谢。 |