[sheepdog] [PATCH 2/3] net: add read timeout for sockfd

Liu Yuan namei.unix at gmail.com
Thu Jan 17 16:30:29 CET 2013


From: Liu Yuan <tailai.ly at taobao.com>

This fixes the hang problem demenstated by refined 035 on sheep side.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 include/net.h   |   11 ++++++++++-
 lib/net.c       |   24 ++++++++++++++++++++++--
 sheep/gateway.c |    8 --------
 3 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/include/net.h b/include/net.h
index f795707..787ee79 100644
--- a/include/net.h
+++ b/include/net.h
@@ -6,6 +6,15 @@
 
 #include "sheepdog_proto.h"
 
+/*
+ * We observed that for a busy node, the response could be as long as 15s, so
+ * wait 30s would be a safe value. Even we are false timeouted, the gateway will
+ * retry the request and sockfd cache module will repair the false-closes.
+ */
+#define MAX_POLLTIME 30 /* seconds */
+#define POLL_TIMEOUT 5 /* seconds */
+#define MAX_RETRY_COUNT (MAX_POLLTIME / POLL_TIMEOUT)
+
 enum conn_state {
 	C_IO_HEADER = 0,
 	C_IO_DATA_INIT,
@@ -54,7 +63,7 @@ int set_nonblocking(int fd);
 int set_nodelay(int fd);
 int set_keepalive(int fd);
 int set_snd_timeout(int fd);
-int set_timeout(int fd);
+int set_rcv_timeout(int fd);
 int get_local_addr(uint8_t *bytes);
 bool inetaddr_is_valid(char *addr);
 
diff --git a/lib/net.c b/lib/net.c
index e6d8a56..365c1e1 100644
--- a/lib/net.c
+++ b/lib/net.c
@@ -238,6 +238,13 @@ int connect_to(const char *name, int port)
 			break;
 		}
 
+		ret = set_rcv_timeout(fd);
+		if (ret) {
+			eprintf("failed to set recv timeout: %m\n");
+			close(fd);
+			break;
+		}
+
 		ret = connect(fd, res->ai_addr, res->ai_addrlen);
 		if (ret) {
 			eprintf("failed to connect to %s:%d: %m\n",
@@ -436,18 +443,31 @@ int set_nonblocking(int fd)
 	return ret;
 }
 
-/* Send timeout for 5 second */
 int set_snd_timeout(int fd)
 {
 	struct timeval timeout;
 
-	timeout.tv_sec = 5;
+	timeout.tv_sec = POLL_TIMEOUT;
 	timeout.tv_usec = 0;
 
 	return setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout,
 			  sizeof(timeout));
 }
 
+int set_rcv_timeout(int fd)
+{
+	struct timeval timeout;
+/*
+ * We should wait longer for read than write because the target node might be
+ * busy doing IO
+ */
+	timeout.tv_sec = MAX_POLLTIME;
+	timeout.tv_usec = 0;
+
+	return setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout,
+			  sizeof(timeout));
+}
+
 int set_nodelay(int fd)
 {
 	int ret, opt;
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 44f9ee2..33fad42 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -149,14 +149,6 @@ static inline void pfd_info_init(struct write_info *wi, struct pfd_info *pi)
  */
 static int wait_forward_request(struct write_info *wi, struct request *req)
 {
-/*
- * We observed that for a busy node, the response could be as long as 15s, so
- * wait 30s would be a safe value. Even we are false timeouted, the gateway will
- * retry the request and sockfd cache module will repair the false-closes.
- */
-#define MAX_POLLTIME 30 /* seconds */
-#define POLL_TIMEOUT 5 /* seconds */
-#define MAX_RETRY_COUNT (MAX_POLLTIME / POLL_TIMEOUT)
 	int nr_sent, err_ret = SD_RES_SUCCESS, ret, pollret, i,
 	    repeat = MAX_RETRY_COUNT;
 	struct pfd_info pi;
-- 
1.7.9.5




More information about the sheepdog mailing list