[Sheepdog] [RFC PATCH] sheep: add client side timeout support for socket

zituan at taobao.com zituan at taobao.com
Fri Nov 25 12:28:57 CET 2011


From: Yibin Shen <zituan at taobao.com>

currently, sheep use infinite timeout with poll(), that will cause
some problem, e.g. node A leave sheep cluster exceptionally, then
infly IOs torward node A may not be terminated forever, so related
confchg event can not be handled.

this patch also change the forward_read_obj_req() from block read
into nonblock with poll().

Signed-off-by: Yibin Shen <zituan at taobao.com>
---
 include/net.h |    2 +
 sheep/store.c |   68 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/include/net.h b/include/net.h
index 9e51fea..a0aed3b 100644
--- a/include/net.h
+++ b/include/net.h
@@ -3,6 +3,8 @@
 
 #include <sys/socket.h>
 
+#define DEFAULT_POLL_TIMEOUT 30000
+
 enum conn_state {
 	C_IO_HEADER = 0,
 	C_IO_DATA_INIT,
diff --git a/sheep/store.c b/sheep/store.c
index d4c3f27..22feda4 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -243,14 +243,14 @@ static int do_local_io(struct request *req, uint32_t epoch);
 
 static int forward_read_obj_req(struct request *req, int idx)
 {
-	int i, n, nr, fd, ret;
+	int i, n, nr, fd, ret, pollret;
 	unsigned wlen, rlen;
 	struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq;
 	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr;
 	struct sheepdog_vnode_list_entry *e;
 	uint64_t oid = hdr.oid;
 	int copies;
-
+	struct pollfd pfds;
 	e = req->entry;
 	nr = req->nr_vnodes;
 
@@ -285,21 +285,69 @@ static int forward_read_obj_req(struct request *req, int idx)
 	wlen = 0;
 	rlen = hdr.data_length;
 
-	ret = exec_req(fd, (struct sd_req *)&hdr, req->data, &wlen, &rlen);
+	ret = send_req(fd, (struct sd_req *)&hdr, req->data, &wlen);
+	if (ret) { /* network errors */
+		ret = SD_RES_NETWORK_ERROR;
+		dprintf("fail %"PRIu32"\n", ret);
+		goto out;
+	}
+
+	pfds.fd = fd;
+	pfds.events = POLLIN;
+	ret = SD_RES_SUCCESS;
+
+ poll_again:
+	/*FIXME: what timout value is better? */
+	pollret= poll(&pfds, 1, DEFAULT_POLL_TIMEOUT);
+
+	if (pollret <  0) {
+		if (errno == EINTR)
+			goto poll_again;
+		ret = SD_RES_EIO;
+	} else if (pollret == 0) {
+		ret = SD_RES_EIO;
+		goto out;
+	}
+
+	if (pfds.fd < 0 || !(pfds.revents & POLLIN)) {
+		ret = SD_RES_NETWORK_ERROR;
+		goto out;
+	}
 
-	if (ret) /* network errors */
+	ret = do_read(pfds.fd, rsp, sizeof(*rsp));
+	if (ret) {
 		ret = SD_RES_NETWORK_ERROR;
-	else {
-		memcpy(&req->rp, rsp, sizeof(*rsp));
+		vprintf(SDOG_INFO, "failed to read a response: %m\n");
+		goto out;
+	}
+
+	/* read the extra data */
+	if (rlen > rsp->data_length)
+		rlen = rsp->data_length;
+
+	if (rlen) {
+		ret = do_read(pfds.fd, req->data, rlen);
+		if (ret) {
+			ret = SD_RES_NETWORK_ERROR;
+			vprintf(SDOG_INFO, "failed to read the response data: %m\n");
+			goto out;
+		}
+	}
+
+	if (rsp->result != SD_RES_SUCCESS) {
+		eprintf("fail %"PRIu32"\n", rsp->result);
 		ret = rsp->result;
+		goto out;
 	}
+
+	memcpy(&req->rp, rsp, sizeof(*rsp));
 out:
 	return ret;
 }
 
 static int forward_write_obj_req(struct request *req, int idx)
 {
-	int i, n, nr, fd, ret;
+	int i, n, nr, fd, ret, pollret;
 	unsigned wlen;
 	char name[128];
 	struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq;
@@ -377,11 +425,15 @@ static int forward_write_obj_req(struct request *req, int idx)
 
 	ret = SD_RES_SUCCESS;
 again:
-	if (poll(pfds, nr_fds, -1) < 0) {
+	pollret = poll(pfds, nr_fds, DEFAULT_POLL_TIMEOUT);
+	if (pollret < 0) {
 		if (errno == EINTR)
 			goto again;
 
 		ret = SD_RES_EIO;
+	} else if ( pollret == 0 ) {/* poll time out */
+		ret = SD_RES_EIO;
+		goto out;
 	}
 
 	for (i = 0; i < nr_fds; i++) {
-- 
1.7.7.3




More information about the sheepdog mailing list