From: Liu Yuan <tailai.ly at taobao.com> If IO NIC is down but sheep alive, epoch isn't incremented, so we can't retry poll for ever. This problem can be demonstrated by 050. Signed-off-by: Liu Yuan <tailai.ly at taobao.com> --- sheep/gateway.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/sheep/gateway.c b/sheep/gateway.c index 2bd6e78..44f9ee2 100644 --- a/sheep/gateway.c +++ b/sheep/gateway.c @@ -149,12 +149,21 @@ static inline void pfd_info_init(struct write_info *wi, struct pfd_info *pi) */ static int wait_forward_request(struct write_info *wi, struct request *req) { - int nr_sent, err_ret = SD_RES_SUCCESS, ret, pollret, i; +/* + * We observed that for a busy node, the response could be as long as 15s, so + * wait 30s would be a safe value. Even we are false timeouted, the gateway will + * retry the request and sockfd cache module will repair the false-closes. + */ +#define MAX_POLLTIME 30 /* seconds */ +#define POLL_TIMEOUT 5 /* seconds */ +#define MAX_RETRY_COUNT (MAX_POLLTIME / POLL_TIMEOUT) + int nr_sent, err_ret = SD_RES_SUCCESS, ret, pollret, i, + repeat = MAX_RETRY_COUNT; struct pfd_info pi; struct sd_rsp *rsp = &req->rp; again: pfd_info_init(wi, &pi); - pollret = poll(pi.pfds, pi.nr, 5000); + pollret = poll(pi.pfds, pi.nr, 1000 * POLL_TIMEOUT); if (pollret < 0) { if (errno == EINTR) goto again; @@ -162,9 +171,14 @@ again: panic("%m\n"); } else if (pollret == 0) { eprintf("poll timeout %d\n", wi->nr_sent); - - if (req->rq.epoch == sys_epoch()) + /* + * If IO NIC is down, epoch isn't incremented, so we can't retry + * for ever. + */ + if (req->rq.epoch == sys_epoch() && repeat) { + repeat--; goto again; + } nr_sent = wi->nr_sent; /* XXX Blinedly close all the connections */ -- 1.7.9.5 |