[sheepdog] [PATCH 3/3] gateway: fix poll() hang
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Tue Sep 4 11:23:50 CEST 2012
At Tue, 4 Sep 2012 17:00:58 +0800,
Liu Yuan wrote:
>
> From: Liu Yuan <tailai.ly at taobao.com>
>
> poll() might wait for unbounded period of time, when
> a) header in send buffer is not acked
> b) the remote node is crashed
>
> So if we detect that the send buffer is not empty, we switch to timeout poll.
>
> Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
>
> ---
> sheep/gateway.c | 42 ++++++++++++++++++++++++++++++++++++------
> 1 file changed, 36 insertions(+), 6 deletions(-)
>
> diff --git a/sheep/gateway.c b/sheep/gateway.c
> index 41d712b..2fdaa58 100644
> --- a/sheep/gateway.c
> +++ b/sheep/gateway.c
> @@ -12,6 +12,7 @@
> #include <stdio.h>
> #include <stdlib.h>
> #include <poll.h>
> +#include <sys/ioctl.h>
>
> #include "sheep_priv.h"
>
> @@ -134,12 +135,32 @@ struct pfd_info {
> int nr;
> };
>
> -static inline void pfd_info_init(struct write_info *wi, struct pfd_info *pi)
> +static inline int pfd_info_init(struct write_info *wi, struct pfd_info *pi)
> {
> - int i;
> - for (i = 0; i < wi->nr_sent; i++)
> + int i, buf_empty = 1, bytes;
> + for (i = 0; i < wi->nr_sent; i++) {
> pi->pfds[i] = wi->ent[i].pfd;
> + if (buf_empty) {
> + /*
> + * Check if the sockfd send buffer is empty.
> + *
> + * For a non-empty buffer (not acked yet), there is
> + * possibilty that the remote node is down, in which
> + * case we use a timeout poll to avoid poll infinitely
> + */
> + if (ioctl(pi->pfds[i].fd, TIOCOUTQ, &bytes) < 0) {
> + eprintf("%m\n");
> + continue;
> + }
> + if (bytes != 0) {
> + buf_empty = 0;
> + dprintf("send buffer is not empty\n");
> + }
> + }
> + }
> +
> pi->nr = wi->nr_sent;
> + return buf_empty;
> }
>
> /*
> @@ -152,16 +173,25 @@ static inline void pfd_info_init(struct write_info *wi, struct pfd_info *pi)
> */
> static int wait_forward_request(struct write_info *wi, struct sd_rsp *rsp)
> {
> - int nr_sent, err_ret = SD_RES_SUCCESS, ret, pollret, i;
> + int nr_sent, err_ret = SD_RES_SUCCESS, ret, pollret, i, buf_empty;
> struct pfd_info pi;;
> again:
> - pfd_info_init(wi, &pi);
> - pollret = poll(pi.pfds, pi.nr, -1);
> + buf_empty= pfd_info_init(wi, &pi);
> + pollret = poll(pi.pfds, pi.nr, buf_empty ? -1 : 5000);
> if (pollret < 0) {
> if (errno == EINTR)
> goto again;
>
> panic("%m\n");
> + } else if (pollret == 0) {
> + eprintf("poll timeout\n");
> + nr_sent = wi->nr_sent;
> + /* XXX Blinedly close all the connections */
We should add the following code here to avoid false timeout.
if (req->rq.epoch == sys_epoch)
goto again;
And if we add this code here, sheep doesn't return
SD_RES_NETWORK_ERROR wrongly at all. If poll timeout works
efficiently, we don't need to check socket buffer, no?
Thanks,
Kazutaka
> + for (i = 0; i < nr_sent; i++)
> + finish_one_write_err(wi, i);
> +
> + err_ret = SD_RES_NETWORK_ERROR;
> + goto finish_write;
> }
>
> nr_sent = wi->nr_sent;
> --
> 1.7.10.2
>
> --
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog
More information about the sheepdog
mailing list