[Sheepdog] [RFC PATCH] sheep: add client side timeout support for socket

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Sat Nov 26 11:51:37 CET 2011


At Sat, 26 Nov 2011 17:53:10 +0800,
Yibin Shen wrote:
> 
> oops, I found a regression with this patch
> 
> Nov 26 11:19:13 store_queue_request(936) 3, 3, 412ca6000022d8 , 10
> Nov 26 11:19:13 forward_write_obj_req(368) 412ca6000022d8
> Nov 26 11:19:13 store_queue_request_local(843) 3, 412ca6000022d8 , 10
> Nov 26 11:19:43 store_queue_request(967) failed: 3, 3, 412ca6000022d8 , 10, 3
> Nov 26 11:19:43 io_op_done(147) leaving sheepdog cluster
> Nov 26 11:19:43 sd_leave_handler(1291) network partition bug: this
> sheep should have exited
> Nov 26 11:19:43 log_sigsegv(358) logger pid 9654 exiting abnormally
> 
> 
> e.g :  if a object have 3 copies,  and is hashed to (local, node A, node B)
> then in a write operation, if node A leave cluster,  IO towards node A
> will timeout after 30sec,

How about using TCP keepalive?  I think it is much simpler.  The
example code is in the below link:
  https://github.com/collie/accord/blob/master/lib/net.c#L299

> but we use a strong consistency model, so return value of
> store_request_queue will be set to SD_RES_EIO,

Set SD_RES_NETWORK_ERROR, and the request will be retried
automatically after epoch is incremented.

Thanks,

Kazutaka

> then io_op_done (sdnet.c) function will call leave_cluster .
> 
> 144        } else if (is_access_local(req->entry, req->nr_vnodes,
> 145                                   ((struct sd_obj_req
> *)&req->rq)->oid, copies) &&
> 146                   req->rp.result == SD_RES_EIO) {
> 147                eprintf("leaving sheepdog cluster\n");
> 148                leave_cluster();
> 
> IMO, maybe we should:
> 1)split store_request_queue() into multiple works.
> 2)replace strong consistency with eventual consistency or casual consistency。
> 
> any comments?
> 
> thanks
> 
> On Fri, Nov 25, 2011 at 7:28 PM,  <zituan at taobao.com> wrote:
> > From: Yibin Shen <zituan at taobao.com>
> >
> > currently, sheep use infinite timeout with poll(), that will cause
> > some problem, e.g. node A leave sheep cluster exceptionally, then
> > infly IOs torward node A may not be terminated forever, so related
> > confchg event can not be handled.
> >
> > this patch also change the forward_read_obj_req() from block read
> > into nonblock with poll().
> >
> > Signed-off-by: Yibin Shen <zituan at taobao.com>
> > ---
> >  include/net.h |    2 +
> >  sheep/store.c |   68 ++++++++++++++++++++++++++++++++++++++++++++++++++------
> >  2 files changed, 62 insertions(+), 8 deletions(-)
> >
> > diff --git a/include/net.h b/include/net.h
> > index 9e51fea..a0aed3b 100644
> > --- a/include/net.h
> > +++ b/include/net.h
> > @@ -3,6 +3,8 @@
> >
> >  #include <sys/socket.h>
> >
> > +#define DEFAULT_POLL_TIMEOUT 30000
> > +
> >  enum conn_state {
> >        C_IO_HEADER = 0,
> >        C_IO_DATA_INIT,
> > diff --git a/sheep/store.c b/sheep/store.c
> > index d4c3f27..22feda4 100644
> > --- a/sheep/store.c
> > +++ b/sheep/store.c
> > @@ -243,14 +243,14 @@ static int do_local_io(struct request *req, uint32_t epoch);
> >
> >  static int forward_read_obj_req(struct request *req, int idx)
> >  {
> > -       int i, n, nr, fd, ret;
> > +       int i, n, nr, fd, ret, pollret;
> >        unsigned wlen, rlen;
> >        struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq;
> >        struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr;
> >        struct sheepdog_vnode_list_entry *e;
> >        uint64_t oid = hdr.oid;
> >        int copies;
> > -
> > +       struct pollfd pfds;
> >        e = req->entry;
> >        nr = req->nr_vnodes;
> >
> > @@ -285,21 +285,69 @@ static int forward_read_obj_req(struct request *req, int idx)
> >        wlen = 0;
> >        rlen = hdr.data_length;
> >
> > -       ret = exec_req(fd, (struct sd_req *)&hdr, req->data, &wlen, &rlen);
> > +       ret = send_req(fd, (struct sd_req *)&hdr, req->data, &wlen);
> > +       if (ret) { /* network errors */
> > +               ret = SD_RES_NETWORK_ERROR;
> > +               dprintf("fail %"PRIu32"\n", ret);
> > +               goto out;
> > +       }
> > +
> > +       pfds.fd = fd;
> > +       pfds.events = POLLIN;
> > +       ret = SD_RES_SUCCESS;
> > +
> > + poll_again:
> > +       /*FIXME: what timout value is better? */
> > +       pollret= poll(&pfds, 1, DEFAULT_POLL_TIMEOUT);
> > +
> > +       if (pollret <  0) {
> > +               if (errno == EINTR)
> > +                       goto poll_again;
> > +               ret = SD_RES_EIO;
> > +       } else if (pollret == 0) {
> > +               ret = SD_RES_EIO;
> > +               goto out;
> > +       }
> > +
> > +       if (pfds.fd < 0 || !(pfds.revents & POLLIN)) {
> > +               ret = SD_RES_NETWORK_ERROR;
> > +               goto out;
> > +       }
> >
> > -       if (ret) /* network errors */
> > +       ret = do_read(pfds.fd, rsp, sizeof(*rsp));
> > +       if (ret) {
> >                ret = SD_RES_NETWORK_ERROR;
> > -       else {
> > -               memcpy(&req->rp, rsp, sizeof(*rsp));
> > +               vprintf(SDOG_INFO, "failed to read a response: %m\n");
> > +               goto out;
> > +       }
> > +
> > +       /* read the extra data */
> > +       if (rlen > rsp->data_length)
> > +               rlen = rsp->data_length;
> > +
> > +       if (rlen) {
> > +               ret = do_read(pfds.fd, req->data, rlen);
> > +               if (ret) {
> > +                       ret = SD_RES_NETWORK_ERROR;
> > +                       vprintf(SDOG_INFO, "failed to read the response data: %m\n");
> > +                       goto out;
> > +               }
> > +       }
> > +
> > +       if (rsp->result != SD_RES_SUCCESS) {
> > +               eprintf("fail %"PRIu32"\n", rsp->result);
> >                ret = rsp->result;
> > +               goto out;
> >        }
> > +
> > +       memcpy(&req->rp, rsp, sizeof(*rsp));
> >  out:
> >        return ret;
> >  }
> >
> >  static int forward_write_obj_req(struct request *req, int idx)
> >  {
> > -       int i, n, nr, fd, ret;
> > +       int i, n, nr, fd, ret, pollret;
> >        unsigned wlen;
> >        char name[128];
> >        struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq;
> > @@ -377,11 +425,15 @@ static int forward_write_obj_req(struct request *req, int idx)
> >
> >        ret = SD_RES_SUCCESS;
> >  again:
> > -       if (poll(pfds, nr_fds, -1) < 0) {
> > +       pollret = poll(pfds, nr_fds, DEFAULT_POLL_TIMEOUT);
> > +       if (pollret < 0) {
> >                if (errno == EINTR)
> >                        goto again;
> >
> >                ret = SD_RES_EIO;
> > +       } else if ( pollret == 0 ) {/* poll time out */
> > +               ret = SD_RES_EIO;
> > +               goto out;
> >        }
> >
> >        for (i = 0; i < nr_fds; i++) {
> > --
> > 1.7.7.3
> >
> >
> 
> ________________________________
> 
> This email (including any attachments) is confidential and may be legally privileged. If you received this email in error, please delete it immediately and do not copy it or use it for any purpose or disclose its contents to any other person. Thank you.
> 
> 本电邮(包括任何附件)可能含有机密资料并受法律保护。如您不是正确的收件人,请您立即删除本邮件。请不要将本电邮进行复制并用作任何其他用途、或透露本邮件之内容。谢谢。
> -- 
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog



More information about the sheepdog mailing list