[Sheepdog] [PATCH v2 1/6] sheep: add transient failure detection

HaiTing Yao yaohaiting.wujue at gmail.com
Wed May 2 09:17:56 CEST 2012


On Thu, Apr 26, 2012 at 1:20 AM, MORITA Kazutaka <
morita.kazutaka at lab.ntt.co.jp> wrote:

> Hi,
>
> Could you rebase this patchset onto the master branch?  Sorry for my
> late review.
>
> checkpatch.pl says that this patchset contains some coding style
> problems.  I hope you would fix them in the next patchset. :)
>
> Thanks,
>
> Kazutaka
>

Sorry for the delay. I have rebased the codes and submitted the new patch.

I used checkpatch.pl to check my patch and fixed the bugs this time

Thanks
Haiti


>
>
> At Thu, 12 Apr 2012 10:37:58 +0800,
>  yaohaiting.wujue at gmail.com wrote:
> >
> > From: HaiTing Yao <wujue.yht at taobao.com>
> >
> > Sometimes we need node can be back in a while.
> >
> > When we need this:
> >
> > 1, restart sheepdog daemon for ugrade or other purpose
> >
> > 2, the corosync driver lose its token for a short while
> >
> > How to implement this:
> >
> > Assume the node leave as transient failue and mark the node's status as
> > failed. Wait to the node come back for a short while. If the time is up
> > and the node has not been back, change the transient failure to
> > permanent one.
> >
> > Here are some changes in header file.
> >
> > Signed-off-by: HaiTing Yao <wujue.yht at taobao.com>
> > ---
> >  include/sheep.h    |   12 ++++++++++++
> >  sheep/cluster.h    |    8 ++++++++
> >  sheep/sheep_priv.h |   10 ++++++++++
> >  3 files changed, 30 insertions(+), 0 deletions(-)
> >
> > diff --git a/include/sheep.h b/include/sheep.h
> > index d010fdf..c4166f4 100644
> > --- a/include/sheep.h
> > +++ b/include/sheep.h
> > @@ -65,6 +65,8 @@
> >  #define SD_RES_NOT_FORMATTED 0x43 /* Sheepdog is not formatted yet */
> >  #define SD_RES_INVALID_CTIME 0x44 /* Creation time of sheepdog is
> different */
> >  #define SD_RES_INVALID_EPOCH 0x45 /* Invalid epoch */
> > +#define SD_RES_NODE_COME_BACK 0x46 /* node come back again after
> temprorary failure */
> > +#define SD_RES_CLUSTER_TEMP_FAILURE 0x47 /* cluster has temporarily
> failed node */
> >
> >  #define SD_FLAG_NOHALT       0x0004 /* Serve the IO rquest even lack of
> nodes */
> >
> > @@ -144,11 +146,17 @@ struct sd_node_rsp {
> >       uint64_t        store_free;
> >  };
> >
> > +enum node_vnode_status {
> > +     NODE_STATUS_NORMAL,
> > +     NODE_STATUS_FAIL,
> > +};
> > +
> >  struct sd_node {
> >       uint8_t         addr[16];
> >       uint16_t        port;
> >       uint16_t        nr_vnodes;
> >       uint32_t        zone;
> > +     uint32_t        status;
> >  };
> >
> >  struct sd_vnode {
> > @@ -157,6 +165,7 @@ struct sd_vnode {
> >       uint16_t        port;
> >       uint16_t        node_idx;
> >       uint32_t        zone;
> > +     uint32_t        status;
> >  };
> >
> >  struct epoch_log {
> > @@ -293,6 +302,7 @@ static inline const char *sd_strerror(int err)
> >               {SD_RES_NOT_FORMATTED, "Cluster has not been formatted"},
> >               {SD_RES_INVALID_CTIME, "Creation times differ"},
> >               {SD_RES_INVALID_EPOCH, "Invalid epoch"},
> > +             {SD_RES_CLUSTER_TEMP_FAILURE, "Nodes occur temporary
> failure"},
> >       };
> >
> >       for (i = 0; i < ARRAY_SIZE(errors); ++i)
> > @@ -366,4 +376,6 @@ static inline int nodes_to_vnodes(struct sd_node
> *nodes, int nr,
> >       return nr_vnodes;
> >  }
> >
> > +int temp_failure_enabled(void);
> > +
> >  #endif
> > diff --git a/sheep/cluster.h b/sheep/cluster.h
> > index b50dbb2..58811e6 100644
> > --- a/sheep/cluster.h
> > +++ b/sheep/cluster.h
> > @@ -29,6 +29,7 @@ enum cluster_join_result {
> >       CJ_RES_MASTER_TRANSFER, /* Transfer mastership.  The joining
> >                                * node has a newer epoch, so this node
> >                                * will leave the cluster (restart later).
> */
> > +     CJ_RES_COME_BACK, /* node come back after reboot */
> >  };
> >
> >  struct cdrv_handlers {
> > @@ -114,6 +115,13 @@ struct cluster_driver {
> >        */
> >       int (*dispatch)(void);
> >
> > +     /*
> > +      * remove one useless node
> > +      *
> > +      * Returns zero on success, -1 on error
> > +      */
> > +     int (*remove_node)(void *node);
> > +
> >       struct list_head list;
> >  };
> >
> > diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> > index a9e8440..e18c696 100644
> > --- a/sheep/sheep_priv.h
> > +++ b/sheep/sheep_priv.h
> > @@ -49,6 +49,14 @@ struct cpg_event {
> >       struct list_head cpg_event_list;
> >  };
> >
> > +#define SD_MAX_NODES_TEMP_FAIL (SD_MAX_NODES - 1)
> > +
> > +struct temp_failure {
> > +     int busy;
> > +     struct sd_node node;
> > +     struct timer leave_timer;
> > +};
> > +
> >  struct client_info {
> >       struct connection conn;
> >
> > @@ -150,6 +158,8 @@ struct cluster_info {
> >       int use_directio;
> >       uint8_t sync_flush;
> >
> > +     uint32_t templeft_time;
> > +
> >       struct work_queue *cpg_wqueue;
> >       struct work_queue *gateway_wqueue;
> >       struct work_queue *io_wqueue;
> > --
> > 1.7.1
> >
> > --
> > sheepdog mailing list
> > sheepdog at lists.wpkg.org
> > http://lists.wpkg.org/mailman/listinfo/sheepdog
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.wpkg.org/pipermail/sheepdog/attachments/20120502/f81a56a6/attachment-0002.html>


More information about the sheepdog mailing list