[Sheepdog] [PATCH v2 1/6] sheep: add transient failure detection
HaiTing Yao
yaohaiting.wujue at gmail.com
Wed May 2 09:17:56 CEST 2012
On Thu, Apr 26, 2012 at 1:20 AM, MORITA Kazutaka <
morita.kazutaka at lab.ntt.co.jp> wrote:
> Hi,
>
> Could you rebase this patchset onto the master branch? Sorry for my
> late review.
>
> checkpatch.pl says that this patchset contains some coding style
> problems. I hope you would fix them in the next patchset. :)
>
> Thanks,
>
> Kazutaka
>
Sorry for the delay. I have rebased the codes and submitted the new patch.
I used checkpatch.pl to check my patch and fixed the bugs this time
Thanks
Haiti
>
>
> At Thu, 12 Apr 2012 10:37:58 +0800,
> yaohaiting.wujue at gmail.com wrote:
> >
> > From: HaiTing Yao <wujue.yht at taobao.com>
> >
> > Sometimes we need node can be back in a while.
> >
> > When we need this:
> >
> > 1, restart sheepdog daemon for ugrade or other purpose
> >
> > 2, the corosync driver lose its token for a short while
> >
> > How to implement this:
> >
> > Assume the node leave as transient failue and mark the node's status as
> > failed. Wait to the node come back for a short while. If the time is up
> > and the node has not been back, change the transient failure to
> > permanent one.
> >
> > Here are some changes in header file.
> >
> > Signed-off-by: HaiTing Yao <wujue.yht at taobao.com>
> > ---
> > include/sheep.h | 12 ++++++++++++
> > sheep/cluster.h | 8 ++++++++
> > sheep/sheep_priv.h | 10 ++++++++++
> > 3 files changed, 30 insertions(+), 0 deletions(-)
> >
> > diff --git a/include/sheep.h b/include/sheep.h
> > index d010fdf..c4166f4 100644
> > --- a/include/sheep.h
> > +++ b/include/sheep.h
> > @@ -65,6 +65,8 @@
> > #define SD_RES_NOT_FORMATTED 0x43 /* Sheepdog is not formatted yet */
> > #define SD_RES_INVALID_CTIME 0x44 /* Creation time of sheepdog is
> different */
> > #define SD_RES_INVALID_EPOCH 0x45 /* Invalid epoch */
> > +#define SD_RES_NODE_COME_BACK 0x46 /* node come back again after
> temprorary failure */
> > +#define SD_RES_CLUSTER_TEMP_FAILURE 0x47 /* cluster has temporarily
> failed node */
> >
> > #define SD_FLAG_NOHALT 0x0004 /* Serve the IO rquest even lack of
> nodes */
> >
> > @@ -144,11 +146,17 @@ struct sd_node_rsp {
> > uint64_t store_free;
> > };
> >
> > +enum node_vnode_status {
> > + NODE_STATUS_NORMAL,
> > + NODE_STATUS_FAIL,
> > +};
> > +
> > struct sd_node {
> > uint8_t addr[16];
> > uint16_t port;
> > uint16_t nr_vnodes;
> > uint32_t zone;
> > + uint32_t status;
> > };
> >
> > struct sd_vnode {
> > @@ -157,6 +165,7 @@ struct sd_vnode {
> > uint16_t port;
> > uint16_t node_idx;
> > uint32_t zone;
> > + uint32_t status;
> > };
> >
> > struct epoch_log {
> > @@ -293,6 +302,7 @@ static inline const char *sd_strerror(int err)
> > {SD_RES_NOT_FORMATTED, "Cluster has not been formatted"},
> > {SD_RES_INVALID_CTIME, "Creation times differ"},
> > {SD_RES_INVALID_EPOCH, "Invalid epoch"},
> > + {SD_RES_CLUSTER_TEMP_FAILURE, "Nodes occur temporary
> failure"},
> > };
> >
> > for (i = 0; i < ARRAY_SIZE(errors); ++i)
> > @@ -366,4 +376,6 @@ static inline int nodes_to_vnodes(struct sd_node
> *nodes, int nr,
> > return nr_vnodes;
> > }
> >
> > +int temp_failure_enabled(void);
> > +
> > #endif
> > diff --git a/sheep/cluster.h b/sheep/cluster.h
> > index b50dbb2..58811e6 100644
> > --- a/sheep/cluster.h
> > +++ b/sheep/cluster.h
> > @@ -29,6 +29,7 @@ enum cluster_join_result {
> > CJ_RES_MASTER_TRANSFER, /* Transfer mastership. The joining
> > * node has a newer epoch, so this node
> > * will leave the cluster (restart later).
> */
> > + CJ_RES_COME_BACK, /* node come back after reboot */
> > };
> >
> > struct cdrv_handlers {
> > @@ -114,6 +115,13 @@ struct cluster_driver {
> > */
> > int (*dispatch)(void);
> >
> > + /*
> > + * remove one useless node
> > + *
> > + * Returns zero on success, -1 on error
> > + */
> > + int (*remove_node)(void *node);
> > +
> > struct list_head list;
> > };
> >
> > diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> > index a9e8440..e18c696 100644
> > --- a/sheep/sheep_priv.h
> > +++ b/sheep/sheep_priv.h
> > @@ -49,6 +49,14 @@ struct cpg_event {
> > struct list_head cpg_event_list;
> > };
> >
> > +#define SD_MAX_NODES_TEMP_FAIL (SD_MAX_NODES - 1)
> > +
> > +struct temp_failure {
> > + int busy;
> > + struct sd_node node;
> > + struct timer leave_timer;
> > +};
> > +
> > struct client_info {
> > struct connection conn;
> >
> > @@ -150,6 +158,8 @@ struct cluster_info {
> > int use_directio;
> > uint8_t sync_flush;
> >
> > + uint32_t templeft_time;
> > +
> > struct work_queue *cpg_wqueue;
> > struct work_queue *gateway_wqueue;
> > struct work_queue *io_wqueue;
> > --
> > 1.7.1
> >
> > --
> > sheepdog mailing list
> > sheepdog at lists.wpkg.org
> > http://lists.wpkg.org/mailman/listinfo/sheepdog
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.wpkg.org/pipermail/sheepdog/attachments/20120502/f81a56a6/attachment-0002.html>
More information about the sheepdog
mailing list