[Sheepdog] [PATCH v2 1/6] sheep: add transient failure detection

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Wed Apr 25 19:20:25 CEST 2012


Hi,

Could you rebase this patchset onto the master branch?  Sorry for my
late review.

checkpatch.pl says that this patchset contains some coding style
problems.  I hope you would fix them in the next patchset. :)

Thanks,

Kazutaka


At Thu, 12 Apr 2012 10:37:58 +0800,
yaohaiting.wujue at gmail.com wrote:
> 
> From: HaiTing Yao <wujue.yht at taobao.com>
> 
> Sometimes we need node can be back in a while.
> 
> When we need this:
> 
> 1, restart sheepdog daemon for ugrade or other purpose
> 
> 2, the corosync driver lose its token for a short while
> 
> How to implement this:
> 
> Assume the node leave as transient failue and mark the node's status as
> failed. Wait to the node come back for a short while. If the time is up
> and the node has not been back, change the transient failure to
> permanent one.
> 
> Here are some changes in header file.
> 
> Signed-off-by: HaiTing Yao <wujue.yht at taobao.com>
> ---
>  include/sheep.h    |   12 ++++++++++++
>  sheep/cluster.h    |    8 ++++++++
>  sheep/sheep_priv.h |   10 ++++++++++
>  3 files changed, 30 insertions(+), 0 deletions(-)
> 
> diff --git a/include/sheep.h b/include/sheep.h
> index d010fdf..c4166f4 100644
> --- a/include/sheep.h
> +++ b/include/sheep.h
> @@ -65,6 +65,8 @@
>  #define SD_RES_NOT_FORMATTED 0x43 /* Sheepdog is not formatted yet */
>  #define SD_RES_INVALID_CTIME 0x44 /* Creation time of sheepdog is different */
>  #define SD_RES_INVALID_EPOCH 0x45 /* Invalid epoch */
> +#define SD_RES_NODE_COME_BACK 0x46 /* node come back again after temprorary failure */
> +#define SD_RES_CLUSTER_TEMP_FAILURE 0x47 /* cluster has temporarily failed node */
>  
>  #define SD_FLAG_NOHALT       0x0004 /* Serve the IO rquest even lack of nodes */
>  
> @@ -144,11 +146,17 @@ struct sd_node_rsp {
>  	uint64_t	store_free;
>  };
>  
> +enum node_vnode_status {
> +	NODE_STATUS_NORMAL,
> +	NODE_STATUS_FAIL,
> +};
> +
>  struct sd_node {
>  	uint8_t         addr[16];
>  	uint16_t        port;
>  	uint16_t	nr_vnodes;
>  	uint32_t	zone;
> +	uint32_t	status;
>  };
>  
>  struct sd_vnode {
> @@ -157,6 +165,7 @@ struct sd_vnode {
>  	uint16_t        port;
>  	uint16_t	node_idx;
>  	uint32_t	zone;
> +	uint32_t	status;
>  };
>  
>  struct epoch_log {
> @@ -293,6 +302,7 @@ static inline const char *sd_strerror(int err)
>  		{SD_RES_NOT_FORMATTED, "Cluster has not been formatted"},
>  		{SD_RES_INVALID_CTIME, "Creation times differ"},
>  		{SD_RES_INVALID_EPOCH, "Invalid epoch"},
> +		{SD_RES_CLUSTER_TEMP_FAILURE, "Nodes occur temporary failure"},
>  	};
>  
>  	for (i = 0; i < ARRAY_SIZE(errors); ++i)
> @@ -366,4 +376,6 @@ static inline int nodes_to_vnodes(struct sd_node *nodes, int nr,
>  	return nr_vnodes;
>  }
>  
> +int temp_failure_enabled(void);
> +
>  #endif
> diff --git a/sheep/cluster.h b/sheep/cluster.h
> index b50dbb2..58811e6 100644
> --- a/sheep/cluster.h
> +++ b/sheep/cluster.h
> @@ -29,6 +29,7 @@ enum cluster_join_result {
>  	CJ_RES_MASTER_TRANSFER, /* Transfer mastership.  The joining
>  				 * node has a newer epoch, so this node
>  				 * will leave the cluster (restart later). */
> +	CJ_RES_COME_BACK, /* node come back after reboot */
>  };
>  
>  struct cdrv_handlers {
> @@ -114,6 +115,13 @@ struct cluster_driver {
>  	 */
>  	int (*dispatch)(void);
>  
> +	/*
> +	 * remove one useless node
> +	 *
> +	 * Returns zero on success, -1 on error
> +	 */
> +	int (*remove_node)(void *node);
> +
>  	struct list_head list;
>  };
>  
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index a9e8440..e18c696 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -49,6 +49,14 @@ struct cpg_event {
>  	struct list_head cpg_event_list;
>  };
>  
> +#define SD_MAX_NODES_TEMP_FAIL (SD_MAX_NODES - 1)
> +
> +struct temp_failure {
> +	int busy;
> +	struct sd_node node;
> +	struct timer leave_timer;
> +};
> +
>  struct client_info {
>  	struct connection conn;
>  
> @@ -150,6 +158,8 @@ struct cluster_info {
>  	int use_directio;
>  	uint8_t sync_flush;
>  
> +	uint32_t templeft_time;
> +
>  	struct work_queue *cpg_wqueue;
>  	struct work_queue *gateway_wqueue;
>  	struct work_queue *io_wqueue;
> -- 
> 1.7.1
> 
> -- 
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog



More information about the sheepdog mailing list