[sheepdog] [PATCH v3 1/2] sheep: handle block/unblock/notify error

Liu Yuan namei.unix at gmail.com
Tue Jul 9 06:28:45 CEST 2013


On Mon, Jul 08, 2013 at 09:07:14PM -0700, Kai Zhang wrote:
> In group.c, it uses 3 broadcast operations: block, unblock and notify.
> These broadcast operations are implemented by cluster drivers.
> For example, corosync implements it by cpg_mcast_joined() while zookeeper by
> sequential node.
> And they can fail if network is unavailable for a while.
> 
> However, current group.c doesn't handle errors of block/unblock/notify events
> and just ignore them.
> 
> This patch add a new error SD_RES_CLUSTER_ERROR to indicate these errors.
> 
> Signed-off-by: Kai Zhang <kyle at zelin.io>
> ---
>  include/sheep.h           |    1 +
>  include/sheepdog_proto.h  |    1 +
>  sheep/cluster.h           |   10 +++++++---
>  sheep/cluster/corosync.c  |   17 ++++++++++-------
>  sheep/cluster/local.c     |   10 +++++++---
>  sheep/cluster/shepherd.c  |   12 ++++++++----
>  sheep/cluster/zookeeper.c |   13 ++++++++-----
>  sheep/group.c             |   40 +++++++++++++++++++++++++++++++++++-----
>  8 files changed, 77 insertions(+), 27 deletions(-)
> 
> diff --git a/include/sheep.h b/include/sheep.h
> index 0d3fae4..3541012 100644
> --- a/include/sheep.h
> +++ b/include/sheep.h
> @@ -204,6 +204,7 @@ static inline const char *sd_strerror(int err)
>  		[SD_RES_JOIN_FAILED] = "Node has failed to join cluster",
>  		[SD_RES_HALT] = "IO has halted as there are too few living nodes",
>  		[SD_RES_READONLY] = "Object is read-only",
> +		[SD_RES_CLUSTER_ERROR] = "Cluster error",
>  
>  		/* from internal_proto.h */
>  		[SD_RES_OLD_NODE_VER] = "Request has an old epoch",
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 156457a..4e9c84e 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -71,6 +71,7 @@
>  #define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */
>  #define SD_RES_HALT          0x19 /* Sheepdog is stopped doing IO */
>  #define SD_RES_READONLY      0x1A /* Object is read-only */
> +#define SD_RES_CLUSTER_ERROR 0x1B /* Cluster error */
>  
>  /* errors above 0x80 are sheepdog-internal */
>  
> diff --git a/sheep/cluster.h b/sheep/cluster.h
> index a912985..4851290 100644
> --- a/sheep/cluster.h
> +++ b/sheep/cluster.h
> @@ -82,7 +82,7 @@ struct cluster_driver {
>  	 * can be read through sd_notify_handler() and totally ordered with
>  	 * node change events.
>  	 *
> -	 * Returns zero on success, -1 on error
> +	 * Returns SD_RES_XXX
>  	 */
>  	int (*notify)(void *msg, size_t msg_len);
>  
> @@ -92,14 +92,18 @@ struct cluster_driver {
>  	 * Once the cluster driver has ensured that events are blocked on all
>  	 * nodes it needs to call sd_block_handler() on the node where ->block
>  	 * was called.
> +	 *
> +	 * Returns SD_RES_XXX
>  	 */
> -	void (*block)(void);
> +	int (*block)(void);
>  
>  	/*
>  	 * Unblock events on all nodes, and send a total order message
>  	 * to all nodes.
> +	 *
> +	 * Returns SD_RES_XXX
>  	 */
> -	void (*unblock)(void *msg, size_t msg_len);
> +	int (*unblock)(void *msg, size_t msg_len);
>  
>  	/* Update the specific node in the driver's private copy of nodes */
>  	void (*update_node)(struct sd_node *);
> diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
> index bf90209..56c4737 100644
> --- a/sheep/cluster/corosync.c
> +++ b/sheep/cluster/corosync.c
> @@ -704,22 +704,25 @@ static int corosync_leave(void)
>  			    NULL, 0);
>  }
>  
> -static void corosync_block(void)
> +static int corosync_block(void)
>  {
> -	send_message(COROSYNC_MSG_TYPE_BLOCK, 0, &this_node, NULL, 0,
> -			    NULL, 0);
> +	return send_message(COROSYNC_MSG_TYPE_BLOCK, 0, &this_node, NULL, 0,
> +			    NULL, 0) == 0 ?
> +		SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR;
>  }
>  
> -static void corosync_unblock(void *msg, size_t msg_len)
> +static int corosync_unblock(void *msg, size_t msg_len)
>  {
> -	send_message(COROSYNC_MSG_TYPE_UNBLOCK, 0, &this_node, NULL, 0,
> -		     msg, msg_len);
> +	return send_message(COROSYNC_MSG_TYPE_UNBLOCK, 0, &this_node, NULL, 0,
> +			    msg, msg_len) == 0 ?
> +		SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR;
>  }
>  
>  static int corosync_notify(void *msg, size_t msg_len)
>  {
>  	return send_message(COROSYNC_MSG_TYPE_NOTIFY, 0, &this_node,
> -			   NULL, 0, msg, msg_len);
> +			   NULL, 0, msg, msg_len) ?
> +		SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR;
>  }

Ask send_message return SD_RES_XXX, then we don't need this ternary operation.

>  
>  static void corosync_handler(int listen_fd, int events, void *data)
> diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c
> index 307a69e..e6fa149 100644
> --- a/sheep/cluster/local.c
> +++ b/sheep/cluster/local.c
> @@ -358,19 +358,21 @@ static int local_notify(void *msg, size_t msg_len)
>  
>  	shm_queue_unlock();
>  
> -	return 0;
> +	return SD_RES_SUCCESS;
>  }
>  
> -static void local_block(void)
> +static int local_block(void)
>  {
>  	shm_queue_lock();
>  
>  	add_event(EVENT_BLOCK, &this_node, NULL, 0);
>  
>  	shm_queue_unlock();
> +
> +	return SD_RES_SUCCESS;
>  }
>  
> -static void local_unblock(void *msg, size_t msg_len)
> +static int local_unblock(void *msg, size_t msg_len)
>  {
>  	struct local_event *ev;
>  
> @@ -384,6 +386,8 @@ static void local_unblock(void *msg, size_t msg_len)
>  	add_event(EVENT_NOTIFY, &this_node, msg, msg_len);
>  
>  	shm_queue_unlock();
> +
> +	return SD_RES_SUCCESS;
>  }
>  
>  /* Returns true if an event is processed */
> diff --git a/sheep/cluster/shepherd.c b/sheep/cluster/shepherd.c
> index fba329c..df8737f 100644
> --- a/sheep/cluster/shepherd.c
> +++ b/sheep/cluster/shepherd.c
> @@ -638,10 +638,11 @@ static int do_shepherd_notify(bool unblock, void *msg, size_t msg_len)
>  
>  static int shepherd_notify(void *msg, size_t msg_len)
>  {
> -	return do_shepherd_notify(false, msg, msg_len);
> +	return do_shepherd_notify(false, msg, msg_len) == 0 ?
> +		SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR;
>  }
>  
> -static void shepherd_block(void)
> +static int shepherd_block(void)
>  {
>  	int ret;
>  	struct sph_msg msg;
> @@ -654,11 +655,14 @@ static void shepherd_block(void)
>  		sd_eprintf("xwrite() failed: %m");
>  		exit(1);
>  	}
> +
> +	return SD_RES_SUCCESS;
>  }
>  
> -static void shepherd_unblock(void *msg, size_t msg_len)
> +static int shepherd_unblock(void *msg, size_t msg_len)
>  {
> -	do_shepherd_notify(true, msg, msg_len);
> +	return do_shepherd_notify(true, msg, msg_len) == 0 ?
> +		SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR;
>  }
>  
>  /* FIXME: shepherd server also has to udpate node information */
> diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
> index 0ac1677..f9f9511 100644
> --- a/sheep/cluster/zookeeper.c
> +++ b/sheep/cluster/zookeeper.c
> @@ -823,17 +823,20 @@ static int zk_leave(void)
>  
>  static int zk_notify(void *msg, size_t msg_len)
>  {
> -	return add_event(EVENT_NOTIFY, &this_node, msg, msg_len);
> +	return add_event(EVENT_NOTIFY, &this_node, msg, msg_len) == 0 ?
> +		SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR;;

ditto.

Thanks
Yuan



More information about the sheepdog mailing list