[sheepdog] [PATCH v3 1/2] sheep: handle block/unblock/notify error
Liu Yuan
namei.unix at gmail.com
Tue Jul 9 06:28:45 CEST 2013
On Mon, Jul 08, 2013 at 09:07:14PM -0700, Kai Zhang wrote:
> In group.c, it uses 3 broadcast operations: block, unblock and notify.
> These broadcast operations are implemented by cluster drivers.
> For example, corosync implements it by cpg_mcast_joined() while zookeeper by
> sequential node.
> And they can fail if network is unavailable for a while.
>
> However, current group.c doesn't handle errors of block/unblock/notify events
> and just ignore them.
>
> This patch add a new error SD_RES_CLUSTER_ERROR to indicate these errors.
>
> Signed-off-by: Kai Zhang <kyle at zelin.io>
> ---
> include/sheep.h | 1 +
> include/sheepdog_proto.h | 1 +
> sheep/cluster.h | 10 +++++++---
> sheep/cluster/corosync.c | 17 ++++++++++-------
> sheep/cluster/local.c | 10 +++++++---
> sheep/cluster/shepherd.c | 12 ++++++++----
> sheep/cluster/zookeeper.c | 13 ++++++++-----
> sheep/group.c | 40 +++++++++++++++++++++++++++++++++++-----
> 8 files changed, 77 insertions(+), 27 deletions(-)
>
> diff --git a/include/sheep.h b/include/sheep.h
> index 0d3fae4..3541012 100644
> --- a/include/sheep.h
> +++ b/include/sheep.h
> @@ -204,6 +204,7 @@ static inline const char *sd_strerror(int err)
> [SD_RES_JOIN_FAILED] = "Node has failed to join cluster",
> [SD_RES_HALT] = "IO has halted as there are too few living nodes",
> [SD_RES_READONLY] = "Object is read-only",
> + [SD_RES_CLUSTER_ERROR] = "Cluster error",
>
> /* from internal_proto.h */
> [SD_RES_OLD_NODE_VER] = "Request has an old epoch",
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 156457a..4e9c84e 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -71,6 +71,7 @@
> #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
> #define SD_RES_HALT 0x19 /* Sheepdog is stopped doing IO */
> #define SD_RES_READONLY 0x1A /* Object is read-only */
> +#define SD_RES_CLUSTER_ERROR 0x1B /* Cluster error */
>
> /* errors above 0x80 are sheepdog-internal */
>
> diff --git a/sheep/cluster.h b/sheep/cluster.h
> index a912985..4851290 100644
> --- a/sheep/cluster.h
> +++ b/sheep/cluster.h
> @@ -82,7 +82,7 @@ struct cluster_driver {
> * can be read through sd_notify_handler() and totally ordered with
> * node change events.
> *
> - * Returns zero on success, -1 on error
> + * Returns SD_RES_XXX
> */
> int (*notify)(void *msg, size_t msg_len);
>
> @@ -92,14 +92,18 @@ struct cluster_driver {
> * Once the cluster driver has ensured that events are blocked on all
> * nodes it needs to call sd_block_handler() on the node where ->block
> * was called.
> + *
> + * Returns SD_RES_XXX
> */
> - void (*block)(void);
> + int (*block)(void);
>
> /*
> * Unblock events on all nodes, and send a total order message
> * to all nodes.
> + *
> + * Returns SD_RES_XXX
> */
> - void (*unblock)(void *msg, size_t msg_len);
> + int (*unblock)(void *msg, size_t msg_len);
>
> /* Update the specific node in the driver's private copy of nodes */
> void (*update_node)(struct sd_node *);
> diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
> index bf90209..56c4737 100644
> --- a/sheep/cluster/corosync.c
> +++ b/sheep/cluster/corosync.c
> @@ -704,22 +704,25 @@ static int corosync_leave(void)
> NULL, 0);
> }
>
> -static void corosync_block(void)
> +static int corosync_block(void)
> {
> - send_message(COROSYNC_MSG_TYPE_BLOCK, 0, &this_node, NULL, 0,
> - NULL, 0);
> + return send_message(COROSYNC_MSG_TYPE_BLOCK, 0, &this_node, NULL, 0,
> + NULL, 0) == 0 ?
> + SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR;
> }
>
> -static void corosync_unblock(void *msg, size_t msg_len)
> +static int corosync_unblock(void *msg, size_t msg_len)
> {
> - send_message(COROSYNC_MSG_TYPE_UNBLOCK, 0, &this_node, NULL, 0,
> - msg, msg_len);
> + return send_message(COROSYNC_MSG_TYPE_UNBLOCK, 0, &this_node, NULL, 0,
> + msg, msg_len) == 0 ?
> + SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR;
> }
>
> static int corosync_notify(void *msg, size_t msg_len)
> {
> return send_message(COROSYNC_MSG_TYPE_NOTIFY, 0, &this_node,
> - NULL, 0, msg, msg_len);
> + NULL, 0, msg, msg_len) ?
> + SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR;
> }
Ask send_message return SD_RES_XXX, then we don't need this ternary operation.
>
> static void corosync_handler(int listen_fd, int events, void *data)
> diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c
> index 307a69e..e6fa149 100644
> --- a/sheep/cluster/local.c
> +++ b/sheep/cluster/local.c
> @@ -358,19 +358,21 @@ static int local_notify(void *msg, size_t msg_len)
>
> shm_queue_unlock();
>
> - return 0;
> + return SD_RES_SUCCESS;
> }
>
> -static void local_block(void)
> +static int local_block(void)
> {
> shm_queue_lock();
>
> add_event(EVENT_BLOCK, &this_node, NULL, 0);
>
> shm_queue_unlock();
> +
> + return SD_RES_SUCCESS;
> }
>
> -static void local_unblock(void *msg, size_t msg_len)
> +static int local_unblock(void *msg, size_t msg_len)
> {
> struct local_event *ev;
>
> @@ -384,6 +386,8 @@ static void local_unblock(void *msg, size_t msg_len)
> add_event(EVENT_NOTIFY, &this_node, msg, msg_len);
>
> shm_queue_unlock();
> +
> + return SD_RES_SUCCESS;
> }
>
> /* Returns true if an event is processed */
> diff --git a/sheep/cluster/shepherd.c b/sheep/cluster/shepherd.c
> index fba329c..df8737f 100644
> --- a/sheep/cluster/shepherd.c
> +++ b/sheep/cluster/shepherd.c
> @@ -638,10 +638,11 @@ static int do_shepherd_notify(bool unblock, void *msg, size_t msg_len)
>
> static int shepherd_notify(void *msg, size_t msg_len)
> {
> - return do_shepherd_notify(false, msg, msg_len);
> + return do_shepherd_notify(false, msg, msg_len) == 0 ?
> + SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR;
> }
>
> -static void shepherd_block(void)
> +static int shepherd_block(void)
> {
> int ret;
> struct sph_msg msg;
> @@ -654,11 +655,14 @@ static void shepherd_block(void)
> sd_eprintf("xwrite() failed: %m");
> exit(1);
> }
> +
> + return SD_RES_SUCCESS;
> }
>
> -static void shepherd_unblock(void *msg, size_t msg_len)
> +static int shepherd_unblock(void *msg, size_t msg_len)
> {
> - do_shepherd_notify(true, msg, msg_len);
> + return do_shepherd_notify(true, msg, msg_len) == 0 ?
> + SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR;
> }
>
> /* FIXME: shepherd server also has to udpate node information */
> diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
> index 0ac1677..f9f9511 100644
> --- a/sheep/cluster/zookeeper.c
> +++ b/sheep/cluster/zookeeper.c
> @@ -823,17 +823,20 @@ static int zk_leave(void)
>
> static int zk_notify(void *msg, size_t msg_len)
> {
> - return add_event(EVENT_NOTIFY, &this_node, msg, msg_len);
> + return add_event(EVENT_NOTIFY, &this_node, msg, msg_len) == 0 ?
> + SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR;;
ditto.
Thanks
Yuan
More information about the sheepdog
mailing list