[sheepdog] [PATCH v2 3/4] sheep: handle block/unblock/notify error
MORITA Kazutaka
morita.kazutaka at gmail.com
Tue Jul 9 02:43:07 CEST 2013
At Sun, 7 Jul 2013 21:20:50 -0700,
Kai Zhang wrote:
>
> In group.c, it uses 3 broadcast operations: block, unblock and notify.
> These broadcast operations are implemented by cluster drivers.
> For example, corosync implements it by cpg_mcast_joined() while zookeeper by
> sequential node.
> And they can fail if network is unavailable for a while.
>
> However, current group.c doesn't handle errors of block/unblock/notify events
> and just ignore them.
>
> This patch add a new error SD_RES_CLUSTER_ERROR to indicate these errors.
>
> Signed-off-by: Kai Zhang <kyle at zelin.io>
> ---
> include/sheep.h | 1 +
> include/sheepdog_proto.h | 1 +
> sheep/cluster.h | 4 ++--
> sheep/cluster/corosync.c | 10 +++++-----
> sheep/cluster/local.c | 8 ++++++--
> sheep/cluster/shepherd.c | 8 +++++---
> sheep/cluster/zookeeper.c | 8 ++++----
> sheep/group.c | 27 ++++++++++++++++++++++-----
> 8 files changed, 46 insertions(+), 21 deletions(-)
>
> diff --git a/include/sheep.h b/include/sheep.h
> index 0d3fae4..3541012 100644
> --- a/include/sheep.h
> +++ b/include/sheep.h
> @@ -204,6 +204,7 @@ static inline const char *sd_strerror(int err)
> [SD_RES_JOIN_FAILED] = "Node has failed to join cluster",
> [SD_RES_HALT] = "IO has halted as there are too few living nodes",
> [SD_RES_READONLY] = "Object is read-only",
> + [SD_RES_CLUSTER_ERROR] = "Cluster error",
>
> /* from internal_proto.h */
> [SD_RES_OLD_NODE_VER] = "Request has an old epoch",
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 156457a..4e9c84e 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -71,6 +71,7 @@
> #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
> #define SD_RES_HALT 0x19 /* Sheepdog is stopped doing IO */
> #define SD_RES_READONLY 0x1A /* Object is read-only */
> +#define SD_RES_CLUSTER_ERROR 0x1B /* Cluster error */
>
> /* errors above 0x80 are sheepdog-internal */
>
> diff --git a/sheep/cluster.h b/sheep/cluster.h
> index a912985..16a09b8 100644
> --- a/sheep/cluster.h
> +++ b/sheep/cluster.h
> @@ -93,13 +93,13 @@ struct cluster_driver {
> * nodes it needs to call sd_block_handler() on the node where ->block
> * was called.
> */
> - void (*block)(void);
> + int (*block)(void);
>
> /*
> * Unblock events on all nodes, and send a total order message
> * to all nodes.
> */
> - void (*unblock)(void *msg, size_t msg_len);
> + int (*unblock)(void *msg, size_t msg_len);
>
> /* Update the specific node in the driver's private copy of nodes */
> void (*update_node)(struct sd_node *);
> diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
> index bf90209..8db12cf 100644
> --- a/sheep/cluster/corosync.c
> +++ b/sheep/cluster/corosync.c
> @@ -704,16 +704,16 @@ static int corosync_leave(void)
> NULL, 0);
> }
>
> -static void corosync_block(void)
> +static int corosync_block(void)
> {
> - send_message(COROSYNC_MSG_TYPE_BLOCK, 0, &this_node, NULL, 0,
> + return send_message(COROSYNC_MSG_TYPE_BLOCK, 0, &this_node, NULL, 0,
> NULL, 0);
> }
>
> -static void corosync_unblock(void *msg, size_t msg_len)
> +static int corosync_unblock(void *msg, size_t msg_len)
> {
> - send_message(COROSYNC_MSG_TYPE_UNBLOCK, 0, &this_node, NULL, 0,
> - msg, msg_len);
> + return send_message(COROSYNC_MSG_TYPE_UNBLOCK, 0, &this_node, NULL, 0,
> + msg, msg_len);
> }
I think block/unblock/notify should return SD_RES_xxx since they know
the error reason rather than the callers. Currently, all the
callbacks only can return SD_RES_CLUSTER_ERROR or SD_RES_SUCCESS,
though.
> diff --git a/sheep/group.c b/sheep/group.c
> index 83c3445..2fa4091 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -254,7 +254,16 @@ static void cluster_op_done(struct work *work)
> sd_dprintf("%s (%p)", op_name(req->op), req);
>
> msg = prepare_cluster_msg(req, &size);
> - sys->cdrv->unblock(msg, size);
> +
> + if (sys->cdrv->unblock(msg, size) != 0) {
> + /*
> + * Failed to unblock, shoot myself to let other sheep
> + * unblock the event.
> + * FIXME: handle it gracefully.
> + */
> + sd_eprintf("Failed to unblock, exiting.");
The log level should be SDOG_EMERG.
> + exit(1);
> + }
>
> free(msg);
> }
> @@ -301,22 +310,30 @@ void queue_cluster_request(struct request *req)
> sd_dprintf("%s (%p)", op_name(req->op), req);
>
> if (has_process_work(req->op)) {
> + if (sys->cdrv->block() != 0)
I'd suggest printing an error message here.
> + goto error;
> list_add_tail(&req->pending_list,
> main_thread_get(pending_block_list));
> - sys->cdrv->block();
> +
> } else {
> struct vdi_op_message *msg;
> size_t size;
>
> msg = prepare_cluster_msg(req, &size);
> + msg->rsp.result = SD_RES_SUCCESS;
> + if (sys->cdrv->notify(msg, size) != 0)
Print an error message here.
> + goto error;
> +
> list_add_tail(&req->pending_list,
> main_thread_get(pending_notify_list));
Thanks,
Kazutaka
More information about the sheepdog
mailing list