[sheepdog] [PATCH] zookeeper: exit program on unrecoverable error
Kai Zhang
kyle at zelin.io
Fri Jun 7 09:40:06 CEST 2013
Hi
In my thought, the set of zk_* APIs are just wrappers for retrying zookeeper operations when retryable errors occur.
They just focus on:
- which error is retryable
- how to retry
And they should not do some other things like 'panic'.
This logic should be left to upper level by returning a zookeeper return code.
Thanks,
Kyle
On Jun 7, 2013, at 1:08 PM, MORITA Kazutaka <morita.kazutaka at gmail.com> wrote:
> From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
>
> This adds panic() where sheep receives unrecoverable error from
> ZooKeeper. It's not good to abort the program on error, but much
> better than destroying membership information on each node. It is
> future work to remove those panic().
>
> This also removes the ZOOAPI modifiers from the function definitions,
> which are not ZooKeeper APIs.
>
> Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> ---
> sheep/cluster/zookeeper.c | 109 ++++++++++++++++++++-------------------------
> 1 file changed, 48 insertions(+), 61 deletions(-)
>
> diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
> index 45db10a..b9759ab 100644
> --- a/sheep/cluster/zookeeper.c
> +++ b/sheep/cluster/zookeeper.c
> @@ -135,19 +135,18 @@ static inline struct zk_node *zk_tree_search(const struct node_id *nid)
> static zhandle_t *zhandle;
> static struct zk_node this_node;
>
> -static inline ZOOAPI int zk_delete_node(const char *path, int version)
> +static inline void zk_delete_node(const char *path, int version)
> {
> int rc;
> do {
> rc = zoo_delete(zhandle, path, version);
> } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
> +
> if (rc != ZOK)
> - sd_eprintf("failed, path:%s, %s", path, zerror(rc));
> - return rc;
> + panic("failed, path:%s, %s", path, zerror(rc));
> }
>
> -static inline ZOOAPI void
> -zk_init_node(const char *path)
> +static inline void zk_init_node(const char *path)
> {
> int rc;
> do {
> @@ -159,7 +158,8 @@ zk_init_node(const char *path)
> panic("failed, path:%s, %s", path, zerror(rc));
> }
>
> -static inline ZOOAPI int
> +/* return zero on success, -1 if path already exists */
> +static inline int
> zk_create_node(const char *path, const char *value, int valuelen,
> const struct ACL_vector *acl, int flags, char *path_buffer,
> int path_buffer_len)
> @@ -168,70 +168,74 @@ zk_create_node(const char *path, const char *value, int valuelen,
> do {
> rc = zoo_create(zhandle, path, value, valuelen, acl,
> flags, path_buffer, path_buffer_len);
> - if (rc != ZOK && rc != ZNODEEXISTS)
> - sd_eprintf("failed, path:%s, %s", path, zerror(rc));
> } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
> - return rc;
> +
> + if (rc != ZOK && rc != ZNODEEXISTS)
> + panic("failed, path:%s, %s", path, zerror(rc));
> +
> + return rc == ZOK ? 0 : -1;
> }
>
> /*
> * Create a znode after adding a unique monotonically increasing sequence number
> * to the path name.
> *
> - * Note that the caller has to retry this function when this returns
> - * ZOPERATIONTIMEOUT or ZCONNECTIONLOSS and the znode is not created.
> + * return zero on success, -1 if it is not sure whether znode is created.
> */
> -static inline ZOOAPI int
> +static inline int
> zk_create_seq_node(const char *path, const char *value, int valuelen,
> char *path_buffer, int path_buffer_len)
> {
> int rc;
> rc = zoo_create(zhandle, path, value, valuelen, &ZOO_OPEN_ACL_UNSAFE,
> ZOO_SEQUENCE, path_buffer, path_buffer_len);
> - if (rc != ZOK)
> - sd_iprintf("failed, path:%s, %s", path, zerror(rc));
> + if (rc != ZOK && rc != ZOPERATIONTIMEOUT && rc != ZCONNECTIONLOSS)
> + panic("failed, path:%s, %s", path, zerror(rc));
>
> - return rc;
> + return rc == ZOK ? 0 : -1;
> }
>
> -static inline ZOOAPI int zk_get_data(const char *path, void *buffer,
> - int *buffer_len)
> +/* return zero on success, -1 if path doesn't exist */
> +static inline int zk_get_data(const char *path, void *buffer, int *buffer_len)
> {
> int rc;
> do {
> rc = zoo_get(zhandle, path, 1, (char *)buffer,
> buffer_len, NULL);
> - if (rc != ZOK)
> - sd_eprintf("failed, path:%s, %s", path, zerror(rc));
> } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
> - return rc;
> +
> + if (rc != ZOK && rc != ZNONODE)
> + panic("failed, path:%s, %s", path, zerror(rc));
> +
> + return rc == ZOK ? 0 : -1;
> }
>
> -static inline ZOOAPI int
> +static inline void
> zk_set_data(const char *path, const char *buffer, int buflen, int version)
> {
> int rc;
> do {
> rc = zoo_set(zhandle, path, buffer, buflen, version);
> } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
> +
> if (rc != ZOK)
> panic("failed, path:%s, %s", path, zerror(rc));
> - return rc;
> }
>
> -static inline ZOOAPI int zk_node_exists(const char *path)
> +static inline bool zk_node_exists(const char *path)
> {
> int rc;
> do {
> rc = zoo_exists(zhandle, path, 1, NULL);
> - if (rc != ZOK && rc != ZNONODE)
> - sd_eprintf("failed, path:%s, %s", path, zerror(rc));
> } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
>
> - return rc;
> + if (rc != ZOK && rc != ZNONODE)
> + panic("failed, path:%s, %s", path, zerror(rc));
> +
> + return rc == ZOK;
> }
>
> -static inline ZOOAPI void zk_get_children(const char *path,
> +static inline void zk_get_children(const char *path,
> struct String_vector *strings)
> {
> int rc;
> @@ -248,16 +252,11 @@ static int32_t queue_pos;
>
> static bool zk_queue_peek(void)
> {
> - int rc;
> char path[MAX_NODE_STR_LEN];
>
> snprintf(path, sizeof(path), QUEUE_ZNODE "/%010"PRId32, queue_pos);
>
> - rc = zk_node_exists(path);
> - if (rc == ZOK)
> - return true;
> -
> - return false;
> + return zk_node_exists(path);
> }
>
> /* return true if there is a node with 'id' in the queue. */
> @@ -271,20 +270,15 @@ static bool zk_find_seq_node(uint64_t id, char *seq_path, int seq_path_len)
> snprintf(seq_path, seq_path_len, QUEUE_ZNODE"/%010"PRId32, seq);
> len = offsetof(typeof(ev), id) + sizeof(ev.id);
> rc = zk_get_data(seq_path, &ev, &len);
> - switch (rc) {
> - case ZOK:
> - if (ev.id == id) {
> - sd_dprintf("id %"PRIx64" is found in %s", id,
> - seq_path);
> - return true;
> - }
> - break;
> - case ZNONODE:
> + if (rc < 0) {
> sd_dprintf("id %"PRIx64" is not found", id);
> return false;
> - default:
> - panic("failed, %s", zerror(rc));
> - break;
> + }
> +
> + if (ev.id == id) {
> + sd_dprintf("id %"PRIx64" is found in %s", id,
> + seq_path);
> + return true;
> }
> }
> }
> @@ -299,19 +293,12 @@ static void zk_queue_push(struct zk_event *ev)
> snprintf(path, sizeof(path), "%s/", QUEUE_ZNODE);
> again:
> rc = zk_create_seq_node(path, (char *)ev, len, buf, sizeof(buf));
> - switch (rc) {
> - case ZOK:
> - /* Success */
> - break;
> - case ZOPERATIONTIMEOUT:
> - case ZCONNECTIONLOSS:
> + if (rc < 0) {
> if (!zk_find_seq_node(ev->id, buf, sizeof(buf)))
> /* retry if seq_node was not created */
> goto again;
> - break;
> - default:
> - panic("failed, path:%s, %s", path, zerror(rc));
> }
> +
> if (first_push) {
> int32_t seq;
>
> @@ -357,8 +344,8 @@ static void zk_queue_pop_advance(struct zk_event *ev)
> len = sizeof(*ev);
> snprintf(path, sizeof(path), QUEUE_ZNODE "/%010"PRId32, queue_pos);
> rc = zk_get_data(path, ev, &len);
> - if (rc != ZOK)
> - panic("failed to get data from %s, %s", path, zerror(rc));
> + if (rc < 0)
> + panic("failed to get data from %s", path);
> sd_dprintf("%s, type:%d, len:%d, pos:%"PRId32, path, ev->type, len,
> queue_pos);
> queue_pos++;
> @@ -553,20 +540,18 @@ static int add_join_event(void *msg, size_t msg_len)
> static int zk_join(const struct sd_node *myself,
> void *opaque, size_t opaque_len)
> {
> - int rc;
> char path[MAX_NODE_STR_LEN];
>
> this_node.node = *myself;
>
> snprintf(path, sizeof(path), MEMBER_ZNODE "/%s", node_to_str(myself));
> - rc = zk_node_exists(path);
> - if (rc == ZOK) {
> + if (zk_node_exists(path)) {
> sd_eprintf("Previous zookeeper session exist, shoot myself.");
> exit(1);
> }
>
> /* For concurrent nodes setup, we allow only one to continue */
> - while (zk_member_empty() && zk_master_create() != ZOK)
> + while (zk_member_empty() && zk_master_create() < 0)
> ;/* wait */
>
> return add_join_event(opaque, opaque_len);
> @@ -631,7 +616,9 @@ static void watch_all_nodes(void)
> return;
>
> FOR_EACH_ZNODE(MEMBER_ZNODE, path, &strs) {
> - zk_get_data(path, &znode, &len);
> + int rc = zk_get_data(path, &znode, &len);
> + if (rc < 0)
> + panic("failed to get data from %s", path);
> }
> }
>
> --
> 1.7.9.5
>
> --
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog
More information about the sheepdog
mailing list