[sheepdog] [PATCH] zookeeper: exit program on unrecoverable error

Kai Zhang kyle at zelin.io
Fri Jun 7 09:40:06 CEST 2013


Hi

In my thought, the set of zk_* APIs are just wrappers for retrying zookeeper operations when retryable errors occur.
They just focus on:
- which error is retryable
- how to retry

And they should not do some other things like 'panic'.

This logic should be left to upper level by returning a zookeeper return code.

Thanks,
Kyle


On Jun 7, 2013, at 1:08 PM, MORITA Kazutaka <morita.kazutaka at gmail.com> wrote:

> From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> 
> This adds panic() where sheep receives unrecoverable error from
> ZooKeeper.  It's not good to abort the program on error, but much
> better than destroying membership information on each node.  It is
> future work to remove those panic().
> 
> This also removes the ZOOAPI modifiers from the function definitions,
> which are not ZooKeeper APIs.
> 
> Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> ---
> sheep/cluster/zookeeper.c |  109 ++++++++++++++++++++-------------------------
> 1 file changed, 48 insertions(+), 61 deletions(-)
> 
> diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
> index 45db10a..b9759ab 100644
> --- a/sheep/cluster/zookeeper.c
> +++ b/sheep/cluster/zookeeper.c
> @@ -135,19 +135,18 @@ static inline struct zk_node *zk_tree_search(const struct node_id *nid)
> static zhandle_t *zhandle;
> static struct zk_node this_node;
> 
> -static inline ZOOAPI int zk_delete_node(const char *path, int version)
> +static inline void zk_delete_node(const char *path, int version)
> {
> 	int rc;
> 	do {
> 		rc = zoo_delete(zhandle, path, version);
> 	} while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
> +
> 	if (rc != ZOK)
> -		sd_eprintf("failed, path:%s, %s", path, zerror(rc));
> -	return rc;
> +		panic("failed, path:%s, %s", path, zerror(rc));
> }
> 
> -static inline ZOOAPI void
> -zk_init_node(const char *path)
> +static inline void zk_init_node(const char *path)
> {
> 	int rc;
> 	do {
> @@ -159,7 +158,8 @@ zk_init_node(const char *path)
> 		panic("failed, path:%s, %s", path, zerror(rc));
> }
> 
> -static inline ZOOAPI int
> +/* return zero on success, -1 if path already exists */
> +static inline int
> zk_create_node(const char *path, const char *value, int valuelen,
> 	       const struct ACL_vector *acl, int flags, char *path_buffer,
> 	       int path_buffer_len)
> @@ -168,70 +168,74 @@ zk_create_node(const char *path, const char *value, int valuelen,
> 	do {
> 		rc = zoo_create(zhandle, path, value, valuelen, acl,
> 				flags, path_buffer, path_buffer_len);
> -		if (rc != ZOK && rc != ZNODEEXISTS)
> -			sd_eprintf("failed, path:%s, %s", path, zerror(rc));
> 	} while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
> -	return rc;
> +
> +	if (rc != ZOK && rc != ZNODEEXISTS)
> +		panic("failed, path:%s, %s", path, zerror(rc));
> +
> +	return rc == ZOK ? 0 : -1;
> }
> 
> /*
>  * Create a znode after adding a unique monotonically increasing sequence number
>  * to the path name.
>  *
> - * Note that the caller has to retry this function when this returns
> - * ZOPERATIONTIMEOUT or ZCONNECTIONLOSS and the znode is not created.
> + * return zero on success, -1 if it is not sure whether znode is created.
>  */
> -static inline ZOOAPI int
> +static inline int
> zk_create_seq_node(const char *path, const char *value, int valuelen,
> 		   char *path_buffer, int path_buffer_len)
> {
> 	int rc;
> 	rc = zoo_create(zhandle, path, value, valuelen, &ZOO_OPEN_ACL_UNSAFE,
> 			ZOO_SEQUENCE, path_buffer, path_buffer_len);
> -	if (rc != ZOK)
> -		sd_iprintf("failed, path:%s, %s", path, zerror(rc));
> +	if (rc != ZOK && rc != ZOPERATIONTIMEOUT && rc != ZCONNECTIONLOSS)
> +		panic("failed, path:%s, %s", path, zerror(rc));
> 
> -	return rc;
> +	return rc == ZOK ? 0 : -1;
> }
> 
> -static inline ZOOAPI int zk_get_data(const char *path, void *buffer,
> -				     int *buffer_len)
> +/* return zero on success, -1 if path doesn't exist */
> +static inline int zk_get_data(const char *path, void *buffer, int *buffer_len)
> {
> 	int rc;
> 	do {
> 		rc = zoo_get(zhandle, path, 1, (char *)buffer,
> 			     buffer_len, NULL);
> -		if (rc != ZOK)
> -			sd_eprintf("failed, path:%s, %s", path, zerror(rc));
> 	} while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
> -	return rc;
> +
> +	if (rc != ZOK && rc != ZNONODE)
> +		panic("failed, path:%s, %s", path, zerror(rc));
> +
> +	return rc == ZOK ? 0 : -1;
> }
> 
> -static inline ZOOAPI int
> +static inline void
> zk_set_data(const char *path, const char *buffer, int buflen, int version)
> {
> 	int rc;
> 	do {
> 		rc = zoo_set(zhandle, path, buffer, buflen, version);
> 	} while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
> +
> 	if (rc != ZOK)
> 		panic("failed, path:%s, %s", path, zerror(rc));
> -	return rc;
> }
> 
> -static inline ZOOAPI int zk_node_exists(const char *path)
> +static inline bool zk_node_exists(const char *path)
> {
> 	int rc;
> 	do {
> 		rc = zoo_exists(zhandle, path, 1, NULL);
> -		if (rc != ZOK && rc != ZNONODE)
> -			sd_eprintf("failed, path:%s, %s", path, zerror(rc));
> 	} while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
> 
> -	return rc;
> +	if (rc != ZOK && rc != ZNONODE)
> +		panic("failed, path:%s, %s", path, zerror(rc));
> +
> +	return rc == ZOK;
> }
> 
> -static inline ZOOAPI void zk_get_children(const char *path,
> +static inline void zk_get_children(const char *path,
> 					  struct String_vector *strings)
> {
> 	int rc;
> @@ -248,16 +252,11 @@ static int32_t queue_pos;
> 
> static bool zk_queue_peek(void)
> {
> -	int rc;
> 	char path[MAX_NODE_STR_LEN];
> 
> 	snprintf(path, sizeof(path), QUEUE_ZNODE "/%010"PRId32, queue_pos);
> 
> -	rc = zk_node_exists(path);
> -	if (rc == ZOK)
> -		return true;
> -
> -	return false;
> +	return zk_node_exists(path);
> }
> 
> /* return true if there is a node with 'id' in the queue. */
> @@ -271,20 +270,15 @@ static bool zk_find_seq_node(uint64_t id, char *seq_path, int seq_path_len)
> 		snprintf(seq_path, seq_path_len, QUEUE_ZNODE"/%010"PRId32, seq);
> 		len = offsetof(typeof(ev), id) + sizeof(ev.id);
> 		rc = zk_get_data(seq_path, &ev, &len);
> -		switch (rc) {
> -		case ZOK:
> -			if (ev.id == id) {
> -				sd_dprintf("id %"PRIx64" is found in %s", id,
> -					   seq_path);
> -				return true;
> -			}
> -			break;
> -		case ZNONODE:
> +		if (rc < 0) {
> 			sd_dprintf("id %"PRIx64" is not found", id);
> 			return false;
> -		default:
> -			panic("failed, %s", zerror(rc));
> -			break;
> +		}
> +
> +		if (ev.id == id) {
> +			sd_dprintf("id %"PRIx64" is found in %s", id,
> +				   seq_path);
> +			return true;
> 		}
> 	}
> }
> @@ -299,19 +293,12 @@ static void zk_queue_push(struct zk_event *ev)
> 	snprintf(path, sizeof(path), "%s/", QUEUE_ZNODE);
> again:
> 	rc = zk_create_seq_node(path, (char *)ev, len, buf, sizeof(buf));
> -	switch (rc) {
> -	case ZOK:
> -		/* Success */
> -		break;
> -	case ZOPERATIONTIMEOUT:
> -	case ZCONNECTIONLOSS:
> +	if (rc < 0) {
> 		if (!zk_find_seq_node(ev->id, buf, sizeof(buf)))
> 			/* retry if seq_node was not created */
> 			goto again;
> -		break;
> -	default:
> -		panic("failed, path:%s, %s", path, zerror(rc));
> 	}
> +
> 	if (first_push) {
> 		int32_t seq;
> 
> @@ -357,8 +344,8 @@ static void zk_queue_pop_advance(struct zk_event *ev)
> 	len = sizeof(*ev);
> 	snprintf(path, sizeof(path), QUEUE_ZNODE "/%010"PRId32, queue_pos);
> 	rc = zk_get_data(path, ev, &len);
> -	if (rc != ZOK)
> -		panic("failed to get data from %s, %s", path, zerror(rc));
> +	if (rc < 0)
> +		panic("failed to get data from %s", path);
> 	sd_dprintf("%s, type:%d, len:%d, pos:%"PRId32, path, ev->type, len,
> 		   queue_pos);
> 	queue_pos++;
> @@ -553,20 +540,18 @@ static int add_join_event(void *msg, size_t msg_len)
> static int zk_join(const struct sd_node *myself,
> 		   void *opaque, size_t opaque_len)
> {
> -	int rc;
> 	char path[MAX_NODE_STR_LEN];
> 
> 	this_node.node = *myself;
> 
> 	snprintf(path, sizeof(path), MEMBER_ZNODE "/%s", node_to_str(myself));
> -	rc = zk_node_exists(path);
> -	if (rc == ZOK) {
> +	if (zk_node_exists(path)) {
> 		sd_eprintf("Previous zookeeper session exist, shoot myself.");
> 		exit(1);
> 	}
> 
> 	/* For concurrent nodes setup, we allow only one to continue */
> -	while (zk_member_empty() && zk_master_create() != ZOK)
> +	while (zk_member_empty() && zk_master_create() < 0)
> 		;/* wait */
> 
> 	return add_join_event(opaque, opaque_len);
> @@ -631,7 +616,9 @@ static void watch_all_nodes(void)
> 		return;
> 
> 	FOR_EACH_ZNODE(MEMBER_ZNODE, path, &strs) {
> -		zk_get_data(path, &znode, &len);
> +		int rc = zk_get_data(path, &znode, &len);
> +		if (rc < 0)
> +			panic("failed to get data from %s", path);
> 	}
> }
> 
> -- 
> 1.7.9.5
> 
> -- 
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog




More information about the sheepdog mailing list