[sheepdog] [PATCH v6 1/6] zookeeper: fixed concurrent startup error

Liu Yuan namei.unix at gmail.com
Sat Jun 22 07:02:15 CEST 2013


On 06/21/2013 08:34 PM, Kai Zhang wrote:
> Current implementation of zookeeper driver has a risk when multiple sheep
> start up concurrently.
> 
> Consider the following situation:
> 1. There is a 3 node cluster: sheep1, sheep2, sheep3.
> 2. Both sheep2 and sheep3 leave cluster.
> 3. Both sheep2 and sheep3 start up after previous zookeeper session timeout.
> 4. Sheep1 leaves the cluster before sheep2 and sheep3 receiving join requests
>    from zookeeper.
> 5. When sheep2 and sheep3 receive the join requests, both of them assume they
>    are master due to zk_member_empty() returns true.
> 
> The new implementation is following the standard way that zookeeper sugguests.
> At start up, each sheep creates a ephemeral sequencial node under MASTER_ZNODE.
> The sheep who has the least number will be the master.
> 
> However, due to the join logic in zookeeper, there is a risk of hang during
> join cluster.
> This is because zk_compete_master and add_join_event are two seperate operations,
> and there is no transection protection.
> For example sheep1 got a less sequence number for master competition than sheep2,
> but a larger sequence number for join request event.
> So if the current master sheep3 quit before handling sheep2's join request,
> sheep2 will hang on joining cluster.
> 
> To solve this problem, we force sheep send join request only when it can confirm
> its a sheep with a less sequence number of master competition has joined.
> 
> Signed-off-by: Kai Zhang <kyle at zelin.io>
> ---
>  sheep/cluster/zookeeper.c |  221 ++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 187 insertions(+), 34 deletions(-)
> 
> diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
> index 45db10a..2ee1b49 100644
> --- a/sheep/cluster/zookeeper.c
> +++ b/sheep/cluster/zookeeper.c
> @@ -33,7 +33,7 @@
>  
>  /* iterate child znodes */
>  #define FOR_EACH_ZNODE(parent, path, strs)			       \
> -	for (zk_get_children(parent, strs),		               \
> +	for (zk_get_children(parent, strs),			       \
>  		     (strs)->data += (strs)->count;		       \
>  	     (strs)->count-- ?					       \
>  		     snprintf(path, sizeof(path), "%s/%s", parent,     \
> @@ -72,7 +72,13 @@ static struct sd_node sd_nodes[SD_MAX_NODES];
>  static size_t nr_sd_nodes;
>  static struct rb_root zk_node_root = RB_ROOT;
>  static pthread_rwlock_t zk_tree_lock = PTHREAD_RWLOCK_INITIALIZER;
> +static pthread_rwlock_t zk_compete_master_lock = PTHREAD_RWLOCK_INITIALIZER;
>  static LIST_HEAD(zk_block_list);
> +static uatomic_bool is_master;
> +static uatomic_bool stop;
> +static bool joined;
> +
> +static void zk_compete_master(void);
>  
>  static struct zk_node *zk_tree_insert(struct zk_node *new)
>  {
> @@ -183,11 +189,14 @@ zk_create_node(const char *path, const char *value, int valuelen,
>   */
>  static inline ZOOAPI int
>  zk_create_seq_node(const char *path, const char *value, int valuelen,
> -		   char *path_buffer, int path_buffer_len)
> +		   char *path_buffer, int path_buffer_len, bool ephemeral)
>  {
>  	int rc;
> +	int flags = ZOO_SEQUENCE;
> +	if (ephemeral)
> +		flags = flags | ZOO_EPHEMERAL;
>  	rc = zoo_create(zhandle, path, value, valuelen, &ZOO_OPEN_ACL_UNSAFE,
> -			ZOO_SEQUENCE, path_buffer, path_buffer_len);
> +			flags, path_buffer, path_buffer_len);
>  	if (rc != ZOK)
>  		sd_iprintf("failed, path:%s, %s", path, zerror(rc));
>  
> @@ -298,7 +307,7 @@ static void zk_queue_push(struct zk_event *ev)
>  	len = offsetof(typeof(*ev), buf) + ev->buf_len;
>  	snprintf(path, sizeof(path), "%s/", QUEUE_ZNODE);
>  again:
> -	rc = zk_create_seq_node(path, (char *)ev, len, buf, sizeof(buf));
> +	rc = zk_create_seq_node(path, (char *)ev, len, buf, sizeof(buf), false);
>  	switch (rc) {
>  	case ZOK:
>  		/* Success */
> @@ -436,32 +445,10 @@ static inline int zk_master_create(void)
>  			      ZOO_EPHEMERAL, NULL, 0);
>  }
>  
> -static bool is_master(void)
> -{
> -	struct rb_node *n;
> -	struct zk_node *zk = NULL;
> -
> -	if (!nr_sd_nodes) {
> -		if (zk_member_empty())
> -			return true;
> -		else
> -			return false;
> -	}
> -
> -	for (n = rb_first(&zk_node_root); n; n = rb_next(n)) {
> -		zk = rb_entry(n, struct zk_node, rb);
> -		if (!zk->gone)
> -			break;
> -	}
> -	if (zk && node_eq(&zk->node, &this_node.node))
> -		return true;
> -
> -	return false;
> -}
> -
>  static void zk_queue_init(void)
>  {
>  	zk_init_node(BASE_ZNODE);
> +	zk_init_node(MASTER_ZNONE);
>  	zk_init_node(QUEUE_ZNODE);
>  	zk_init_node(MEMBER_ZNODE);
>  }
> @@ -511,6 +498,12 @@ static void zk_watcher(zhandle_t *zh, int type, int state, const char *path,
>  	} else if (type == ZOO_DELETED_EVENT) {
>  		struct zk_node *n;
>  
> +		ret = sscanf(path, MASTER_ZNONE "/%s", str);
> +		if (ret == 1) {
> +			zk_compete_master();
> +			return;
> +		}
> +
>  		ret = sscanf(path, MEMBER_ZNODE "/%s", str);
>  		if (ret != 1)
>  			return;
> @@ -550,6 +543,162 @@ static int add_join_event(void *msg, size_t msg_len)
>  	return 0;
>  }
>  
> +static void zk_get_least_seq(const char *parent, char *least_seq_path,
> +			     int path_len, void *buf, int *buf_len)
> +{
> +	char path[MAX_NODE_STR_LEN], *p, *tmp;
> +	struct String_vector strs;
> +	int rc, least_seq = INT_MAX , seq;
> +
> +	while (true) {
> +		FOR_EACH_ZNODE(parent, path, &strs) {
> +			p = strrchr(path, '/');
> +			seq = strtol(++p, &tmp, 10);
> +			if (seq < least_seq)
> +				least_seq = seq;
> +		}
> +
> +		snprintf(path, MAX_NODE_STR_LEN, "%s/%010"PRId32,
> +			 parent, least_seq);
> +		rc = zk_get_data(path, buf, buf_len);
> +
> +		if (rc == ZOK) {
> +			strncpy(least_seq_path, path, path_len);
> +			break;
> +		} else if (rc == ZNONODE)
> +			continue;
> +		else
> +			panic("failed, path:%s, %s", path, zerror(rc));
> +	}
> +}
> +
> +static void zk_find_master(int *master_seq, char *master_name)
> +{
> +	int rc, len = MAX_NODE_STR_LEN;
> +	char master_compete_path[MAX_NODE_STR_LEN];
> +
> +	if (*master_seq < 0) {
> +		zk_get_least_seq(MASTER_ZNONE, master_compete_path,
> +				 MAX_NODE_STR_LEN, master_name, &len);
> +		sscanf(master_compete_path, MASTER_ZNONE "/%"PRId32,
> +		       master_seq);
> +	} else {
> +		while(true) {
> +			snprintf(master_compete_path, len,
> +				 MASTER_ZNONE "/%010"PRId32, *master_seq);
> +			rc = zk_get_data(master_compete_path, master_name,
> +					 &len);
> +			if (rc == ZOK)
> +				break;
> +			else if (rc == ZNONODE) {
> +				sd_iprintf("detect master leave, "
> +					   "start to compete master");
> +				(*master_seq)++;
> +				continue;
> +			} else
> +				panic("failed, path:%s, %s",
> +				      master_compete_path, zerror(rc));
> +		}
> +	}
> +}
> +
> +/*
> + * block until last sheep joined
> + * return sequence number of last sheep or -1 if no previous sheep
> + */
> +static int zk_verify_last_sheep_join(int seq)
> +{
> +	int rc, i, len = MAX_NODE_STR_LEN;
> +	char path[MAX_NODE_STR_LEN], name[MAX_NODE_STR_LEN];
> +
> +	for (i = seq - 1; i >= 0; i--) {
> +		snprintf(path, MAX_NODE_STR_LEN, MASTER_ZNONE "/%010"PRId32, i);
> +		rc = zk_get_data(path, name, &len);
> +		if (rc == ZNONODE)
> +			continue;
> +		else if (rc != ZOK)
> +			panic("failed, path:%s, %s", path, zerror(rc));
> +
> +		if (!strcmp(name, node_to_str(&this_node.node)))
> +			continue;
> +
> +		snprintf(path, MAX_NODE_STR_LEN, MEMBER_ZNODE "/%s", name);
> +		rc = zk_node_exists(path);
> +		if (rc == ZOK)
> +			break;
> +		else if (rc == ZNONODE) {
> +			i++;
> +			continue;
> +		} else
> +			panic("failed, path:%s, %s", path, zerror(rc));
> +	}
> +	return i;
> +}
> +
> +/*
> + * Create sequential node under MASTER_ZNODE.
> + * Sheep with least sequential number win the competition.
> + */
> +static void zk_compete_master(void)
> +{
> +	int rc;
> +	char master_name[MAX_NODE_STR_LEN];
> +	char my_compete_path[MAX_NODE_STR_LEN];
> +	static int master_seq = -1, my_seq;
> +
> +	/*
> +	 * This is to protect master_seq and my_seq because this function will
> +	 * be called by both main thread and zookeeper's event thread.
> +	 */
> +	pthread_rwlock_wrlock(&zk_compete_master_lock);
> +
> +	if (uatomic_is_true(&is_master) || uatomic_is_true(&stop))
> +		goto out;
> +
> +	if (!joined) {
> +		sd_iprintf("start to compete master for the first time");
> +		do {
> +			if (uatomic_is_true(&stop))
> +				goto out;
> +			/* duplicate sequential node has no side-effect */
> +			rc = zk_create_seq_node(MASTER_ZNONE "/",
> +						node_to_str(&this_node.node),
> +						MAX_NODE_STR_LEN,
> +						my_compete_path,
> +						MAX_NODE_STR_LEN, true);
> +		} while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
> +
> +		if (rc != ZOK)
> +			panic("failed, %s", zerror(rc));
> +
> +		sd_dprintf("my compete path: %s", my_compete_path);
> +		sscanf(my_compete_path, MASTER_ZNONE "/%"PRId32,
> +		       &my_seq);
> +	}
> +
> +	zk_find_master(&master_seq, master_name);
> +
> +	if (!strcmp(master_name, node_to_str(&this_node.node)))
> +		goto success;
> +	else if (joined)
> +		goto lost;
> +	else if (zk_verify_last_sheep_join(my_seq) < 0) {
> +		/* all previous sheep has quit, i'm master */
> +		master_seq = my_seq;
> +		goto success;
> +	} else
> +		goto lost;
> +success:
> +	uatomic_set_true(&is_master);
> +	sd_iprintf("success");
> +	goto out;
> +lost:
> +	sd_iprintf("lost");
> +	goto out;

this goto isn't necessary. I think you can iprintf in above else clause.
Then the out handling can be reduced as fowllowing:

out:
    uatomic_set_true(&is_master);
    sd_iprintf("success"); # Do you really need bother user here ?
out_unlock:
    pthread_rwlock_unlock(&zk_compete_master_lock);

Thanks,
Yuan



More information about the sheepdog mailing list