[sheepdog] [PATCH v6 1/6] zookeeper: fixed concurrent startup error

Kai Zhang kyle at zelin.io
Sat Jun 22 10:22:01 CEST 2013


Comments inline.

From Kyle's iPhone

在 2013-6-22,13:02,Liu Yuan <namei.unix at gmail.com> 写道:

> On 06/21/2013 08:34 PM, Kai Zhang wrote:
>> Current implementation of zookeeper driver has a risk when multiple sheep
>> start up concurrently.
>> 
>> Consider the following situation:
>> 1. There is a 3 node cluster: sheep1, sheep2, sheep3.
>> 2. Both sheep2 and sheep3 leave cluster.
>> 3. Both sheep2 and sheep3 start up after previous zookeeper session timeout.
>> 4. Sheep1 leaves the cluster before sheep2 and sheep3 receiving join requests
>>   from zookeeper.
>> 5. When sheep2 and sheep3 receive the join requests, both of them assume they
>>   are master due to zk_member_empty() returns true.
>> 
>> The new implementation is following the standard way that zookeeper sugguests.
>> At start up, each sheep creates a ephemeral sequencial node under MASTER_ZNODE.
>> The sheep who has the least number will be the master.
>> 
>> However, due to the join logic in zookeeper, there is a risk of hang during
>> join cluster.
>> This is because zk_compete_master and add_join_event are two seperate operations,
>> and there is no transection protection.
>> For example sheep1 got a less sequence number for master competition than sheep2,
>> but a larger sequence number for join request event.
>> So if the current master sheep3 quit before handling sheep2's join request,
>> sheep2 will hang on joining cluster.
>> 
>> To solve this problem, we force sheep send join request only when it can confirm
>> its a sheep with a less sequence number of master competition has joined.
>> 
>> Signed-off-by: Kai Zhang <kyle at zelin.io>
>> ---
>> sheep/cluster/zookeeper.c |  221 ++++++++++++++++++++++++++++++++++++++-------
>> 1 file changed, 187 insertions(+), 34 deletions(-)
>> 
>> diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
>> index 45db10a..2ee1b49 100644
>> --- a/sheep/cluster/zookeeper.c
>> +++ b/sheep/cluster/zookeeper.c
>> @@ -33,7 +33,7 @@
>> 
>> /* iterate child znodes */
>> #define FOR_EACH_ZNODE(parent, path, strs)                   \
>> -    for (zk_get_children(parent, strs),                       \
>> +    for (zk_get_children(parent, strs),                   \
>>             (strs)->data += (strs)->count;               \
>>         (strs)->count-- ?                           \
>>             snprintf(path, sizeof(path), "%s/%s", parent,     \
>> @@ -72,7 +72,13 @@ static struct sd_node sd_nodes[SD_MAX_NODES];
>> static size_t nr_sd_nodes;
>> static struct rb_root zk_node_root = RB_ROOT;
>> static pthread_rwlock_t zk_tree_lock = PTHREAD_RWLOCK_INITIALIZER;
>> +static pthread_rwlock_t zk_compete_master_lock = PTHREAD_RWLOCK_INITIALIZER;
>> static LIST_HEAD(zk_block_list);
>> +static uatomic_bool is_master;
>> +static uatomic_bool stop;
>> +static bool joined;
>> +
>> +static void zk_compete_master(void);
>> 
>> static struct zk_node *zk_tree_insert(struct zk_node *new)
>> {
>> @@ -183,11 +189,14 @@ zk_create_node(const char *path, const char *value, int valuelen,
>>  */
>> static inline ZOOAPI int
>> zk_create_seq_node(const char *path, const char *value, int valuelen,
>> -           char *path_buffer, int path_buffer_len)
>> +           char *path_buffer, int path_buffer_len, bool ephemeral)
>> {
>>    int rc;
>> +    int flags = ZOO_SEQUENCE;
>> +    if (ephemeral)
>> +        flags = flags | ZOO_EPHEMERAL;
>>    rc = zoo_create(zhandle, path, value, valuelen, &ZOO_OPEN_ACL_UNSAFE,
>> -            ZOO_SEQUENCE, path_buffer, path_buffer_len);
>> +            flags, path_buffer, path_buffer_len);
>>    if (rc != ZOK)
>>        sd_iprintf("failed, path:%s, %s", path, zerror(rc));
>> 
>> @@ -298,7 +307,7 @@ static void zk_queue_push(struct zk_event *ev)
>>    len = offsetof(typeof(*ev), buf) + ev->buf_len;
>>    snprintf(path, sizeof(path), "%s/", QUEUE_ZNODE);
>> again:
>> -    rc = zk_create_seq_node(path, (char *)ev, len, buf, sizeof(buf));
>> +    rc = zk_create_seq_node(path, (char *)ev, len, buf, sizeof(buf), false);
>>    switch (rc) {
>>    case ZOK:
>>        /* Success */
>> @@ -436,32 +445,10 @@ static inline int zk_master_create(void)
>>                  ZOO_EPHEMERAL, NULL, 0);
>> }
>> 
>> -static bool is_master(void)
>> -{
>> -    struct rb_node *n;
>> -    struct zk_node *zk = NULL;
>> -
>> -    if (!nr_sd_nodes) {
>> -        if (zk_member_empty())
>> -            return true;
>> -        else
>> -            return false;
>> -    }
>> -
>> -    for (n = rb_first(&zk_node_root); n; n = rb_next(n)) {
>> -        zk = rb_entry(n, struct zk_node, rb);
>> -        if (!zk->gone)
>> -            break;
>> -    }
>> -    if (zk && node_eq(&zk->node, &this_node.node))
>> -        return true;
>> -
>> -    return false;
>> -}
>> -
>> static void zk_queue_init(void)
>> {
>>    zk_init_node(BASE_ZNODE);
>> +    zk_init_node(MASTER_ZNONE);
>>    zk_init_node(QUEUE_ZNODE);
>>    zk_init_node(MEMBER_ZNODE);
>> }
>> @@ -511,6 +498,12 @@ static void zk_watcher(zhandle_t *zh, int type, int state, const char *path,
>>    } else if (type == ZOO_DELETED_EVENT) {
>>        struct zk_node *n;
>> 
>> +        ret = sscanf(path, MASTER_ZNONE "/%s", str);
>> +        if (ret == 1) {
>> +            zk_compete_master();
>> +            return;
>> +        }
>> +
>>        ret = sscanf(path, MEMBER_ZNODE "/%s", str);
>>        if (ret != 1)
>>            return;
>> @@ -550,6 +543,162 @@ static int add_join_event(void *msg, size_t msg_len)
>>    return 0;
>> }
>> 
>> +static void zk_get_least_seq(const char *parent, char *least_seq_path,
>> +                 int path_len, void *buf, int *buf_len)
>> +{
>> +    char path[MAX_NODE_STR_LEN], *p, *tmp;
>> +    struct String_vector strs;
>> +    int rc, least_seq = INT_MAX , seq;
>> +
>> +    while (true) {
>> +        FOR_EACH_ZNODE(parent, path, &strs) {
>> +            p = strrchr(path, '/');
>> +            seq = strtol(++p, &tmp, 10);
>> +            if (seq < least_seq)
>> +                least_seq = seq;
>> +        }
>> +
>> +        snprintf(path, MAX_NODE_STR_LEN, "%s/%010"PRId32,
>> +             parent, least_seq);
>> +        rc = zk_get_data(path, buf, buf_len);
>> +
>> +        if (rc == ZOK) {
>> +            strncpy(least_seq_path, path, path_len);
>> +            break;
>> +        } else if (rc == ZNONODE)
>> +            continue;
>> +        else
>> +            panic("failed, path:%s, %s", path, zerror(rc));
>> +    }
>> +}
>> +
>> +static void zk_find_master(int *master_seq, char *master_name)
>> +{
>> +    int rc, len = MAX_NODE_STR_LEN;
>> +    char master_compete_path[MAX_NODE_STR_LEN];
>> +
>> +    if (*master_seq < 0) {
>> +        zk_get_least_seq(MASTER_ZNONE, master_compete_path,
>> +                 MAX_NODE_STR_LEN, master_name, &len);
>> +        sscanf(master_compete_path, MASTER_ZNONE "/%"PRId32,
>> +               master_seq);
>> +    } else {
>> +        while(true) {
>> +            snprintf(master_compete_path, len,
>> +                 MASTER_ZNONE "/%010"PRId32, *master_seq);
>> +            rc = zk_get_data(master_compete_path, master_name,
>> +                     &len);
>> +            if (rc == ZOK)
>> +                break;
>> +            else if (rc == ZNONODE) {
>> +                sd_iprintf("detect master leave, "
>> +                       "start to compete master");
>> +                (*master_seq)++;
>> +                continue;
>> +            } else
>> +                panic("failed, path:%s, %s",
>> +                      master_compete_path, zerror(rc));
>> +        }
>> +    }
>> +}
>> +
>> +/*
>> + * block until last sheep joined
>> + * return sequence number of last sheep or -1 if no previous sheep
>> + */
>> +static int zk_verify_last_sheep_join(int seq)
>> +{
>> +    int rc, i, len = MAX_NODE_STR_LEN;
>> +    char path[MAX_NODE_STR_LEN], name[MAX_NODE_STR_LEN];
>> +
>> +    for (i = seq - 1; i >= 0; i--) {
>> +        snprintf(path, MAX_NODE_STR_LEN, MASTER_ZNONE "/%010"PRId32, i);
>> +        rc = zk_get_data(path, name, &len);
>> +        if (rc == ZNONODE)
>> +            continue;
>> +        else if (rc != ZOK)
>> +            panic("failed, path:%s, %s", path, zerror(rc));
>> +
>> +        if (!strcmp(name, node_to_str(&this_node.node)))
>> +            continue;
>> +
>> +        snprintf(path, MAX_NODE_STR_LEN, MEMBER_ZNODE "/%s", name);
>> +        rc = zk_node_exists(path);
>> +        if (rc == ZOK)
>> +            break;
>> +        else if (rc == ZNONODE) {
>> +            i++;
>> +            continue;
>> +        } else
>> +            panic("failed, path:%s, %s", path, zerror(rc));
>> +    }
>> +    return i;
>> +}
>> +
>> +/*
>> + * Create sequential node under MASTER_ZNODE.
>> + * Sheep with least sequential number win the competition.
>> + */
>> +static void zk_compete_master(void)
>> +{
>> +    int rc;
>> +    char master_name[MAX_NODE_STR_LEN];
>> +    char my_compete_path[MAX_NODE_STR_LEN];
>> +    static int master_seq = -1, my_seq;
>> +
>> +    /*
>> +     * This is to protect master_seq and my_seq because this function will
>> +     * be called by both main thread and zookeeper's event thread.
>> +     */
>> +    pthread_rwlock_wrlock(&zk_compete_master_lock);
>> +
>> +    if (uatomic_is_true(&is_master) || uatomic_is_true(&stop))
>> +        goto out;
>> +
>> +    if (!joined) {
>> +        sd_iprintf("start to compete master for the first time");
>> +        do {
>> +            if (uatomic_is_true(&stop))
>> +                goto out;
>> +            /* duplicate sequential node has no side-effect */
>> +            rc = zk_create_seq_node(MASTER_ZNONE "/",
>> +                        node_to_str(&this_node.node),
>> +                        MAX_NODE_STR_LEN,
>> +                        my_compete_path,
>> +                        MAX_NODE_STR_LEN, true);
>> +        } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
>> +
>> +        if (rc != ZOK)
>> +            panic("failed, %s", zerror(rc));
>> +
>> +        sd_dprintf("my compete path: %s", my_compete_path);
>> +        sscanf(my_compete_path, MASTER_ZNONE "/%"PRId32,
>> +               &my_seq);
>> +    }
>> +
>> +    zk_find_master(&master_seq, master_name);
>> +
>> +    if (!strcmp(master_name, node_to_str(&this_node.node)))
>> +        goto success;
>> +    else if (joined)
>> +        goto lost;
>> +    else if (zk_verify_last_sheep_join(my_seq) < 0) {
>> +        /* all previous sheep has quit, i'm master */
>> +        master_seq = my_seq;
>> +        goto success;
>> +    } else
>> +        goto lost;
>> +success:
>> +    uatomic_set_true(&is_master);
>> +    sd_iprintf("success");
>> +    goto out;
>> +lost:
>> +    sd_iprintf("lost");
>> +    goto out;
> 
> this goto isn't necessary. I think you can iprintf in above else clause.
> Then the out handling can be reduced as fowllowing:
> 
> out:
>    uatomic_set_true(&is_master);
>    sd_iprintf("success"); # Do you really need bother user here ?
> out_unlock:
>    pthread_rwlock_unlock(&zk_compete_master_lock);
> 

Good suggestion, thanks.
But I think 'success' is so important to us. And this is only printed for once.

> Thanks,
> Yuan



More information about the sheepdog mailing list