[sheepdog] [PATCH] zookeeper: hande node joining race

Yunkai Zhang yunkai.me at gmail.com
Mon May 28 17:54:32 CEST 2012


On Mon, May 28, 2012 at 11:25 PM, Christoph Hellwig <hch at infradead.org> wrote:
> I got a bug report where the nr_sd_nodes == nr_zk_nodes assert in
> build_node_list is trigger by a larger number of sheep joining at the same
> time.

We should not start sheeps at the same time. Are you read this commit log from
this patch:8567aae281c75502c0a267bf76b771a2af8392f2 ?

    ==*Note*==:
    When sheep startups, it fetchs member list by reading /sheepdog/member/*,
    if the result is empty, sheep will consider itself as *master*.

    Now we have removed lock from zk_join,if we start multiple sheeps
    simultaneously, one problem arises: there may exist more than one *master*,
    this is bad.

    To prevent this problem, we can start multiple sheeps like this:
    - start the fist sheep alone, and sleep 2 seconds:
      $ sheep -d /store/0 -z 0 -p 7000 -c zookeeper:localhost:2181
      $ sleep 2

    - start other sheeps simultaneously(need not to sleep between them):
      $ for i in {1..100}; do sheep -d /store/$i -z $i -p $((7000 + $i)) \
       -c zookeeper:localhost:2181

>
> From code inspection it seems like this case can be triggered if
> node_btree_add is called twice for the same node, and tsearch returns
> the existing node, but nr_zk_nodes is still incremented.
>
> It seems like node_btree_add could be called twice if the startup of
> the zookeeper driver hits a race between reading out the list of existing
> nodes and new events.
>
> The patch below fixes the problem by handling this case in node_btree_add
> and asserting it only happens for valid case.  I'm not overly happy with
> this fix, fixes based on better knowledge of zookeeper would be welcome.
>
> Signed-off-by: Christoph Hellwig <hch at lst.de>
>
> diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
> index 859f2b0..a09d55a 100644
> --- a/sheep/cluster/zookeeper.c
> +++ b/sheep/cluster/zookeeper.c
> @@ -330,7 +330,7 @@ static inline int zk_node_cmp(const void *a, const void *b)
>        return node_cmp(&znode1->node, &znode2->node);
>  }
>
> -static void node_btree_add(void **btroot, struct zk_node *znode)
> +static int node_btree_add(void **btroot, struct zk_node *znode)
>  {
>        struct zk_node *n, **p;
>
> @@ -343,11 +343,14 @@ static void node_btree_add(void **btroot, struct zk_node *znode)
>        p = (struct zk_node **)tsearch((void *)n, btroot, zk_node_cmp);
>        if (p == NULL)
>                panic("tsearch, oom\n");
> -       else if (*p != n) {
> +
> +       if (*p != n) {
>                **p = *n;
>                free(n);
> +               return -1;
>        }
>        nr_zk_nodes++;
> +       return 0;
>  }
>
>  static inline void node_btree_del(void **btroot, struct zk_node *znode)
> @@ -466,7 +469,10 @@ static void zk_member_init(zhandle_t *zh)
>
>                        switch (rc) {
>                        case ZOK:
> -                               node_btree_add(&zk_node_btroot, &znode);
> +                               rc = node_btree_add(&zk_node_btroot, &znode);
> +                               if (rc)
> +                                       panic("node %s already exists\n", path);
> +                               break;
>                        case ZNONODE:
>                                break;
>                        default:
> @@ -770,7 +776,13 @@ static void zk_handler(int listen_fd, int events, void *data)
>                         */
>                        node_btree_clear(&zk_node_btroot);
>
> -               node_btree_add(&zk_node_btroot, &ev.sender);
> +               rc = node_btree_add(&zk_node_btroot, &ev.sender);
> +               if (rc) {
> +                       if (node_eq(&ev.sender.node, &this_node.node))
> +                               panic("this_node already in btree\n");
> +                       dprintf("node %s already existed\n",
> +                               node_to_str(&ev.sender.node));
> +               }
>                dprintf("one sheep joined[down], nr_nodes:%ld, sender:%s, joined:%d\n",
>                                nr_zk_nodes, node_to_str(&ev.sender.node), ev.sender.joined);
>
> --
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog



-- 
Yunkai Zhang
Work at Taobao



More information about the sheepdog mailing list