[sheepdog] [PATCH v6] sheep/cluster: add distributed-lock implemented by zookeeper
Liu Yuan
namei.unix at gmail.com
Tue Dec 3 11:00:29 CET 2013
On Tue, Dec 03, 2013 at 04:05:25PM +0800, Robin Dong wrote:
> Implement the distributed lock by zookeeper
> (refer:http://zookeeper.apache.org/doc/trunk/recipes.html)
>
> The routine is:
> 1. create a seq-ephemeral znode in lock directory
> (use lock-id as dir name)
> 2. get smallest file path as owner of the lock; the other thread wait
> on a pthread_mutex_t (cluster_lock->wait)
> 3. if owner of the lock release it (or the owner is killed by accident),
> zookeeper will trigger zk_watch() which will wake up all waiting
> threads to compete new owner of the lock
>
> We use dlock_array to store pointers of cluster_locks in this sheep daemon so
> when receiving the event of ZOO_DELETED_EVENT the program will wake up all
> waiters (in this sheep daemon) who is sleeping on the lock id and let them
> compete for new owner.
>
> dlock_array is just a normal array using lock-id as index, so imaging a
> scenario: two threads (A and B) in one sheep daemon call zk_lock() for same
> lock-id, they will create two znodes in zookeeper but set dlock_array[lock_id]
> to only one of them (for example, set to B). After that, when ZOO_DELETED_EVENT
> comes, the zk_waiter() will only wake up thread B and thread A will sleep on
> '->wait' forever becuase no one could wakeup him.
>
> We have two method to solve this problem:
> A. using more complicated structure instead of dlock_array to store
> both A and B 's lock handle.
> B. adding a lock to avoid A and B call zk_lock() in the same time.
> We prefer method B because it also avoid creating too many files in a directory
> of zookeeper which will take too much pressure on zookeeper server if the
> number of sheep deamons are huge. Therefore we add 'local_lock' in
> 'struct cluster_lock'.
>
> Signed-off-by: Robin Dong <sanbai at taobao.com>
> ---
> include/sheep.h | 12 +++
> sheep/cluster.h | 29 ++++++
> sheep/cluster/corosync.c | 16 ++++
> sheep/cluster/local.c | 16 ++++
> sheep/cluster/zookeeper.c | 209 ++++++++++++++++++++++++++++++++++++++++++++-
> 5 files changed, 280 insertions(+), 2 deletions(-)
>
> diff --git a/include/sheep.h b/include/sheep.h
> index 293e057..e5726e8 100644
> --- a/include/sheep.h
> +++ b/include/sheep.h
> @@ -255,6 +255,18 @@ static inline void nodes_to_buffer(struct rb_root *nroot, void *buffer)
>
> #define MAX_NODE_STR_LEN 256
>
> +/* structure for distributed lock */
> +struct cluster_lock {
> + struct hlist_node hnode;
> + /* id is passed by users to represent a lock handle */
> + uint64_t id;
> + /* wait for the release of id by other lock owner */
> + pthread_mutex_t wait_release;
> + /* lock for different threads of the same node on the same id */
> + pthread_mutex_t id_lock;
> + char lock_path[MAX_NODE_STR_LEN];
> +};
> +
> static inline const char *node_to_str(const struct sd_node *id)
> {
> static __thread char str[MAX_NODE_STR_LEN];
> diff --git a/sheep/cluster.h b/sheep/cluster.h
> index 81b5ae4..08df91c 100644
> --- a/sheep/cluster.h
> +++ b/sheep/cluster.h
> @@ -109,6 +109,35 @@ struct cluster_driver {
> int (*unblock)(void *msg, size_t msg_len);
>
> /*
> + * Init a distributed mutually exclusive lock to avoid race condition
> + * when the whole sheepdog cluster process one exclusive resource.
> + *
> + * This function use 'lock_id' as the id of this distributed lock.
> + * A thread can create many locks in one sheep daemon.
> + *
> + * Returns SD_RES_XXX
> + */
> + int (*init_lock)(struct cluster_lock *lock, uint64_t lock_id);
> +
> + /*
> + * Acquire the distributed lock.
> + *
> + * The cluster_lock referenced by 'lock' shall be locked by calling
> + * cluster->lock(). If the cluster_lock is already locked, the calling
> + * thread shall block until the cluster_lock becomes available.
> + */
> + void (*lock)(struct cluster_lock *lock);
> +
> + /*
> + * Release the distributed lock.
> + *
> + * If the owner of the cluster_lock release it (or the owner is
> + * killed by accident), zookeeper will trigger zk_watch() which will
> + * wake up all waiting threads to compete new owner of the lock
> + */
> + void (*unlock)(struct cluster_lock *lock);
> +
> + /*
> * Update the specific node in the driver's private copy of nodes
> *
> * Returns SD_RES_XXX
> diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
> index ea4421b..7382d9a 100644
> --- a/sheep/cluster/corosync.c
> +++ b/sheep/cluster/corosync.c
> @@ -774,6 +774,19 @@ again:
> return 0;
> }
>
> +static int corosync_init_lock(struct cluster_lock *cluster_lock, uint32_t id)
uint64_t id
> +{
> + return -1;
> +}
> +
> +static void corosync_lock(struct cluster_lock *cluster_lock)
> +{
> +}
> +
> +static void corosync_unlock(struct cluster_lock *cluster_lock)
> +{
> +}
> +
> static int corosync_update_node(struct sd_node *node)
> {
> struct cpg_node cnode = this_node;
> @@ -794,6 +807,9 @@ static struct cluster_driver cdrv_corosync = {
> .notify = corosync_notify,
> .block = corosync_block,
> .unblock = corosync_unblock,
> + .init_lock = corosync_init_lock,
> + .lock = corosync_lock,
> + .unlock = corosync_unlock,
> .update_node = corosync_update_node,
> };
>
> diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c
> index b8cbb5c..4c4d83b 100644
> --- a/sheep/cluster/local.c
> +++ b/sheep/cluster/local.c
> @@ -547,6 +547,19 @@ static int local_init(const char *option)
> return 0;
> }
>
> +static int local_init_lock(struct cluster_lock *cluster_lock, uint64_t id)
> +{
> + return -1;
> +}
> +
> +static void local_lock(struct cluster_lock *cluster_lock)
> +{
> +}
> +
> +static void local_unlock(struct cluster_lock *cluster_lock)
> +{
> +}
> +
> static int local_update_node(struct sd_node *node)
> {
> struct local_node lnode = this_node;
> @@ -566,6 +579,9 @@ static struct cluster_driver cdrv_local = {
> .notify = local_notify,
> .block = local_block,
> .unblock = local_unblock,
> + .init_lock = local_init_lock,
> + .lock = local_lock,
> + .unlock = local_unlock,
> .update_node = local_update_node,
> };
>
> diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
> index fa89c46..84f1608 100644
> --- a/sheep/cluster/zookeeper.c
> +++ b/sheep/cluster/zookeeper.c
> @@ -30,6 +30,72 @@
> #define QUEUE_ZNODE BASE_ZNODE "/queue"
> #define MEMBER_ZNODE BASE_ZNODE "/member"
> #define MASTER_ZNONE BASE_ZNODE "/master"
> +#define LOCK_ZNODE BASE_ZNODE "/lock"
> +
> +#define WAIT_TIME 1 /* second */
> +
> +#define HASH_BUCKET_NR 4097
4097 looks a bit too big to me, how about make it smaller?
> +static struct hlist_head *cluster_locks_table;
> +static pthread_mutex_t table_locks[HASH_BUCKET_NR];
> +
> +/*
> + * All the operations of the lock table is protected by
> + * cluster_lock->id_lock so we don't need to add lock here
> + */
> +
> +static void lock_table_del(uint64_t lock_id)
> +{
> + uint64_t hval = lock_id % HASH_BUCKET_NR;
we have sd_hash(), which is more evently distributed than your simple hash.
Thanks
Yuan
More information about the sheepdog
mailing list