On Tue, Dec 03, 2013 at 04:05:25PM +0800, Robin Dong wrote: > Implement the distributed lock by zookeeper > (refer:http://zookeeper.apache.org/doc/trunk/recipes.html) > > The routine is: > 1. create a seq-ephemeral znode in lock directory > (use lock-id as dir name) > 2. get smallest file path as owner of the lock; the other thread wait > on a pthread_mutex_t (cluster_lock->wait) > 3. if owner of the lock release it (or the owner is killed by accident), > zookeeper will trigger zk_watch() which will wake up all waiting > threads to compete new owner of the lock > > We use dlock_array to store pointers of cluster_locks in this sheep daemon so > when receiving the event of ZOO_DELETED_EVENT the program will wake up all > waiters (in this sheep daemon) who is sleeping on the lock id and let them > compete for new owner. > > dlock_array is just a normal array using lock-id as index, so imaging a > scenario: two threads (A and B) in one sheep daemon call zk_lock() for same > lock-id, they will create two znodes in zookeeper but set dlock_array[lock_id] > to only one of them (for example, set to B). After that, when ZOO_DELETED_EVENT > comes, the zk_waiter() will only wake up thread B and thread A will sleep on > '->wait' forever becuase no one could wakeup him. > > We have two method to solve this problem: > A. using more complicated structure instead of dlock_array to store > both A and B 's lock handle. > B. adding a lock to avoid A and B call zk_lock() in the same time. > We prefer method B because it also avoid creating too many files in a directory > of zookeeper which will take too much pressure on zookeeper server if the > number of sheep deamons are huge. Therefore we add 'local_lock' in > 'struct cluster_lock'. > > Signed-off-by: Robin Dong <sanbai at taobao.com> > --- > include/sheep.h | 12 +++ > sheep/cluster.h | 29 ++++++ > sheep/cluster/corosync.c | 16 ++++ > sheep/cluster/local.c | 16 ++++ > sheep/cluster/zookeeper.c | 209 ++++++++++++++++++++++++++++++++++++++++++++- > 5 files changed, 280 insertions(+), 2 deletions(-) > > diff --git a/include/sheep.h b/include/sheep.h > index 293e057..e5726e8 100644 > --- a/include/sheep.h > +++ b/include/sheep.h > @@ -255,6 +255,18 @@ static inline void nodes_to_buffer(struct rb_root *nroot, void *buffer) > > #define MAX_NODE_STR_LEN 256 > > +/* structure for distributed lock */ > +struct cluster_lock { > + struct hlist_node hnode; > + /* id is passed by users to represent a lock handle */ > + uint64_t id; > + /* wait for the release of id by other lock owner */ > + pthread_mutex_t wait_release; > + /* lock for different threads of the same node on the same id */ > + pthread_mutex_t id_lock; > + char lock_path[MAX_NODE_STR_LEN]; > +}; > + > static inline const char *node_to_str(const struct sd_node *id) > { > static __thread char str[MAX_NODE_STR_LEN]; > diff --git a/sheep/cluster.h b/sheep/cluster.h > index 81b5ae4..08df91c 100644 > --- a/sheep/cluster.h > +++ b/sheep/cluster.h > @@ -109,6 +109,35 @@ struct cluster_driver { > int (*unblock)(void *msg, size_t msg_len); > > /* > + * Init a distributed mutually exclusive lock to avoid race condition > + * when the whole sheepdog cluster process one exclusive resource. > + * > + * This function use 'lock_id' as the id of this distributed lock. > + * A thread can create many locks in one sheep daemon. > + * > + * Returns SD_RES_XXX > + */ > + int (*init_lock)(struct cluster_lock *lock, uint64_t lock_id); > + > + /* > + * Acquire the distributed lock. > + * > + * The cluster_lock referenced by 'lock' shall be locked by calling > + * cluster->lock(). If the cluster_lock is already locked, the calling > + * thread shall block until the cluster_lock becomes available. > + */ > + void (*lock)(struct cluster_lock *lock); > + > + /* > + * Release the distributed lock. > + * > + * If the owner of the cluster_lock release it (or the owner is > + * killed by accident), zookeeper will trigger zk_watch() which will > + * wake up all waiting threads to compete new owner of the lock > + */ > + void (*unlock)(struct cluster_lock *lock); > + > + /* > * Update the specific node in the driver's private copy of nodes > * > * Returns SD_RES_XXX > diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c > index ea4421b..7382d9a 100644 > --- a/sheep/cluster/corosync.c > +++ b/sheep/cluster/corosync.c > @@ -774,6 +774,19 @@ again: > return 0; > } > > +static int corosync_init_lock(struct cluster_lock *cluster_lock, uint32_t id) uint64_t id > +{ > + return -1; > +} > + > +static void corosync_lock(struct cluster_lock *cluster_lock) > +{ > +} > + > +static void corosync_unlock(struct cluster_lock *cluster_lock) > +{ > +} > + > static int corosync_update_node(struct sd_node *node) > { > struct cpg_node cnode = this_node; > @@ -794,6 +807,9 @@ static struct cluster_driver cdrv_corosync = { > .notify = corosync_notify, > .block = corosync_block, > .unblock = corosync_unblock, > + .init_lock = corosync_init_lock, > + .lock = corosync_lock, > + .unlock = corosync_unlock, > .update_node = corosync_update_node, > }; > > diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c > index b8cbb5c..4c4d83b 100644 > --- a/sheep/cluster/local.c > +++ b/sheep/cluster/local.c > @@ -547,6 +547,19 @@ static int local_init(const char *option) > return 0; > } > > +static int local_init_lock(struct cluster_lock *cluster_lock, uint64_t id) > +{ > + return -1; > +} > + > +static void local_lock(struct cluster_lock *cluster_lock) > +{ > +} > + > +static void local_unlock(struct cluster_lock *cluster_lock) > +{ > +} > + > static int local_update_node(struct sd_node *node) > { > struct local_node lnode = this_node; > @@ -566,6 +579,9 @@ static struct cluster_driver cdrv_local = { > .notify = local_notify, > .block = local_block, > .unblock = local_unblock, > + .init_lock = local_init_lock, > + .lock = local_lock, > + .unlock = local_unlock, > .update_node = local_update_node, > }; > > diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c > index fa89c46..84f1608 100644 > --- a/sheep/cluster/zookeeper.c > +++ b/sheep/cluster/zookeeper.c > @@ -30,6 +30,72 @@ > #define QUEUE_ZNODE BASE_ZNODE "/queue" > #define MEMBER_ZNODE BASE_ZNODE "/member" > #define MASTER_ZNONE BASE_ZNODE "/master" > +#define LOCK_ZNODE BASE_ZNODE "/lock" > + > +#define WAIT_TIME 1 /* second */ > + > +#define HASH_BUCKET_NR 4097 4097 looks a bit too big to me, how about make it smaller? > +static struct hlist_head *cluster_locks_table; > +static pthread_mutex_t table_locks[HASH_BUCKET_NR]; > + > +/* > + * All the operations of the lock table is protected by > + * cluster_lock->id_lock so we don't need to add lock here > + */ > + > +static void lock_table_del(uint64_t lock_id) > +{ > + uint64_t hval = lock_id % HASH_BUCKET_NR; we have sd_hash(), which is more evently distributed than your simple hash. Thanks Yuan |