[sheepdog] [PATCH v2 1/2] sheep/cluster: use pthread_cond instread of pthread_mutex to avoid panic

Fri Jan 17 09:00:56 CET 2014

On Fri, Jan 17, 2014 at 02:34:12PM +0800, Robin Dong wrote:
> From: Robin Dong <sanbai at taobao.com>
> 
> If we use pthread like this:
> 
> 	pthread_mutex_t mt;
> 	pthread_mutex_init(&mt, NULL);
> 	pthread_mutex_unlock(&mt);
> 	pthread_mutex_destroy(&mt);
> 
> The last 'pthread_mutex_destroy()' will return EBUSY because the mutex has
> been 'unlocked' even no one locked it.
> (ref: https://bugzilla.redhat.com/show_bug.cgi?id=113588)
> 
> This will cause problem in cluster/zookeeper.c because any event from zk_watcher()
> would unlocked the 'wait_wakeup' mutex. Therefore we need to use pthread-condition
> to replace pthread-mutex.
> 
> Signed-off-by: Robin Dong <sanbai at taobao.com>
> ---
>  include/util.h            |  9 +++++++++
>  sheep/cluster/zookeeper.c | 18 +++++++++++-------
>  2 files changed, 20 insertions(+), 7 deletions(-)
> 
> diff --git a/include/util.h b/include/util.h
> index b7e0e8b..83f3c08 100644
> --- a/include/util.h
> +++ b/include/util.h
> @@ -371,6 +371,15 @@ static inline int sd_cond_wait(struct sd_cond *cond, struct sd_mutex *mutex)
>  	return pthread_cond_wait(&cond->cond, &mutex->mutex);
>  }
>  
> +static inline int sd_cond_wait_timeout(struct sd_cond *cond,
> +				       struct sd_mutex *mutex, int second)
> +{
> +	struct timespec wait_time;
> +	wait_time.tv_sec = second;
> +	wait_time.tv_nsec = 0;
> +	return pthread_cond_timedwait(&cond->cond, &mutex->mutex, &wait_time);
> +}
> +
>  static inline int sd_cond_broadcast(struct sd_cond *cond)
>  {
>  	return pthread_cond_broadcast(&cond->cond);
> diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
> index 3f6c146..4ef3a9a 100644
> --- a/sheep/cluster/zookeeper.c
> +++ b/sheep/cluster/zookeeper.c
> @@ -42,7 +42,8 @@ struct cluster_lock {
>  	/* referenced by different threads in one sheepdog daemon */
>  	uint64_t ref;
>  	/* wait for the release of id by other lock owner */
> -	struct sd_mutex wait_wakeup;
> +	struct sd_cond wait_wakeup;
> +	struct sd_mutex wait_mutex;
>  	/* lock for different threads of the same node on the same id */
>  	struct sd_mutex id_lock;
>  	char lock_path[MAX_NODE_STR_LEN];
> @@ -314,7 +315,7 @@ static int lock_table_lookup_wakeup(uint64_t lock_id)
>  	sd_mutex_lock(table_locks + hval);
>  	hlist_for_each_entry(lock, iter, cluster_locks_table + hval, hnode) {
>  		if (lock->id == lock_id) {
> -			sd_mutex_unlock(&lock->wait_wakeup);
> +			sd_cond_broadcast(&lock->wait_wakeup);
>  			res = 0;
>  			break;
>  		}
> @@ -351,7 +352,7 @@ static struct cluster_lock *lock_table_lookup_acquire(uint64_t lock_id)
>  		if (rc)
>  			panic("Failed to init node %s", path);
>  
> -		sd_init_mutex(&ret_lock->wait_wakeup);
> +		sd_cond_init(&ret_lock->wait_wakeup);
>  		sd_init_mutex(&ret_lock->id_lock);
>  
>  		hlist_add_head(&(ret_lock->hnode), cluster_locks_table + hval);
> @@ -394,7 +395,8 @@ static void lock_table_lookup_release(uint64_t lock_id)
>  			hlist_del(iter);
>  			/* free all resource used by this lock */
>  			sd_destroy_mutex(&lock->id_lock);
> -			sd_destroy_mutex(&lock->wait_wakeup);
> +			sd_destroy_mutex(&lock->wait_mutex);
> +			sd_destroy_cond(&lock->wait_wakeup);
>  			snprintf(path, MAX_NODE_STR_LEN, LOCK_ZNODE "/%"PRIu64,
>  				 lock->id);
>  			/*
> @@ -1229,7 +1231,7 @@ kick_block_event:
>   * This operation will create a seq-ephemeral znode in lock directory
>   * of zookeeper (use lock-id as dir name). The smallest file path in
>   * this directory wil be the owner of the lock; the other threads will
> - * wait on a struct sd_mutex (cluster_lock->wait_wakeup)
> + * wait on a struct sd_cond (cluster_lock->wait_wakeup)
>   */
>  static void zk_lock(uint64_t lock_id)
>  {
> @@ -1259,7 +1261,7 @@ static void zk_lock(uint64_t lock_id)
>  	sd_debug("create path %s success", my_path);
>  
>  	/* create node ok now */
> -	snprintf(parent, MAX_NODE_STR_LEN, LOCK_ZNODE "/%"PRIu64"",
> +	snprintf(parent, MAX_NODE_STR_LEN, LOCK_ZNODE "/%"PRIu64,
>  		 cluster_lock->id);
>  	while (true) {
>  		zk_get_least_seq(parent, lowest_seq_path, MAX_NODE_STR_LEN,
> @@ -1275,7 +1277,9 @@ static void zk_lock(uint64_t lock_id)
>  		rc = zoo_exists(zhandle, lowest_seq_path, 1, NULL);
>  		if (rc == ZOK) {
>  			sd_debug("call zoo_exits success %s", lowest_seq_path);
> -			sd_mutex_lock(&cluster_lock->wait_wakeup);
> +			sd_cond_wait_timeout(&cluster_lock->wait_wakeup,
> +					     &cluster_lock->wait_mutex,
> +					     WAIT_TIME);

This looks overly complicated. So this mutex or condition is used to wakeup
other lock contender(s) in the same node to save zookeeper traffic. But with
current sd_cond, we can't save any traffic now because we timeout and retry
zk_get_least_seq. So why not simply remove this kind of mutex completely like

while (true) {
	if get_lock == success
		return;
	zk_wait();
}

Thanks
Yuan