[sheepdog] [PATCH v3] cluster: add the distributed lock implemented by zookeeper for object-storage

Fri Nov 29 12:36:45 CET 2013

On Fri, Nov 29, 2013 at 06:35:11PM +0800, Robin Dong wrote:

The patch title need updated, e.g, as "sheep/cluster: introduce distributed lock"

> Implement the distributed lock by zookeeper (refer: http://zookeeper.apache.org/doc/trunk/recipes.html)
> The routine is:
>         1. create a seq-ephemeral znode in lock directory (use lock-id as dir name)
>         2. get smallest file path as owner of the lock; the other thread wait on a pthread_mutex_t
>         3. if owner of the lock release it (or the owner is killed by accident), zookeeper will
>            trigger zk_watch() which will wake up all waiting threads to compete new owner of the lock
> 
> We add ->local_lock for dist_mutex to avoid many threads in one sheepdog daemon to create too many files
> in a lock directory.

What exactly kind of scenario does this handle? It would be great if you can
show us any use case or clear problem this ->local_lock solves. Unless there is
a definite use case, I think we should avoid pre-mature optimization.

> 
> v2 --> v3:
> 	1. change cluster interface to init_lock()/lock()/unlock()
> 	2. change 'struct zk_mutex' to 'struct cluster_lock'
> 	3. add empty implementation to local/corosync
> 
> v1 --> v2:
> 	move code from sheep/http/lock.c into sheep/cluster/zookeeper.c using cluster framework
> 
> Signed-off-by: Robin Dong <sanbai at taobao.com>
> ---
>  sheep/cluster.h           |   25 ++++++++
>  sheep/cluster/corosync.c  |   16 +++++
>  sheep/cluster/local.c     |   16 +++++
>  sheep/cluster/zookeeper.c |  140 ++++++++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 196 insertions(+), 1 deletions(-)
> 
> diff --git a/sheep/cluster.h b/sheep/cluster.h
> index 81b5ae4..260573c 100644
> --- a/sheep/cluster.h
> +++ b/sheep/cluster.h
> @@ -23,6 +23,13 @@
>  #include "sheep.h"
>  #include "config.h"
>  
> +struct cluster_lock {
> +	uint32_t id;		/* id of this mutex */
> +	pthread_mutex_t wait;
> +	pthread_mutex_t local_lock;
> +	char ephemeral_path[MAX_NODE_STR_LEN];
> +};

I suppose cluster_lock will be used throughout the sheep daemon, so this
should be declared in include/sheep.h

>  /*
>   * maximum payload size sent in ->notify and ->unblock, it should be large
>   * enough to support COROSYNC_MAX_NODES * struct sd_node
> @@ -109,6 +116,24 @@ struct cluster_driver {
>  	int (*unblock)(void *msg, size_t msg_len);
>  
>  	/*
> +	 * Init a distributed mutually exclusive lock to avoid race condition
> +	 * when using swift interface to add/delete/list object.
> +	 *
> +	 * Returns SD_RES_XXX
> +	 */

'swift stuff' should be removed from the comment. And I think you'd better
describe id in the comment too.

>
> +	int (*init_lock)(struct cluster_lock *lock, uint32_t id);

uint32_t is supposed to be a per vdi lock, right? I am wondering if can solve the
following case:

(AG) Allocation Group

 vdi: AG1, AG2, AG3 ...

Every AG need a lock to allow concurrent allocation and deallocation in the vdi
so your lock can't hanlde it right?

What about extend it as uint64_t, then we can use 'vdi + lock_id' to solve the
above case.

> +	/*
> +	 * Get the distributed lock.

Better as "Acquire the distributed lock" and we need more comment on a bit how
it works, I'd suggest man pthread_mutex_lock as a reference.

> +	 *
> +	 * It will never return if it can't acquire the lock.
> +	 */
>
> +	void (*lock)(struct cluster_lock *lock);
> +
> +	/* Release the distributed lock. */

Ditto

> +	void (*unlock)(struct cluster_lock *lock);
> +
> +	/*
>  	 * Update the specific node in the driver's private copy of nodes
>  	 *
>  	 * Returns SD_RES_XXX
> diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
> index ea4421b..35c95d3 100644
> --- a/sheep/cluster/corosync.c
> +++ b/sheep/cluster/corosync.c
> @@ -774,6 +774,19 @@ again:
>  	return 0;
>  }
>  
> +static int corosync_init_lock(struct cluster_lock *dlock, uint32_t id)
> +{
> +	return -1;
> +}
> +
> +static void corosync_lock(struct cluster_lock *dlock)
> +{
> +}
> +
> +static void corosync_unlock(struct cluster_lock *dlock)
> +{
> +}
> +
>  static int corosync_update_node(struct sd_node *node)
>  {
>  	struct cpg_node cnode = this_node;
> @@ -794,6 +807,9 @@ static struct cluster_driver cdrv_corosync = {
>  	.notify		= corosync_notify,
>  	.block		= corosync_block,
>  	.unblock	= corosync_unblock,
> +	.init_lock	= corosync_init_lock,
> +	.lock		= corosync_lock,
> +	.unlock		= corosync_unlock,
>  	.update_node	= corosync_update_node,
>  };
>  
> diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c
> index b8cbb5c..d71cd20 100644
> --- a/sheep/cluster/local.c
> +++ b/sheep/cluster/local.c
> @@ -547,6 +547,19 @@ static int local_init(const char *option)
>  	return 0;
>  }
>  
> +static int local_init_lock(struct cluster_lock *dlock, uint32_t id)
> +{
> +	return -1;
> +}
> +
> +static void local_lock(struct cluster_lock *dlock)
> +{
> +}
> +
> +static void local_unlock(struct cluster_lock *dlock)
> +{
> +}
> +
>  static int local_update_node(struct sd_node *node)
>  {
>  	struct local_node lnode = this_node;
> @@ -566,6 +579,9 @@ static struct cluster_driver cdrv_local = {
>  	.notify		= local_notify,
>  	.block		= local_block,
>  	.unblock	= local_unblock,
> +	.init_lock	= local_init_lock,
> +	.lock		= local_lock,
> +	.unlock		= local_unlock,
>  	.update_node    = local_update_node,
>  };
>  
> diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
> index fa89c46..fc6c207 100644
> --- a/sheep/cluster/zookeeper.c
> +++ b/sheep/cluster/zookeeper.c
> @@ -30,6 +30,21 @@
>  #define QUEUE_ZNODE BASE_ZNODE "/queue"
>  #define MEMBER_ZNODE BASE_ZNODE "/member"
>  #define MASTER_ZNONE BASE_ZNODE "/master"
> +#define LOCK_ZNODE BASE_ZNODE "/lock"
> +
> +#define MAX_MUTEX_NR	4096
> +#define WAIT_TIME	1		/* second */
> +
> +static struct cluster_lock **dlock_array;
> +
> +/*
> + * Wait a while when create, delete or get_children fail on
> + * zookeeper lock so it will not print too much loop log
> + */
> +static void zk_wait(void)
> +{
> +	sleep(WAIT_TIME);
> +}
>  
>  /* iterate child znodes */
>  #define FOR_EACH_ZNODE(parent, path, strs)			       \
> @@ -506,6 +521,7 @@ static void zk_watcher(zhandle_t *zh, int type, int state, const char *path,
>  {
>  	struct zk_node znode;
>  	char str[MAX_NODE_STR_LEN], *p;
> +	uint32_t lock_id;
>  	int ret;
>  
>  	if (type == ZOO_SESSION_EVENT && state == ZOO_EXPIRED_SESSION_STATE) {
> @@ -528,6 +544,14 @@ static void zk_watcher(zhandle_t *zh, int type, int state, const char *path,
>  	} else if (type == ZOO_DELETED_EVENT) {
>  		struct zk_node *n;
>  
> +		/* process distributed lock */
> +		ret = sscanf(path, LOCK_ZNODE "/%u/%s", &lock_id, str);
> +		if (ret == 2 && lock_id < MAX_MUTEX_NR &&
> +		    dlock_array && dlock_array[lock_id]) {
> +			pthread_mutex_unlock(&(dlock_array[lock_id]->wait));
> +			sd_debug("release lock %u %s", lock_id, str);
> +		}
> +
>  		ret = sscanf(path, MASTER_ZNONE "/%s", str);
>  		if (ret == 1) {
>  			zk_compete_master();
> @@ -1058,6 +1082,108 @@ kick_block_event:
>  	kick_block_event();
>  }
>  
> +static int zk_init_lock(struct cluster_lock *dlock, uint32_t id)
> +{
> +	int rc;
> +	char path[MAX_NODE_STR_LEN];
> +
> +	if (id > MAX_MUTEX_NR) {
> +		sd_err("lock-id is too large!");
> +		rc = -1;
> +		goto err;
> +	}
> +
> +	dlock_array[id] = dlock;
> +	dlock->id = id;
> +	snprintf(path, MAX_NODE_STR_LEN, LOCK_ZNODE "/%u", dlock->id);
> +	rc = zk_init_node(path);
> +	if (rc)
> +		goto err;
> +
> +	rc = pthread_mutex_init(&dlock->wait, NULL);
> +	if (rc) {
> +		sd_err("failed to init dlock->wait");
> +		goto err;
> +	}
> +
> +	rc = pthread_mutex_init(&dlock->local_lock, NULL);
> +	if (rc) {
> +		sd_err("failed to init dlock->local_lock");
> +		goto err;
> +	}
> +
> +	return 0;
> +err:
> +	dlock_array[id] = NULL;
> +	return rc;
> +}
> +
> +static void zk_lock(struct cluster_lock *dlock)
> +{
> +	int flags = ZOO_SEQUENCE | ZOO_EPHEMERAL;
> +	int rc, len = MAX_NODE_STR_LEN;
> +	char *my_path;
> +	char parent[MAX_NODE_STR_LEN];
> +	char lowest_seq_path[MAX_NODE_STR_LEN];
> +	char owner_name[MAX_NODE_STR_LEN];
> +
> +	/*
> +	 * if many threads use locks with same id, we should use
> +	 * ->local_lock to avoid the only zookeeper handler to
> +	 * create many seq-ephemeral files.
> +	 */
> +	pthread_mutex_lock(&dlock->local_lock);
> +
> +	my_path = dlock->ephemeral_path;
> +
> +	/* compete owner of lock is just like zk_compete_master() */

Though this is not relevent to this patch, I am wondering if zk_compete_master
can make use of your proposed lock?

Thanks
Yuan