[Sheepdog] [PATCH v4] object cache: incorrect lock may lead to update lost

Liu Yuan namei.unix at gmail.com
Mon Apr 16 09:21:51 CEST 2012


On 04/16/2012 11:41 AM, Yunkai Zhang wrote:

> Currently, we use a dirty_rb tree to record which cache object
> have been updated.
> 
> Two kinds of threads will operate this dirty tree concurrently:
>  a. mulitple io-workers: write something to cache objects
>  b. one flush-worker: flush all updates to cluster
> 
> In the following scenario, update will be lost:
> 
> flush-worker                                  one io-worker
> [object_cache_push]                         [object_cache_rw]
>  |-(1) get a cache object from dirty tree                 |
>  |-(2) read the data file of this object                  |
>  |                    modify data file of this object (3)-|
>  |-(4) forward_write_obj_req()                            |
>  |                      add this object to dirty tree (5)-|
>  |-(6) rb_erase: remove this object from dirty tree       |
> 
> Note: io-worker generate *new update* in step (3), but flush-worker
> remove this cache object from dirty tree in step (6).
> 
> I use two dirty trees to fix this bug and avoid heavy lock between
> flush-worker and io-wroker threads.
> 
> There is only one *active* dirty tree for io-workers in any time.
> After io-worker modify something, it operate this active dirty tree.
> 
> When flush-worker want to flush all updates to cluster, it:
>  1. mark another tree as *active* dirty tree.
>  2. get update info from *inactive* dirty tree.
>  3. if something wrong occur in flushing process,
>     merge two dirty trees into active drity tree.
> 
> Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
> ---
>  sheep/object_cache.c |  110 ++++++++++++++++++++++++++++++++++++++++----------
>  sheep/sheep_priv.h   |    9 +++-
>  2 files changed, 95 insertions(+), 24 deletions(-)
> 
> diff --git a/sheep/object_cache.c b/sheep/object_cache.c
> index df9ab49..d75f532 100644
> --- a/sheep/object_cache.c
> +++ b/sheep/object_cache.c
> @@ -127,9 +127,16 @@ not_found:
>  		cache = xzalloc(sizeof(*cache));
>  		cache->vid = vid;
>  		create_dir_for(vid);
> -		cache->dirty_rb = RB_ROOT;
> +
> +		cache->dirty_trees[0] = RB_ROOT;
> +		cache->dirty_trees[1] = RB_ROOT;
> +		cache->active_dirty_tree = &cache->dirty_trees[0];
> +
> +		INIT_LIST_HEAD(&cache->dirty_lists[0]);
> +		INIT_LIST_HEAD(&cache->dirty_lists[1]);
> +		cache->active_dirty_list = &cache->dirty_lists[0];
> +
>  		pthread_mutex_init(&cache->lock, NULL);
> -		INIT_LIST_HEAD(&cache->dirty_list);
>  		hlist_add_head(&cache->hash, head);
>  	} else
>  		cache = NULL;
> @@ -138,18 +145,61 @@ out:
>  	return cache;
>  }
>  
> -static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx, int create)
> +static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx,
> +		struct object_cache_entry *entry, int create)
> +{
> +	if (!entry) {
> +		entry = xzalloc(sizeof(*entry));
> +		entry->idx = idx;
> +		entry->create = create;
> +	}
> +	if (!dirty_tree_insert(oc->active_dirty_tree, entry))
> +		list_add(&entry->list, oc->active_dirty_list);
> +	else
> +		free(entry);
> +}
> +
> +static inline void del_from_dirty_tree_and_list(
> +		struct object_cache_entry *entry,
> +		struct rb_root *dirty_tree)
>  {
> -	struct object_cache_entry *entry = xzalloc(sizeof(*entry));
> +	rb_erase(&entry->rb, dirty_tree);
> +	list_del(&entry->list);
> +}
>  
> -	entry->idx = idx;
> +static void switch_dirty_tree_and_list(struct object_cache *oc,
> +		struct rb_root ** inactive_dirty_tree,
> +		struct list_head **inactive_dirty_list)
> +{
>  	pthread_mutex_lock(&oc->lock);
> -	if (!dirty_tree_insert(&oc->dirty_rb, entry)) {
> -		if (create)
> -			entry->create = 1;
> -		list_add(&entry->list, &oc->dirty_list);
> -	} else
> -		free(entry);
> +
> +	*inactive_dirty_list = oc->active_dirty_list;
> +	*inactive_dirty_tree = oc->active_dirty_tree;
> +
> +	if (oc->active_dirty_tree == &oc->dirty_trees[0]) {
> +		oc->active_dirty_list = &oc->dirty_lists[1];
> +		oc->active_dirty_tree = &oc->dirty_trees[1];
> +	} else {
> +		oc->active_dirty_list = &oc->dirty_lists[0];
> +		oc->active_dirty_tree = &oc->dirty_trees[0];
> +	}
> +
> +	pthread_mutex_unlock(&oc->lock);
> +}
> +
> +static void merge_dirty_tree_and_list(struct object_cache *oc,
> +		struct rb_root *inactive_dirty_tree,
> +		struct list_head *inactive_dirty_list)
> +{
> +	struct object_cache_entry *entry, *t;
> +
> +	pthread_mutex_lock(&oc->lock);
> +
> +	list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
> +		del_from_dirty_tree_and_list(entry, inactive_dirty_tree);
> +		add_to_dirty_tree_and_list(oc, entry->idx, entry, 0);
> +	}
> +
>  	pthread_mutex_unlock(&oc->lock);
>  }
>  
> @@ -181,8 +231,11 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create)
>  		ret = prealloc(fd, data_length);
>  		if (ret != SD_RES_SUCCESS)
>  			ret = -1;
> -		else
> -			add_to_dirty_tree_and_list(oc, idx, 1);
> +		else {
> +			pthread_mutex_lock(&oc->lock);
> +			add_to_dirty_tree_and_list(oc, idx, NULL, 1);
> +			pthread_mutex_unlock(&oc->lock);
> +		}
>  	}
>  	close(fd);
>  out:
> @@ -268,7 +321,9 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
>  		ret = write_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
>  		if (ret != SD_RES_SUCCESS)
>  			goto out;
> -		add_to_dirty_tree_and_list(oc, idx, 0);
> +		pthread_mutex_lock(&oc->lock);
> +		add_to_dirty_tree_and_list(oc, idx, NULL, 0);
> +		pthread_mutex_unlock(&oc->lock);
>  	} else {
>  		ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
>  		if (ret != SD_RES_SUCCESS)
> @@ -464,23 +519,34 @@ out:
>  int object_cache_push(struct object_cache *oc)
>  {
>  	struct object_cache_entry *entry, *t;
> +	struct rb_root *inactive_dirty_tree;
> +	struct list_head *inactive_dirty_list;
>  	int ret = SD_RES_SUCCESS;
>  
>  	if (node_in_recovery())
>  		/* We don't do flushing in recovery */
>  		return SD_RES_SUCCESS;
>  
> -	list_for_each_entry_safe(entry, t, &oc->dirty_list, list) {
> +	switch_dirty_tree_and_list(oc,
> +			&inactive_dirty_tree,
> +			&inactive_dirty_list);
> +
> +	/* 1. for async flush, there is only one worker
> +	 * 2. for sync flush, Guest assure us of that only one sync
> +	 * request is issued in one of gateway worker threads
> +	 * So we need not to protect inactive dirty tree and list */
> +	list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
>  		ret = push_cache_object(oc->vid, entry->idx, entry->create);
>  		if (ret != SD_RES_SUCCESS)
> -			goto out;
> -		pthread_mutex_lock(&oc->lock);
> -		rb_erase(&entry->rb, &oc->dirty_rb);
> -		list_del(&entry->list);
> -		pthread_mutex_unlock(&oc->lock);
> +			goto push_failed;
> +		del_from_dirty_tree_and_list(entry, inactive_dirty_tree);
>  		free(entry);
>  	}
> -out:
> +	return ret;
> +push_failed:
> +	merge_dirty_tree_and_list(oc,
> +			inactive_dirty_tree,
> +			inactive_dirty_list);
>  	return ret;
>  }
>  
> @@ -518,7 +584,7 @@ void object_cache_delete(uint32_t vid)
>  		hlist_del(&cache->hash);
>  		pthread_mutex_unlock(&hashtable_lock[h]);
>  
> -		list_for_each_entry_safe(entry, t, &cache->dirty_list, list) {
> +		list_for_each_entry_safe(entry, t, cache->active_dirty_list, list) {
>  			free(entry);
>  		}
>  		free(cache);
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index a9e8440..8e982cf 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -410,9 +410,14 @@ static inline int sys_can_halt(void)
>  
>  struct object_cache {
>  	uint32_t vid;
> -	struct list_head dirty_list;
>  	struct hlist_node hash;
> -	struct rb_root dirty_rb;
> +
> +	struct list_head dirty_lists[2];
> +	struct list_head *active_dirty_list;
> +
> +	struct rb_root dirty_trees[2];
> +	struct rb_root *active_dirty_tree;
> +
>  	pthread_mutex_t lock;
>  };
>  


Applied, thanks.

Yuan



More information about the sheepdog mailing list