[Sheepdog] [PATCH] object cache: incorrect lock may lead to update lost

Sun Apr 15 12:25:41 CEST 2012

On 04/14/2012 09:35 PM, Yunkai Zhang wrote:

> Currently, we use a dirty_rb tree to record which cache object
> have been updated.
> 
> Two kinds of threads will operate this dirty tree concurrently:
>  a. mulitple io-workers: write something to cache objects
>  b. one flush-worker: flush all updates to cluster
> 
> In the following scenario, update will be lost:
> 
> flush-worker                                  one io-worker
> [object_cache_push]                         [object_cache_rw]
>  |-(1) get a cache object from dirty tree                 |
>  |-(2) read the data file of this object                  |
>  |                    modify data file of this object (3)-|
>  |-(4) forward_write_obj_req()                            |
>  |                      add this object to dirty tree (5)-|
>  |-(6) rb_erase: remove this object from dirty tree       |
> 
> Note: io-worker generate *new update* in step (3), but flush-worker
> remove this cache object from dirty tree in step (6).
> 
> I use two dirty trees to fix this bug and avoid heavy lock between
> flush-worker and io-wroker threads.
> 
> There is only one *active* dirty tree for io-workers in any time.
> After io-worker modify something, it operate this active dirty tree.
> 
> When flush-worker want to flush all updates to cluster, it:
>  1. mark another tree as *active* dirty tree.
>  2. get update info from *unactive* dirty tree.

I guess you mean *inactive* here.

>  3. if something wrong occur in flushing process,
>     merge two dirty trees into active drity tree.
> 

Hi Yunkai,
   Really nice catch and commit log. Comments bellow

> Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
> ---
>  sheep/object_cache.c |   73 +++++++++++++++++++++++++++++++++++++++++++-------
>  sheep/sheep_priv.h   |    9 +++++-
>  2 files changed, 70 insertions(+), 12 deletions(-)
> 
> diff --git a/sheep/object_cache.c b/sheep/object_cache.c
> index df9ab49..9756fb6 100644
> --- a/sheep/object_cache.c
> +++ b/sheep/object_cache.c
> @@ -127,9 +127,16 @@ not_found:
>  		cache = xzalloc(sizeof(*cache));
>  		cache->vid = vid;
>  		create_dir_for(vid);
> -		cache->dirty_rb = RB_ROOT;
> +
> +		cache->dirty_trees[0] = RB_ROOT;
> +		cache->dirty_trees[1] = RB_ROOT;
> +		cache->active_dirty_tree = &cache->dirty_trees[0];
> +
> +		INIT_LIST_HEAD(&cache->dirty_lists[0]);
> +		INIT_LIST_HEAD(&cache->dirty_lists[1]);
> +		cache->active_dirty_list = &cache->dirty_lists[0];
> +
>  		pthread_mutex_init(&cache->lock, NULL);
> -		INIT_LIST_HEAD(&cache->dirty_list);
>  		hlist_add_head(&cache->hash, head);
>  	} else
>  		cache = NULL;
> @@ -144,15 +151,50 @@ static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx, in
>  
>  	entry->idx = idx;
>  	pthread_mutex_lock(&oc->lock);
> -	if (!dirty_tree_insert(&oc->dirty_rb, entry)) {
> +	if (!dirty_tree_insert(oc->active_dirty_tree, entry)) {
>  		if (create)
>  			entry->create = 1;
> -		list_add(&entry->list, &oc->dirty_list);
> +		list_add(&entry->list, oc->active_dirty_list);
>  	} else
>  		free(entry);
>  	pthread_mutex_unlock(&oc->lock);
>  }
>  
> +static void active_dirty_tree_and_list_switch(struct object_cache *oc,
> +		struct rb_root ** prev_dirty_tree,
> +		struct list_head **prev_dirty_list)
> +{
> +	pthread_mutex_lock(&oc->lock);
> +
> +	*prev_dirty_list = oc->active_dirty_list;
> +	*prev_dirty_tree = oc->active_dirty_tree;
> +
> +	if (oc->active_dirty_tree == &oc->dirty_trees[0]) {
> +		oc->active_dirty_list = &oc->dirty_lists[1];
> +		oc->active_dirty_tree = &oc->dirty_trees[1];
> +	} else {
> +		oc->active_dirty_list = &oc->dirty_lists[0];
> +		oc->active_dirty_tree = &oc->dirty_trees[0];
> +	}
> +
> +	pthread_mutex_unlock(&oc->lock);
> +}
> +

For internal functions, we don't need to prefix a namespace for function
name, so I think we'd better simply place verb in the head for easy
reading, such as, switch_dirty_tree_and_list().

Further more, it would be better to use 'inactive_dirty_tree',
'inactive_dirty_list' to denote what is returned by the function.

> +static void active_dirty_tree_merge(struct object_cache *oc,
> +		struct rb_root *prev_dirty_tree,
> +		struct list_head *prev_dirty_list)
> +{
> +	struct object_cache_entry *entry, *t;
> +
> +	pthread_mutex_lock(&oc->lock);
> +	list_for_each_entry_safe(entry, t, prev_dirty_list, list) {
> +		dirty_tree_insert(oc->active_dirty_tree, entry);
> +		rb_erase(&entry->rb, prev_dirty_tree);
> +		list_del(&entry->list);
> +	}
> +	pthread_mutex_unlock(&oc->lock);
> +}
> +

Ditto, use merge_dirty_tree_and_list()

IIUC, there are some problems for this function:

1) you need to add entry to dirty list too.
2) check if entry is successfully added, if not, free(entry) to avoid
memory leak.
   - I'd suggest use wrapper function add_to_dirty_tree_and_list() to
assure correctness
3) use inactive_dirty_list instead of "prev_dirty_list"

>  int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create)
>  {
>  	struct strbuf buf;
> @@ -464,23 +506,34 @@ out:
>  int object_cache_push(struct object_cache *oc)
>  {
>  	struct object_cache_entry *entry, *t;
> +	struct rb_root *prev_dirty_tree;
> +	struct list_head *prev_dirty_list;
>  	int ret = SD_RES_SUCCESS;
>  
>  	if (node_in_recovery())
>  		/* We don't do flushing in recovery */
>  		return SD_RES_SUCCESS;
>  
> -	list_for_each_entry_safe(entry, t, &oc->dirty_list, list) {
> +	active_dirty_tree_and_list_switch(oc,
> +			&prev_dirty_tree,
> +			&prev_dirty_list);
> +
> +	/* Only one flush worker thread would operate prev_dirty_list
> +	 * and prev_dirty_tree, needn't to use lock to protect them */

There is not only async flush, we need to take *sync* flush into account
too.

So a more complete picture is:
1) for async flush, there is only one worker
2) for sync flush, Guest assure us of that only one sync request is
issued in one of gatework worker threads.

Above characteristics assure us of no lock for the function.

> +	list_for_each_entry_safe(entry, t, prev_dirty_list, list) {
>  		ret = push_cache_object(oc->vid, entry->idx, entry->create);
> -		if (ret != SD_RES_SUCCESS)
> +		if (ret != SD_RES_SUCCESS) {
> +			active_dirty_tree_merge(oc,
> +					prev_dirty_tree,
> +					prev_dirty_list);
>  			goto out;
> -		pthread_mutex_lock(&oc->lock);
> -		rb_erase(&entry->rb, &oc->dirty_rb);
> +		}
> +		rb_erase(&entry->rb, prev_dirty_tree);
>  		list_del(&entry->list);
> -		pthread_mutex_unlock(&oc->lock);
>  		free(entry);

This logic is hard to catch, how about

	list_for_each_entry_safe(entry, t, prev_dirty_list, list) {
		ret = push_cache_object(...);
		if (ret != SD_RES_SUCCESS)
			goto push_failed;
		del_from_dirty_tree_and_list(...)
	}
	return ret;
push_failed:
	merge_dirty_tree_and_list(...);
	return ret;

Another problem is that you should take account *sync* flush mode, in
which case, we don't need waste cpu cycles to switch and merge trees and
lists.

>  	}
>  out:
> +	assert(list_empty(prev_dirty_list));

>  	return ret;
>  }
>  
> @@ -518,7 +571,7 @@ void object_cache_delete(uint32_t vid)
>  		hlist_del(&cache->hash);
>  		pthread_mutex_unlock(&hashtable_lock[h]);
>  
> -		list_for_each_entry_safe(entry, t, &cache->dirty_list, list) {
> +		list_for_each_entry_safe(entry, t, cache->active_dirty_list, list) {
>  			free(entry);
>  		}
>  		free(cache);
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index a9e8440..8e982cf 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -410,9 +410,14 @@ static inline int sys_can_halt(void)
>  
>  struct object_cache {
>  	uint32_t vid;
> -	struct list_head dirty_list;
>  	struct hlist_node hash;
> -	struct rb_root dirty_rb;
> +
> +	struct list_head dirty_lists[2];
> +	struct list_head *active_dirty_list;
> +
> +	struct rb_root dirty_trees[2];
> +	struct rb_root *active_dirty_tree;
> +
>  	pthread_mutex_t lock;
>  };
>