[Sheepdog] [PATCH v3] object cache: incorrect lock may lead to update lost

Yunkai Zhang yunkai.me at gmail.com
Mon Apr 16 05:11:18 CEST 2012


Currently, we use a dirty_rb tree to record which cache object
have been updated.

Two kinds of threads will operate this dirty tree concurrently:
 a. mulitple io-workers: write something to cache objects
 b. one flush-worker: flush all updates to cluster

In the following scenario, update will be lost:

flush-worker                                  one io-worker
[object_cache_push]                         [object_cache_rw]
 |-(1) get a cache object from dirty tree                 |
 |-(2) read the data file of this object                  |
 |                    modify data file of this object (3)-|
 |-(4) forward_write_obj_req()                            |
 |                      add this object to dirty tree (5)-|
 |-(6) rb_erase: remove this object from dirty tree       |

Note: io-worker generate *new update* in step (3), but flush-worker
remove this cache object from dirty tree in step (6).

I use two dirty trees to fix this bug and avoid heavy lock between
flush-worker and io-wroker threads.

There is only one *active* dirty tree for io-workers in any time.
After io-worker modify something, it operate this active dirty tree.

When flush-worker want to flush all updates to cluster, it:
 1. mark another tree as *active* dirty tree.
 2. get update info from *inactive* dirty tree.
 3. if something wrong occur in flushing process,
    merge two dirty trees into active drity tree.

Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
---
 sheep/object_cache.c |  115 ++++++++++++++++++++++++++++++++++++++++----------
 sheep/sheep_priv.h   |    9 +++-
 2 files changed, 100 insertions(+), 24 deletions(-)

diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index df9ab49..79c2bc8 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -127,9 +127,16 @@ not_found:
 		cache = xzalloc(sizeof(*cache));
 		cache->vid = vid;
 		create_dir_for(vid);
-		cache->dirty_rb = RB_ROOT;
+
+		cache->dirty_trees[0] = RB_ROOT;
+		cache->dirty_trees[1] = RB_ROOT;
+		cache->active_dirty_tree = &cache->dirty_trees[0];
+
+		INIT_LIST_HEAD(&cache->dirty_lists[0]);
+		INIT_LIST_HEAD(&cache->dirty_lists[1]);
+		cache->active_dirty_list = &cache->dirty_lists[0];
+
 		pthread_mutex_init(&cache->lock, NULL);
-		INIT_LIST_HEAD(&cache->dirty_list);
 		hlist_add_head(&cache->hash, head);
 	} else
 		cache = NULL;
@@ -138,18 +145,64 @@ out:
 	return cache;
 }
 
-static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx, int create)
+static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx,
+		struct object_cache_entry *entry, int create)
+{
+	if (!entry) {
+		entry = xzalloc(sizeof(*entry));
+		entry->idx = idx;
+		entry->create = create;
+	}
+	if (!dirty_tree_insert(oc->active_dirty_tree, entry))
+		list_add(&entry->list, oc->active_dirty_list);
+	else
+		free(entry);
+}
+
+static inline void del_from_dirty_tree_and_list(
+		struct object_cache_entry *entry,
+		struct rb_root *inactive_dirty_tree,
+		struct list_head *inactive_dirty_list)
 {
-	struct object_cache_entry *entry = xzalloc(sizeof(*entry));
+	rb_erase(&entry->rb, inactive_dirty_tree);
+	list_del(&entry->list);
+}
 
-	entry->idx = idx;
+static void switch_dirty_tree_and_list(struct object_cache *oc,
+		struct rb_root ** inactive_dirty_tree,
+		struct list_head **inactive_dirty_list)
+{
 	pthread_mutex_lock(&oc->lock);
-	if (!dirty_tree_insert(&oc->dirty_rb, entry)) {
-		if (create)
-			entry->create = 1;
-		list_add(&entry->list, &oc->dirty_list);
-	} else
-		free(entry);
+
+	*inactive_dirty_list = oc->active_dirty_list;
+	*inactive_dirty_tree = oc->active_dirty_tree;
+
+	if (oc->active_dirty_tree == &oc->dirty_trees[0]) {
+		oc->active_dirty_list = &oc->dirty_lists[1];
+		oc->active_dirty_tree = &oc->dirty_trees[1];
+	} else {
+		oc->active_dirty_list = &oc->dirty_lists[0];
+		oc->active_dirty_tree = &oc->dirty_trees[0];
+	}
+
+	pthread_mutex_unlock(&oc->lock);
+}
+
+static void merge_dirty_tree_and_list(struct object_cache *oc,
+		struct rb_root *inactive_dirty_tree,
+		struct list_head *inactive_dirty_list)
+{
+	struct object_cache_entry *entry, *t;
+
+	pthread_mutex_lock(&oc->lock);
+
+	list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
+		del_from_dirty_tree_and_list(entry,
+				inactive_dirty_tree,
+				inactive_dirty_list);
+		add_to_dirty_tree_and_list(oc, entry->idx, entry, 0);
+	}
+
 	pthread_mutex_unlock(&oc->lock);
 }
 
@@ -181,8 +234,11 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create)
 		ret = prealloc(fd, data_length);
 		if (ret != SD_RES_SUCCESS)
 			ret = -1;
-		else
-			add_to_dirty_tree_and_list(oc, idx, 1);
+		else {
+			pthread_mutex_lock(&oc->lock);
+			add_to_dirty_tree_and_list(oc, idx, NULL, 1);
+			pthread_mutex_unlock(&oc->lock);
+		}
 	}
 	close(fd);
 out:
@@ -268,7 +324,9 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
 		ret = write_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
 		if (ret != SD_RES_SUCCESS)
 			goto out;
-		add_to_dirty_tree_and_list(oc, idx, 0);
+		pthread_mutex_lock(&oc->lock);
+		add_to_dirty_tree_and_list(oc, idx, NULL, 0);
+		pthread_mutex_unlock(&oc->lock);
 	} else {
 		ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
 		if (ret != SD_RES_SUCCESS)
@@ -464,23 +522,36 @@ out:
 int object_cache_push(struct object_cache *oc)
 {
 	struct object_cache_entry *entry, *t;
+	struct rb_root *inactive_dirty_tree;
+	struct list_head *inactive_dirty_list;
 	int ret = SD_RES_SUCCESS;
 
 	if (node_in_recovery())
 		/* We don't do flushing in recovery */
 		return SD_RES_SUCCESS;
 
-	list_for_each_entry_safe(entry, t, &oc->dirty_list, list) {
+	switch_dirty_tree_and_list(oc,
+			&inactive_dirty_tree,
+			&inactive_dirty_list);
+
+	/* 1. for async flush, there is only one worker
+	 * 2. for sync flush, Guest assure us of that only one sync
+	 * request is issued in one of gateway worker threads
+	 * So we need not to protect inactive dirty tree and list */
+	list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
 		ret = push_cache_object(oc->vid, entry->idx, entry->create);
 		if (ret != SD_RES_SUCCESS)
-			goto out;
-		pthread_mutex_lock(&oc->lock);
-		rb_erase(&entry->rb, &oc->dirty_rb);
-		list_del(&entry->list);
-		pthread_mutex_unlock(&oc->lock);
+			goto push_failed;
+		del_from_dirty_tree_and_list(entry,
+				inactive_dirty_tree,
+				inactive_dirty_list);
 		free(entry);
 	}
-out:
+	return ret;
+push_failed:
+	merge_dirty_tree_and_list(oc,
+			inactive_dirty_tree,
+			inactive_dirty_list);
 	return ret;
 }
 
@@ -518,7 +589,7 @@ void object_cache_delete(uint32_t vid)
 		hlist_del(&cache->hash);
 		pthread_mutex_unlock(&hashtable_lock[h]);
 
-		list_for_each_entry_safe(entry, t, &cache->dirty_list, list) {
+		list_for_each_entry_safe(entry, t, cache->active_dirty_list, list) {
 			free(entry);
 		}
 		free(cache);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index a9e8440..8e982cf 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -410,9 +410,14 @@ static inline int sys_can_halt(void)
 
 struct object_cache {
 	uint32_t vid;
-	struct list_head dirty_list;
 	struct hlist_node hash;
-	struct rb_root dirty_rb;
+
+	struct list_head dirty_lists[2];
+	struct list_head *active_dirty_list;
+
+	struct rb_root dirty_trees[2];
+	struct rb_root *active_dirty_tree;
+
 	pthread_mutex_t lock;
 };
 
-- 
1.7.7.6




More information about the sheepdog mailing list