Currently, we use a dirty_rb tree to record which cache object have been updated. Two kinds of threads will operate this dirty tree concurrently: a. mulitple io-workers: write something to cache objects b. one flush-worker: flush all updates to cluster In the following scenario, update will be lost: flush-worker one io-worker [object_cache_push] [object_cache_rw] |-(1) get a cache object from dirty tree | |-(2) read the data file of this object | | modify data file of this object (3)-| |-(4) forward_write_obj_req() | | add this object to dirty tree (5)-| |-(6) rb_erase: remove this object from dirty tree | Note: io-worker generate *new update* in step (3), but flush-worker remove this cache object from dirty tree in step (6). I use two dirty trees to fix this bug and avoid heavy lock between flush-worker and io-wroker threads. There is only one *active* dirty tree for io-workers in any time. After io-worker modify something, it operate this active dirty tree. When flush-worker want to flush all updates to cluster, it: 1. mark another tree as *active* dirty tree. 2. get update info from *inactive* dirty tree. 3. if something wrong occur in flushing process, merge two dirty trees into active drity tree. Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com> --- sheep/object_cache.c | 115 ++++++++++++++++++++++++++++++++++++++++---------- sheep/sheep_priv.h | 9 +++- 2 files changed, 100 insertions(+), 24 deletions(-) diff --git a/sheep/object_cache.c b/sheep/object_cache.c index df9ab49..79c2bc8 100644 --- a/sheep/object_cache.c +++ b/sheep/object_cache.c @@ -127,9 +127,16 @@ not_found: cache = xzalloc(sizeof(*cache)); cache->vid = vid; create_dir_for(vid); - cache->dirty_rb = RB_ROOT; + + cache->dirty_trees[0] = RB_ROOT; + cache->dirty_trees[1] = RB_ROOT; + cache->active_dirty_tree = &cache->dirty_trees[0]; + + INIT_LIST_HEAD(&cache->dirty_lists[0]); + INIT_LIST_HEAD(&cache->dirty_lists[1]); + cache->active_dirty_list = &cache->dirty_lists[0]; + pthread_mutex_init(&cache->lock, NULL); - INIT_LIST_HEAD(&cache->dirty_list); hlist_add_head(&cache->hash, head); } else cache = NULL; @@ -138,18 +145,64 @@ out: return cache; } -static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx, int create) +static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx, + struct object_cache_entry *entry, int create) +{ + if (!entry) { + entry = xzalloc(sizeof(*entry)); + entry->idx = idx; + entry->create = create; + } + if (!dirty_tree_insert(oc->active_dirty_tree, entry)) + list_add(&entry->list, oc->active_dirty_list); + else + free(entry); +} + +static inline void del_from_dirty_tree_and_list( + struct object_cache_entry *entry, + struct rb_root *inactive_dirty_tree, + struct list_head *inactive_dirty_list) { - struct object_cache_entry *entry = xzalloc(sizeof(*entry)); + rb_erase(&entry->rb, inactive_dirty_tree); + list_del(&entry->list); +} - entry->idx = idx; +static void switch_dirty_tree_and_list(struct object_cache *oc, + struct rb_root ** inactive_dirty_tree, + struct list_head **inactive_dirty_list) +{ pthread_mutex_lock(&oc->lock); - if (!dirty_tree_insert(&oc->dirty_rb, entry)) { - if (create) - entry->create = 1; - list_add(&entry->list, &oc->dirty_list); - } else - free(entry); + + *inactive_dirty_list = oc->active_dirty_list; + *inactive_dirty_tree = oc->active_dirty_tree; + + if (oc->active_dirty_tree == &oc->dirty_trees[0]) { + oc->active_dirty_list = &oc->dirty_lists[1]; + oc->active_dirty_tree = &oc->dirty_trees[1]; + } else { + oc->active_dirty_list = &oc->dirty_lists[0]; + oc->active_dirty_tree = &oc->dirty_trees[0]; + } + + pthread_mutex_unlock(&oc->lock); +} + +static void merge_dirty_tree_and_list(struct object_cache *oc, + struct rb_root *inactive_dirty_tree, + struct list_head *inactive_dirty_list) +{ + struct object_cache_entry *entry, *t; + + pthread_mutex_lock(&oc->lock); + + list_for_each_entry_safe(entry, t, inactive_dirty_list, list) { + del_from_dirty_tree_and_list(entry, + inactive_dirty_tree, + inactive_dirty_list); + add_to_dirty_tree_and_list(oc, entry->idx, entry, 0); + } + pthread_mutex_unlock(&oc->lock); } @@ -181,8 +234,11 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create) ret = prealloc(fd, data_length); if (ret != SD_RES_SUCCESS) ret = -1; - else - add_to_dirty_tree_and_list(oc, idx, 1); + else { + pthread_mutex_lock(&oc->lock); + add_to_dirty_tree_and_list(oc, idx, NULL, 1); + pthread_mutex_unlock(&oc->lock); + } } close(fd); out: @@ -268,7 +324,9 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req) ret = write_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset); if (ret != SD_RES_SUCCESS) goto out; - add_to_dirty_tree_and_list(oc, idx, 0); + pthread_mutex_lock(&oc->lock); + add_to_dirty_tree_and_list(oc, idx, NULL, 0); + pthread_mutex_unlock(&oc->lock); } else { ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset); if (ret != SD_RES_SUCCESS) @@ -464,23 +522,36 @@ out: int object_cache_push(struct object_cache *oc) { struct object_cache_entry *entry, *t; + struct rb_root *inactive_dirty_tree; + struct list_head *inactive_dirty_list; int ret = SD_RES_SUCCESS; if (node_in_recovery()) /* We don't do flushing in recovery */ return SD_RES_SUCCESS; - list_for_each_entry_safe(entry, t, &oc->dirty_list, list) { + switch_dirty_tree_and_list(oc, + &inactive_dirty_tree, + &inactive_dirty_list); + + /* 1. for async flush, there is only one worker + * 2. for sync flush, Guest assure us of that only one sync + * request is issued in one of gateway worker threads + * So we need not to protect inactive dirty tree and list */ + list_for_each_entry_safe(entry, t, inactive_dirty_list, list) { ret = push_cache_object(oc->vid, entry->idx, entry->create); if (ret != SD_RES_SUCCESS) - goto out; - pthread_mutex_lock(&oc->lock); - rb_erase(&entry->rb, &oc->dirty_rb); - list_del(&entry->list); - pthread_mutex_unlock(&oc->lock); + goto push_failed; + del_from_dirty_tree_and_list(entry, + inactive_dirty_tree, + inactive_dirty_list); free(entry); } -out: + return ret; +push_failed: + merge_dirty_tree_and_list(oc, + inactive_dirty_tree, + inactive_dirty_list); return ret; } @@ -518,7 +589,7 @@ void object_cache_delete(uint32_t vid) hlist_del(&cache->hash); pthread_mutex_unlock(&hashtable_lock[h]); - list_for_each_entry_safe(entry, t, &cache->dirty_list, list) { + list_for_each_entry_safe(entry, t, cache->active_dirty_list, list) { free(entry); } free(cache); diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index a9e8440..8e982cf 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -410,9 +410,14 @@ static inline int sys_can_halt(void) struct object_cache { uint32_t vid; - struct list_head dirty_list; struct hlist_node hash; - struct rb_root dirty_rb; + + struct list_head dirty_lists[2]; + struct list_head *active_dirty_list; + + struct rb_root dirty_trees[2]; + struct rb_root *active_dirty_tree; + pthread_mutex_t lock; }; -- 1.7.7.6 |