On 04/16/2012 11:41 AM, Yunkai Zhang wrote: > Currently, we use a dirty_rb tree to record which cache object > have been updated. > > Two kinds of threads will operate this dirty tree concurrently: > a. mulitple io-workers: write something to cache objects > b. one flush-worker: flush all updates to cluster > > In the following scenario, update will be lost: > > flush-worker one io-worker > [object_cache_push] [object_cache_rw] > |-(1) get a cache object from dirty tree | > |-(2) read the data file of this object | > | modify data file of this object (3)-| > |-(4) forward_write_obj_req() | > | add this object to dirty tree (5)-| > |-(6) rb_erase: remove this object from dirty tree | > > Note: io-worker generate *new update* in step (3), but flush-worker > remove this cache object from dirty tree in step (6). > > I use two dirty trees to fix this bug and avoid heavy lock between > flush-worker and io-wroker threads. > > There is only one *active* dirty tree for io-workers in any time. > After io-worker modify something, it operate this active dirty tree. > > When flush-worker want to flush all updates to cluster, it: > 1. mark another tree as *active* dirty tree. > 2. get update info from *inactive* dirty tree. > 3. if something wrong occur in flushing process, > merge two dirty trees into active drity tree. > > Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com> > --- > sheep/object_cache.c | 110 ++++++++++++++++++++++++++++++++++++++++---------- > sheep/sheep_priv.h | 9 +++- > 2 files changed, 95 insertions(+), 24 deletions(-) > > diff --git a/sheep/object_cache.c b/sheep/object_cache.c > index df9ab49..d75f532 100644 > --- a/sheep/object_cache.c > +++ b/sheep/object_cache.c > @@ -127,9 +127,16 @@ not_found: > cache = xzalloc(sizeof(*cache)); > cache->vid = vid; > create_dir_for(vid); > - cache->dirty_rb = RB_ROOT; > + > + cache->dirty_trees[0] = RB_ROOT; > + cache->dirty_trees[1] = RB_ROOT; > + cache->active_dirty_tree = &cache->dirty_trees[0]; > + > + INIT_LIST_HEAD(&cache->dirty_lists[0]); > + INIT_LIST_HEAD(&cache->dirty_lists[1]); > + cache->active_dirty_list = &cache->dirty_lists[0]; > + > pthread_mutex_init(&cache->lock, NULL); > - INIT_LIST_HEAD(&cache->dirty_list); > hlist_add_head(&cache->hash, head); > } else > cache = NULL; > @@ -138,18 +145,61 @@ out: > return cache; > } > > -static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx, int create) > +static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx, > + struct object_cache_entry *entry, int create) > +{ > + if (!entry) { > + entry = xzalloc(sizeof(*entry)); > + entry->idx = idx; > + entry->create = create; > + } > + if (!dirty_tree_insert(oc->active_dirty_tree, entry)) > + list_add(&entry->list, oc->active_dirty_list); > + else > + free(entry); > +} > + > +static inline void del_from_dirty_tree_and_list( > + struct object_cache_entry *entry, > + struct rb_root *dirty_tree) > { > - struct object_cache_entry *entry = xzalloc(sizeof(*entry)); > + rb_erase(&entry->rb, dirty_tree); > + list_del(&entry->list); > +} > > - entry->idx = idx; > +static void switch_dirty_tree_and_list(struct object_cache *oc, > + struct rb_root ** inactive_dirty_tree, > + struct list_head **inactive_dirty_list) > +{ > pthread_mutex_lock(&oc->lock); > - if (!dirty_tree_insert(&oc->dirty_rb, entry)) { > - if (create) > - entry->create = 1; > - list_add(&entry->list, &oc->dirty_list); > - } else > - free(entry); > + > + *inactive_dirty_list = oc->active_dirty_list; > + *inactive_dirty_tree = oc->active_dirty_tree; > + > + if (oc->active_dirty_tree == &oc->dirty_trees[0]) { > + oc->active_dirty_list = &oc->dirty_lists[1]; > + oc->active_dirty_tree = &oc->dirty_trees[1]; > + } else { > + oc->active_dirty_list = &oc->dirty_lists[0]; > + oc->active_dirty_tree = &oc->dirty_trees[0]; > + } > + > + pthread_mutex_unlock(&oc->lock); > +} > + > +static void merge_dirty_tree_and_list(struct object_cache *oc, > + struct rb_root *inactive_dirty_tree, > + struct list_head *inactive_dirty_list) > +{ > + struct object_cache_entry *entry, *t; > + > + pthread_mutex_lock(&oc->lock); > + > + list_for_each_entry_safe(entry, t, inactive_dirty_list, list) { > + del_from_dirty_tree_and_list(entry, inactive_dirty_tree); > + add_to_dirty_tree_and_list(oc, entry->idx, entry, 0); > + } > + > pthread_mutex_unlock(&oc->lock); > } > > @@ -181,8 +231,11 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create) > ret = prealloc(fd, data_length); > if (ret != SD_RES_SUCCESS) > ret = -1; > - else > - add_to_dirty_tree_and_list(oc, idx, 1); > + else { > + pthread_mutex_lock(&oc->lock); > + add_to_dirty_tree_and_list(oc, idx, NULL, 1); > + pthread_mutex_unlock(&oc->lock); > + } > } > close(fd); > out: > @@ -268,7 +321,9 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req) > ret = write_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset); > if (ret != SD_RES_SUCCESS) > goto out; > - add_to_dirty_tree_and_list(oc, idx, 0); > + pthread_mutex_lock(&oc->lock); > + add_to_dirty_tree_and_list(oc, idx, NULL, 0); > + pthread_mutex_unlock(&oc->lock); > } else { > ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset); > if (ret != SD_RES_SUCCESS) > @@ -464,23 +519,34 @@ out: > int object_cache_push(struct object_cache *oc) > { > struct object_cache_entry *entry, *t; > + struct rb_root *inactive_dirty_tree; > + struct list_head *inactive_dirty_list; > int ret = SD_RES_SUCCESS; > > if (node_in_recovery()) > /* We don't do flushing in recovery */ > return SD_RES_SUCCESS; > > - list_for_each_entry_safe(entry, t, &oc->dirty_list, list) { > + switch_dirty_tree_and_list(oc, > + &inactive_dirty_tree, > + &inactive_dirty_list); > + > + /* 1. for async flush, there is only one worker > + * 2. for sync flush, Guest assure us of that only one sync > + * request is issued in one of gateway worker threads > + * So we need not to protect inactive dirty tree and list */ > + list_for_each_entry_safe(entry, t, inactive_dirty_list, list) { > ret = push_cache_object(oc->vid, entry->idx, entry->create); > if (ret != SD_RES_SUCCESS) > - goto out; > - pthread_mutex_lock(&oc->lock); > - rb_erase(&entry->rb, &oc->dirty_rb); > - list_del(&entry->list); > - pthread_mutex_unlock(&oc->lock); > + goto push_failed; > + del_from_dirty_tree_and_list(entry, inactive_dirty_tree); > free(entry); > } > -out: > + return ret; > +push_failed: > + merge_dirty_tree_and_list(oc, > + inactive_dirty_tree, > + inactive_dirty_list); > return ret; > } > > @@ -518,7 +584,7 @@ void object_cache_delete(uint32_t vid) > hlist_del(&cache->hash); > pthread_mutex_unlock(&hashtable_lock[h]); > > - list_for_each_entry_safe(entry, t, &cache->dirty_list, list) { > + list_for_each_entry_safe(entry, t, cache->active_dirty_list, list) { > free(entry); > } > free(cache); > diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h > index a9e8440..8e982cf 100644 > --- a/sheep/sheep_priv.h > +++ b/sheep/sheep_priv.h > @@ -410,9 +410,14 @@ static inline int sys_can_halt(void) > > struct object_cache { > uint32_t vid; > - struct list_head dirty_list; > struct hlist_node hash; > - struct rb_root dirty_rb; > + > + struct list_head dirty_lists[2]; > + struct list_head *active_dirty_list; > + > + struct rb_root dirty_trees[2]; > + struct rb_root *active_dirty_tree; > + > pthread_mutex_t lock; > }; > Applied, thanks. Yuan |