At Fri, 27 Jul 2012 17:48:07 +0800, levin li wrote: > > From: levin li <xingke.lwp at taobao.com> > > This patch do reclaiming work when the total size of cached objects > reaches the max size specified by user, I did it in the following way: > > 1. check the object tree for the object entry to determine whether the > cache entry is exist and whether it's reclaiming, if it's reclaiming > we make sheep ingore the cache. > 2. In object_cache_rw() we search the cache entry, after passed the sanity > check, we increment its refcnt to tell the reclaiming worker that this > entry is being referenced, we should not reclaim it now. > 3. In add_to_object_cache(), when the cached size reaches the max size, > we start a reclaiming thread, only one such thread can be running at > one time. > 4. In reclaim_work(), we reclaim cached objects until the cache size reduced > to 80% of the max size. > 5. In reclaim_object(), we start to reclaim an object, before this, we check > that if the cache is flushing, we don't reclaim it, and if the refcnt of > the object is not zero, we also don't reclaim it. > If the cached object is dirty, we flush it by push_cache_object(), and > then try to remove the object. > > Signed-off-by: levin li <xingke.lwp at taobao.com> > --- > include/list.h | 10 + > include/sheepdog_proto.h | 1 + > sheep/object_cache.c | 757 +++++++++++++++++++++++++++++++--------------- > sheep/sheep.c | 3 +- > sheep/sheep_priv.h | 1 + > sheep/store.c | 8 + > 6 files changed, 537 insertions(+), 243 deletions(-) > > diff --git a/include/list.h b/include/list.h > index 30ee3c4..e1d645d 100644 > --- a/include/list.h > +++ b/include/list.h > @@ -259,3 +259,13 @@ static inline void hlist_add_after(struct hlist_node *n, > pos = n) > > #endif > + > + > +#define list_for_each_entry_revert_safe_rcu(pos, n, head, member) \ > + for (pos = cds_list_entry(rcu_dereference((head)->prev), \ > + typeof(*pos), member), \ > + n = cds_list_entry(rcu_dereference(pos->member.prev), \ > + typeof(*pos), member); \ > + &pos->member != (head); \ > + pos = n, n = cds_list_entry(rcu_dereference(pos->member.prev),\ > + typeof(*pos), member)) It might be simpler and more useful to define cds_list_add_tail_rcu and use it for lru_list. > diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h > index 45a4b81..05597fb 100644 > --- a/include/sheepdog_proto.h > +++ b/include/sheepdog_proto.h > @@ -68,6 +68,7 @@ > #define SD_RES_CLUSTER_RECOVERING 0x22 /* Cluster is recovering. */ > #define SD_RES_OBJ_RECOVERING 0x23 /* Object is recovering */ > #define SD_RES_KILLED 0x24 /* Node is killed */ > +#define SD_RES_NO_CACHE 0x25 /* No cache object found */ This is an internal error code and should be moved to include/internal_proto.h, no? > > /* errors above 0x80 are sheepdog-internal */ > > diff --git a/sheep/object_cache.c b/sheep/object_cache.c > index 19480c2..b98335f 100644 > --- a/sheep/object_cache.c > +++ b/sheep/object_cache.c > @@ -40,7 +40,13 @@ > #define CACHE_VDI_BIT (UINT32_C(1) << CACHE_VDI_SHIFT) > #define CACHE_BLOCK_SIZE ((UINT64_C(1) << 10) * 64) /* 64 KB */ > > -#define ENTRY_CREATE_BIT (1) > +#define CACHE_RECLAIM_SHIFT 27 > +#define CACHE_RECLAIM_BIT (UINT32_C(1) << CACHE_RECLAIM_SHIFT) > + > +#define CACHE_CREATE_SHIFT 26 > +#define CACHE_CREATE_BIT (UINT32_C(1) << CACHE_CREATE_SHIFT) > + > +#define CACHE_INDEX_MASK (CACHE_RECLAIM_BIT | CACHE_CREATE_BIT) > > struct global_cache { > uint64_t cache_size; > @@ -50,6 +56,7 @@ struct global_cache { > > struct object_cache { > uint32_t vid; > + uint8_t in_flush; > struct hlist_node hash; > > struct list_head dirty_list; > @@ -64,7 +71,6 @@ struct object_cache_entry { > uint64_t bmap; /* each bit represents one dirty > * block which should be flushed */ > int refcnt; > - int flags; > struct rb_node node; > struct rb_node dirty_node; > struct object_cache *oc; > @@ -85,11 +91,29 @@ static pthread_mutex_t hashtable_lock[HASH_SIZE] = { > > static struct hlist_head cache_hashtable[HASH_SIZE]; > > +static inline int cache_in_reclaim(int start) > +{ > + if (start) > + return uatomic_cmpxchg(&sys_cache.reclaiming, 0, 1); > + else > + return uatomic_read(&sys_cache.reclaiming); > +} > + > +static inline int entry_is_dirty(struct object_cache_entry *entry) > +{ > + return !!entry->bmap; > +} > + > static inline int hash(uint64_t vid) > { > return hash_64(vid, HASH_BITS); > } > > +static inline uint32_t idx_mask(uint32_t idx) > +{ > + return idx &= ~CACHE_INDEX_MASK; > +} > + > static inline uint32_t object_cache_oid_to_idx(uint64_t oid) > { > uint32_t idx = data_oid_to_idx(oid); > @@ -124,14 +148,15 @@ object_cache_insert(struct rb_root *root, struct object_cache_entry *new) > struct rb_node **p = &root->rb_node; > struct rb_node *parent = NULL; > struct object_cache_entry *entry; > + uint32_t idx = idx_mask(new->idx); > > while (*p) { > parent = *p; > entry = rb_entry(parent, struct object_cache_entry, node); > > - if (new->idx < entry->idx) > + if (idx < idx_mask(entry->idx)) > p = &(*p)->rb_left; > - else if (new->idx > entry->idx) > + else if (idx > idx_mask(entry->idx)) > p = &(*p)->rb_right; > else { > /* already has this entry */ > @@ -149,13 +174,14 @@ static struct object_cache_entry *object_tree_search(struct rb_root *root, > { > struct rb_node *n = root->rb_node; > struct object_cache_entry *t; > + idx = idx_mask(idx); > > while (n) { > t = rb_entry(n, struct object_cache_entry, node); > > - if (idx < t->idx) > + if (idx < idx_mask(t->idx)) > n = n->rb_left; > - else if (idx > t->idx) > + else if (idx > idx_mask(t->idx)) > n = n->rb_right; > else > return t; /* found it */ > @@ -164,6 +190,339 @@ static struct object_cache_entry *object_tree_search(struct rb_root *root, > return NULL; > } > > +static struct object_cache_entry *dirty_tree_search(struct rb_root *root, > + uint32_t idx) > +{ > + struct rb_node *n = root->rb_node; > + struct object_cache_entry *t; > + idx = idx_mask(idx); > + > + while (n) { > + t = rb_entry(n, struct object_cache_entry, dirty_node); > + > + if (idx < idx_mask(t->idx)) > + n = n->rb_left; > + else if (idx > idx_mask(t->idx)) > + n = n->rb_right; > + else > + return t; /* found it */ > + } > + > + return NULL; > +} > + > +static inline void > +del_from_dirty_tree_and_list(struct object_cache_entry *entry, > + struct rb_root *dirty_tree) > +{ > + rb_erase(&entry->dirty_node, dirty_tree); > + list_del(&entry->list); > +} > + > +static inline void > +del_from_object_tree_and_list(struct object_cache_entry *entry, > + struct rb_root *object_tree) > +{ > + rb_erase(&entry->node, object_tree); > + cds_list_del_rcu(&entry->lru_list); > +} > + > +static uint64_t cache_vid_to_data_oid(uint32_t vid, uint32_t idx) > +{ > + idx = idx_mask(idx); > + > + return vid_to_data_oid(vid, idx); > +} > + > +static uint64_t idx_to_oid(uint32_t vid, uint32_t idx) > +{ > + if (idx_has_vdi_bit(idx)) > + return vid_to_vdi_oid(vid); > + else > + return cache_vid_to_data_oid(vid, idx); > +} > + > +static int remove_cache_object(struct object_cache *oc, uint32_t idx) > +{ > + struct strbuf buf; > + int ret = SD_RES_SUCCESS; > + > + idx = idx_mask(idx); > + > + strbuf_init(&buf, PATH_MAX); > + strbuf_addstr(&buf, cache_dir); > + strbuf_addf(&buf, "/%06"PRIx32"/%08"PRIx32, oc->vid, idx); > + > + dprintf("removing cache object %s\n", buf.buf); > + if (unlink(buf.buf) < 0) { > + ret = SD_RES_EIO; > + eprintf("failed to remove cached object %m\n"); > + goto out; > + } > +out: > + strbuf_release(&buf); > + > + return ret; > +} > + > +static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, > + size_t count, off_t offset) > +{ > + size_t size; > + int fd, flags = def_open_flags, ret = SD_RES_SUCCESS; > + struct strbuf p; > + > + strbuf_init(&p, PATH_MAX); > + strbuf_addstr(&p, cache_dir); > + strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx); > + > + if (sys->use_directio && !idx_has_vdi_bit(idx)) > + flags |= O_DIRECT; > + > + fd = open(p.buf, flags, def_fmode); > + if (fd < 0) { > + eprintf("%m\n"); > + ret = SD_RES_EIO; > + goto out; > + } > + > + if (flock(fd, LOCK_EX) < 0) { > + ret = SD_RES_EIO; > + eprintf("%m\n"); > + goto out_close; > + } > + size = xpwrite(fd, buf, count, offset); > + if (flock(fd, LOCK_UN) < 0) { > + ret = SD_RES_EIO; > + eprintf("%m\n"); > + goto out_close; > + } > + > + if (size != count) { > + eprintf("size %zu, count:%zu, offset %jd %m\n", > + size, count, (intmax_t)offset); > + ret = SD_RES_EIO; > + } > +out_close: > + close(fd); > +out: > + strbuf_release(&p); > + return ret; > +} > + > +static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, > + size_t count, off_t offset) > +{ > + size_t size; > + int fd, flags = def_open_flags, ret = SD_RES_SUCCESS; > + struct strbuf p; > + > + strbuf_init(&p, PATH_MAX); > + strbuf_addstr(&p, cache_dir); > + strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx); > + > + if (sys->use_directio && !idx_has_vdi_bit(idx)) > + flags |= O_DIRECT; > + > + fd = open(p.buf, flags, def_fmode); > + if (fd < 0) { > + eprintf("%m\n"); > + ret = SD_RES_EIO; > + goto out; > + } > + > + if (flock(fd, LOCK_SH) < 0) { > + ret = SD_RES_EIO; > + eprintf("%m\n"); > + goto out_close; > + } > + size = xpread(fd, buf, count, offset); > + if (flock(fd, LOCK_UN) < 0) { > + ret = SD_RES_EIO; > + eprintf("%m\n"); > + goto out_close; > + } > + > + if (size != count) { > + eprintf("size %zu, count:%zu, offset %jd %m\n", > + size, count, (intmax_t)offset); > + ret = SD_RES_EIO; > + } > + > +out_close: > + close(fd); > +out: > + strbuf_release(&p); > + return ret; > +} > + > +static int push_cache_object(uint32_t vid, uint32_t idx, uint64_t bmap, > + int create) > +{ > + struct sd_req hdr; > + void *buf; > + off_t offset; > + unsigned data_length; > + int ret = SD_RES_NO_MEM; > + uint64_t oid = idx_to_oid(vid, idx); > + int first_bit, last_bit; > + > + dprintf("%"PRIx64", create %d\n", oid, create); > + > + idx = idx_mask(idx); > + > + if (!bmap) { > + dprintf("WARN: nothing to flush\n"); > + return SD_RES_SUCCESS; > + } > + > + first_bit = ffsll(bmap) - 1; > + last_bit = fls64(bmap) - 1; > + > + dprintf("bmap:0x%"PRIx64", first_bit:%d, last_bit:%d\n", > + bmap, first_bit, last_bit); > + offset = first_bit * CACHE_BLOCK_SIZE; > + data_length = (last_bit - first_bit + 1) * CACHE_BLOCK_SIZE; > + > + /* > + * CACHE_BLOCK_SIZE may not be divisible by SD_INODE_SIZE, > + * so (offset + data_length) could larger than SD_INODE_SIZE > + */ > + if (is_vdi_obj(oid) && (offset + data_length) > SD_INODE_SIZE) > + data_length = SD_INODE_SIZE - offset; > + > + buf = valloc(data_length); > + if (buf == NULL) { > + eprintf("failed to allocate memory\n"); > + goto out; > + } > + > + ret = read_cache_object(vid, idx, buf, data_length, offset); > + if (ret != SD_RES_SUCCESS) > + goto out; > + > + if (create) > + sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ); > + else > + sd_init_req(&hdr, SD_OP_WRITE_OBJ); > + hdr.flags = SD_FLAG_CMD_WRITE; > + hdr.data_length = data_length; > + hdr.obj.oid = oid; > + hdr.obj.offset = offset; > + > + ret = exec_local_req(&hdr, buf); > + if (ret != SD_RES_SUCCESS) > + eprintf("failed to push object %x\n", ret); > + > +out: > + free(buf); > + return ret; > +} > + > +static int reclaim_object(struct object_cache_entry *entry) > +{ > + struct object_cache *oc = entry->oc; > + int ret = SD_RES_SUCCESS; > + > + pthread_rwlock_wrlock(&oc->lock); > + dprintf("reclaiming /%06"PRIx32"/%08"PRIx32", cache_size: %ld\n", > + oc->vid, entry->idx, uatomic_read(&sys_cache.cache_size)); > + > + if (uatomic_read(&entry->refcnt) > 0) { > + ret = -1; > + goto out; > + } > + > + if (entry_is_dirty(entry)) { > + uint64_t bmap = entry->bmap; > + int create = entry->idx & CACHE_CREATE_BIT; > + > + if (oc->in_flush) { > + ret = -1; > + goto out; > + } > + > + entry->bmap = 0; > + del_from_dirty_tree_and_list(entry, &oc->dirty_tree); > + pthread_rwlock_unlock(&oc->lock); > + > + ret = push_cache_object(oc->vid, entry->idx, bmap, create); > + > + pthread_rwlock_wrlock(&oc->lock); > + if (ret != SD_RES_SUCCESS) { > + /* still dirty */ > + entry->bmap |= bmap; > + ret = -1; > + goto out; > + } > + > + entry->idx &= ~CACHE_CREATE_BIT; > + /* dirty again */ > + if (entry_is_dirty(entry)) { > + dprintf("object cache is dirty again %06" PRIx32 "\n", > + entry->idx); > + ret = -1; > + goto out; > + } > + > + if (oc->in_flush) { > + ret = -1; > + goto out; > + } > + > + if (uatomic_read(&entry->refcnt) > 0) { > + ret = -1; > + goto out; > + } > + } > + > + entry->idx |= CACHE_RECLAIM_BIT; > + > + ret = remove_cache_object(oc, entry->idx); > + if (ret == SD_RES_SUCCESS) > + del_from_object_tree_and_list(entry, &oc->object_tree); > +out: > + pthread_rwlock_unlock(&oc->lock); > + return ret; > +} > + > +static void reclaim_work(struct work *work) > +{ > + struct object_cache_entry *entry, *n; > + int ret; > + > + if (node_in_recovery()) > + return; Can you comment briefly why we cannot recaim caches while recovering? I think it will help newcomer to understand the code. > + > + list_for_each_entry_revert_safe_rcu(entry, n, > + &sys_cache.cache_lru_list, lru_list) { > + unsigned data_length; > + /* Reclaim cache to 80% of max size */ > + if (uatomic_read(&sys_cache.cache_size) <= > + sys->cache_size * 8 / 10) > + break; > + > + ret = reclaim_object(entry); > + if (ret != SD_RES_SUCCESS) > + continue; > + if (idx_has_vdi_bit(entry->idx)) > + data_length = SD_INODE_SIZE; > + else > + data_length = SD_DATA_OBJ_SIZE; > + > + uatomic_sub(&sys_cache.cache_size, data_length); > + free(entry); > + } > + > + dprintf("cache reclaim complete\n"); > +} > + > +static void reclaim_done(struct work *work) > +{ > + uatomic_set(&sys_cache.reclaiming, 0); > + free(work); > +} > + > static struct object_cache_entry * > dirty_tree_insert(struct object_cache *oc, uint32_t idx, > uint64_t bmap, int create) > @@ -171,18 +530,21 @@ dirty_tree_insert(struct object_cache *oc, uint32_t idx, > struct rb_node **p = &oc->dirty_tree.rb_node; > struct rb_node *parent = NULL; > struct object_cache_entry *entry; > + idx = idx_mask(idx); > > while (*p) { > parent = *p; > entry = rb_entry(parent, struct object_cache_entry, dirty_node); > > - if (idx < entry->idx) > + if (idx < idx_mask(entry->idx)) > p = &(*p)->rb_left; > - else if (idx > entry->idx) > + else if (idx > idx_mask(entry->idx)) > p = &(*p)->rb_right; > else { > /* already has this entry, merge bmap */ > entry->bmap |= bmap; > + if (create) > + entry->idx |= CACHE_CREATE_BIT; > return entry; > } > } > @@ -192,7 +554,8 @@ dirty_tree_insert(struct object_cache *oc, uint32_t idx, > return NULL; > > entry->bmap |= bmap; > - entry->flags |= ENTRY_CREATE_BIT; > + if (create) > + entry->idx |= CACHE_CREATE_BIT; > rb_link_node(&entry->dirty_node, parent, p); > rb_insert_color(&entry->dirty_node, &oc->dirty_tree); > list_add(&entry->list, &oc->dirty_list); > @@ -200,26 +563,6 @@ dirty_tree_insert(struct object_cache *oc, uint32_t idx, > return entry; > } > > -static struct object_cache_entry *dirty_tree_search(struct rb_root *root, > - uint32_t idx) > -{ > - struct rb_node *n = root->rb_node; > - struct object_cache_entry *t; > - > - while (n) { > - t = rb_entry(n, struct object_cache_entry, dirty_node); > - > - if (idx < t->idx) > - n = n->rb_left; > - else if (idx > t->idx) > - n = n->rb_right; > - else > - return t; /* found it */ > - } > - > - return NULL; > -} > - > static int create_dir_for(uint32_t vid) > { > int ret = 0; > @@ -257,6 +600,7 @@ not_found: > if (create) { > cache = xzalloc(sizeof(*cache)); > cache->vid = vid; > + cache->object_tree = RB_ROOT; > create_dir_for(vid); > > cache->dirty_tree = RB_ROOT; > @@ -271,14 +615,6 @@ out: > return cache; > } > > -static inline void > -del_from_dirty_tree_and_list(struct object_cache_entry *entry, > - struct rb_root *dirty_tree) > -{ > - rb_erase(&entry->dirty_node, dirty_tree); > - list_del(&entry->list); > -} > - > /* Caller should hold the oc->lock */ > static inline void > add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx, > @@ -289,6 +625,9 @@ add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx, > if (!entry) > panic("Can not find object entry %" PRIx32 "\n", idx); > > + if (cache_in_reclaim(0)) > + return; > + > /* If cache isn't in reclaiming, move it > * to the head of lru list */ > cds_list_del_rcu(&entry->lru_list); > @@ -312,15 +651,18 @@ static void add_to_object_cache(struct object_cache *oc, uint32_t idx) > uint32_t data_length; > > if (idx_has_vdi_bit(idx)) > - data_length = SD_INODE_SIZE / 1024; > + data_length = SD_INODE_SIZE; > else > - data_length = SD_DATA_OBJ_SIZE / 1024; > + data_length = SD_DATA_OBJ_SIZE; What's the intention of this change? And if you want to use the value 1024, please use a macro and specify what it is. > > entry = xzalloc(sizeof(*entry)); > entry->oc = oc; > entry->idx = idx; > CDS_INIT_LIST_HEAD(&entry->lru_list); > > + dprintf("cache object for vdi %" PRIx32 ", idx %08" PRIx32 "added\n", > + oc->vid, idx); > + > pthread_rwlock_wrlock(&oc->lock); > old = object_cache_insert(&oc->object_tree, entry); > if (!old) { > @@ -331,142 +673,76 @@ static void add_to_object_cache(struct object_cache *oc, uint32_t idx) > entry = old; > } > pthread_rwlock_unlock(&oc->lock); > + > + if (sys->cache_size && > + uatomic_read(&sys_cache.cache_size) > sys->cache_size && > + !cache_in_reclaim(1)) { > + struct work *work = xzalloc(sizeof(struct work)); > + work->fn = reclaim_work; > + work->done = reclaim_done; > + queue_work(sys->reclaim_wqueue, work); > + } > +} > + > +static inline struct object_cache_entry * > +find_cache_entry(struct object_cache *oc, uint32_t idx) > +{ > + struct object_cache_entry *entry; > + > + entry = object_tree_search(&oc->object_tree, idx); > + if (!entry || entry->idx & CACHE_RECLAIM_BIT) > + return NULL; > + > + return entry; > } > > static int object_cache_lookup(struct object_cache *oc, uint32_t idx, > int create) > { > struct strbuf buf; > - int fd, ret = 0, flags = def_open_flags; > + int fd, ret = SD_RES_SUCCESS, flags = def_open_flags; > + unsigned data_length; > + > + if (!create) { > + pthread_rwlock_wrlock(&oc->lock); > + if (!find_cache_entry(oc, idx)) > + ret = SD_RES_NO_CACHE; > + pthread_rwlock_unlock(&oc->lock); > + return ret; > + } > > strbuf_init(&buf, PATH_MAX); > strbuf_addstr(&buf, cache_dir); > strbuf_addf(&buf, "/%06"PRIx32"/%08"PRIx32, oc->vid, idx); > > - if (create) > - flags |= O_CREAT | O_TRUNC; > + flags |= O_CREAT | O_TRUNC; > > fd = open(buf.buf, flags, def_fmode); > if (fd < 0) { > - ret = -1; > - goto out; > - } > - > - if (create) { > - unsigned data_length; > - > - if (idx_has_vdi_bit(idx)) > - data_length = SD_INODE_SIZE; > - else > - data_length = SD_DATA_OBJ_SIZE; > - > - ret = prealloc(fd, data_length); > - if (ret != SD_RES_SUCCESS) > - ret = -1; > - else { > - uint64_t bmap = UINT64_MAX; > - > - add_to_object_cache(oc, idx); > - > - pthread_rwlock_wrlock(&oc->lock); > - add_to_dirty_tree_and_list(oc, idx, bmap, 1); > - pthread_rwlock_unlock(&oc->lock); > - } > - } > - close(fd); > -out: > - strbuf_release(&buf); > - return ret; > -} > - > -static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, > - size_t count, off_t offset) > -{ > - size_t size; > - int fd, flags = def_open_flags, ret = SD_RES_SUCCESS; > - struct strbuf p; > - > - strbuf_init(&p, PATH_MAX); > - strbuf_addstr(&p, cache_dir); > - strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx); > - > - if (sys->use_directio && !idx_has_vdi_bit(idx)) > - flags |= O_DIRECT; > - > - fd = open(p.buf, flags, def_fmode); > - if (fd < 0) { > - eprintf("%m\n"); > ret = SD_RES_EIO; > goto out; > } > > - if (flock(fd, LOCK_EX) < 0) { > - ret = SD_RES_EIO; > - eprintf("%m\n"); > - goto out_close; > - } > - size = xpwrite(fd, buf, count, offset); > - if (flock(fd, LOCK_UN) < 0) { > - ret = SD_RES_EIO; > - eprintf("%m\n"); > - goto out_close; > - } > + if (idx_has_vdi_bit(idx)) > + data_length = SD_INODE_SIZE; > + else > + data_length = SD_DATA_OBJ_SIZE; > > - if (size != count) { > - eprintf("size %zu, count:%zu, offset %jd %m\n", > - size, count, (intmax_t)offset); > + ret = prealloc(fd, data_length); > + if (ret != SD_RES_SUCCESS) > ret = SD_RES_EIO; > - } > -out_close: > - close(fd); > -out: > - strbuf_release(&p); > - return ret; > -} > - > -static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, > - size_t count, off_t offset) > -{ > - size_t size; > - int fd, flags = def_open_flags, ret = SD_RES_SUCCESS; > - struct strbuf p; > - > - strbuf_init(&p, PATH_MAX); > - strbuf_addstr(&p, cache_dir); > - strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx); > - > - if (sys->use_directio && !idx_has_vdi_bit(idx)) > - flags |= O_DIRECT; > + else { > + uint64_t bmap = UINT64_MAX; > > - fd = open(p.buf, flags, def_fmode); > - if (fd < 0) { > - eprintf("%m\n"); > - ret = SD_RES_EIO; > - goto out; > - } > + add_to_object_cache(oc, idx); > > - if (flock(fd, LOCK_SH) < 0) { > - ret = SD_RES_EIO; > - eprintf("%m\n"); > - goto out_close; > - } > - size = xpread(fd, buf, count, offset); > - if (flock(fd, LOCK_UN) < 0) { > - ret = SD_RES_EIO; > - eprintf("%m\n"); > - goto out_close; > - } > - > - if (size != count) { > - eprintf("size %zu, count:%zu, offset %jd %m\n", > - size, count, (intmax_t)offset); > - ret = SD_RES_EIO; > + pthread_rwlock_wrlock(&oc->lock); > + add_to_dirty_tree_and_list(oc, idx, bmap, 1); > + pthread_rwlock_unlock(&oc->lock); > } > - > -out_close: > close(fd); > out: > - strbuf_release(&p); > + strbuf_release(&buf); > return ret; > } > > @@ -531,7 +807,7 @@ static int object_cache_pull(struct object_cache *oc, uint32_t idx) > oid = vid_to_vdi_oid(oc->vid); > data_length = SD_INODE_SIZE; > } else { > - oid = vid_to_data_oid(oc->vid, idx); > + oid = cache_vid_to_data_oid(oc->vid, idx); > data_length = SD_DATA_OBJ_SIZE; > } > > @@ -558,75 +834,6 @@ out: > return ret; > } > > -static uint64_t idx_to_oid(uint32_t vid, uint32_t idx) > -{ > - if (idx_has_vdi_bit(idx)) > - return vid_to_vdi_oid(vid); > - else > - return vid_to_data_oid(vid, idx); > -} > - > -static int push_cache_object(uint32_t vid, uint32_t idx, uint64_t bmap, > - int create) > -{ > - struct sd_req hdr; > - void *buf; > - off_t offset; > - unsigned data_length; > - int ret = SD_RES_NO_MEM; > - uint64_t oid = idx_to_oid(vid, idx); > - int first_bit, last_bit; > - > - dprintf("%"PRIx64", create %d\n", oid, create); > - > - if (!bmap) { > - dprintf("WARN: nothing to flush\n"); > - return SD_RES_SUCCESS; > - } > - > - first_bit = ffsll(bmap) - 1; > - last_bit = fls64(bmap) - 1; > - > - dprintf("bmap:0x%"PRIx64", first_bit:%d, last_bit:%d\n", > - bmap, first_bit, last_bit); > - offset = first_bit * CACHE_BLOCK_SIZE; > - data_length = (last_bit - first_bit + 1) * CACHE_BLOCK_SIZE; > - > - /* > - * CACHE_BLOCK_SIZE may not be divisible by SD_INODE_SIZE, > - * so (offset + data_length) could larger than SD_INODE_SIZE > - */ > - if (is_vdi_obj(oid) && (offset + data_length) > SD_INODE_SIZE) > - data_length = SD_INODE_SIZE - offset; > - > - buf = valloc(data_length); > - if (buf == NULL) { > - eprintf("failed to allocate memory\n"); > - goto out; > - } > - > - ret = read_cache_object(vid, idx, buf, data_length, offset); > - if (ret != SD_RES_SUCCESS) > - goto out; > - > - if (create) > - sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ); > - else > - sd_init_req(&hdr, SD_OP_WRITE_OBJ); > - hdr.flags = SD_FLAG_CMD_WRITE; > - hdr.data_length = data_length; > - hdr.obj.oid = oid; > - hdr.obj.offset = offset; > - > - ret = exec_local_req(&hdr, buf); > - if (ret != SD_RES_SUCCESS) > - eprintf("failed to push object %x\n", ret); > - > -out: > - free(buf); > - return ret; > -} > - > /* Push back all the dirty objects to sheep cluster storage */ > static int object_cache_push(struct object_cache *oc) > { > @@ -640,27 +847,33 @@ static int object_cache_push(struct object_cache *oc) > return SD_RES_SUCCESS; > > pthread_rwlock_wrlock(&oc->lock); > + oc->in_flush = 1; > list_splice_init(&oc->dirty_list, &inactive_dirty_list); > pthread_rwlock_unlock(&oc->lock); > > list_for_each_entry_safe(entry, t, &inactive_dirty_list, list) { > - pthread_rwlock_rdlock(&oc->lock); > + pthread_rwlock_wrlock(&oc->lock); > bmap = entry->bmap; > - create = entry->flags & ENTRY_CREATE_BIT; > + create = entry->idx & CACHE_CREATE_BIT; > + entry->bmap = 0; > + del_from_dirty_tree_and_list(entry, &oc->dirty_tree); > pthread_rwlock_unlock(&oc->lock); > > ret = push_cache_object(oc->vid, entry->idx, bmap, create); > - if (ret != SD_RES_SUCCESS) > - goto push_failed; > > pthread_rwlock_wrlock(&oc->lock); > - del_from_dirty_tree_and_list(entry, &oc->dirty_tree); > + if (ret != SD_RES_SUCCESS) { > + entry->bmap |= bmap; > + goto push_failed; > + } > + entry->idx &= ~CACHE_CREATE_BIT; > pthread_rwlock_unlock(&oc->lock); > } > + oc->in_flush = 0; > return ret; > > push_failed: > - pthread_rwlock_wrlock(&oc->lock); > + oc->in_flush = 0; > list_splice_init(&inactive_dirty_list, &oc->dirty_list); > pthread_rwlock_unlock(&oc->lock); > > @@ -677,10 +890,10 @@ int object_is_cached(uint64_t oid) > if (!cache) > return 0; > > - if (object_cache_lookup(cache, idx, 0) < 0) > - return 0; > + if (object_cache_lookup(cache, idx, 0) == SD_RES_SUCCESS) > + return 1; > else > - return 1; /* found it */ > + return 0; More simply, how about the following? return (object_cache_lookup(cache, idx, 0) == SD_RES_SUCCESS) Thanks, Kazutaka |