[sheepdog] [PATCH v3 6/8] object cache: reclaim cached objects when cache reaches the max size
Liu Yuan
namei.unix at gmail.com
Thu Jul 26 09:55:55 CEST 2012
On 07/26/2012 03:17 PM, levin li wrote:
> +static inline int cache_is_flushing(struct object_cache *cache)
> +{
> + return cache->in_flush;
> +}
> +
better renamed as cache_in_flush()
> +static inline int entry_is_reclaiming(struct object_cache_entry *entry)
> +{
> + int flags = uatomic_read(&entry->flags);
> + if (flags & ENTRY_RECLAIM_BIT)
> + return 1;
> + return 0;
> +}
> +
ditto, entry_in_flush()
> +static inline int entry_is_dirty(struct object_cache_entry *entry)
> +{
> + return !!entry->bmap;
> +}
> +
> +static inline void mark_entry_reclaiming(struct object_cache_entry *entry)
> +{
> + int flags = uatomic_read(&entry->flags);
> + flags |= ENTRY_RECLAIM_BIT;
> + uatomic_set(&entry->flags, flags);
> +}
> +
I think mark_entry_reclaim() would be simpler.
> +static int check_cache_status(struct object_cache *oc,
> + struct object_cache_entry *entry)
> +{
> + if (cache_is_flushing(oc)) {
> + dprintf("cache %" PRIx32 " is flushing, don't reclaim it.\n",
> + oc->vid);
> + return SD_RES_CACHE_FLUSHING;
> + }
> +
> + /* If entry is being accessed, we don't reclaim it */
> + if (uatomic_read(&entry->refcnt) > 0) {
> + dprintf("cache object %" PRIx32 "(%08" PRIx32 ") "
> + "can't be reclaimed, refcnt: %d\n",
> + oc->vid, entry->idx, uatomic_read(&entry->refcnt));
> + return SD_RES_CACHE_REFERENCING;
> + }
use a variable for uatomic_read(&entry->refcnt), we should avoid use
uatomic_read(&entry->refcnt in dprintf().
> +
> + return SD_RES_SUCCESS;
> +}
> +
> +static int reclaim_object(struct object_cache_entry *entry)
> +{
> + struct object_cache *oc = entry->oc;
> + uint32_t idx = entry->idx;
> + int ret = SD_RES_SUCCESS;
> +
> + pthread_rwlock_wrlock(&oc->lock);
> + dprintf("reclaiming /%06"PRIx32"/%08"PRIx32", cache_size: %ld\n",
> + oc->vid, idx, uatomic_read(&sys_cache.cache_size));
> +
> + ret = check_cache_status(oc, entry);
> + if (ret != SD_RES_SUCCESS)
> + goto out;
> +
> + if (entry_is_dirty(entry)) {
> + uint64_t bmap = entry->bmap;
> + int create = entry->flags & ENTRY_CREATE_BIT;
> +
> + entry->bmap = 0;
> + del_from_dirty_tree_and_list(entry, &oc->dirty_tree);
> + pthread_rwlock_unlock(&oc->lock);
> +
> + ret = push_cache_object(oc->vid, idx, bmap, create);
> +
> + pthread_rwlock_wrlock(&oc->lock);
> + if (ret == SD_RES_SUCCESS) {
> + entry->flags &= ~ENTRY_CREATE_BIT;
> + /* dirty again */
> + if (entry_is_dirty(entry)) {
> + dprintf("object cache is dirty again %06" PRIx32 "\n", idx);
> + ret = SD_RES_CACHE_REFERENCING;
> + goto out;
> + }
> + } else {
> + /* still dirty */
> + entry->bmap |= bmap;
> + ret = SD_RES_CACHE_REFERENCING;
> + goto out;
> + }
> +
> + /* Now we get lock again, check cache status again */
> + ret = check_cache_status(oc, entry);
> + if (ret != SD_RES_SUCCESS)
> + goto out;
> + }
> +
> + mark_entry_reclaiming(entry);
> +
> + ret = remove_cache_object(oc, idx);
> + if (ret == SD_RES_SUCCESS)
> + del_from_object_tree_and_list(entry, &oc->object_tree);
> +out:
> + pthread_rwlock_unlock(&oc->lock);
> + return ret;
> +}
> +
> +static void reclaim_work(struct work *work)
> +{
> + struct object_cache_entry *entry, *n;
> + int ret;
> +
> + if (node_in_recovery())
> + return;
> +
> + list_for_each_entry_revert_safe_rcu(entry, n,
> + &sys_cache.cache_lru_list, lru_list) {
> + /* Reclaim cache to 80% of max size */
> + if (uatomic_read(&sys_cache.cache_size) <=
> + sys->cache_size * 8 / 10)
> + break;
> +
> + ret = reclaim_object(entry);
> + if (ret == SD_RES_CACHE_FLUSHING)
> + /* If cache is flushing, stop reclaiming. */
> + break;
> +
> + if (ret == SD_RES_SUCCESS) {
> + unsigned data_length;
> + if (idx_has_vdi_bit(entry->idx))
> + data_length = SD_INODE_SIZE;
> + else
> + data_length = SD_DATA_OBJ_SIZE;
> +
> + uatomic_sub(&sys_cache.cache_size, data_length);
> + free(entry);
> + }
> + }
> +
> + dprintf("cache reclaim complete\n");
> +}
> +
> +static void reclaim_done(struct work *work)
> +{
> + uatomic_set(&sys_cache.reclaiming, 0);
> + free(work);
> +}
> +
> static struct object_cache_entry *
> dirty_tree_insert(struct object_cache *oc, uint32_t idx,
> uint64_t bmap, int create)
> @@ -183,6 +556,8 @@ dirty_tree_insert(struct object_cache *oc, uint32_t idx,
> else {
> /* already has this entry, merge bmap */
> entry->bmap |= bmap;
> + if (create)
> + entry->flags |= ENTRY_CREATE_BIT;
> return entry;
> }
> }
> @@ -192,7 +567,8 @@ dirty_tree_insert(struct object_cache *oc, uint32_t idx,
> return NULL;
>
> entry->bmap |= bmap;
> - entry->flags |= ENTRY_CREATE_BIT;
> + if (create)
> + entry->flags |= ENTRY_CREATE_BIT;
> rb_link_node(&entry->dirty_node, parent, p);
> rb_insert_color(&entry->dirty_node, &oc->dirty_tree);
> list_add(&entry->list, &oc->dirty_list);
> @@ -200,26 +576,6 @@ dirty_tree_insert(struct object_cache *oc, uint32_t idx,
> return entry;
> }
>
> -static struct object_cache_entry *dirty_tree_search(struct rb_root *root,
> - uint32_t idx)
> -{
> - struct rb_node *n = root->rb_node;
> - struct object_cache_entry *t;
> -
> - while (n) {
> - t = rb_entry(n, struct object_cache_entry, dirty_node);
> -
> - if (idx < t->idx)
> - n = n->rb_left;
> - else if (idx > t->idx)
> - n = n->rb_right;
> - else
> - return t; /* found it */
> - }
> -
> - return NULL;
> -}
> -
> static int create_dir_for(uint32_t vid)
> {
> int ret = 0;
> @@ -257,6 +613,7 @@ not_found:
> if (create) {
> cache = xzalloc(sizeof(*cache));
> cache->vid = vid;
> + cache->object_tree = RB_ROOT;
> create_dir_for(vid);
>
> cache->dirty_tree = RB_ROOT;
> @@ -271,14 +628,6 @@ out:
> return cache;
> }
>
> -static inline void
> -del_from_dirty_tree_and_list(struct object_cache_entry *entry,
> - struct rb_root *dirty_tree)
> -{
> - rb_erase(&entry->dirty_node, dirty_tree);
> - list_del(&entry->list);
> -}
> -
> /* Caller should hold the oc->lock */
> static inline void
> add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx,
> @@ -289,6 +638,9 @@ add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx,
> if (!entry)
> panic("Can not find object entry %" PRIx32 "\n", idx);
>
> + if (cache_in_reclaim(0))
> + return;
> +
> /* If cache isn't in reclaiming, move it
> * to the head of lru list */
> cds_list_del_rcu(&entry->lru_list);
> @@ -312,15 +664,18 @@ static void add_to_object_cache(struct object_cache *oc, uint32_t idx)
> uint32_t data_length;
>
> if (idx_has_vdi_bit(idx))
> - data_length = SD_INODE_SIZE / 1024;
> + data_length = SD_INODE_SIZE;
> else
> - data_length = SD_DATA_OBJ_SIZE / 1024;
> + data_length = SD_DATA_OBJ_SIZE;
>
> entry = xzalloc(sizeof(*entry));
> entry->oc = oc;
> entry->idx = idx;
> CDS_INIT_LIST_HEAD(&entry->lru_list);
>
> + dprintf("cache object for vdi %" PRIx32 ", idx %08" PRIx32 "added\n",
> + oc->vid, idx);
> +
> pthread_rwlock_wrlock(&oc->lock);
> old = object_cache_insert(&oc->object_tree, entry);
> if (!old) {
> @@ -331,20 +686,51 @@ static void add_to_object_cache(struct object_cache *oc, uint32_t idx)
> entry = old;
> }
> pthread_rwlock_unlock(&oc->lock);
> +
> + dprintf("sys_cache.cache_size %" PRIx64 ", sys->cache_size %" PRIx64 "\n",
> + uatomic_read(&sys_cache.cache_size), sys->cache_size);
> + if (sys->cache_size &&
> + uatomic_read(&sys_cache.cache_size) > sys->cache_size &&
> + !cache_in_reclaim(1)) {
> + struct work *work = xzalloc(sizeof(struct work));
> + work->fn = reclaim_work;
> + work->done = reclaim_done;
> + queue_work(sys->reclaim_wqueue, work);
> + }
> +}
> +
> +static inline struct object_cache_entry *
> +find_cache_entry(struct object_cache *oc, uint32_t idx)
> +{
> + struct object_cache_entry *entry;
> +
> + entry = object_tree_search(&oc->object_tree, idx);
> + if (!entry || entry_is_reclaiming(entry))
> + return NULL;
> +
> + return entry;
> }
>
> static int object_cache_lookup(struct object_cache *oc, uint32_t idx,
> int create)
> {
> struct strbuf buf;
> - int fd, ret = 0, flags = def_open_flags;
> + int fd, ret = SD_RES_SUCCESS, flags = def_open_flags;
> + unsigned data_length;
> +
> + if (!create) {
> + pthread_rwlock_wrlock(&oc->lock);
> + if (!find_cache_entry(oc, idx))
> + ret = SD_RES_NO_CACHE;
> + pthread_rwlock_unlock(&oc->lock);
> + return ret;
> + }
>
> strbuf_init(&buf, PATH_MAX);
> strbuf_addstr(&buf, cache_dir);
> strbuf_addf(&buf, "/%06"PRIx32"/%08"PRIx32, oc->vid, idx);
>
> - if (create)
> - flags |= O_CREAT | O_TRUNC;
> + flags |= O_CREAT | O_TRUNC;
>
> fd = open(buf.buf, flags, def_fmode);
> if (fd < 0) {
> @@ -352,26 +738,22 @@ static int object_cache_lookup(struct object_cache *oc, uint32_t idx,
> goto out;
> }
>
> - if (create) {
> - unsigned data_length;
> -
> - if (idx_has_vdi_bit(idx))
> - data_length = SD_INODE_SIZE;
> - else
> - data_length = SD_DATA_OBJ_SIZE;
> + if (idx_has_vdi_bit(idx))
> + data_length = SD_INODE_SIZE;
> + else
> + data_length = SD_DATA_OBJ_SIZE;
>
> - ret = prealloc(fd, data_length);
> - if (ret != SD_RES_SUCCESS)
> - ret = -1;
> - else {
> - uint64_t bmap = UINT64_MAX;
> + ret = prealloc(fd, data_length);
> + if (ret != SD_RES_SUCCESS)
> + ret = -1;
> + else {
> + uint64_t bmap = UINT64_MAX;
>
> - add_to_object_cache(oc, idx);
> + add_to_object_cache(oc, idx);
>
> - pthread_rwlock_wrlock(&oc->lock);
> - add_to_dirty_tree_and_list(oc, idx, bmap, 1);
> - pthread_rwlock_unlock(&oc->lock);
> - }
> + pthread_rwlock_wrlock(&oc->lock);
> + add_to_dirty_tree_and_list(oc, idx, bmap, 1);
> + pthread_rwlock_unlock(&oc->lock);
> }
> close(fd);
> out:
> @@ -379,97 +761,6 @@ out:
> return ret;
> }
>
> -static int write_cache_object(uint32_t vid, uint32_t idx, void *buf,
> - size_t count, off_t offset)
> -{
> - size_t size;
> - int fd, flags = def_open_flags, ret = SD_RES_SUCCESS;
> - struct strbuf p;
> -
> - strbuf_init(&p, PATH_MAX);
> - strbuf_addstr(&p, cache_dir);
> - strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
> -
> - if (sys->use_directio && !idx_has_vdi_bit(idx))
> - flags |= O_DIRECT;
> -
> - fd = open(p.buf, flags, def_fmode);
> - if (fd < 0) {
> - eprintf("%m\n");
> - ret = SD_RES_EIO;
> - goto out;
> - }
> -
> - if (flock(fd, LOCK_EX) < 0) {
> - ret = SD_RES_EIO;
> - eprintf("%m\n");
> - goto out_close;
> - }
> - size = xpwrite(fd, buf, count, offset);
> - if (flock(fd, LOCK_UN) < 0) {
> - ret = SD_RES_EIO;
> - eprintf("%m\n");
> - goto out_close;
> - }
> -
> - if (size != count) {
> - eprintf("size %zu, count:%zu, offset %jd %m\n",
> - size, count, (intmax_t)offset);
> - ret = SD_RES_EIO;
> - }
> -out_close:
> - close(fd);
> -out:
> - strbuf_release(&p);
> - return ret;
> -}
> -
> -static int read_cache_object(uint32_t vid, uint32_t idx, void *buf,
> - size_t count, off_t offset)
> -{
> - size_t size;
> - int fd, flags = def_open_flags, ret = SD_RES_SUCCESS;
> - struct strbuf p;
> -
> - strbuf_init(&p, PATH_MAX);
> - strbuf_addstr(&p, cache_dir);
> - strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
> -
> - if (sys->use_directio && !idx_has_vdi_bit(idx))
> - flags |= O_DIRECT;
> -
> - fd = open(p.buf, flags, def_fmode);
> - if (fd < 0) {
> - eprintf("%m\n");
> - ret = SD_RES_EIO;
> - goto out;
> - }
> -
> - if (flock(fd, LOCK_SH) < 0) {
> - ret = SD_RES_EIO;
> - eprintf("%m\n");
> - goto out_close;
> - }
> - size = xpread(fd, buf, count, offset);
> - if (flock(fd, LOCK_UN) < 0) {
> - ret = SD_RES_EIO;
> - eprintf("%m\n");
> - goto out_close;
> - }
> -
> - if (size != count) {
> - eprintf("size %zu, count:%zu, offset %jd %m\n",
> - size, count, (intmax_t)offset);
> - ret = SD_RES_EIO;
> - }
> -
> -out_close:
> - close(fd);
> -out:
> - strbuf_release(&p);
> - return ret;
> -}
> -
> static int create_cache_object(struct object_cache *oc, uint32_t idx,
> void *buffer, size_t buf_size)
> {
> @@ -558,75 +849,6 @@ out:
> return ret;
> }
>
> -static uint64_t idx_to_oid(uint32_t vid, uint32_t idx)
> -{
> - if (idx_has_vdi_bit(idx))
> - return vid_to_vdi_oid(vid);
> - else
> - return vid_to_data_oid(vid, idx);
> -}
> -
> -static int push_cache_object(uint32_t vid, uint32_t idx, uint64_t bmap,
> - int create)
> -{
> - struct sd_req hdr;
> - void *buf;
> - off_t offset;
> - unsigned data_length;
> - int ret = SD_RES_NO_MEM;
> - uint64_t oid = idx_to_oid(vid, idx);
> - int first_bit, last_bit;
> -
> - dprintf("%"PRIx64", create %d\n", oid, create);
> -
> - if (!bmap) {
> - dprintf("WARN: nothing to flush\n");
> - return SD_RES_SUCCESS;
> - }
> -
> - first_bit = ffsll(bmap) - 1;
> - last_bit = fls64(bmap) - 1;
> -
> - dprintf("bmap:0x%"PRIx64", first_bit:%d, last_bit:%d\n",
> - bmap, first_bit, last_bit);
> - offset = first_bit * CACHE_BLOCK_SIZE;
> - data_length = (last_bit - first_bit + 1) * CACHE_BLOCK_SIZE;
> -
> - /*
> - * CACHE_BLOCK_SIZE may not be divisible by SD_INODE_SIZE,
> - * so (offset + data_length) could larger than SD_INODE_SIZE
> - */
> - if (is_vdi_obj(oid) && (offset + data_length) > SD_INODE_SIZE)
> - data_length = SD_INODE_SIZE - offset;
> -
> - buf = valloc(data_length);
> - if (buf == NULL) {
> - eprintf("failed to allocate memory\n");
> - goto out;
> - }
> -
> - ret = read_cache_object(vid, idx, buf, data_length, offset);
> - if (ret != SD_RES_SUCCESS)
> - goto out;
> -
> - if (create)
> - sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ);
> - else
> - sd_init_req(&hdr, SD_OP_WRITE_OBJ);
> - hdr.flags = SD_FLAG_CMD_WRITE;
> - hdr.data_length = data_length;
> - hdr.obj.oid = oid;
> - hdr.obj.offset = offset;
> -
> - ret = exec_local_req(&hdr, buf);
> - if (ret != SD_RES_SUCCESS)
> - eprintf("failed to push object %x\n", ret);
> -
> -out:
> - free(buf);
> - return ret;
> -}
> -
> /* Push back all the dirty objects to sheep cluster storage */
> static int object_cache_push(struct object_cache *oc)
> {
> @@ -647,20 +869,23 @@ static int object_cache_push(struct object_cache *oc)
> pthread_rwlock_rdlock(&oc->lock);
> bmap = entry->bmap;
> create = entry->flags & ENTRY_CREATE_BIT;
> + entry->bmap = 0;
> pthread_rwlock_unlock(&oc->lock);
>
> ret = push_cache_object(oc->vid, entry->idx, bmap, create);
> - if (ret != SD_RES_SUCCESS)
> - goto push_failed;
>
> pthread_rwlock_wrlock(&oc->lock);
> + if (ret != SD_RES_SUCCESS) {
> + entry->bmap |= bmap;
> + goto push_failed;
> + }
> + entry->flags &= ~ENTRY_CREATE_BIT;
> del_from_dirty_tree_and_list(entry, &oc->dirty_tree);
> pthread_rwlock_unlock(&oc->lock);
> }
> return ret;
>
> push_failed:
> - pthread_rwlock_wrlock(&oc->lock);
> list_splice_init(&inactive_dirty_list, &oc->dirty_list);
> pthread_rwlock_unlock(&oc->lock);
>
> @@ -677,10 +902,10 @@ int object_is_cached(uint64_t oid)
> if (!cache)
> return 0;
>
> - if (object_cache_lookup(cache, idx, 0) < 0)
> - return 0;
> + if (object_cache_lookup(cache, idx, 0) == SD_RES_SUCCESS)
> + return 1;
> else
> - return 1; /* found it */
> + return 0;
> }
>
> void object_cache_delete(uint32_t vid)
> @@ -712,6 +937,44 @@ void object_cache_delete(uint32_t vid)
>
> }
>
> +static void object_cache_flush_begin(struct object_cache *oc)
> +{
> + pthread_rwlock_wrlock(&oc->lock);
> + oc->in_flush = 1;
> + pthread_rwlock_unlock(&oc->lock);
> +}
> +
> +static void object_cache_flush_end(struct object_cache *oc)
> +{
> + pthread_rwlock_wrlock(&oc->lock);
> + oc->in_flush = 0;
> + pthread_rwlock_unlock(&oc->lock);
> +}
> +
I think we can use uatomic_cmpchg() for oc->in_flush, instead of mutex
lock.Also, as I have mentioned, if these two helpers only operate on
oc->in_flush, we'd better fold them into the caller to code the logic
more explicit.
Thanks,
Yuan
More information about the sheepdog
mailing list