[sheepdog] [PATCH UPDATE] object cache: reclaim cached objects when cache reaches the max size

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Fri Jul 27 12:47:59 CEST 2012


At Fri, 27 Jul 2012 17:48:07 +0800,
levin li wrote:
> 
> From: levin li <xingke.lwp at taobao.com>
> 
> This patch do reclaiming work when the total size of cached objects
> reaches the max size specified by user, I did it in the following way:
> 
> 1. check the object tree for the object entry to determine whether the
>    cache entry is exist and whether it's reclaiming, if it's reclaiming
>    we make sheep ingore the cache.
> 2. In object_cache_rw() we search the cache entry, after passed the sanity
>    check, we increment its refcnt to tell the reclaiming worker that this
>    entry is being referenced, we should not reclaim it now.
> 3. In add_to_object_cache(), when the cached size reaches the max size,
>    we start a reclaiming thread, only one such thread can be running at
>    one time.
> 4. In reclaim_work(), we reclaim cached objects until the cache size reduced
>    to 80% of the max size.
> 5. In reclaim_object(), we start to reclaim an object, before this, we check
>    that if the cache is flushing, we don't reclaim it, and if the refcnt of
>    the object is not zero, we also don't reclaim it.
>    If the cached object is dirty, we flush it by push_cache_object(), and
>    then try to remove the object.
> 
> Signed-off-by: levin li <xingke.lwp at taobao.com>
> ---
>  include/list.h           |   10 +
>  include/sheepdog_proto.h |    1 +
>  sheep/object_cache.c     |  757 +++++++++++++++++++++++++++++++---------------
>  sheep/sheep.c            |    3 +-
>  sheep/sheep_priv.h       |    1 +
>  sheep/store.c            |    8 +
>  6 files changed, 537 insertions(+), 243 deletions(-)
> 
> diff --git a/include/list.h b/include/list.h
> index 30ee3c4..e1d645d 100644
> --- a/include/list.h
> +++ b/include/list.h
> @@ -259,3 +259,13 @@ static inline void hlist_add_after(struct hlist_node *n,
>               pos = n)
>  
>  #endif
> +
> +
> +#define list_for_each_entry_revert_safe_rcu(pos, n, head, member)          \
> +	for (pos = cds_list_entry(rcu_dereference((head)->prev),           \
> +				  typeof(*pos), member),                   \
> +	     n = cds_list_entry(rcu_dereference(pos->member.prev),         \
> +				typeof(*pos), member);                     \
> +				&pos->member != (head);                    \
> +	     pos = n, n = cds_list_entry(rcu_dereference(pos->member.prev),\
> +					 typeof(*pos), member))

It might be simpler and more useful to define cds_list_add_tail_rcu
and use it for lru_list.


> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 45a4b81..05597fb 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -68,6 +68,7 @@
>  #define SD_RES_CLUSTER_RECOVERING 0x22 /* Cluster is recovering. */
>  #define SD_RES_OBJ_RECOVERING     0x23 /* Object is recovering */
>  #define SD_RES_KILLED           0x24 /* Node is killed */
> +#define SD_RES_NO_CACHE      0x25 /* No cache object found */

This is an internal error code and should be moved to
include/internal_proto.h, no?


>  
>  /* errors above 0x80 are sheepdog-internal */
>  
> diff --git a/sheep/object_cache.c b/sheep/object_cache.c
> index 19480c2..b98335f 100644
> --- a/sheep/object_cache.c
> +++ b/sheep/object_cache.c
> @@ -40,7 +40,13 @@
>  #define CACHE_VDI_BIT         (UINT32_C(1) << CACHE_VDI_SHIFT)
>  #define CACHE_BLOCK_SIZE      ((UINT64_C(1) << 10) * 64) /* 64 KB */
>  
> -#define ENTRY_CREATE_BIT      (1)
> +#define CACHE_RECLAIM_SHIFT   27
> +#define CACHE_RECLAIM_BIT     (UINT32_C(1) << CACHE_RECLAIM_SHIFT)
> +
> +#define CACHE_CREATE_SHIFT    26
> +#define CACHE_CREATE_BIT      (UINT32_C(1) << CACHE_CREATE_SHIFT)
> +
> +#define CACHE_INDEX_MASK      (CACHE_RECLAIM_BIT | CACHE_CREATE_BIT)
>  
>  struct global_cache {
>  	uint64_t cache_size;
> @@ -50,6 +56,7 @@ struct global_cache {
>  
>  struct object_cache {
>  	uint32_t vid;
> +	uint8_t in_flush;
>  	struct hlist_node hash;
>  
>  	struct list_head dirty_list;
> @@ -64,7 +71,6 @@ struct object_cache_entry {
>  	uint64_t bmap; /* each bit represents one dirty
>  			* block which should be flushed */
>  	int refcnt;
> -	int flags;
>  	struct rb_node node;
>  	struct rb_node dirty_node;
>  	struct object_cache *oc;
> @@ -85,11 +91,29 @@ static pthread_mutex_t hashtable_lock[HASH_SIZE] = {
>  
>  static struct hlist_head cache_hashtable[HASH_SIZE];
>  
> +static inline int cache_in_reclaim(int start)
> +{
> +	if (start)
> +		return uatomic_cmpxchg(&sys_cache.reclaiming, 0, 1);
> +	else
> +		return uatomic_read(&sys_cache.reclaiming);
> +}
> +
> +static inline int entry_is_dirty(struct object_cache_entry *entry)
> +{
> +	return !!entry->bmap;
> +}
> +
>  static inline int hash(uint64_t vid)
>  {
>  	return hash_64(vid, HASH_BITS);
>  }
>  
> +static inline uint32_t idx_mask(uint32_t idx)
> +{
> +	return idx &= ~CACHE_INDEX_MASK;
> +}
> +
>  static inline uint32_t object_cache_oid_to_idx(uint64_t oid)
>  {
>  	uint32_t idx = data_oid_to_idx(oid);
> @@ -124,14 +148,15 @@ object_cache_insert(struct rb_root *root, struct object_cache_entry *new)
>  	struct rb_node **p = &root->rb_node;
>  	struct rb_node *parent = NULL;
>  	struct object_cache_entry *entry;
> +	uint32_t idx = idx_mask(new->idx);
>  
>  	while (*p) {
>  		parent = *p;
>  		entry = rb_entry(parent, struct object_cache_entry, node);
>  
> -		if (new->idx < entry->idx)
> +		if (idx < idx_mask(entry->idx))
>  			p = &(*p)->rb_left;
> -		else if (new->idx > entry->idx)
> +		else if (idx > idx_mask(entry->idx))
>  			p = &(*p)->rb_right;
>  		else {
>  			/* already has this entry */
> @@ -149,13 +174,14 @@ static struct object_cache_entry *object_tree_search(struct rb_root *root,
>  {
>  	struct rb_node *n = root->rb_node;
>  	struct object_cache_entry *t;
> +	idx = idx_mask(idx);
>  
>  	while (n) {
>  		t = rb_entry(n, struct object_cache_entry, node);
>  
> -		if (idx < t->idx)
> +		if (idx < idx_mask(t->idx))
>  			n = n->rb_left;
> -		else if (idx > t->idx)
> +		else if (idx > idx_mask(t->idx))
>  			n = n->rb_right;
>  		else
>  			return t; /* found it */
> @@ -164,6 +190,339 @@ static struct object_cache_entry *object_tree_search(struct rb_root *root,
>  	return NULL;
>  }
>  
> +static struct object_cache_entry *dirty_tree_search(struct rb_root *root,
> +						    uint32_t idx)
> +{
> +	struct rb_node *n = root->rb_node;
> +	struct object_cache_entry *t;
> +	idx = idx_mask(idx);
> +
> +	while (n) {
> +		t = rb_entry(n, struct object_cache_entry, dirty_node);
> +
> +		if (idx < idx_mask(t->idx))
> +			n = n->rb_left;
> +		else if (idx > idx_mask(t->idx))
> +			n = n->rb_right;
> +		else
> +			return t; /* found it */
> +	}
> +
> +	return NULL;
> +}
> +
> +static inline void
> +del_from_dirty_tree_and_list(struct object_cache_entry *entry,
> +			     struct rb_root *dirty_tree)
> +{
> +	rb_erase(&entry->dirty_node, dirty_tree);
> +	list_del(&entry->list);
> +}
> +
> +static inline void
> +del_from_object_tree_and_list(struct object_cache_entry *entry,
> +			      struct rb_root *object_tree)
> +{
> +	rb_erase(&entry->node, object_tree);
> +	cds_list_del_rcu(&entry->lru_list);
> +}
> +
> +static uint64_t cache_vid_to_data_oid(uint32_t vid, uint32_t idx)
> +{
> +	idx = idx_mask(idx);
> +
> +	return vid_to_data_oid(vid, idx);
> +}
> +
> +static uint64_t idx_to_oid(uint32_t vid, uint32_t idx)
> +{
> +	if (idx_has_vdi_bit(idx))
> +		return vid_to_vdi_oid(vid);
> +	else
> +		return cache_vid_to_data_oid(vid, idx);
> +}
> +
> +static int remove_cache_object(struct object_cache *oc, uint32_t idx)
> +{
> +	struct strbuf buf;
> +	int ret = SD_RES_SUCCESS;
> +
> +	idx = idx_mask(idx);
> +
> +	strbuf_init(&buf, PATH_MAX);
> +	strbuf_addstr(&buf, cache_dir);
> +	strbuf_addf(&buf, "/%06"PRIx32"/%08"PRIx32, oc->vid, idx);
> +
> +	dprintf("removing cache object %s\n", buf.buf);
> +	if (unlink(buf.buf) < 0) {
> +		ret = SD_RES_EIO;
> +		eprintf("failed to remove cached object %m\n");
> +		goto out;
> +	}
> +out:
> +	strbuf_release(&buf);
> +
> +	return ret;
> +}
> +
> +static int write_cache_object(uint32_t vid, uint32_t idx, void *buf,
> +			      size_t count, off_t offset)
> +{
> +	size_t size;
> +	int fd, flags = def_open_flags, ret = SD_RES_SUCCESS;
> +	struct strbuf p;
> +
> +	strbuf_init(&p, PATH_MAX);
> +	strbuf_addstr(&p, cache_dir);
> +	strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
> +
> +	if (sys->use_directio && !idx_has_vdi_bit(idx))
> +		flags |= O_DIRECT;
> +
> +	fd = open(p.buf, flags, def_fmode);
> +	if (fd < 0) {
> +		eprintf("%m\n");
> +		ret = SD_RES_EIO;
> +		goto out;
> +	}
> +
> +	if (flock(fd, LOCK_EX) < 0) {
> +		ret = SD_RES_EIO;
> +		eprintf("%m\n");
> +		goto out_close;
> +	}
> +	size = xpwrite(fd, buf, count, offset);
> +	if (flock(fd, LOCK_UN) < 0) {
> +		ret = SD_RES_EIO;
> +		eprintf("%m\n");
> +		goto out_close;
> +	}
> +
> +	if (size != count) {
> +		eprintf("size %zu, count:%zu, offset %jd %m\n",
> +			size, count, (intmax_t)offset);
> +		ret = SD_RES_EIO;
> +	}
> +out_close:
> +	close(fd);
> +out:
> +	strbuf_release(&p);
> +	return ret;
> +}
> +
> +static int read_cache_object(uint32_t vid, uint32_t idx, void *buf,
> +			     size_t count, off_t offset)
> +{
> +	size_t size;
> +	int fd, flags = def_open_flags, ret = SD_RES_SUCCESS;
> +	struct strbuf p;
> +
> +	strbuf_init(&p, PATH_MAX);
> +	strbuf_addstr(&p, cache_dir);
> +	strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
> +
> +	if (sys->use_directio && !idx_has_vdi_bit(idx))
> +		flags |= O_DIRECT;
> +
> +	fd = open(p.buf, flags, def_fmode);
> +	if (fd < 0) {
> +		eprintf("%m\n");
> +		ret = SD_RES_EIO;
> +		goto out;
> +	}
> +
> +	if (flock(fd, LOCK_SH) < 0) {
> +		ret = SD_RES_EIO;
> +		eprintf("%m\n");
> +		goto out_close;
> +	}
> +	size = xpread(fd, buf, count, offset);
> +	if (flock(fd, LOCK_UN) < 0) {
> +		ret = SD_RES_EIO;
> +		eprintf("%m\n");
> +		goto out_close;
> +	}
> +
> +	if (size != count) {
> +		eprintf("size %zu, count:%zu, offset %jd %m\n",
> +			size, count, (intmax_t)offset);
> +		ret = SD_RES_EIO;
> +	}
> +
> +out_close:
> +	close(fd);
> +out:
> +	strbuf_release(&p);
> +	return ret;
> +}
> +
> +static int push_cache_object(uint32_t vid, uint32_t idx, uint64_t bmap,
> +			     int create)
> +{
> +	struct sd_req hdr;
> +	void *buf;
> +	off_t offset;
> +	unsigned data_length;
> +	int ret = SD_RES_NO_MEM;
> +	uint64_t oid = idx_to_oid(vid, idx);
> +	int first_bit, last_bit;
> +
> +	dprintf("%"PRIx64", create %d\n", oid, create);
> +
> +	idx = idx_mask(idx);
> +
> +	if (!bmap) {
> +		dprintf("WARN: nothing to flush\n");
> +		return SD_RES_SUCCESS;
> +	}
> +
> +	first_bit = ffsll(bmap) - 1;
> +	last_bit = fls64(bmap) - 1;
> +
> +	dprintf("bmap:0x%"PRIx64", first_bit:%d, last_bit:%d\n",
> +		bmap, first_bit, last_bit);
> +	offset = first_bit * CACHE_BLOCK_SIZE;
> +	data_length = (last_bit - first_bit + 1) * CACHE_BLOCK_SIZE;
> +
> +	/*
> +	 * CACHE_BLOCK_SIZE may not be divisible by SD_INODE_SIZE,
> +	 * so (offset + data_length) could larger than SD_INODE_SIZE
> +	 */
> +	if (is_vdi_obj(oid) && (offset + data_length) > SD_INODE_SIZE)
> +		data_length = SD_INODE_SIZE - offset;
> +
> +	buf = valloc(data_length);
> +	if (buf == NULL) {
> +		eprintf("failed to allocate memory\n");
> +		goto out;
> +	}
> +
> +	ret = read_cache_object(vid, idx, buf, data_length, offset);
> +	if (ret != SD_RES_SUCCESS)
> +		goto out;
> +
> +	if (create)
> +		sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ);
> +	else
> +		sd_init_req(&hdr, SD_OP_WRITE_OBJ);
> +	hdr.flags = SD_FLAG_CMD_WRITE;
> +	hdr.data_length = data_length;
> +	hdr.obj.oid = oid;
> +	hdr.obj.offset = offset;
> +
> +	ret = exec_local_req(&hdr, buf);
> +	if (ret != SD_RES_SUCCESS)
> +		eprintf("failed to push object %x\n", ret);
> +
> +out:
> +	free(buf);
> +	return ret;
> +}
> +
> +static int reclaim_object(struct object_cache_entry *entry)
> +{
> +	struct object_cache *oc = entry->oc;
> +	int ret = SD_RES_SUCCESS;
> +
> +	pthread_rwlock_wrlock(&oc->lock);
> +	dprintf("reclaiming /%06"PRIx32"/%08"PRIx32", cache_size: %ld\n",
> +		oc->vid, entry->idx, uatomic_read(&sys_cache.cache_size));
> +
> +	if (uatomic_read(&entry->refcnt) > 0) {
> +		ret = -1;
> +		goto out;
> +	}
> +
> +	if (entry_is_dirty(entry)) {
> +		uint64_t bmap = entry->bmap;
> +		int create = entry->idx & CACHE_CREATE_BIT;
> +
> +		if (oc->in_flush) {
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		entry->bmap = 0;
> +		del_from_dirty_tree_and_list(entry, &oc->dirty_tree);
> +		pthread_rwlock_unlock(&oc->lock);
> +
> +		ret = push_cache_object(oc->vid, entry->idx, bmap, create);
> +
> +		pthread_rwlock_wrlock(&oc->lock);
> +		if (ret != SD_RES_SUCCESS) {
> +			/* still dirty */
> +			entry->bmap |= bmap;
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		entry->idx &= ~CACHE_CREATE_BIT;
> +		/* dirty again */
> +		if (entry_is_dirty(entry)) {
> +			dprintf("object cache is dirty again %06" PRIx32 "\n",
> +				entry->idx);
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		if (oc->in_flush) {
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		if (uatomic_read(&entry->refcnt) > 0) {
> +			ret = -1;
> +			goto out;
> +		}
> +	}
> +
> +	entry->idx |= CACHE_RECLAIM_BIT;
> +
> +	ret = remove_cache_object(oc, entry->idx);
> +	if (ret == SD_RES_SUCCESS)
> +		del_from_object_tree_and_list(entry, &oc->object_tree);
> +out:
> +	pthread_rwlock_unlock(&oc->lock);
> +	return ret;
> +}
> +
> +static void reclaim_work(struct work *work)
> +{
> +	struct object_cache_entry *entry, *n;
> +	int ret;
> +
> +	if (node_in_recovery())
> +		return;

Can you comment briefly why we cannot recaim caches while recovering?
I think it will help newcomer to understand the code.


> +
> +	list_for_each_entry_revert_safe_rcu(entry, n,
> +		       &sys_cache.cache_lru_list, lru_list) {
> +		unsigned data_length;
> +		/* Reclaim cache to 80% of max size */
> +		if (uatomic_read(&sys_cache.cache_size) <=
> +		    sys->cache_size * 8 / 10)
> +			break;
> +
> +		ret = reclaim_object(entry);
> +		if (ret != SD_RES_SUCCESS)
> +			continue;
> +		if (idx_has_vdi_bit(entry->idx))
> +			data_length = SD_INODE_SIZE;
> +		else
> +			data_length = SD_DATA_OBJ_SIZE;
> +
> +		uatomic_sub(&sys_cache.cache_size, data_length);
> +		free(entry);
> +	}
> +
> +	dprintf("cache reclaim complete\n");
> +}
> +
> +static void reclaim_done(struct work *work)
> +{
> +	uatomic_set(&sys_cache.reclaiming, 0);
> +	free(work);
> +}
> +
>  static struct object_cache_entry *
>  dirty_tree_insert(struct object_cache *oc, uint32_t idx,
>  		  uint64_t bmap, int create)
> @@ -171,18 +530,21 @@ dirty_tree_insert(struct object_cache *oc, uint32_t idx,
>  	struct rb_node **p = &oc->dirty_tree.rb_node;
>  	struct rb_node *parent = NULL;
>  	struct object_cache_entry *entry;
> +	idx = idx_mask(idx);
>  
>  	while (*p) {
>  		parent = *p;
>  		entry = rb_entry(parent, struct object_cache_entry, dirty_node);
>  
> -		if (idx < entry->idx)
> +		if (idx < idx_mask(entry->idx))
>  			p = &(*p)->rb_left;
> -		else if (idx > entry->idx)
> +		else if (idx > idx_mask(entry->idx))
>  			p = &(*p)->rb_right;
>  		else {
>  			/* already has this entry, merge bmap */
>  			entry->bmap |= bmap;
> +			if (create)
> +				entry->idx |= CACHE_CREATE_BIT;
>  			return entry;
>  		}
>  	}
> @@ -192,7 +554,8 @@ dirty_tree_insert(struct object_cache *oc, uint32_t idx,
>  		return NULL;
>  
>  	entry->bmap |= bmap;
> -	entry->flags |= ENTRY_CREATE_BIT;
> +	if (create)
> +		entry->idx |= CACHE_CREATE_BIT;
>  	rb_link_node(&entry->dirty_node, parent, p);
>  	rb_insert_color(&entry->dirty_node, &oc->dirty_tree);
>  	list_add(&entry->list, &oc->dirty_list);
> @@ -200,26 +563,6 @@ dirty_tree_insert(struct object_cache *oc, uint32_t idx,
>  	return entry;
>  }
>  
> -static struct object_cache_entry *dirty_tree_search(struct rb_root *root,
> -						    uint32_t idx)
> -{
> -	struct rb_node *n = root->rb_node;
> -	struct object_cache_entry *t;
> -
> -	while (n) {
> -		t = rb_entry(n, struct object_cache_entry, dirty_node);
> -
> -		if (idx < t->idx)
> -			n = n->rb_left;
> -		else if (idx > t->idx)
> -			n = n->rb_right;
> -		else
> -			return t; /* found it */
> -	}
> -
> -	return NULL;
> -}
> -
>  static int create_dir_for(uint32_t vid)
>  {
>  	int ret = 0;
> @@ -257,6 +600,7 @@ not_found:
>  	if (create) {
>  		cache = xzalloc(sizeof(*cache));
>  		cache->vid = vid;
> +		cache->object_tree = RB_ROOT;
>  		create_dir_for(vid);
>  
>  		cache->dirty_tree = RB_ROOT;
> @@ -271,14 +615,6 @@ out:
>  	return cache;
>  }
>  
> -static inline void
> -del_from_dirty_tree_and_list(struct object_cache_entry *entry,
> -			     struct rb_root *dirty_tree)
> -{
> -	rb_erase(&entry->dirty_node, dirty_tree);
> -	list_del(&entry->list);
> -}
> -
>  /* Caller should hold the oc->lock */
>  static inline void
>  add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx,
> @@ -289,6 +625,9 @@ add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx,
>  	if (!entry)
>  		panic("Can not find object entry %" PRIx32 "\n", idx);
>  
> +	if (cache_in_reclaim(0))
> +		return;
> +
>  	/* If cache isn't in reclaiming, move it
>  	 * to the head of lru list */
>  	cds_list_del_rcu(&entry->lru_list);
> @@ -312,15 +651,18 @@ static void add_to_object_cache(struct object_cache *oc, uint32_t idx)
>  	uint32_t data_length;
>  
>  	if (idx_has_vdi_bit(idx))
> -		data_length = SD_INODE_SIZE / 1024;
> +		data_length = SD_INODE_SIZE;
>  	else
> -		data_length = SD_DATA_OBJ_SIZE / 1024;
> +		data_length = SD_DATA_OBJ_SIZE;

What's the intention of this change?  And if you want to use the value
1024, please use a macro and specify what it is.


>  
>  	entry = xzalloc(sizeof(*entry));
>  	entry->oc = oc;
>  	entry->idx = idx;
>  	CDS_INIT_LIST_HEAD(&entry->lru_list);
>  
> +	dprintf("cache object for vdi %" PRIx32 ", idx %08" PRIx32 "added\n",
> +		oc->vid, idx);
> +
>  	pthread_rwlock_wrlock(&oc->lock);
>  	old = object_cache_insert(&oc->object_tree, entry);
>  	if (!old) {
> @@ -331,142 +673,76 @@ static void add_to_object_cache(struct object_cache *oc, uint32_t idx)
>  		entry = old;
>  	}
>  	pthread_rwlock_unlock(&oc->lock);
> +
> +	if (sys->cache_size &&
> +	    uatomic_read(&sys_cache.cache_size) > sys->cache_size &&
> +	    !cache_in_reclaim(1)) {
> +		struct work *work = xzalloc(sizeof(struct work));
> +		work->fn = reclaim_work;
> +		work->done = reclaim_done;
> +		queue_work(sys->reclaim_wqueue, work);
> +	}
> +}
> +
> +static inline struct object_cache_entry *
> +find_cache_entry(struct object_cache *oc, uint32_t idx)
> +{
> +	struct object_cache_entry *entry;
> +
> +	entry = object_tree_search(&oc->object_tree, idx);
> +	if (!entry || entry->idx & CACHE_RECLAIM_BIT)
> +		return NULL;
> +
> +	return entry;
>  }
>  
>  static int object_cache_lookup(struct object_cache *oc, uint32_t idx,
>  			       int create)
>  {
>  	struct strbuf buf;
> -	int fd, ret = 0, flags = def_open_flags;
> +	int fd, ret = SD_RES_SUCCESS, flags = def_open_flags;
> +	unsigned data_length;
> +
> +	if (!create) {
> +		pthread_rwlock_wrlock(&oc->lock);
> +		if (!find_cache_entry(oc, idx))
> +			ret = SD_RES_NO_CACHE;
> +		pthread_rwlock_unlock(&oc->lock);
> +		return ret;
> +	}
>  
>  	strbuf_init(&buf, PATH_MAX);
>  	strbuf_addstr(&buf, cache_dir);
>  	strbuf_addf(&buf, "/%06"PRIx32"/%08"PRIx32, oc->vid, idx);
>  
> -	if (create)
> -		flags |= O_CREAT | O_TRUNC;
> +	flags |= O_CREAT | O_TRUNC;
>  
>  	fd = open(buf.buf, flags, def_fmode);
>  	if (fd < 0) {
> -		ret = -1;
> -		goto out;
> -	}
> -
> -	if (create) {
> -		unsigned data_length;
> -
> -		if (idx_has_vdi_bit(idx))
> -			data_length = SD_INODE_SIZE;
> -		else
> -			data_length = SD_DATA_OBJ_SIZE;
> -
> -		ret = prealloc(fd, data_length);
> -		if (ret != SD_RES_SUCCESS)
> -			ret = -1;
> -		else {
> -			uint64_t bmap = UINT64_MAX;
> -
> -			add_to_object_cache(oc, idx);
> -
> -			pthread_rwlock_wrlock(&oc->lock);
> -			add_to_dirty_tree_and_list(oc, idx, bmap, 1);
> -			pthread_rwlock_unlock(&oc->lock);
> -		}
> -	}
> -	close(fd);
> -out:
> -	strbuf_release(&buf);
> -	return ret;
> -}
> -
> -static int write_cache_object(uint32_t vid, uint32_t idx, void *buf,
> -			      size_t count, off_t offset)
> -{
> -	size_t size;
> -	int fd, flags = def_open_flags, ret = SD_RES_SUCCESS;
> -	struct strbuf p;
> -
> -	strbuf_init(&p, PATH_MAX);
> -	strbuf_addstr(&p, cache_dir);
> -	strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
> -
> -	if (sys->use_directio && !idx_has_vdi_bit(idx))
> -		flags |= O_DIRECT;
> -
> -	fd = open(p.buf, flags, def_fmode);
> -	if (fd < 0) {
> -		eprintf("%m\n");
>  		ret = SD_RES_EIO;
>  		goto out;
>  	}
>  
> -	if (flock(fd, LOCK_EX) < 0) {
> -		ret = SD_RES_EIO;
> -		eprintf("%m\n");
> -		goto out_close;
> -	}
> -	size = xpwrite(fd, buf, count, offset);
> -	if (flock(fd, LOCK_UN) < 0) {
> -		ret = SD_RES_EIO;
> -		eprintf("%m\n");
> -		goto out_close;
> -	}
> +	if (idx_has_vdi_bit(idx))
> +		data_length = SD_INODE_SIZE;
> +	else
> +		data_length = SD_DATA_OBJ_SIZE;
>  
> -	if (size != count) {
> -		eprintf("size %zu, count:%zu, offset %jd %m\n",
> -			size, count, (intmax_t)offset);
> +	ret = prealloc(fd, data_length);
> +	if (ret != SD_RES_SUCCESS)
>  		ret = SD_RES_EIO;
> -	}
> -out_close:
> -	close(fd);
> -out:
> -	strbuf_release(&p);
> -	return ret;
> -}
> -
> -static int read_cache_object(uint32_t vid, uint32_t idx, void *buf,
> -			     size_t count, off_t offset)
> -{
> -	size_t size;
> -	int fd, flags = def_open_flags, ret = SD_RES_SUCCESS;
> -	struct strbuf p;
> -
> -	strbuf_init(&p, PATH_MAX);
> -	strbuf_addstr(&p, cache_dir);
> -	strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
> -
> -	if (sys->use_directio && !idx_has_vdi_bit(idx))
> -		flags |= O_DIRECT;
> +	else {
> +		uint64_t bmap = UINT64_MAX;
>  
> -	fd = open(p.buf, flags, def_fmode);
> -	if (fd < 0) {
> -		eprintf("%m\n");
> -		ret = SD_RES_EIO;
> -		goto out;
> -	}
> +		add_to_object_cache(oc, idx);
>  
> -	if (flock(fd, LOCK_SH) < 0) {
> -		ret = SD_RES_EIO;
> -		eprintf("%m\n");
> -		goto out_close;
> -	}
> -	size = xpread(fd, buf, count, offset);
> -	if (flock(fd, LOCK_UN) < 0) {
> -		ret = SD_RES_EIO;
> -		eprintf("%m\n");
> -		goto out_close;
> -	}
> -
> -	if (size != count) {
> -		eprintf("size %zu, count:%zu, offset %jd %m\n",
> -			size, count, (intmax_t)offset);
> -		ret = SD_RES_EIO;
> +		pthread_rwlock_wrlock(&oc->lock);
> +		add_to_dirty_tree_and_list(oc, idx, bmap, 1);
> +		pthread_rwlock_unlock(&oc->lock);
>  	}
> -
> -out_close:
>  	close(fd);
>  out:
> -	strbuf_release(&p);
> +	strbuf_release(&buf);
>  	return ret;
>  }
>  
> @@ -531,7 +807,7 @@ static int object_cache_pull(struct object_cache *oc, uint32_t idx)
>  		oid = vid_to_vdi_oid(oc->vid);
>  		data_length = SD_INODE_SIZE;
>  	} else {
> -		oid = vid_to_data_oid(oc->vid, idx);
> +		oid = cache_vid_to_data_oid(oc->vid, idx);
>  		data_length = SD_DATA_OBJ_SIZE;
>  	}
>  
> @@ -558,75 +834,6 @@ out:
>  	return ret;
>  }
>  
> -static uint64_t idx_to_oid(uint32_t vid, uint32_t idx)
> -{
> -	if (idx_has_vdi_bit(idx))
> -		return vid_to_vdi_oid(vid);
> -	else
> -		return vid_to_data_oid(vid, idx);
> -}
> -
> -static int push_cache_object(uint32_t vid, uint32_t idx, uint64_t bmap,
> -			     int create)
> -{
> -	struct sd_req hdr;
> -	void *buf;
> -	off_t offset;
> -	unsigned data_length;
> -	int ret = SD_RES_NO_MEM;
> -	uint64_t oid = idx_to_oid(vid, idx);
> -	int first_bit, last_bit;
> -
> -	dprintf("%"PRIx64", create %d\n", oid, create);
> -
> -	if (!bmap) {
> -		dprintf("WARN: nothing to flush\n");
> -		return SD_RES_SUCCESS;
> -	}
> -
> -	first_bit = ffsll(bmap) - 1;
> -	last_bit = fls64(bmap) - 1;
> -
> -	dprintf("bmap:0x%"PRIx64", first_bit:%d, last_bit:%d\n",
> -		bmap, first_bit, last_bit);
> -	offset = first_bit * CACHE_BLOCK_SIZE;
> -	data_length = (last_bit - first_bit + 1) * CACHE_BLOCK_SIZE;
> -
> -	/*
> -	 * CACHE_BLOCK_SIZE may not be divisible by SD_INODE_SIZE,
> -	 * so (offset + data_length) could larger than SD_INODE_SIZE
> -	 */
> -	if (is_vdi_obj(oid) && (offset + data_length) > SD_INODE_SIZE)
> -		data_length = SD_INODE_SIZE - offset;
> -
> -	buf = valloc(data_length);
> -	if (buf == NULL) {
> -		eprintf("failed to allocate memory\n");
> -		goto out;
> -	}
> -
> -	ret = read_cache_object(vid, idx, buf, data_length, offset);
> -	if (ret != SD_RES_SUCCESS)
> -		goto out;
> -
> -	if (create)
> -		sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ);
> -	else
> -		sd_init_req(&hdr, SD_OP_WRITE_OBJ);
> -	hdr.flags = SD_FLAG_CMD_WRITE;
> -	hdr.data_length = data_length;
> -	hdr.obj.oid = oid;
> -	hdr.obj.offset = offset;
> -
> -	ret = exec_local_req(&hdr, buf);
> -	if (ret != SD_RES_SUCCESS)
> -		eprintf("failed to push object %x\n", ret);
> -
> -out:
> -	free(buf);
> -	return ret;
> -}
> -
>  /* Push back all the dirty objects to sheep cluster storage */
>  static int object_cache_push(struct object_cache *oc)
>  {
> @@ -640,27 +847,33 @@ static int object_cache_push(struct object_cache *oc)
>  		return SD_RES_SUCCESS;
>  
>  	pthread_rwlock_wrlock(&oc->lock);
> +	oc->in_flush = 1;
>  	list_splice_init(&oc->dirty_list, &inactive_dirty_list);
>  	pthread_rwlock_unlock(&oc->lock);
>  
>  	list_for_each_entry_safe(entry, t, &inactive_dirty_list, list) {
> -		pthread_rwlock_rdlock(&oc->lock);
> +		pthread_rwlock_wrlock(&oc->lock);
>  		bmap = entry->bmap;
> -		create = entry->flags & ENTRY_CREATE_BIT;
> +		create = entry->idx & CACHE_CREATE_BIT;
> +		entry->bmap = 0;
> +		del_from_dirty_tree_and_list(entry, &oc->dirty_tree);
>  		pthread_rwlock_unlock(&oc->lock);
>  
>  		ret = push_cache_object(oc->vid, entry->idx, bmap, create);
> -		if (ret != SD_RES_SUCCESS)
> -			goto push_failed;
>  
>  		pthread_rwlock_wrlock(&oc->lock);
> -		del_from_dirty_tree_and_list(entry, &oc->dirty_tree);
> +		if (ret != SD_RES_SUCCESS) {
> +			entry->bmap |= bmap;
> +			goto push_failed;
> +		}
> +		entry->idx &= ~CACHE_CREATE_BIT;
>  		pthread_rwlock_unlock(&oc->lock);
>  	}
> +	oc->in_flush = 0;
>  	return ret;
>  
>  push_failed:
> -	pthread_rwlock_wrlock(&oc->lock);
> +	oc->in_flush = 0;
>  	list_splice_init(&inactive_dirty_list, &oc->dirty_list);
>  	pthread_rwlock_unlock(&oc->lock);
>  
> @@ -677,10 +890,10 @@ int object_is_cached(uint64_t oid)
>  	if (!cache)
>  		return 0;
>  
> -	if (object_cache_lookup(cache, idx, 0) < 0)
> -		return 0;
> +	if (object_cache_lookup(cache, idx, 0) == SD_RES_SUCCESS)
> +		return 1;
>  	else
> -		return 1; /* found it */
> +		return 0;

More simply, how about the following?
  return (object_cache_lookup(cache, idx, 0) == SD_RES_SUCCESS)


Thanks,

Kazutaka



More information about the sheepdog mailing list