[sheepdog] [PATCH v3 6/8] object cache: reclaim cached objects when cache reaches the max size

levin li levin108 at gmail.com
Thu Jul 26 10:11:05 CEST 2012


On 2012年07月26日 15:55, Liu Yuan wrote:
> On 07/26/2012 03:17 PM, levin li wrote:
>> +static inline int cache_is_flushing(struct object_cache *cache)
>> +{
>> +	return cache->in_flush;
>> +}
>> +
> 
> better renamed as cache_in_flush()
> 
>> +static inline int entry_is_reclaiming(struct object_cache_entry *entry)
>> +{
>> +	int flags = uatomic_read(&entry->flags);
>> +	if (flags & ENTRY_RECLAIM_BIT)
>> +		return 1;
>> +	return 0;
>> +}
>> +
> 
> ditto, entry_in_flush()
> 
>> +static inline int entry_is_dirty(struct object_cache_entry *entry)
>> +{
>> +	return !!entry->bmap;
>> +}
>> +
>> +static inline void mark_entry_reclaiming(struct object_cache_entry *entry)
>> +{
>> +	int flags = uatomic_read(&entry->flags);
>> +	flags |= ENTRY_RECLAIM_BIT;
>> +	uatomic_set(&entry->flags, flags);
>> +}
>> +
> 
> I think mark_entry_reclaim() would be simpler.
> 
>> +static int check_cache_status(struct object_cache *oc,
>> +			      struct object_cache_entry *entry)
>> +{
>> +	if (cache_is_flushing(oc)) {
>> +		dprintf("cache %" PRIx32 " is flushing, don't reclaim it.\n",
>> +			oc->vid);
>> +		return SD_RES_CACHE_FLUSHING;
>> +	}
>> +
>> +	/* If entry is being accessed, we don't reclaim it */
>> +	if (uatomic_read(&entry->refcnt) > 0) {
>> +		dprintf("cache object %" PRIx32 "(%08" PRIx32 ") "
>> +			"can't be reclaimed, refcnt: %d\n",
>> +			oc->vid, entry->idx, uatomic_read(&entry->refcnt));
>> +		return SD_RES_CACHE_REFERENCING;
>> +	}
> 
> use a variable for uatomic_read(&entry->refcnt), we should avoid use
> uatomic_read(&entry->refcnt in dprintf().
> 
>> +
>> +	return SD_RES_SUCCESS;
>> +}
>> +
>> +static int reclaim_object(struct object_cache_entry *entry)
>> +{
>> +	struct object_cache *oc = entry->oc;
>> +	uint32_t idx = entry->idx;
>> +	int ret = SD_RES_SUCCESS;
>> +
>> +	pthread_rwlock_wrlock(&oc->lock);
>> +	dprintf("reclaiming /%06"PRIx32"/%08"PRIx32", cache_size: %ld\n",
>> +		oc->vid, idx, uatomic_read(&sys_cache.cache_size));
>> +
>> +	ret = check_cache_status(oc, entry);
>> +	if (ret != SD_RES_SUCCESS)
>> +		goto out;
>> +
>> +	if (entry_is_dirty(entry)) {
>> +		uint64_t bmap = entry->bmap;
>> +		int create = entry->flags & ENTRY_CREATE_BIT;
>> +
>> +		entry->bmap = 0;
>> +		del_from_dirty_tree_and_list(entry, &oc->dirty_tree);
>> +		pthread_rwlock_unlock(&oc->lock);
>> +
>> +		ret = push_cache_object(oc->vid, idx, bmap, create);
>> +
>> +		pthread_rwlock_wrlock(&oc->lock);
>> +		if (ret == SD_RES_SUCCESS) {
>> +			entry->flags &= ~ENTRY_CREATE_BIT;
>> +			/* dirty again */
>> +			if (entry_is_dirty(entry)) {
>> +				dprintf("object cache is dirty again %06" PRIx32 "\n", idx);
>> +				ret = SD_RES_CACHE_REFERENCING;
>> +				goto out;
>> +			}
>> +		} else {
>> +			/* still dirty */
>> +			entry->bmap |= bmap;
>> +			ret = SD_RES_CACHE_REFERENCING;
>> +			goto out;
>> +		}
>> +
>> +		/* Now we get lock again, check cache status again */
>> +		ret = check_cache_status(oc, entry);
>> +		if (ret != SD_RES_SUCCESS)
>> +			goto out;
>> +	}
>> +
>> +	mark_entry_reclaiming(entry);
>> +
>> +	ret = remove_cache_object(oc, idx);
>> +	if (ret == SD_RES_SUCCESS)
>> +		del_from_object_tree_and_list(entry, &oc->object_tree);
>> +out:
>> +	pthread_rwlock_unlock(&oc->lock);
>> +	return ret;
>> +}
>> +
>> +static void reclaim_work(struct work *work)
>> +{
>> +	struct object_cache_entry *entry, *n;
>> +	int ret;
>> +
>> +	if (node_in_recovery())
>> +		return;
>> +
>> +	list_for_each_entry_revert_safe_rcu(entry, n,
>> +		       &sys_cache.cache_lru_list, lru_list) {
>> +		/* Reclaim cache to 80% of max size */
>> +		if (uatomic_read(&sys_cache.cache_size) <=
>> +		    sys->cache_size * 8 / 10)
>> +			break;
>> +
>> +		ret = reclaim_object(entry);
>> +		if (ret == SD_RES_CACHE_FLUSHING)
>> +			/* If cache is flushing, stop reclaiming. */
>> +			break;
>> +
>> +		if (ret == SD_RES_SUCCESS) {
>> +			unsigned data_length;
>> +			if (idx_has_vdi_bit(entry->idx))
>> +				data_length = SD_INODE_SIZE;
>> +			else
>> +				data_length = SD_DATA_OBJ_SIZE;
>> +
>> +			uatomic_sub(&sys_cache.cache_size, data_length);
>> +			free(entry);
>> +		}
>> +	}
>> +
>> +	dprintf("cache reclaim complete\n");
>> +}
>> +
>> +static void reclaim_done(struct work *work)
>> +{
>> +	uatomic_set(&sys_cache.reclaiming, 0);
>> +	free(work);
>> +}
>> +
>>  static struct object_cache_entry *
>>  dirty_tree_insert(struct object_cache *oc, uint32_t idx,
>>  		  uint64_t bmap, int create)
>> @@ -183,6 +556,8 @@ dirty_tree_insert(struct object_cache *oc, uint32_t idx,
>>  		else {
>>  			/* already has this entry, merge bmap */
>>  			entry->bmap |= bmap;
>> +			if (create)
>> +				entry->flags |= ENTRY_CREATE_BIT;
>>  			return entry;
>>  		}
>>  	}
>> @@ -192,7 +567,8 @@ dirty_tree_insert(struct object_cache *oc, uint32_t idx,
>>  		return NULL;
>>  
>>  	entry->bmap |= bmap;
>> -	entry->flags |= ENTRY_CREATE_BIT;
>> +	if (create)
>> +		entry->flags |= ENTRY_CREATE_BIT;
>>  	rb_link_node(&entry->dirty_node, parent, p);
>>  	rb_insert_color(&entry->dirty_node, &oc->dirty_tree);
>>  	list_add(&entry->list, &oc->dirty_list);
>> @@ -200,26 +576,6 @@ dirty_tree_insert(struct object_cache *oc, uint32_t idx,
>>  	return entry;
>>  }
>>  
>> -static struct object_cache_entry *dirty_tree_search(struct rb_root *root,
>> -						    uint32_t idx)
>> -{
>> -	struct rb_node *n = root->rb_node;
>> -	struct object_cache_entry *t;
>> -
>> -	while (n) {
>> -		t = rb_entry(n, struct object_cache_entry, dirty_node);
>> -
>> -		if (idx < t->idx)
>> -			n = n->rb_left;
>> -		else if (idx > t->idx)
>> -			n = n->rb_right;
>> -		else
>> -			return t; /* found it */
>> -	}
>> -
>> -	return NULL;
>> -}
>> -
>>  static int create_dir_for(uint32_t vid)
>>  {
>>  	int ret = 0;
>> @@ -257,6 +613,7 @@ not_found:
>>  	if (create) {
>>  		cache = xzalloc(sizeof(*cache));
>>  		cache->vid = vid;
>> +		cache->object_tree = RB_ROOT;
>>  		create_dir_for(vid);
>>  
>>  		cache->dirty_tree = RB_ROOT;
>> @@ -271,14 +628,6 @@ out:
>>  	return cache;
>>  }
>>  
>> -static inline void
>> -del_from_dirty_tree_and_list(struct object_cache_entry *entry,
>> -			     struct rb_root *dirty_tree)
>> -{
>> -	rb_erase(&entry->dirty_node, dirty_tree);
>> -	list_del(&entry->list);
>> -}
>> -
>>  /* Caller should hold the oc->lock */
>>  static inline void
>>  add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx,
>> @@ -289,6 +638,9 @@ add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx,
>>  	if (!entry)
>>  		panic("Can not find object entry %" PRIx32 "\n", idx);
>>  
>> +	if (cache_in_reclaim(0))
>> +		return;
>> +
>>  	/* If cache isn't in reclaiming, move it
>>  	 * to the head of lru list */
>>  	cds_list_del_rcu(&entry->lru_list);
>> @@ -312,15 +664,18 @@ static void add_to_object_cache(struct object_cache *oc, uint32_t idx)
>>  	uint32_t data_length;
>>  
>>  	if (idx_has_vdi_bit(idx))
>> -		data_length = SD_INODE_SIZE / 1024;
>> +		data_length = SD_INODE_SIZE;
>>  	else
>> -		data_length = SD_DATA_OBJ_SIZE / 1024;
>> +		data_length = SD_DATA_OBJ_SIZE;
>>  
>>  	entry = xzalloc(sizeof(*entry));
>>  	entry->oc = oc;
>>  	entry->idx = idx;
>>  	CDS_INIT_LIST_HEAD(&entry->lru_list);
>>  
>> +	dprintf("cache object for vdi %" PRIx32 ", idx %08" PRIx32 "added\n",
>> +		oc->vid, idx);
>> +
>>  	pthread_rwlock_wrlock(&oc->lock);
>>  	old = object_cache_insert(&oc->object_tree, entry);
>>  	if (!old) {
>> @@ -331,20 +686,51 @@ static void add_to_object_cache(struct object_cache *oc, uint32_t idx)
>>  		entry = old;
>>  	}
>>  	pthread_rwlock_unlock(&oc->lock);
>> +
>> +	dprintf("sys_cache.cache_size %" PRIx64 ", sys->cache_size %" PRIx64 "\n",
>> +		uatomic_read(&sys_cache.cache_size), sys->cache_size);
>> +	if (sys->cache_size &&
>> +	    uatomic_read(&sys_cache.cache_size) > sys->cache_size &&
>> +	    !cache_in_reclaim(1)) {
>> +		struct work *work = xzalloc(sizeof(struct work));
>> +		work->fn = reclaim_work;
>> +		work->done = reclaim_done;
>> +		queue_work(sys->reclaim_wqueue, work);
>> +	}
>> +}
>> +
>> +static inline struct object_cache_entry *
>> +find_cache_entry(struct object_cache *oc, uint32_t idx)
>> +{
>> +	struct object_cache_entry *entry;
>> +
>> +	entry = object_tree_search(&oc->object_tree, idx);
>> +	if (!entry || entry_is_reclaiming(entry))
>> +		return NULL;
>> +
>> +	return entry;
>>  }
>>  
>>  static int object_cache_lookup(struct object_cache *oc, uint32_t idx,
>>  			       int create)
>>  {
>>  	struct strbuf buf;
>> -	int fd, ret = 0, flags = def_open_flags;
>> +	int fd, ret = SD_RES_SUCCESS, flags = def_open_flags;
>> +	unsigned data_length;
>> +
>> +	if (!create) {
>> +		pthread_rwlock_wrlock(&oc->lock);
>> +		if (!find_cache_entry(oc, idx))
>> +			ret = SD_RES_NO_CACHE;
>> +		pthread_rwlock_unlock(&oc->lock);
>> +		return ret;
>> +	}
>>  
>>  	strbuf_init(&buf, PATH_MAX);
>>  	strbuf_addstr(&buf, cache_dir);
>>  	strbuf_addf(&buf, "/%06"PRIx32"/%08"PRIx32, oc->vid, idx);
>>  
>> -	if (create)
>> -		flags |= O_CREAT | O_TRUNC;
>> +	flags |= O_CREAT | O_TRUNC;
>>  
>>  	fd = open(buf.buf, flags, def_fmode);
>>  	if (fd < 0) {
>> @@ -352,26 +738,22 @@ static int object_cache_lookup(struct object_cache *oc, uint32_t idx,
>>  		goto out;
>>  	}
>>  
>> -	if (create) {
>> -		unsigned data_length;
>> -
>> -		if (idx_has_vdi_bit(idx))
>> -			data_length = SD_INODE_SIZE;
>> -		else
>> -			data_length = SD_DATA_OBJ_SIZE;
>> +	if (idx_has_vdi_bit(idx))
>> +		data_length = SD_INODE_SIZE;
>> +	else
>> +		data_length = SD_DATA_OBJ_SIZE;
>>  
>> -		ret = prealloc(fd, data_length);
>> -		if (ret != SD_RES_SUCCESS)
>> -			ret = -1;
>> -		else {
>> -			uint64_t bmap = UINT64_MAX;
>> +	ret = prealloc(fd, data_length);
>> +	if (ret != SD_RES_SUCCESS)
>> +		ret = -1;
>> +	else {
>> +		uint64_t bmap = UINT64_MAX;
>>  
>> -			add_to_object_cache(oc, idx);
>> +		add_to_object_cache(oc, idx);
>>  
>> -			pthread_rwlock_wrlock(&oc->lock);
>> -			add_to_dirty_tree_and_list(oc, idx, bmap, 1);
>> -			pthread_rwlock_unlock(&oc->lock);
>> -		}
>> +		pthread_rwlock_wrlock(&oc->lock);
>> +		add_to_dirty_tree_and_list(oc, idx, bmap, 1);
>> +		pthread_rwlock_unlock(&oc->lock);
>>  	}
>>  	close(fd);
>>  out:
>> @@ -379,97 +761,6 @@ out:
>>  	return ret;
>>  }
>>  
>> -static int write_cache_object(uint32_t vid, uint32_t idx, void *buf,
>> -			      size_t count, off_t offset)
>> -{
>> -	size_t size;
>> -	int fd, flags = def_open_flags, ret = SD_RES_SUCCESS;
>> -	struct strbuf p;
>> -
>> -	strbuf_init(&p, PATH_MAX);
>> -	strbuf_addstr(&p, cache_dir);
>> -	strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
>> -
>> -	if (sys->use_directio && !idx_has_vdi_bit(idx))
>> -		flags |= O_DIRECT;
>> -
>> -	fd = open(p.buf, flags, def_fmode);
>> -	if (fd < 0) {
>> -		eprintf("%m\n");
>> -		ret = SD_RES_EIO;
>> -		goto out;
>> -	}
>> -
>> -	if (flock(fd, LOCK_EX) < 0) {
>> -		ret = SD_RES_EIO;
>> -		eprintf("%m\n");
>> -		goto out_close;
>> -	}
>> -	size = xpwrite(fd, buf, count, offset);
>> -	if (flock(fd, LOCK_UN) < 0) {
>> -		ret = SD_RES_EIO;
>> -		eprintf("%m\n");
>> -		goto out_close;
>> -	}
>> -
>> -	if (size != count) {
>> -		eprintf("size %zu, count:%zu, offset %jd %m\n",
>> -			size, count, (intmax_t)offset);
>> -		ret = SD_RES_EIO;
>> -	}
>> -out_close:
>> -	close(fd);
>> -out:
>> -	strbuf_release(&p);
>> -	return ret;
>> -}
>> -
>> -static int read_cache_object(uint32_t vid, uint32_t idx, void *buf,
>> -			     size_t count, off_t offset)
>> -{
>> -	size_t size;
>> -	int fd, flags = def_open_flags, ret = SD_RES_SUCCESS;
>> -	struct strbuf p;
>> -
>> -	strbuf_init(&p, PATH_MAX);
>> -	strbuf_addstr(&p, cache_dir);
>> -	strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
>> -
>> -	if (sys->use_directio && !idx_has_vdi_bit(idx))
>> -		flags |= O_DIRECT;
>> -
>> -	fd = open(p.buf, flags, def_fmode);
>> -	if (fd < 0) {
>> -		eprintf("%m\n");
>> -		ret = SD_RES_EIO;
>> -		goto out;
>> -	}
>> -
>> -	if (flock(fd, LOCK_SH) < 0) {
>> -		ret = SD_RES_EIO;
>> -		eprintf("%m\n");
>> -		goto out_close;
>> -	}
>> -	size = xpread(fd, buf, count, offset);
>> -	if (flock(fd, LOCK_UN) < 0) {
>> -		ret = SD_RES_EIO;
>> -		eprintf("%m\n");
>> -		goto out_close;
>> -	}
>> -
>> -	if (size != count) {
>> -		eprintf("size %zu, count:%zu, offset %jd %m\n",
>> -			size, count, (intmax_t)offset);
>> -		ret = SD_RES_EIO;
>> -	}
>> -
>> -out_close:
>> -	close(fd);
>> -out:
>> -	strbuf_release(&p);
>> -	return ret;
>> -}
>> -
>>  static int create_cache_object(struct object_cache *oc, uint32_t idx,
>>  			       void *buffer, size_t buf_size)
>>  {
>> @@ -558,75 +849,6 @@ out:
>>  	return ret;
>>  }
>>  
>> -static uint64_t idx_to_oid(uint32_t vid, uint32_t idx)
>> -{
>> -	if (idx_has_vdi_bit(idx))
>> -		return vid_to_vdi_oid(vid);
>> -	else
>> -		return vid_to_data_oid(vid, idx);
>> -}
>> -
>> -static int push_cache_object(uint32_t vid, uint32_t idx, uint64_t bmap,
>> -			     int create)
>> -{
>> -	struct sd_req hdr;
>> -	void *buf;
>> -	off_t offset;
>> -	unsigned data_length;
>> -	int ret = SD_RES_NO_MEM;
>> -	uint64_t oid = idx_to_oid(vid, idx);
>> -	int first_bit, last_bit;
>> -
>> -	dprintf("%"PRIx64", create %d\n", oid, create);
>> -
>> -	if (!bmap) {
>> -		dprintf("WARN: nothing to flush\n");
>> -		return SD_RES_SUCCESS;
>> -	}
>> -
>> -	first_bit = ffsll(bmap) - 1;
>> -	last_bit = fls64(bmap) - 1;
>> -
>> -	dprintf("bmap:0x%"PRIx64", first_bit:%d, last_bit:%d\n",
>> -		bmap, first_bit, last_bit);
>> -	offset = first_bit * CACHE_BLOCK_SIZE;
>> -	data_length = (last_bit - first_bit + 1) * CACHE_BLOCK_SIZE;
>> -
>> -	/*
>> -	 * CACHE_BLOCK_SIZE may not be divisible by SD_INODE_SIZE,
>> -	 * so (offset + data_length) could larger than SD_INODE_SIZE
>> -	 */
>> -	if (is_vdi_obj(oid) && (offset + data_length) > SD_INODE_SIZE)
>> -		data_length = SD_INODE_SIZE - offset;
>> -
>> -	buf = valloc(data_length);
>> -	if (buf == NULL) {
>> -		eprintf("failed to allocate memory\n");
>> -		goto out;
>> -	}
>> -
>> -	ret = read_cache_object(vid, idx, buf, data_length, offset);
>> -	if (ret != SD_RES_SUCCESS)
>> -		goto out;
>> -
>> -	if (create)
>> -		sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ);
>> -	else
>> -		sd_init_req(&hdr, SD_OP_WRITE_OBJ);
>> -	hdr.flags = SD_FLAG_CMD_WRITE;
>> -	hdr.data_length = data_length;
>> -	hdr.obj.oid = oid;
>> -	hdr.obj.offset = offset;
>> -
>> -	ret = exec_local_req(&hdr, buf);
>> -	if (ret != SD_RES_SUCCESS)
>> -		eprintf("failed to push object %x\n", ret);
>> -
>> -out:
>> -	free(buf);
>> -	return ret;
>> -}
>> -
>>  /* Push back all the dirty objects to sheep cluster storage */
>>  static int object_cache_push(struct object_cache *oc)
>>  {
>> @@ -647,20 +869,23 @@ static int object_cache_push(struct object_cache *oc)
>>  		pthread_rwlock_rdlock(&oc->lock);
>>  		bmap = entry->bmap;
>>  		create = entry->flags & ENTRY_CREATE_BIT;
>> +		entry->bmap = 0;
>>  		pthread_rwlock_unlock(&oc->lock);
>>  
>>  		ret = push_cache_object(oc->vid, entry->idx, bmap, create);
>> -		if (ret != SD_RES_SUCCESS)
>> -			goto push_failed;
>>  
>>  		pthread_rwlock_wrlock(&oc->lock);
>> +		if (ret != SD_RES_SUCCESS) {
>> +			entry->bmap |= bmap;
>> +			goto push_failed;
>> +		}
>> +		entry->flags &= ~ENTRY_CREATE_BIT;
>>  		del_from_dirty_tree_and_list(entry, &oc->dirty_tree);
>>  		pthread_rwlock_unlock(&oc->lock);
>>  	}
>>  	return ret;
>>  
>>  push_failed:
>> -	pthread_rwlock_wrlock(&oc->lock);
>>  	list_splice_init(&inactive_dirty_list, &oc->dirty_list);
>>  	pthread_rwlock_unlock(&oc->lock);
>>  
>> @@ -677,10 +902,10 @@ int object_is_cached(uint64_t oid)
>>  	if (!cache)
>>  		return 0;
>>  
>> -	if (object_cache_lookup(cache, idx, 0) < 0)
>> -		return 0;
>> +	if (object_cache_lookup(cache, idx, 0) == SD_RES_SUCCESS)
>> +		return 1;
>>  	else
>> -		return 1; /* found it */
>> +		return 0;
>>  }
>>  
>>  void object_cache_delete(uint32_t vid)
>> @@ -712,6 +937,44 @@ void object_cache_delete(uint32_t vid)
>>  
>>  }
>>  
>> +static void object_cache_flush_begin(struct object_cache *oc)
>> +{
>> +	pthread_rwlock_wrlock(&oc->lock);
>> +	oc->in_flush = 1;
>> +	pthread_rwlock_unlock(&oc->lock);
>> +}
>> +
>> +static void object_cache_flush_end(struct object_cache *oc)
>> +{
>> +	pthread_rwlock_wrlock(&oc->lock);
>> +	oc->in_flush = 0;
>> +	pthread_rwlock_unlock(&oc->lock);
>> +}
>> +
> 
> I think we can use uatomic_cmpchg() for oc->in_flush, instead of mutex
> lock.Also,  as I have mentioned, if these two helpers only operate on
> oc->in_flush, we'd better fold them into the caller to code the logic
> more explicit.
> 
> Thanks,
> Yuan
> 

The lock is necessary here, atomic can only guarantee the data is correct,
but not supply a function of barrier, we must make sure that when a cached
object is preparing to reclaim, the flush work should wait until the preparing
work finished, firstly I also didn't use the rwlock but just use atomic to
set this variable, but there's indeed problem causing VM error.

'begin/end' make it easy to understand the behavior, if you have a better
name, suggest one, I don't mind to merge these two helper functions.

thanks,

levin



More information about the sheepdog mailing list