[Sheepdog] [PATCH RFC] Make the flushing data more fine grained

Liu Yuan namei.unix at gmail.com
Fri May 11 11:50:56 CEST 2012


On 05/08/2012 08:12 PM, Yunkai Zhang wrote:

> From: Yunkai Zhang <qiushu.zyk at taobao.com>
> 
> Each time when sheep flushes an cache object, it will flush its whole
> data(4M) to the cluster, even if that object only contains one byte in
> dirty.
> 
> Now I splits an cache object into multiple blocks, each block is 512KB
> (this size can be switch in range[64, 128, 256, 512]KB defined by BLOCK_SIZE
> macro).
> 
> Each time when client writes an cache object, sheep will caculate which block
> would be modified and mark it as dirty block. Those dirty infomation are saved
> in a bitmap.
> 
> When sheep flushes an canch object, it will not flush the whole data again,
> instead of, it will read the bitmap of this object and only send the data marked
> as dirty block.
> 
> In addition, I replaced flock with lockf so that we only lock on the section we care
> about of an object file.
> 
> Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
> ---
>  include/util.h       |   10 +++++
>  sheep/object_cache.c |   95 ++++++++++++++++++++++++++++++++++++++++----------
>  sheep/sdnet.c        |    2 +-
>  sheep/sheep_priv.h   |    2 +
>  4 files changed, 89 insertions(+), 20 deletions(-)
> 
> diff --git a/include/util.h b/include/util.h
> index ca43ab9..1c32954 100644
> --- a/include/util.h
> +++ b/include/util.h
> @@ -4,6 +4,7 @@
>  #include <string.h>
>  #include <limits.h>
>  #include <stdint.h>
> +#include <unistd.h>
>  
>  #include "bitops.h"
>  #include "list.h"
> @@ -58,6 +59,15 @@ static inline void *zalloc(size_t size)
>  	return calloc(1, size);
>  }
>  
> +static inline int xlockf(int fd, int cmd, off_t offset, off_t len)
> +{
> +	if (lseek(fd, offset, SEEK_SET) < 0)
> +		return -1;
> +
> +	return lockf(fd, cmd, len);
> +}
> +
> +
>  typedef void (*try_to_free_t)(size_t);
>  extern try_to_free_t set_try_to_free_routine(try_to_free_t);
>  
> diff --git a/sheep/object_cache.c b/sheep/object_cache.c
> index 0b3c911..43bf109 100644
> --- a/sheep/object_cache.c
> +++ b/sheep/object_cache.c
> @@ -41,6 +41,29 @@ static inline int hash(uint64_t vid)
>  	return hash_64(vid, HASH_BITS);
>  }
>  
> +static inline uint32_t div_ceil(uint32_t numerator, uint32_t denominator)
> +{
> +	return (numerator / denominator) + ((numerator % denominator) > 0);
> +}
> +


Use DVI_ROUND_UP()

> +static uint64_t object_bmap_calc(size_t len, off_t offset)
> +{
> +        int i, j;
> +        uint64_t bmap = 1;
> +
> +        if (!len)
> +                return 0;
> +
> +        i = offset / BLOCK_SIZE;
> +        j = (offset + len - 1) / BLOCK_SIZE;
> +
> +        bmap <<= i;
> +        while (j - i++)
> +                bmap = bmap | (bmap << 1);
> +
> +        return bmap;
> +}
> +


Better name it as cacl_object_bmap() because it is a static function, we
don't need namespace.

>  static struct object_cache_entry *dirty_tree_insert(struct rb_root *root,
>  		struct object_cache_entry *new)
>  {
> @@ -56,8 +79,11 @@ static struct object_cache_entry *dirty_tree_insert(struct rb_root *root,
>  			p = &(*p)->rb_left;
>  		else if (new->idx > entry->idx)
>  			p = &(*p)->rb_right;
> -		else
> -			return entry; /* already has this entry */
> +		else {
> +			/* already has this entry, merge bmap */
> +			entry->bmap |= new->bmap;
> +			return entry;
> +		}
>  	}
>  	rb_link_node(&new->rb, parent, p);
>  	rb_insert_color(&new->rb, root);
> @@ -143,11 +169,12 @@ out:
>  }
>  
>  static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx,
> -		struct object_cache_entry *entry, int create)
> +		uint64_t bmap, struct object_cache_entry *entry, int create)
>  {
>  	if (!entry) {
>  		entry = xzalloc(sizeof(*entry));
>  		entry->idx = idx;
> +		entry->bmap = bmap;
>  		entry->create = create;
>  	}
>  	if (!dirty_tree_insert(oc->active_dirty_tree, entry))
> @@ -194,7 +221,7 @@ static void merge_dirty_tree_and_list(struct object_cache *oc,
>  
>  	list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
>  		del_from_dirty_tree_and_list(entry, inactive_dirty_tree);
> -		add_to_dirty_tree_and_list(oc, entry->idx, entry, 0);
> +		add_to_dirty_tree_and_list(oc, entry->idx, entry->bmap, entry, 0);
>  	}
>  
>  	pthread_mutex_unlock(&oc->lock);
> @@ -204,6 +231,7 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create)
>  {
>  	struct strbuf buf;
>  	int fd, ret = 0, flags = def_open_flags;
> +	uint64_t bmap  = 0;
>  
>  	strbuf_init(&buf, PATH_MAX);
>  	strbuf_addstr(&buf, cache_dir);
> @@ -230,7 +258,7 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create)
>  			ret = -1;
>  		else {
>  			pthread_mutex_lock(&oc->lock);
> -			add_to_dirty_tree_and_list(oc, idx, NULL, 1);
> +			add_to_dirty_tree_and_list(oc, idx, ~bmap, NULL, 1);


We shouldn't set bmap as full for object creation, or the new created
object will always be flushed as 4M.


>  			pthread_mutex_unlock(&oc->lock);
>  		}
>  	}
> @@ -254,13 +282,13 @@ static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t coun
>  		flags |= O_DIRECT;
>  
>  	fd = open(p.buf, flags, def_fmode);
> -	if (flock(fd, LOCK_EX) < 0) {
> +	if (xlockf(fd, F_LOCK, offset, count) < 0) {
>  		ret = SD_RES_EIO;
>  		eprintf("%m\n");
>  		goto out;
>  	}
>  	size = xpwrite(fd, buf, count, offset);
> -	if (flock(fd, LOCK_UN) < 0) {
> +	if (xlockf(fd, F_UNLCK, offset, count) < 0) {
>  		ret = SD_RES_EIO;
>  		eprintf("%m\n");
>  		goto out;
> @@ -287,14 +315,14 @@ static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count
>  		flags |= O_DIRECT;
>  
>  	fd = open(p.buf, flags, def_fmode);
> -	if (flock(fd, LOCK_SH) < 0) {
> +	if (xlockf(fd, F_LOCK, offset, count) < 0) {
>  		ret = SD_RES_EIO;
>  		eprintf("%m\n");
>  		goto out;
>  	}
>  
>  	size = xpread(fd, buf, count, offset);
> -	if (flock(fd, LOCK_UN) < 0) {
> +	if (xlockf(fd, F_UNLCK, offset, count) < 0) {
>  		ret = SD_RES_EIO;
>  		eprintf("%m\n");
>  		goto out;
> @@ -311,6 +339,7 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
>  {
>  	struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
>  	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp;
> +	uint64_t bmap = 0;
>  	int ret;
>  
>  	dprintf("%08"PRIx32", len %"PRIu32", off %"PRIu64"\n", idx, hdr->data_length, hdr->offset);
> @@ -319,7 +348,8 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
>  		if (ret != SD_RES_SUCCESS)
>  			goto out;
>  		pthread_mutex_lock(&oc->lock);
> -		add_to_dirty_tree_and_list(oc, idx, NULL, 0);
> +		bmap = object_bmap_calc(hdr->data_length, hdr->offset);
> +		add_to_dirty_tree_and_list(oc, idx, bmap, NULL, 0);
>  		pthread_mutex_unlock(&oc->lock);
>  	} else {
>  		ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
> @@ -476,22 +506,44 @@ static uint64_t idx_to_oid(uint32_t vid, uint32_t idx)
>  		return vid_to_data_oid(vid, idx);
>  }
>  
> -static int push_cache_object(uint32_t vid, uint32_t idx, int create)
> +static int push_cache_object(uint32_t vid, uint32_t idx,
> +		uint64_t bmap, int create)
>  {
>  	struct request fake_req;
>  	struct sd_obj_req *hdr = (struct sd_obj_req *)&fake_req.rq;
>  	void *buf;
> +	off_t offset;
>  	unsigned data_length;
>  	int ret = SD_RES_NO_MEM;
>  	uint64_t oid = idx_to_oid(vid, idx);
> +	int nbits, shift = 0;
> +	struct vnode_info *vnodes = NULL;
>  
>  	dprintf("%"PRIx64", create %d\n", oid, create);
>  
>  	memset(&fake_req, 0, sizeof(fake_req));
>  	if (is_vdi_obj(oid))
> -		data_length = SD_INODE_SIZE;
> +		nbits = div_ceil(SD_INODE_SIZE, BLOCK_SIZE);
>  	else
> -		data_length = SD_DATA_OBJ_SIZE;
> +		nbits = div_ceil(SD_DATA_OBJ_SIZE, BLOCK_SIZE);
> +
> +	vnodes = get_vnode_info();
> +loop:
> +	if (shift == nbits) {
> +		put_vnode_info(vnodes);
> +		return SD_RES_SUCCESS;
> +	}
> +
> +	data_length = 0;
> +	offset = shift * BLOCK_SIZE;
> +	while (shift < nbits) {
> +		if (bmap & (1 << shift++))
> +			data_length += BLOCK_SIZE;
> +		else if (data_length > 0)
> +			break;
> +		else
> +			goto loop;
> +	}
>  
>  	buf = valloc(data_length);
>  	if (buf == NULL) {
> @@ -499,7 +551,7 @@ static int push_cache_object(uint32_t vid, uint32_t idx, int create)
>  		goto out;
>  	}
>  
> -	ret = read_cache_object(vid, idx, buf, data_length, 0);
> +	ret = read_cache_object(vid, idx, buf, data_length, offset);
>  	if (ret != SD_RES_SUCCESS)
>  		goto out;
>  
> @@ -512,15 +564,18 @@ static int push_cache_object(uint32_t vid, uint32_t idx, int create)
>  	hdr->epoch = sys->epoch;
>  	fake_req.data = buf;
>  	fake_req.op = get_sd_op(hdr->opcode);
> -	fake_req.vnodes = get_vnode_info();
> +	fake_req.vnodes = vnodes;
>  
>  	ret = forward_write_obj_req(&fake_req);
>  	if (ret != SD_RES_SUCCESS)
>  		eprintf("failed to push object %x\n", ret);
> -
> -	put_vnode_info(fake_req.vnodes);
> +	else {
> +		free(buf);
> +		goto loop;
> +	}
>  
>  out:
> +	put_vnode_info(vnodes);
>  	free(buf);
>  	return ret;
>  }
> @@ -546,7 +601,8 @@ int object_cache_push(struct object_cache *oc)
>  	 * request is issued in one of gateway worker threads
>  	 * So we need not to protect inactive dirty tree and list */
>  	list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
> -		ret = push_cache_object(oc->vid, entry->idx, entry->create);
> +		ret = push_cache_object(oc->vid, entry->idx,
> +				entry->bmap, entry->create);
>  		if (ret != SD_RES_SUCCESS)
>  			goto push_failed;
>  		del_from_dirty_tree_and_list(entry, inactive_dirty_tree);
> @@ -614,6 +670,7 @@ int object_cache_flush_and_delete(struct object_cache *oc)
>  	struct dirent *d;
>  	uint32_t vid = oc->vid;
>  	uint32_t idx;
> +	uint64_t bmap = 0;
>  	struct strbuf p;
>  	int ret = 0;
>  
> @@ -635,7 +692,7 @@ int object_cache_flush_and_delete(struct object_cache *oc)
>  		idx = strtoul(d->d_name, NULL, 16);
>  		if (idx == ULLONG_MAX)
>  			continue;
> -		if (push_cache_object(vid, idx, 1) != SD_RES_SUCCESS) {

> +		if (push_cache_object(vid, idx, ~bmap, 1) != SD_RES_SUCCESS) {


Better comment on why we need ~bmap.

>  			dprintf("failed to push %"PRIx64"\n",
>  				idx_to_oid(vid, idx));
>  			ret = -1;
> diff --git a/sheep/sdnet.c b/sheep/sdnet.c
> index 10be245..aaad630 100644
> --- a/sheep/sdnet.c
> +++ b/sheep/sdnet.c
> @@ -256,7 +256,7 @@ static int check_request(struct request *req)
>  	return 0;
>  }
>  
> -static do_nothing(struct work *work) {}
> +static void do_nothing(struct work *work) {}
>  
>  static void queue_request(struct request *req)
>  {
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index 2275a93..dc28572 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -414,6 +414,7 @@ static inline int sys_can_halt(void)
>  
>  #define CACHE_VDI_SHIFT       31
>  #define CACHE_VDI_BIT         (UINT32_C(1) << CACHE_VDI_SHIFT)
> +#define BLOCK_SIZE            (UINT32_C(512) << 20) /* 512 KB */
>


We'd better use CACHE_BLOCK_SIZE

  
>  struct object_cache {
>  	uint32_t vid;
> @@ -430,6 +431,7 @@ struct object_cache {
>  
>  struct object_cache_entry {
>  	uint32_t idx;
> +	uint64_t bmap;


We'd better comment what bmap mean, for e.g, "smallest flush unit"

>  	struct rb_node rb;
>  	struct list_head list;
>  	int create;


For flush tragedy, it might work better if we merge the flush requests
into one, even though this include flush amplification.

Thanks,
Yuan



More information about the sheepdog mailing list