[Sheepdog] [PATCH V3] Make the flushing data more fine grained
Liu Yuan
namei.unix at gmail.com
Tue May 15 10:13:59 CEST 2012
On 05/14/2012 09:56 PM, Yunkai Zhang wrote:
> From: Yunkai Zhang <qiushu.zyk at taobao.com>
>
> Each time when sheep flushes an cache object, it will flush its whole
> data(4M) to the cluster, even if that object only contains one byte in
> dirty.
>
> Now I splits an cache object into multiple blocks, each block is 128 KB
> (this size can be switch in range[64, 128, 256, 512] KB defined by BLOCK_SIZE
> macro).
>
> Each time when client writes an cache object, sheep will caculate which block
> would be modified and mark it as dirty block. Those dirty infomation are saved
> in a bitmap.
>
> When sheep flushes an canch object, it will not flush the whole data again,
> instead of, it will read the bitmap of this object and only send the data marked
> as dirty block.
>
> In addition, I replaced flock with lockf so that we only lock on the section we
> care about of an object file.
>
> Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
> ---
> - BLOCK_SIZE to CACHE_BLOCK_SIZE
> - object_bmap_calc to calc_object_bmap
> - changes flush algo.
> - fix bugs when push vdi object
> include/util.h | 10 +++++
> sheep/object_cache.c | 98 +++++++++++++++++++++++++++++++++++++++++---------
> sheep/sheep_priv.h | 3 ++
> 3 files changed, 94 insertions(+), 17 deletions(-)
>
> diff --git a/include/util.h b/include/util.h
> index ca43ab9..1c32954 100644
> --- a/include/util.h
> +++ b/include/util.h
> @@ -4,6 +4,7 @@
> #include <string.h>
> #include <limits.h>
> #include <stdint.h>
> +#include <unistd.h>
>
> #include "bitops.h"
> #include "list.h"
> @@ -58,6 +59,15 @@ static inline void *zalloc(size_t size)
> return calloc(1, size);
> }
>
> +static inline int xlockf(int fd, int cmd, off_t offset, off_t len)
> +{
> + if (lseek(fd, offset, SEEK_SET) < 0)
> + return -1;
> +
> + return lockf(fd, cmd, len);
> +}
> +
> +
> typedef void (*try_to_free_t)(size_t);
> extern try_to_free_t set_try_to_free_routine(try_to_free_t);
>
> diff --git a/sheep/object_cache.c b/sheep/object_cache.c
> index 0b3c911..4d3864c 100644
> --- a/sheep/object_cache.c
> +++ b/sheep/object_cache.c
> @@ -41,6 +41,24 @@ static inline int hash(uint64_t vid)
> return hash_64(vid, HASH_BITS);
> }
>
> +static uint64_t calc_object_bmap(size_t len, off_t offset)
> +{
> + int i, j;
> + uint64_t bmap, shift = 1;
> +
> + if (!len)
> + return 0;
> +
> + i = offset / CACHE_BLOCK_SIZE;
> + j = (offset + len - 1) / CACHE_BLOCK_SIZE;
> +
> + bmap = (shift <<= i);
> + while (j - i++)
> + bmap |= (shift <<= 1);
> +
> + return bmap;
> +}
> +
I think this code is a little bit confusing, normally shift has a common
sense of how many bits we shift. But I am okay for this, since it is
matter of personal taste.
> static struct object_cache_entry *dirty_tree_insert(struct rb_root *root,
> struct object_cache_entry *new)
> {
> @@ -56,8 +74,11 @@ static struct object_cache_entry *dirty_tree_insert(struct rb_root *root,
> p = &(*p)->rb_left;
> else if (new->idx > entry->idx)
> p = &(*p)->rb_right;
> - else
> - return entry; /* already has this entry */
> + else {
> + /* already has this entry, merge bmap */
> + entry->bmap |= new->bmap;
> + return entry;
> + }
> }
> rb_link_node(&new->rb, parent, p);
> rb_insert_color(&new->rb, root);
> @@ -143,11 +164,12 @@ out:
> }
>
> static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx,
> - struct object_cache_entry *entry, int create)
> + uint64_t bmap, struct object_cache_entry *entry, int create)
> {
> if (!entry) {
> entry = xzalloc(sizeof(*entry));
> entry->idx = idx;
> + entry->bmap = bmap;
> entry->create = create;
> }
> if (!dirty_tree_insert(oc->active_dirty_tree, entry))
> @@ -194,7 +216,8 @@ static void merge_dirty_tree_and_list(struct object_cache *oc,
>
> list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
> del_from_dirty_tree_and_list(entry, inactive_dirty_tree);
> - add_to_dirty_tree_and_list(oc, entry->idx, entry, 0);
> + add_to_dirty_tree_and_list(oc, entry->idx,
> + entry->bmap, entry, 0);
It seems that we don't need to pass entry->bmap for
add_to_dirty_tree_and_list()
> }
>
> pthread_mutex_unlock(&oc->lock);
> @@ -230,7 +253,7 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create)
> ret = -1;
> else {
> pthread_mutex_lock(&oc->lock);
> - add_to_dirty_tree_and_list(oc, idx, NULL, 1);
> + add_to_dirty_tree_and_list(oc, idx, 0, NULL, 1);
> pthread_mutex_unlock(&oc->lock);
> }
> }
> @@ -254,13 +277,13 @@ static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t coun
> flags |= O_DIRECT;
>
> fd = open(p.buf, flags, def_fmode);
> - if (flock(fd, LOCK_EX) < 0) {
> + if (xlockf(fd, F_LOCK, offset, count) < 0) {
> ret = SD_RES_EIO;
> eprintf("%m\n");
> goto out;
> }
> size = xpwrite(fd, buf, count, offset);
> - if (flock(fd, LOCK_UN) < 0) {
> + if (xlockf(fd, F_UNLCK, offset, count) < 0) {
> ret = SD_RES_EIO;
> eprintf("%m\n");
> goto out;
> @@ -287,14 +310,14 @@ static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count
> flags |= O_DIRECT;
>
> fd = open(p.buf, flags, def_fmode);
> - if (flock(fd, LOCK_SH) < 0) {
> + if (xlockf(fd, F_LOCK, offset, count) < 0) {
> ret = SD_RES_EIO;
> eprintf("%m\n");
> goto out;
> }
>
> size = xpread(fd, buf, count, offset);
> - if (flock(fd, LOCK_UN) < 0) {
> + if (xlockf(fd, F_UNLCK, offset, count) < 0) {
> ret = SD_RES_EIO;
> eprintf("%m\n");
> goto out;
> @@ -311,6 +334,7 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
> {
> struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
> struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp;
> + uint64_t bmap = 0;
> int ret;
>
> dprintf("%08"PRIx32", len %"PRIu32", off %"PRIu64"\n", idx, hdr->data_length, hdr->offset);
> @@ -319,7 +343,8 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
> if (ret != SD_RES_SUCCESS)
> goto out;
> pthread_mutex_lock(&oc->lock);
> - add_to_dirty_tree_and_list(oc, idx, NULL, 0);
> + bmap = calc_object_bmap(hdr->data_length, hdr->offset);
> + add_to_dirty_tree_and_list(oc, idx, bmap, NULL, 0);
> pthread_mutex_unlock(&oc->lock);
> } else {
> ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
> @@ -476,22 +501,59 @@ static uint64_t idx_to_oid(uint32_t vid, uint32_t idx)
> return vid_to_data_oid(vid, idx);
> }
>
> -static int push_cache_object(uint32_t vid, uint32_t idx, int create)
> +static int push_cache_object(uint32_t vid, uint32_t idx,
> + uint64_t bmap, int create)
> {
> struct request fake_req;
> struct sd_obj_req *hdr = (struct sd_obj_req *)&fake_req.rq;
> void *buf;
> + off_t offset;
> unsigned data_length;
> int ret = SD_RES_NO_MEM;
> - uint64_t oid = idx_to_oid(vid, idx);
> + uint64_t shift, oid = idx_to_oid(vid, idx);
> + int i, nbits, first_dirty_bit, last_dirty_bit;
>
> dprintf("%"PRIx64", create %d\n", oid, create);
>
> + if (!bmap)
> + return SD_RES_SUCCESS;
> +
> memset(&fake_req, 0, sizeof(fake_req));
> if (is_vdi_obj(oid))
> - data_length = SD_INODE_SIZE;
> + nbits = DIV_ROUND_UP(SD_INODE_SIZE, CACHE_BLOCK_SIZE);
> else
> - data_length = SD_DATA_OBJ_SIZE;
> + nbits = DIV_ROUND_UP(SD_DATA_OBJ_SIZE, CACHE_BLOCK_SIZE);
> +
> + shift = 1;
> + first_dirty_bit = 0;
> +
> + for (i = 0; i < nbits; i++) {
> + if (bmap & (shift << i)) {
> + first_dirty_bit = i;
> + break;
> + }
> + }
> +
> + shift = (UINT64_C(1) << (nbits - 1));
> + last_dirty_bit = 0;
> + for (i = 0; i < nbits; i++) {
> + if (bmap & (shift >> i)) {
> + last_dirty_bit = nbits - i - 1 ;
> + break;
> + }
> + }
> +
> + dprintf("bmap:0x%"PRIx64", first_dirty_bit:%d, last_dirty_bit:%d\n",
> + bmap, first_dirty_bit, last_dirty_bit);
> + offset = first_dirty_bit * CACHE_BLOCK_SIZE;
> + data_length = (last_dirty_bit - first_dirty_bit + 1) * CACHE_BLOCK_SIZE;
> +
> + /*
> + * CACHE_BLOCK_SIZE may not be divisible by SD_INODE_SIZE,
> + * so data_length could larger than SD_INODE_SIZE
> + */
> + if (is_vdi_obj(oid) && data_length > SD_INODE_SIZE)
> + data_length = SD_INODE_SIZE;
>
> buf = valloc(data_length);
> if (buf == NULL) {
> @@ -499,7 +561,7 @@ static int push_cache_object(uint32_t vid, uint32_t idx, int create)
> goto out;
> }
>
> - ret = read_cache_object(vid, idx, buf, data_length, 0);
> + ret = read_cache_object(vid, idx, buf, data_length, offset);
> if (ret != SD_RES_SUCCESS)
> goto out;
>
> @@ -546,7 +608,8 @@ int object_cache_push(struct object_cache *oc)
> * request is issued in one of gateway worker threads
> * So we need not to protect inactive dirty tree and list */
> list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
> - ret = push_cache_object(oc->vid, entry->idx, entry->create);
> + ret = push_cache_object(oc->vid, entry->idx,
> + entry->bmap, entry->create);
> if (ret != SD_RES_SUCCESS)
> goto push_failed;
> del_from_dirty_tree_and_list(entry, inactive_dirty_tree);
> @@ -614,6 +677,7 @@ int object_cache_flush_and_delete(struct object_cache *oc)
> struct dirent *d;
> uint32_t vid = oc->vid;
> uint32_t idx;
> + uint64_t bmap = 0;
> struct strbuf p;
> int ret = 0;
>
> @@ -635,7 +699,7 @@ int object_cache_flush_and_delete(struct object_cache *oc)
> idx = strtoul(d->d_name, NULL, 16);
> if (idx == ULLONG_MAX)
> continue;
> - if (push_cache_object(vid, idx, 1) != SD_RES_SUCCESS) {
> + if (push_cache_object(vid, idx, ~bmap, 1) != SD_RES_SUCCESS) {
Better use another var (for e.g, all = ~0 ) to means to flush the whole
object, instead of '~bmap',
Thanks,
Yuan
More information about the sheepdog
mailing list