[Sheepdog] [PATCH V3] Make the flushing data more fine grained

Yunkai Zhang yunkai.me at gmail.com
Tue May 15 10:47:17 CEST 2012


On Tue, May 15, 2012 at 4:33 PM, Yunkai Zhang <yunkai.me at gmail.com> wrote:
> On Tue, May 15, 2012 at 4:13 PM, Liu Yuan <namei.unix at gmail.com> wrote:
>> On 05/14/2012 09:56 PM, Yunkai Zhang wrote:
>>
>>> From: Yunkai Zhang <qiushu.zyk at taobao.com>
>>>
>>> Each time when sheep flushes an cache object, it will flush its whole
>>> data(4M) to the cluster, even if that object only contains one byte in
>>> dirty.
>>>
>>> Now I splits an cache object into multiple blocks, each block is 128 KB
>>> (this size can be switch in range[64, 128, 256, 512] KB defined by BLOCK_SIZE
>>> macro).
>>>
>>> Each time when client writes an cache object, sheep will caculate which block
>>> would be modified and mark it as dirty block. Those dirty infomation are saved
>>> in a bitmap.
>>>
>>> When sheep flushes an canch object, it will not flush the whole data again,
>>> instead of, it will read the bitmap of this object and only send the data marked
>>> as dirty block.
>>>
>>> In addition, I replaced flock with lockf so that we only lock on the section we
>>> care about of an object file.
>>>
>>> Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
>>> ---
>>> - BLOCK_SIZE to CACHE_BLOCK_SIZE
>>> - object_bmap_calc to calc_object_bmap
>>> - changes flush algo.
>>> - fix bugs when push vdi object
>>>  include/util.h       |   10 +++++
>>>  sheep/object_cache.c |   98 +++++++++++++++++++++++++++++++++++++++++---------
>>>  sheep/sheep_priv.h   |    3 ++
>>>  3 files changed, 94 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/include/util.h b/include/util.h
>>> index ca43ab9..1c32954 100644
>>> --- a/include/util.h
>>> +++ b/include/util.h
>>> @@ -4,6 +4,7 @@
>>>  #include <string.h>
>>>  #include <limits.h>
>>>  #include <stdint.h>
>>> +#include <unistd.h>
>>>
>>>  #include "bitops.h"
>>>  #include "list.h"
>>> @@ -58,6 +59,15 @@ static inline void *zalloc(size_t size)
>>>       return calloc(1, size);
>>>  }
>>>
>>> +static inline int xlockf(int fd, int cmd, off_t offset, off_t len)
>>> +{
>>> +     if (lseek(fd, offset, SEEK_SET) < 0)
>>> +             return -1;
>>> +
>>> +     return lockf(fd, cmd, len);
>>> +}
>>> +
>>> +
>>>  typedef void (*try_to_free_t)(size_t);
>>>  extern try_to_free_t set_try_to_free_routine(try_to_free_t);
>>>
>>> diff --git a/sheep/object_cache.c b/sheep/object_cache.c
>>> index 0b3c911..4d3864c 100644
>>> --- a/sheep/object_cache.c
>>> +++ b/sheep/object_cache.c
>>> @@ -41,6 +41,24 @@ static inline int hash(uint64_t vid)
>>>       return hash_64(vid, HASH_BITS);
>>>  }
>>>
>>> +static uint64_t calc_object_bmap(size_t len, off_t offset)
>>> +{
>>> +     int i, j;
>>> +     uint64_t bmap, shift = 1;
>>> +
>>> +     if (!len)
>>> +             return 0;
>>> +
>>> +     i = offset / CACHE_BLOCK_SIZE;
>>> +     j = (offset + len - 1) / CACHE_BLOCK_SIZE;
>>> +
>>> +     bmap = (shift <<= i);
>>> +     while (j - i++)
>>> +             bmap |= (shift <<= 1);
>>> +
>>> +     return bmap;
>>> +}
>>> +
>>
>>
>> I think this code is a little bit confusing, normally shift has a common
>> sense of how many bits we shift. But I am okay for this, since it is
>> matter of personal taste.
>>
>>>  static struct object_cache_entry *dirty_tree_insert(struct rb_root *root,
>>>               struct object_cache_entry *new)
>>>  {
>>> @@ -56,8 +74,11 @@ static struct object_cache_entry *dirty_tree_insert(struct rb_root *root,
>>>                       p = &(*p)->rb_left;
>>>               else if (new->idx > entry->idx)
>>>                       p = &(*p)->rb_right;
>>> -             else
>>> -                     return entry; /* already has this entry */
>>> +             else {
>>> +                     /* already has this entry, merge bmap */
>>> +                     entry->bmap |= new->bmap;
>>> +                     return entry;
>>> +             }
>>>       }
>>>       rb_link_node(&new->rb, parent, p);
>>>       rb_insert_color(&new->rb, root);
>>> @@ -143,11 +164,12 @@ out:
>>>  }
>>>
>>>  static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx,
>>> -             struct object_cache_entry *entry, int create)
>>> +             uint64_t bmap, struct object_cache_entry *entry, int create)
>>>  {
>>>       if (!entry) {
>>>               entry = xzalloc(sizeof(*entry));
>>>               entry->idx = idx;
>>> +             entry->bmap = bmap;
>>>               entry->create = create;
>>>       }
>>>       if (!dirty_tree_insert(oc->active_dirty_tree, entry))
>>> @@ -194,7 +216,8 @@ static void merge_dirty_tree_and_list(struct object_cache *oc,
>>>
>>>       list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
>>>               del_from_dirty_tree_and_list(entry, inactive_dirty_tree);
>>> -             add_to_dirty_tree_and_list(oc, entry->idx, entry, 0);
>>> +             add_to_dirty_tree_and_list(oc, entry->idx,
>>> +                             entry->bmap, entry, 0);
>>
>>
>> It seems that we don't need to pass entry->bmap for
>> add_to_dirty_tree_and_list()
>
> We need it! If there are nodes with the same idx in active_ditry_tree,
> entry->bmap would be merged with that nodes so that dirty blocks info
> would not be lost.
Oh, you are right, because entry is not NULL.
>
>>
>>
>>>       }
>>>
>>>       pthread_mutex_unlock(&oc->lock);
>>> @@ -230,7 +253,7 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create)
>>>                       ret = -1;
>>>               else {
>>>                       pthread_mutex_lock(&oc->lock);
>>> -                     add_to_dirty_tree_and_list(oc, idx, NULL, 1);
>>> +                     add_to_dirty_tree_and_list(oc, idx, 0, NULL, 1);
>>>                       pthread_mutex_unlock(&oc->lock);
>>>               }
>>>       }
>>> @@ -254,13 +277,13 @@ static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t coun
>>>               flags |= O_DIRECT;
>>>
>>>       fd = open(p.buf, flags, def_fmode);
>>> -     if (flock(fd, LOCK_EX) < 0) {
>>> +     if (xlockf(fd, F_LOCK, offset, count) < 0) {
>>>               ret = SD_RES_EIO;
>>>               eprintf("%m\n");
>>>               goto out;
>>>       }
>>>       size = xpwrite(fd, buf, count, offset);
>>> -     if (flock(fd, LOCK_UN) < 0) {
>>> +     if (xlockf(fd, F_UNLCK, offset, count) < 0) {
>>>               ret = SD_RES_EIO;
>>>               eprintf("%m\n");
>>>               goto out;
>>> @@ -287,14 +310,14 @@ static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count
>>>               flags |= O_DIRECT;
>>>
>>>       fd = open(p.buf, flags, def_fmode);
>>> -     if (flock(fd, LOCK_SH) < 0) {
>>> +     if (xlockf(fd, F_LOCK, offset, count) < 0) {
>>>               ret = SD_RES_EIO;
>>>               eprintf("%m\n");
>>>               goto out;
>>>       }
>>>
>>>       size = xpread(fd, buf, count, offset);
>>> -     if (flock(fd, LOCK_UN) < 0) {
>>> +     if (xlockf(fd, F_UNLCK, offset, count) < 0) {
>>>               ret = SD_RES_EIO;
>>>               eprintf("%m\n");
>>>               goto out;
>>> @@ -311,6 +334,7 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
>>>  {
>>>       struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
>>>       struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp;
>>> +     uint64_t bmap = 0;
>>>       int ret;
>>>
>>>       dprintf("%08"PRIx32", len %"PRIu32", off %"PRIu64"\n", idx, hdr->data_length, hdr->offset);
>>> @@ -319,7 +343,8 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
>>>               if (ret != SD_RES_SUCCESS)
>>>                       goto out;
>>>               pthread_mutex_lock(&oc->lock);
>>> -             add_to_dirty_tree_and_list(oc, idx, NULL, 0);
>>> +             bmap = calc_object_bmap(hdr->data_length, hdr->offset);
>>> +             add_to_dirty_tree_and_list(oc, idx, bmap, NULL, 0);
>>>               pthread_mutex_unlock(&oc->lock);
>>>       } else {
>>>               ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
>>> @@ -476,22 +501,59 @@ static uint64_t idx_to_oid(uint32_t vid, uint32_t idx)
>>>               return vid_to_data_oid(vid, idx);
>>>  }
>>>
>>> -static int push_cache_object(uint32_t vid, uint32_t idx, int create)
>>> +static int push_cache_object(uint32_t vid, uint32_t idx,
>>> +             uint64_t bmap, int create)
>>>  {
>>>       struct request fake_req;
>>>       struct sd_obj_req *hdr = (struct sd_obj_req *)&fake_req.rq;
>>>       void *buf;
>>> +     off_t offset;
>>>       unsigned data_length;
>>>       int ret = SD_RES_NO_MEM;
>>> -     uint64_t oid = idx_to_oid(vid, idx);
>>> +     uint64_t shift, oid = idx_to_oid(vid, idx);
>>> +     int i, nbits, first_dirty_bit, last_dirty_bit;
>>>
>>>       dprintf("%"PRIx64", create %d\n", oid, create);
>>>
>>> +     if (!bmap)
>>> +             return SD_RES_SUCCESS;
>>> +
>>>       memset(&fake_req, 0, sizeof(fake_req));
>>>       if (is_vdi_obj(oid))
>>> -             data_length = SD_INODE_SIZE;
>>> +             nbits = DIV_ROUND_UP(SD_INODE_SIZE, CACHE_BLOCK_SIZE);
>>>       else
>>> -             data_length = SD_DATA_OBJ_SIZE;
>>> +             nbits = DIV_ROUND_UP(SD_DATA_OBJ_SIZE, CACHE_BLOCK_SIZE);
>>> +
>>> +     shift = 1;
>>> +     first_dirty_bit = 0;
>>> +
>>> +     for (i = 0; i < nbits; i++) {
>>> +             if (bmap & (shift << i)) {
>>> +                     first_dirty_bit = i;
>>> +                     break;
>>> +             }
>>> +     }
>>> +
>>> +     shift =  (UINT64_C(1) << (nbits - 1));
>>> +     last_dirty_bit = 0;
>>> +     for (i = 0; i < nbits; i++) {
>>> +             if (bmap & (shift >> i)) {
>>> +                     last_dirty_bit = nbits - i - 1 ;
>>> +                     break;
>>> +             }
>>> +     }
>>> +
>>> +     dprintf("bmap:0x%"PRIx64", first_dirty_bit:%d, last_dirty_bit:%d\n",
>>> +                     bmap, first_dirty_bit, last_dirty_bit);
>>> +     offset = first_dirty_bit * CACHE_BLOCK_SIZE;
>>> +     data_length = (last_dirty_bit - first_dirty_bit + 1) * CACHE_BLOCK_SIZE;
>>> +
>>> +     /*
>>> +      * CACHE_BLOCK_SIZE may not be divisible by SD_INODE_SIZE,
>>> +      * so data_length could larger than SD_INODE_SIZE
>>> +      */
>>> +     if (is_vdi_obj(oid) && data_length > SD_INODE_SIZE)
>>> +             data_length = SD_INODE_SIZE;
>>>
>>>       buf = valloc(data_length);
>>>       if (buf == NULL) {
>>> @@ -499,7 +561,7 @@ static int push_cache_object(uint32_t vid, uint32_t idx, int create)
>>>               goto out;
>>>       }
>>>
>>> -     ret = read_cache_object(vid, idx, buf, data_length, 0);
>>> +     ret = read_cache_object(vid, idx, buf, data_length, offset);
>>>       if (ret != SD_RES_SUCCESS)
>>>               goto out;
>>>
>>> @@ -546,7 +608,8 @@ int object_cache_push(struct object_cache *oc)
>>>        * request is issued in one of gateway worker threads
>>>        * So we need not to protect inactive dirty tree and list */
>>>       list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
>>> -             ret = push_cache_object(oc->vid, entry->idx, entry->create);
>>> +             ret = push_cache_object(oc->vid, entry->idx,
>>> +                             entry->bmap, entry->create);
>>>               if (ret != SD_RES_SUCCESS)
>>>                       goto push_failed;
>>>               del_from_dirty_tree_and_list(entry, inactive_dirty_tree);
>>> @@ -614,6 +677,7 @@ int object_cache_flush_and_delete(struct object_cache *oc)
>>>       struct dirent *d;
>>>       uint32_t vid = oc->vid;
>>>       uint32_t idx;
>>> +     uint64_t bmap = 0;
>>>       struct strbuf p;
>>>       int ret = 0;
>>>
>>> @@ -635,7 +699,7 @@ int object_cache_flush_and_delete(struct object_cache *oc)
>>>               idx = strtoul(d->d_name, NULL, 16);
>>>               if (idx == ULLONG_MAX)
>>>                       continue;
>>> -             if (push_cache_object(vid, idx, 1) != SD_RES_SUCCESS) {
>>> +             if (push_cache_object(vid, idx, ~bmap, 1) != SD_RES_SUCCESS) {
>>
>>
>> Better use another var (for e.g, all = ~0 ) to means to flush the whole
>
> No, There was not type info with ~0, it's size depends on system
> environment, we should keep it as 64 bit, so I use ~bmap variable
> instead.
>
>> object, instead of '~bmap',
>>
>> Thanks,
>> Yuan
>
>
>
> --
> Yunkai Zhang
> Work at Taobao



-- 
Yunkai Zhang
Work at Taobao



More information about the sheepdog mailing list