On Tue, May 15, 2012 at 4:33 PM, Yunkai Zhang <yunkai.me at gmail.com> wrote: > On Tue, May 15, 2012 at 4:13 PM, Liu Yuan <namei.unix at gmail.com> wrote: >> On 05/14/2012 09:56 PM, Yunkai Zhang wrote: >> >>> From: Yunkai Zhang <qiushu.zyk at taobao.com> >>> >>> Each time when sheep flushes an cache object, it will flush its whole >>> data(4M) to the cluster, even if that object only contains one byte in >>> dirty. >>> >>> Now I splits an cache object into multiple blocks, each block is 128 KB >>> (this size can be switch in range[64, 128, 256, 512] KB defined by BLOCK_SIZE >>> macro). >>> >>> Each time when client writes an cache object, sheep will caculate which block >>> would be modified and mark it as dirty block. Those dirty infomation are saved >>> in a bitmap. >>> >>> When sheep flushes an canch object, it will not flush the whole data again, >>> instead of, it will read the bitmap of this object and only send the data marked >>> as dirty block. >>> >>> In addition, I replaced flock with lockf so that we only lock on the section we >>> care about of an object file. >>> >>> Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com> >>> --- >>> - BLOCK_SIZE to CACHE_BLOCK_SIZE >>> - object_bmap_calc to calc_object_bmap >>> - changes flush algo. >>> - fix bugs when push vdi object >>> include/util.h | 10 +++++ >>> sheep/object_cache.c | 98 +++++++++++++++++++++++++++++++++++++++++--------- >>> sheep/sheep_priv.h | 3 ++ >>> 3 files changed, 94 insertions(+), 17 deletions(-) >>> >>> diff --git a/include/util.h b/include/util.h >>> index ca43ab9..1c32954 100644 >>> --- a/include/util.h >>> +++ b/include/util.h >>> @@ -4,6 +4,7 @@ >>> #include <string.h> >>> #include <limits.h> >>> #include <stdint.h> >>> +#include <unistd.h> >>> >>> #include "bitops.h" >>> #include "list.h" >>> @@ -58,6 +59,15 @@ static inline void *zalloc(size_t size) >>> return calloc(1, size); >>> } >>> >>> +static inline int xlockf(int fd, int cmd, off_t offset, off_t len) >>> +{ >>> + if (lseek(fd, offset, SEEK_SET) < 0) >>> + return -1; >>> + >>> + return lockf(fd, cmd, len); >>> +} >>> + >>> + >>> typedef void (*try_to_free_t)(size_t); >>> extern try_to_free_t set_try_to_free_routine(try_to_free_t); >>> >>> diff --git a/sheep/object_cache.c b/sheep/object_cache.c >>> index 0b3c911..4d3864c 100644 >>> --- a/sheep/object_cache.c >>> +++ b/sheep/object_cache.c >>> @@ -41,6 +41,24 @@ static inline int hash(uint64_t vid) >>> return hash_64(vid, HASH_BITS); >>> } >>> >>> +static uint64_t calc_object_bmap(size_t len, off_t offset) >>> +{ >>> + int i, j; >>> + uint64_t bmap, shift = 1; >>> + >>> + if (!len) >>> + return 0; >>> + >>> + i = offset / CACHE_BLOCK_SIZE; >>> + j = (offset + len - 1) / CACHE_BLOCK_SIZE; >>> + >>> + bmap = (shift <<= i); >>> + while (j - i++) >>> + bmap |= (shift <<= 1); >>> + >>> + return bmap; >>> +} >>> + >> >> >> I think this code is a little bit confusing, normally shift has a common >> sense of how many bits we shift. But I am okay for this, since it is >> matter of personal taste. >> >>> static struct object_cache_entry *dirty_tree_insert(struct rb_root *root, >>> struct object_cache_entry *new) >>> { >>> @@ -56,8 +74,11 @@ static struct object_cache_entry *dirty_tree_insert(struct rb_root *root, >>> p = &(*p)->rb_left; >>> else if (new->idx > entry->idx) >>> p = &(*p)->rb_right; >>> - else >>> - return entry; /* already has this entry */ >>> + else { >>> + /* already has this entry, merge bmap */ >>> + entry->bmap |= new->bmap; >>> + return entry; >>> + } >>> } >>> rb_link_node(&new->rb, parent, p); >>> rb_insert_color(&new->rb, root); >>> @@ -143,11 +164,12 @@ out: >>> } >>> >>> static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx, >>> - struct object_cache_entry *entry, int create) >>> + uint64_t bmap, struct object_cache_entry *entry, int create) >>> { >>> if (!entry) { >>> entry = xzalloc(sizeof(*entry)); >>> entry->idx = idx; >>> + entry->bmap = bmap; >>> entry->create = create; >>> } >>> if (!dirty_tree_insert(oc->active_dirty_tree, entry)) >>> @@ -194,7 +216,8 @@ static void merge_dirty_tree_and_list(struct object_cache *oc, >>> >>> list_for_each_entry_safe(entry, t, inactive_dirty_list, list) { >>> del_from_dirty_tree_and_list(entry, inactive_dirty_tree); >>> - add_to_dirty_tree_and_list(oc, entry->idx, entry, 0); >>> + add_to_dirty_tree_and_list(oc, entry->idx, >>> + entry->bmap, entry, 0); >> >> >> It seems that we don't need to pass entry->bmap for >> add_to_dirty_tree_and_list() > > We need it! If there are nodes with the same idx in active_ditry_tree, > entry->bmap would be merged with that nodes so that dirty blocks info > would not be lost. Oh, you are right, because entry is not NULL. > >> >> >>> } >>> >>> pthread_mutex_unlock(&oc->lock); >>> @@ -230,7 +253,7 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create) >>> ret = -1; >>> else { >>> pthread_mutex_lock(&oc->lock); >>> - add_to_dirty_tree_and_list(oc, idx, NULL, 1); >>> + add_to_dirty_tree_and_list(oc, idx, 0, NULL, 1); >>> pthread_mutex_unlock(&oc->lock); >>> } >>> } >>> @@ -254,13 +277,13 @@ static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t coun >>> flags |= O_DIRECT; >>> >>> fd = open(p.buf, flags, def_fmode); >>> - if (flock(fd, LOCK_EX) < 0) { >>> + if (xlockf(fd, F_LOCK, offset, count) < 0) { >>> ret = SD_RES_EIO; >>> eprintf("%m\n"); >>> goto out; >>> } >>> size = xpwrite(fd, buf, count, offset); >>> - if (flock(fd, LOCK_UN) < 0) { >>> + if (xlockf(fd, F_UNLCK, offset, count) < 0) { >>> ret = SD_RES_EIO; >>> eprintf("%m\n"); >>> goto out; >>> @@ -287,14 +310,14 @@ static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count >>> flags |= O_DIRECT; >>> >>> fd = open(p.buf, flags, def_fmode); >>> - if (flock(fd, LOCK_SH) < 0) { >>> + if (xlockf(fd, F_LOCK, offset, count) < 0) { >>> ret = SD_RES_EIO; >>> eprintf("%m\n"); >>> goto out; >>> } >>> >>> size = xpread(fd, buf, count, offset); >>> - if (flock(fd, LOCK_UN) < 0) { >>> + if (xlockf(fd, F_UNLCK, offset, count) < 0) { >>> ret = SD_RES_EIO; >>> eprintf("%m\n"); >>> goto out; >>> @@ -311,6 +334,7 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req) >>> { >>> struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq; >>> struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp; >>> + uint64_t bmap = 0; >>> int ret; >>> >>> dprintf("%08"PRIx32", len %"PRIu32", off %"PRIu64"\n", idx, hdr->data_length, hdr->offset); >>> @@ -319,7 +343,8 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req) >>> if (ret != SD_RES_SUCCESS) >>> goto out; >>> pthread_mutex_lock(&oc->lock); >>> - add_to_dirty_tree_and_list(oc, idx, NULL, 0); >>> + bmap = calc_object_bmap(hdr->data_length, hdr->offset); >>> + add_to_dirty_tree_and_list(oc, idx, bmap, NULL, 0); >>> pthread_mutex_unlock(&oc->lock); >>> } else { >>> ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset); >>> @@ -476,22 +501,59 @@ static uint64_t idx_to_oid(uint32_t vid, uint32_t idx) >>> return vid_to_data_oid(vid, idx); >>> } >>> >>> -static int push_cache_object(uint32_t vid, uint32_t idx, int create) >>> +static int push_cache_object(uint32_t vid, uint32_t idx, >>> + uint64_t bmap, int create) >>> { >>> struct request fake_req; >>> struct sd_obj_req *hdr = (struct sd_obj_req *)&fake_req.rq; >>> void *buf; >>> + off_t offset; >>> unsigned data_length; >>> int ret = SD_RES_NO_MEM; >>> - uint64_t oid = idx_to_oid(vid, idx); >>> + uint64_t shift, oid = idx_to_oid(vid, idx); >>> + int i, nbits, first_dirty_bit, last_dirty_bit; >>> >>> dprintf("%"PRIx64", create %d\n", oid, create); >>> >>> + if (!bmap) >>> + return SD_RES_SUCCESS; >>> + >>> memset(&fake_req, 0, sizeof(fake_req)); >>> if (is_vdi_obj(oid)) >>> - data_length = SD_INODE_SIZE; >>> + nbits = DIV_ROUND_UP(SD_INODE_SIZE, CACHE_BLOCK_SIZE); >>> else >>> - data_length = SD_DATA_OBJ_SIZE; >>> + nbits = DIV_ROUND_UP(SD_DATA_OBJ_SIZE, CACHE_BLOCK_SIZE); >>> + >>> + shift = 1; >>> + first_dirty_bit = 0; >>> + >>> + for (i = 0; i < nbits; i++) { >>> + if (bmap & (shift << i)) { >>> + first_dirty_bit = i; >>> + break; >>> + } >>> + } >>> + >>> + shift = (UINT64_C(1) << (nbits - 1)); >>> + last_dirty_bit = 0; >>> + for (i = 0; i < nbits; i++) { >>> + if (bmap & (shift >> i)) { >>> + last_dirty_bit = nbits - i - 1 ; >>> + break; >>> + } >>> + } >>> + >>> + dprintf("bmap:0x%"PRIx64", first_dirty_bit:%d, last_dirty_bit:%d\n", >>> + bmap, first_dirty_bit, last_dirty_bit); >>> + offset = first_dirty_bit * CACHE_BLOCK_SIZE; >>> + data_length = (last_dirty_bit - first_dirty_bit + 1) * CACHE_BLOCK_SIZE; >>> + >>> + /* >>> + * CACHE_BLOCK_SIZE may not be divisible by SD_INODE_SIZE, >>> + * so data_length could larger than SD_INODE_SIZE >>> + */ >>> + if (is_vdi_obj(oid) && data_length > SD_INODE_SIZE) >>> + data_length = SD_INODE_SIZE; >>> >>> buf = valloc(data_length); >>> if (buf == NULL) { >>> @@ -499,7 +561,7 @@ static int push_cache_object(uint32_t vid, uint32_t idx, int create) >>> goto out; >>> } >>> >>> - ret = read_cache_object(vid, idx, buf, data_length, 0); >>> + ret = read_cache_object(vid, idx, buf, data_length, offset); >>> if (ret != SD_RES_SUCCESS) >>> goto out; >>> >>> @@ -546,7 +608,8 @@ int object_cache_push(struct object_cache *oc) >>> * request is issued in one of gateway worker threads >>> * So we need not to protect inactive dirty tree and list */ >>> list_for_each_entry_safe(entry, t, inactive_dirty_list, list) { >>> - ret = push_cache_object(oc->vid, entry->idx, entry->create); >>> + ret = push_cache_object(oc->vid, entry->idx, >>> + entry->bmap, entry->create); >>> if (ret != SD_RES_SUCCESS) >>> goto push_failed; >>> del_from_dirty_tree_and_list(entry, inactive_dirty_tree); >>> @@ -614,6 +677,7 @@ int object_cache_flush_and_delete(struct object_cache *oc) >>> struct dirent *d; >>> uint32_t vid = oc->vid; >>> uint32_t idx; >>> + uint64_t bmap = 0; >>> struct strbuf p; >>> int ret = 0; >>> >>> @@ -635,7 +699,7 @@ int object_cache_flush_and_delete(struct object_cache *oc) >>> idx = strtoul(d->d_name, NULL, 16); >>> if (idx == ULLONG_MAX) >>> continue; >>> - if (push_cache_object(vid, idx, 1) != SD_RES_SUCCESS) { >>> + if (push_cache_object(vid, idx, ~bmap, 1) != SD_RES_SUCCESS) { >> >> >> Better use another var (for e.g, all = ~0 ) to means to flush the whole > > No, There was not type info with ~0, it's size depends on system > environment, we should keep it as 64 bit, so I use ~bmap variable > instead. > >> object, instead of '~bmap', >> >> Thanks, >> Yuan > > > > -- > Yunkai Zhang > Work at Taobao -- Yunkai Zhang Work at Taobao |