On 05/08/2012 08:12 PM, Yunkai Zhang wrote: > From: Yunkai Zhang <qiushu.zyk at taobao.com> > > Each time when sheep flushes an cache object, it will flush its whole > data(4M) to the cluster, even if that object only contains one byte in > dirty. > > Now I splits an cache object into multiple blocks, each block is 512KB > (this size can be switch in range[64, 128, 256, 512]KB defined by BLOCK_SIZE > macro). > > Each time when client writes an cache object, sheep will caculate which block > would be modified and mark it as dirty block. Those dirty infomation are saved > in a bitmap. > > When sheep flushes an canch object, it will not flush the whole data again, > instead of, it will read the bitmap of this object and only send the data marked > as dirty block. > > In addition, I replaced flock with lockf so that we only lock on the section we care > about of an object file. > > Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com> > --- > include/util.h | 10 +++++ > sheep/object_cache.c | 95 ++++++++++++++++++++++++++++++++++++++++---------- > sheep/sdnet.c | 2 +- > sheep/sheep_priv.h | 2 + > 4 files changed, 89 insertions(+), 20 deletions(-) > > diff --git a/include/util.h b/include/util.h > index ca43ab9..1c32954 100644 > --- a/include/util.h > +++ b/include/util.h > @@ -4,6 +4,7 @@ > #include <string.h> > #include <limits.h> > #include <stdint.h> > +#include <unistd.h> > > #include "bitops.h" > #include "list.h" > @@ -58,6 +59,15 @@ static inline void *zalloc(size_t size) > return calloc(1, size); > } > > +static inline int xlockf(int fd, int cmd, off_t offset, off_t len) > +{ > + if (lseek(fd, offset, SEEK_SET) < 0) > + return -1; > + > + return lockf(fd, cmd, len); > +} > + > + > typedef void (*try_to_free_t)(size_t); > extern try_to_free_t set_try_to_free_routine(try_to_free_t); > > diff --git a/sheep/object_cache.c b/sheep/object_cache.c > index 0b3c911..43bf109 100644 > --- a/sheep/object_cache.c > +++ b/sheep/object_cache.c > @@ -41,6 +41,29 @@ static inline int hash(uint64_t vid) > return hash_64(vid, HASH_BITS); > } > > +static inline uint32_t div_ceil(uint32_t numerator, uint32_t denominator) > +{ > + return (numerator / denominator) + ((numerator % denominator) > 0); > +} > + Use DVI_ROUND_UP() > +static uint64_t object_bmap_calc(size_t len, off_t offset) > +{ > + int i, j; > + uint64_t bmap = 1; > + > + if (!len) > + return 0; > + > + i = offset / BLOCK_SIZE; > + j = (offset + len - 1) / BLOCK_SIZE; > + > + bmap <<= i; > + while (j - i++) > + bmap = bmap | (bmap << 1); > + > + return bmap; > +} > + Better name it as cacl_object_bmap() because it is a static function, we don't need namespace. > static struct object_cache_entry *dirty_tree_insert(struct rb_root *root, > struct object_cache_entry *new) > { > @@ -56,8 +79,11 @@ static struct object_cache_entry *dirty_tree_insert(struct rb_root *root, > p = &(*p)->rb_left; > else if (new->idx > entry->idx) > p = &(*p)->rb_right; > - else > - return entry; /* already has this entry */ > + else { > + /* already has this entry, merge bmap */ > + entry->bmap |= new->bmap; > + return entry; > + } > } > rb_link_node(&new->rb, parent, p); > rb_insert_color(&new->rb, root); > @@ -143,11 +169,12 @@ out: > } > > static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx, > - struct object_cache_entry *entry, int create) > + uint64_t bmap, struct object_cache_entry *entry, int create) > { > if (!entry) { > entry = xzalloc(sizeof(*entry)); > entry->idx = idx; > + entry->bmap = bmap; > entry->create = create; > } > if (!dirty_tree_insert(oc->active_dirty_tree, entry)) > @@ -194,7 +221,7 @@ static void merge_dirty_tree_and_list(struct object_cache *oc, > > list_for_each_entry_safe(entry, t, inactive_dirty_list, list) { > del_from_dirty_tree_and_list(entry, inactive_dirty_tree); > - add_to_dirty_tree_and_list(oc, entry->idx, entry, 0); > + add_to_dirty_tree_and_list(oc, entry->idx, entry->bmap, entry, 0); > } > > pthread_mutex_unlock(&oc->lock); > @@ -204,6 +231,7 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create) > { > struct strbuf buf; > int fd, ret = 0, flags = def_open_flags; > + uint64_t bmap = 0; > > strbuf_init(&buf, PATH_MAX); > strbuf_addstr(&buf, cache_dir); > @@ -230,7 +258,7 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create) > ret = -1; > else { > pthread_mutex_lock(&oc->lock); > - add_to_dirty_tree_and_list(oc, idx, NULL, 1); > + add_to_dirty_tree_and_list(oc, idx, ~bmap, NULL, 1); We shouldn't set bmap as full for object creation, or the new created object will always be flushed as 4M. > pthread_mutex_unlock(&oc->lock); > } > } > @@ -254,13 +282,13 @@ static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t coun > flags |= O_DIRECT; > > fd = open(p.buf, flags, def_fmode); > - if (flock(fd, LOCK_EX) < 0) { > + if (xlockf(fd, F_LOCK, offset, count) < 0) { > ret = SD_RES_EIO; > eprintf("%m\n"); > goto out; > } > size = xpwrite(fd, buf, count, offset); > - if (flock(fd, LOCK_UN) < 0) { > + if (xlockf(fd, F_UNLCK, offset, count) < 0) { > ret = SD_RES_EIO; > eprintf("%m\n"); > goto out; > @@ -287,14 +315,14 @@ static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count > flags |= O_DIRECT; > > fd = open(p.buf, flags, def_fmode); > - if (flock(fd, LOCK_SH) < 0) { > + if (xlockf(fd, F_LOCK, offset, count) < 0) { > ret = SD_RES_EIO; > eprintf("%m\n"); > goto out; > } > > size = xpread(fd, buf, count, offset); > - if (flock(fd, LOCK_UN) < 0) { > + if (xlockf(fd, F_UNLCK, offset, count) < 0) { > ret = SD_RES_EIO; > eprintf("%m\n"); > goto out; > @@ -311,6 +339,7 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req) > { > struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq; > struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp; > + uint64_t bmap = 0; > int ret; > > dprintf("%08"PRIx32", len %"PRIu32", off %"PRIu64"\n", idx, hdr->data_length, hdr->offset); > @@ -319,7 +348,8 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req) > if (ret != SD_RES_SUCCESS) > goto out; > pthread_mutex_lock(&oc->lock); > - add_to_dirty_tree_and_list(oc, idx, NULL, 0); > + bmap = object_bmap_calc(hdr->data_length, hdr->offset); > + add_to_dirty_tree_and_list(oc, idx, bmap, NULL, 0); > pthread_mutex_unlock(&oc->lock); > } else { > ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset); > @@ -476,22 +506,44 @@ static uint64_t idx_to_oid(uint32_t vid, uint32_t idx) > return vid_to_data_oid(vid, idx); > } > > -static int push_cache_object(uint32_t vid, uint32_t idx, int create) > +static int push_cache_object(uint32_t vid, uint32_t idx, > + uint64_t bmap, int create) > { > struct request fake_req; > struct sd_obj_req *hdr = (struct sd_obj_req *)&fake_req.rq; > void *buf; > + off_t offset; > unsigned data_length; > int ret = SD_RES_NO_MEM; > uint64_t oid = idx_to_oid(vid, idx); > + int nbits, shift = 0; > + struct vnode_info *vnodes = NULL; > > dprintf("%"PRIx64", create %d\n", oid, create); > > memset(&fake_req, 0, sizeof(fake_req)); > if (is_vdi_obj(oid)) > - data_length = SD_INODE_SIZE; > + nbits = div_ceil(SD_INODE_SIZE, BLOCK_SIZE); > else > - data_length = SD_DATA_OBJ_SIZE; > + nbits = div_ceil(SD_DATA_OBJ_SIZE, BLOCK_SIZE); > + > + vnodes = get_vnode_info(); > +loop: > + if (shift == nbits) { > + put_vnode_info(vnodes); > + return SD_RES_SUCCESS; > + } > + > + data_length = 0; > + offset = shift * BLOCK_SIZE; > + while (shift < nbits) { > + if (bmap & (1 << shift++)) > + data_length += BLOCK_SIZE; > + else if (data_length > 0) > + break; > + else > + goto loop; > + } > > buf = valloc(data_length); > if (buf == NULL) { > @@ -499,7 +551,7 @@ static int push_cache_object(uint32_t vid, uint32_t idx, int create) > goto out; > } > > - ret = read_cache_object(vid, idx, buf, data_length, 0); > + ret = read_cache_object(vid, idx, buf, data_length, offset); > if (ret != SD_RES_SUCCESS) > goto out; > > @@ -512,15 +564,18 @@ static int push_cache_object(uint32_t vid, uint32_t idx, int create) > hdr->epoch = sys->epoch; > fake_req.data = buf; > fake_req.op = get_sd_op(hdr->opcode); > - fake_req.vnodes = get_vnode_info(); > + fake_req.vnodes = vnodes; > > ret = forward_write_obj_req(&fake_req); > if (ret != SD_RES_SUCCESS) > eprintf("failed to push object %x\n", ret); > - > - put_vnode_info(fake_req.vnodes); > + else { > + free(buf); > + goto loop; > + } > > out: > + put_vnode_info(vnodes); > free(buf); > return ret; > } > @@ -546,7 +601,8 @@ int object_cache_push(struct object_cache *oc) > * request is issued in one of gateway worker threads > * So we need not to protect inactive dirty tree and list */ > list_for_each_entry_safe(entry, t, inactive_dirty_list, list) { > - ret = push_cache_object(oc->vid, entry->idx, entry->create); > + ret = push_cache_object(oc->vid, entry->idx, > + entry->bmap, entry->create); > if (ret != SD_RES_SUCCESS) > goto push_failed; > del_from_dirty_tree_and_list(entry, inactive_dirty_tree); > @@ -614,6 +670,7 @@ int object_cache_flush_and_delete(struct object_cache *oc) > struct dirent *d; > uint32_t vid = oc->vid; > uint32_t idx; > + uint64_t bmap = 0; > struct strbuf p; > int ret = 0; > > @@ -635,7 +692,7 @@ int object_cache_flush_and_delete(struct object_cache *oc) > idx = strtoul(d->d_name, NULL, 16); > if (idx == ULLONG_MAX) > continue; > - if (push_cache_object(vid, idx, 1) != SD_RES_SUCCESS) { > + if (push_cache_object(vid, idx, ~bmap, 1) != SD_RES_SUCCESS) { Better comment on why we need ~bmap. > dprintf("failed to push %"PRIx64"\n", > idx_to_oid(vid, idx)); > ret = -1; > diff --git a/sheep/sdnet.c b/sheep/sdnet.c > index 10be245..aaad630 100644 > --- a/sheep/sdnet.c > +++ b/sheep/sdnet.c > @@ -256,7 +256,7 @@ static int check_request(struct request *req) > return 0; > } > > -static do_nothing(struct work *work) {} > +static void do_nothing(struct work *work) {} > > static void queue_request(struct request *req) > { > diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h > index 2275a93..dc28572 100644 > --- a/sheep/sheep_priv.h > +++ b/sheep/sheep_priv.h > @@ -414,6 +414,7 @@ static inline int sys_can_halt(void) > > #define CACHE_VDI_SHIFT 31 > #define CACHE_VDI_BIT (UINT32_C(1) << CACHE_VDI_SHIFT) > +#define BLOCK_SIZE (UINT32_C(512) << 20) /* 512 KB */ > We'd better use CACHE_BLOCK_SIZE > struct object_cache { > uint32_t vid; > @@ -430,6 +431,7 @@ struct object_cache { > > struct object_cache_entry { > uint32_t idx; > + uint64_t bmap; We'd better comment what bmap mean, for e.g, "smallest flush unit" > struct rb_node rb; > struct list_head list; > int create; For flush tragedy, it might work better if we merge the flush requests into one, even though this include flush amplification. Thanks, Yuan |