[Sheepdog] [PATCH RFC] Make the flushing data more fine grained
Yunkai Zhang
yunkai.me at gmail.com
Tue May 8 14:12:38 CEST 2012
From: Yunkai Zhang <qiushu.zyk at taobao.com>
Each time when sheep flushes an cache object, it will flush its whole
data(4M) to the cluster, even if that object only contains one byte in
dirty.
Now I splits an cache object into multiple blocks, each block is 512KB
(this size can be switch in range[64, 128, 256, 512]KB defined by BLOCK_SIZE
macro).
Each time when client writes an cache object, sheep will caculate which block
would be modified and mark it as dirty block. Those dirty infomation are saved
in a bitmap.
When sheep flushes an canch object, it will not flush the whole data again,
instead of, it will read the bitmap of this object and only send the data marked
as dirty block.
In addition, I replaced flock with lockf so that we only lock on the section we care
about of an object file.
Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
---
include/util.h | 10 +++++
sheep/object_cache.c | 95 ++++++++++++++++++++++++++++++++++++++++----------
sheep/sdnet.c | 2 +-
sheep/sheep_priv.h | 2 +
4 files changed, 89 insertions(+), 20 deletions(-)
diff --git a/include/util.h b/include/util.h
index ca43ab9..1c32954 100644
--- a/include/util.h
+++ b/include/util.h
@@ -4,6 +4,7 @@
#include <string.h>
#include <limits.h>
#include <stdint.h>
+#include <unistd.h>
#include "bitops.h"
#include "list.h"
@@ -58,6 +59,15 @@ static inline void *zalloc(size_t size)
return calloc(1, size);
}
+static inline int xlockf(int fd, int cmd, off_t offset, off_t len)
+{
+ if (lseek(fd, offset, SEEK_SET) < 0)
+ return -1;
+
+ return lockf(fd, cmd, len);
+}
+
+
typedef void (*try_to_free_t)(size_t);
extern try_to_free_t set_try_to_free_routine(try_to_free_t);
diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index 0b3c911..43bf109 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -41,6 +41,29 @@ static inline int hash(uint64_t vid)
return hash_64(vid, HASH_BITS);
}
+static inline uint32_t div_ceil(uint32_t numerator, uint32_t denominator)
+{
+ return (numerator / denominator) + ((numerator % denominator) > 0);
+}
+
+static uint64_t object_bmap_calc(size_t len, off_t offset)
+{
+ int i, j;
+ uint64_t bmap = 1;
+
+ if (!len)
+ return 0;
+
+ i = offset / BLOCK_SIZE;
+ j = (offset + len - 1) / BLOCK_SIZE;
+
+ bmap <<= i;
+ while (j - i++)
+ bmap = bmap | (bmap << 1);
+
+ return bmap;
+}
+
static struct object_cache_entry *dirty_tree_insert(struct rb_root *root,
struct object_cache_entry *new)
{
@@ -56,8 +79,11 @@ static struct object_cache_entry *dirty_tree_insert(struct rb_root *root,
p = &(*p)->rb_left;
else if (new->idx > entry->idx)
p = &(*p)->rb_right;
- else
- return entry; /* already has this entry */
+ else {
+ /* already has this entry, merge bmap */
+ entry->bmap |= new->bmap;
+ return entry;
+ }
}
rb_link_node(&new->rb, parent, p);
rb_insert_color(&new->rb, root);
@@ -143,11 +169,12 @@ out:
}
static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx,
- struct object_cache_entry *entry, int create)
+ uint64_t bmap, struct object_cache_entry *entry, int create)
{
if (!entry) {
entry = xzalloc(sizeof(*entry));
entry->idx = idx;
+ entry->bmap = bmap;
entry->create = create;
}
if (!dirty_tree_insert(oc->active_dirty_tree, entry))
@@ -194,7 +221,7 @@ static void merge_dirty_tree_and_list(struct object_cache *oc,
list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
del_from_dirty_tree_and_list(entry, inactive_dirty_tree);
- add_to_dirty_tree_and_list(oc, entry->idx, entry, 0);
+ add_to_dirty_tree_and_list(oc, entry->idx, entry->bmap, entry, 0);
}
pthread_mutex_unlock(&oc->lock);
@@ -204,6 +231,7 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create)
{
struct strbuf buf;
int fd, ret = 0, flags = def_open_flags;
+ uint64_t bmap = 0;
strbuf_init(&buf, PATH_MAX);
strbuf_addstr(&buf, cache_dir);
@@ -230,7 +258,7 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create)
ret = -1;
else {
pthread_mutex_lock(&oc->lock);
- add_to_dirty_tree_and_list(oc, idx, NULL, 1);
+ add_to_dirty_tree_and_list(oc, idx, ~bmap, NULL, 1);
pthread_mutex_unlock(&oc->lock);
}
}
@@ -254,13 +282,13 @@ static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t coun
flags |= O_DIRECT;
fd = open(p.buf, flags, def_fmode);
- if (flock(fd, LOCK_EX) < 0) {
+ if (xlockf(fd, F_LOCK, offset, count) < 0) {
ret = SD_RES_EIO;
eprintf("%m\n");
goto out;
}
size = xpwrite(fd, buf, count, offset);
- if (flock(fd, LOCK_UN) < 0) {
+ if (xlockf(fd, F_UNLCK, offset, count) < 0) {
ret = SD_RES_EIO;
eprintf("%m\n");
goto out;
@@ -287,14 +315,14 @@ static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count
flags |= O_DIRECT;
fd = open(p.buf, flags, def_fmode);
- if (flock(fd, LOCK_SH) < 0) {
+ if (xlockf(fd, F_LOCK, offset, count) < 0) {
ret = SD_RES_EIO;
eprintf("%m\n");
goto out;
}
size = xpread(fd, buf, count, offset);
- if (flock(fd, LOCK_UN) < 0) {
+ if (xlockf(fd, F_UNLCK, offset, count) < 0) {
ret = SD_RES_EIO;
eprintf("%m\n");
goto out;
@@ -311,6 +339,7 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
{
struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp;
+ uint64_t bmap = 0;
int ret;
dprintf("%08"PRIx32", len %"PRIu32", off %"PRIu64"\n", idx, hdr->data_length, hdr->offset);
@@ -319,7 +348,8 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
if (ret != SD_RES_SUCCESS)
goto out;
pthread_mutex_lock(&oc->lock);
- add_to_dirty_tree_and_list(oc, idx, NULL, 0);
+ bmap = object_bmap_calc(hdr->data_length, hdr->offset);
+ add_to_dirty_tree_and_list(oc, idx, bmap, NULL, 0);
pthread_mutex_unlock(&oc->lock);
} else {
ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
@@ -476,22 +506,44 @@ static uint64_t idx_to_oid(uint32_t vid, uint32_t idx)
return vid_to_data_oid(vid, idx);
}
-static int push_cache_object(uint32_t vid, uint32_t idx, int create)
+static int push_cache_object(uint32_t vid, uint32_t idx,
+ uint64_t bmap, int create)
{
struct request fake_req;
struct sd_obj_req *hdr = (struct sd_obj_req *)&fake_req.rq;
void *buf;
+ off_t offset;
unsigned data_length;
int ret = SD_RES_NO_MEM;
uint64_t oid = idx_to_oid(vid, idx);
+ int nbits, shift = 0;
+ struct vnode_info *vnodes = NULL;
dprintf("%"PRIx64", create %d\n", oid, create);
memset(&fake_req, 0, sizeof(fake_req));
if (is_vdi_obj(oid))
- data_length = SD_INODE_SIZE;
+ nbits = div_ceil(SD_INODE_SIZE, BLOCK_SIZE);
else
- data_length = SD_DATA_OBJ_SIZE;
+ nbits = div_ceil(SD_DATA_OBJ_SIZE, BLOCK_SIZE);
+
+ vnodes = get_vnode_info();
+loop:
+ if (shift == nbits) {
+ put_vnode_info(vnodes);
+ return SD_RES_SUCCESS;
+ }
+
+ data_length = 0;
+ offset = shift * BLOCK_SIZE;
+ while (shift < nbits) {
+ if (bmap & (1 << shift++))
+ data_length += BLOCK_SIZE;
+ else if (data_length > 0)
+ break;
+ else
+ goto loop;
+ }
buf = valloc(data_length);
if (buf == NULL) {
@@ -499,7 +551,7 @@ static int push_cache_object(uint32_t vid, uint32_t idx, int create)
goto out;
}
- ret = read_cache_object(vid, idx, buf, data_length, 0);
+ ret = read_cache_object(vid, idx, buf, data_length, offset);
if (ret != SD_RES_SUCCESS)
goto out;
@@ -512,15 +564,18 @@ static int push_cache_object(uint32_t vid, uint32_t idx, int create)
hdr->epoch = sys->epoch;
fake_req.data = buf;
fake_req.op = get_sd_op(hdr->opcode);
- fake_req.vnodes = get_vnode_info();
+ fake_req.vnodes = vnodes;
ret = forward_write_obj_req(&fake_req);
if (ret != SD_RES_SUCCESS)
eprintf("failed to push object %x\n", ret);
-
- put_vnode_info(fake_req.vnodes);
+ else {
+ free(buf);
+ goto loop;
+ }
out:
+ put_vnode_info(vnodes);
free(buf);
return ret;
}
@@ -546,7 +601,8 @@ int object_cache_push(struct object_cache *oc)
* request is issued in one of gateway worker threads
* So we need not to protect inactive dirty tree and list */
list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
- ret = push_cache_object(oc->vid, entry->idx, entry->create);
+ ret = push_cache_object(oc->vid, entry->idx,
+ entry->bmap, entry->create);
if (ret != SD_RES_SUCCESS)
goto push_failed;
del_from_dirty_tree_and_list(entry, inactive_dirty_tree);
@@ -614,6 +670,7 @@ int object_cache_flush_and_delete(struct object_cache *oc)
struct dirent *d;
uint32_t vid = oc->vid;
uint32_t idx;
+ uint64_t bmap = 0;
struct strbuf p;
int ret = 0;
@@ -635,7 +692,7 @@ int object_cache_flush_and_delete(struct object_cache *oc)
idx = strtoul(d->d_name, NULL, 16);
if (idx == ULLONG_MAX)
continue;
- if (push_cache_object(vid, idx, 1) != SD_RES_SUCCESS) {
+ if (push_cache_object(vid, idx, ~bmap, 1) != SD_RES_SUCCESS) {
dprintf("failed to push %"PRIx64"\n",
idx_to_oid(vid, idx));
ret = -1;
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index 10be245..aaad630 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -256,7 +256,7 @@ static int check_request(struct request *req)
return 0;
}
-static do_nothing(struct work *work) {}
+static void do_nothing(struct work *work) {}
static void queue_request(struct request *req)
{
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 2275a93..dc28572 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -414,6 +414,7 @@ static inline int sys_can_halt(void)
#define CACHE_VDI_SHIFT 31
#define CACHE_VDI_BIT (UINT32_C(1) << CACHE_VDI_SHIFT)
+#define BLOCK_SIZE (UINT32_C(512) << 20) /* 512 KB */
struct object_cache {
uint32_t vid;
@@ -430,6 +431,7 @@ struct object_cache {
struct object_cache_entry {
uint32_t idx;
+ uint64_t bmap;
struct rb_node rb;
struct list_head list;
int create;
--
1.7.7.6
More information about the sheepdog
mailing list