[Sheepdog] [PATCH] object cache: make the flushing data more fine grained

Yunkai Zhang yunkai.me at gmail.com
Tue May 15 11:25:04 CEST 2012


From: Yunkai Zhang <qiushu.zyk at taobao.com>

Changes in V5:
- correct data_length when push vdi object
------------------------------------------ >8

Each time when sheep flushes an cache object, it will flush its whole
data(4M) to the cluster, even if that object only contains one byte in
dirty.

Now I splits an cache object into multiple blocks, each block is 128 KB
(this size can be switch in range[64, 128, 256, 512] KB defined by BLOCK_SIZE
macro).

Each time when client writes an cache object, sheep will caculate which block
would be modified and mark it as dirty block. Those dirty infomation are saved
in a bitmap.

When sheep flushes an canch object, it will not flush the whole data again,
instead of, it will read the bitmap of this object and only send the data marked
as dirty block.

In addition, I replaced flock with lockf so that we only lock on the section we
care about of an object file.

Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
---
 include/util.h       |   10 +++++
 sheep/object_cache.c |   97 +++++++++++++++++++++++++++++++++++++++++---------
 sheep/sheep_priv.h   |    3 ++
 3 files changed, 93 insertions(+), 17 deletions(-)

diff --git a/include/util.h b/include/util.h
index ca43ab9..1c32954 100644
--- a/include/util.h
+++ b/include/util.h
@@ -4,6 +4,7 @@
 #include <string.h>
 #include <limits.h>
 #include <stdint.h>
+#include <unistd.h>
 
 #include "bitops.h"
 #include "list.h"
@@ -58,6 +59,15 @@ static inline void *zalloc(size_t size)
 	return calloc(1, size);
 }
 
+static inline int xlockf(int fd, int cmd, off_t offset, off_t len)
+{
+	if (lseek(fd, offset, SEEK_SET) < 0)
+		return -1;
+
+	return lockf(fd, cmd, len);
+}
+
+
 typedef void (*try_to_free_t)(size_t);
 extern try_to_free_t set_try_to_free_routine(try_to_free_t);
 
diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index 0b3c911..24d2f0f 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -41,6 +41,24 @@ static inline int hash(uint64_t vid)
 	return hash_64(vid, HASH_BITS);
 }
 
+static uint64_t calc_object_bmap(size_t len, off_t offset)
+{
+	int i, j;
+	uint64_t bmap, shift = 1;
+
+	if (!len)
+		return 0;
+
+	i = offset / CACHE_BLOCK_SIZE;
+	j = (offset + len - 1) / CACHE_BLOCK_SIZE;
+
+	bmap = (shift <<= i);
+	while (j - i++)
+		bmap |= (shift <<= 1);
+
+	return bmap;
+}
+
 static struct object_cache_entry *dirty_tree_insert(struct rb_root *root,
 		struct object_cache_entry *new)
 {
@@ -56,8 +74,11 @@ static struct object_cache_entry *dirty_tree_insert(struct rb_root *root,
 			p = &(*p)->rb_left;
 		else if (new->idx > entry->idx)
 			p = &(*p)->rb_right;
-		else
-			return entry; /* already has this entry */
+		else {
+			/* already has this entry, merge bmap */
+			entry->bmap |= new->bmap;
+			return entry;
+		}
 	}
 	rb_link_node(&new->rb, parent, p);
 	rb_insert_color(&new->rb, root);
@@ -143,11 +164,12 @@ out:
 }
 
 static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx,
-		struct object_cache_entry *entry, int create)
+		uint64_t bmap, struct object_cache_entry *entry, int create)
 {
 	if (!entry) {
 		entry = xzalloc(sizeof(*entry));
 		entry->idx = idx;
+		entry->bmap = bmap;
 		entry->create = create;
 	}
 	if (!dirty_tree_insert(oc->active_dirty_tree, entry))
@@ -194,7 +216,7 @@ static void merge_dirty_tree_and_list(struct object_cache *oc,
 
 	list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
 		del_from_dirty_tree_and_list(entry, inactive_dirty_tree);
-		add_to_dirty_tree_and_list(oc, entry->idx, entry, 0);
+		add_to_dirty_tree_and_list(oc, entry->idx, 0, entry, 0);
 	}
 
 	pthread_mutex_unlock(&oc->lock);
@@ -230,7 +252,7 @@ int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create)
 			ret = -1;
 		else {
 			pthread_mutex_lock(&oc->lock);
-			add_to_dirty_tree_and_list(oc, idx, NULL, 1);
+			add_to_dirty_tree_and_list(oc, idx, 0, NULL, 1);
 			pthread_mutex_unlock(&oc->lock);
 		}
 	}
@@ -254,13 +276,13 @@ static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t coun
 		flags |= O_DIRECT;
 
 	fd = open(p.buf, flags, def_fmode);
-	if (flock(fd, LOCK_EX) < 0) {
+	if (xlockf(fd, F_LOCK, offset, count) < 0) {
 		ret = SD_RES_EIO;
 		eprintf("%m\n");
 		goto out;
 	}
 	size = xpwrite(fd, buf, count, offset);
-	if (flock(fd, LOCK_UN) < 0) {
+	if (xlockf(fd, F_UNLCK, offset, count) < 0) {
 		ret = SD_RES_EIO;
 		eprintf("%m\n");
 		goto out;
@@ -287,14 +309,14 @@ static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count
 		flags |= O_DIRECT;
 
 	fd = open(p.buf, flags, def_fmode);
-	if (flock(fd, LOCK_SH) < 0) {
+	if (xlockf(fd, F_LOCK, offset, count) < 0) {
 		ret = SD_RES_EIO;
 		eprintf("%m\n");
 		goto out;
 	}
 
 	size = xpread(fd, buf, count, offset);
-	if (flock(fd, LOCK_UN) < 0) {
+	if (xlockf(fd, F_UNLCK, offset, count) < 0) {
 		ret = SD_RES_EIO;
 		eprintf("%m\n");
 		goto out;
@@ -311,6 +333,7 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
 {
 	struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
 	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp;
+	uint64_t bmap = 0;
 	int ret;
 
 	dprintf("%08"PRIx32", len %"PRIu32", off %"PRIu64"\n", idx, hdr->data_length, hdr->offset);
@@ -319,7 +342,8 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
 		if (ret != SD_RES_SUCCESS)
 			goto out;
 		pthread_mutex_lock(&oc->lock);
-		add_to_dirty_tree_and_list(oc, idx, NULL, 0);
+		bmap = calc_object_bmap(hdr->data_length, hdr->offset);
+		add_to_dirty_tree_and_list(oc, idx, bmap, NULL, 0);
 		pthread_mutex_unlock(&oc->lock);
 	} else {
 		ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
@@ -476,22 +500,59 @@ static uint64_t idx_to_oid(uint32_t vid, uint32_t idx)
 		return vid_to_data_oid(vid, idx);
 }
 
-static int push_cache_object(uint32_t vid, uint32_t idx, int create)
+static int push_cache_object(uint32_t vid, uint32_t idx,
+		uint64_t bmap, int create)
 {
 	struct request fake_req;
 	struct sd_obj_req *hdr = (struct sd_obj_req *)&fake_req.rq;
 	void *buf;
+	off_t offset;
 	unsigned data_length;
 	int ret = SD_RES_NO_MEM;
-	uint64_t oid = idx_to_oid(vid, idx);
+	uint64_t shift, oid = idx_to_oid(vid, idx);
+	int i, nbits, first_dirty_bit, last_dirty_bit;
 
 	dprintf("%"PRIx64", create %d\n", oid, create);
 
+	if (!bmap)
+		return SD_RES_SUCCESS;
+
 	memset(&fake_req, 0, sizeof(fake_req));
 	if (is_vdi_obj(oid))
-		data_length = SD_INODE_SIZE;
+		nbits = DIV_ROUND_UP(SD_INODE_SIZE, CACHE_BLOCK_SIZE);
 	else
-		data_length = SD_DATA_OBJ_SIZE;
+		nbits = DIV_ROUND_UP(SD_DATA_OBJ_SIZE, CACHE_BLOCK_SIZE);
+
+	shift = 1;
+	first_dirty_bit = 0;
+
+	for (i = 0; i < nbits; i++) {
+		if (bmap & (shift << i)) {
+			first_dirty_bit = i;
+			break;
+		}
+	}
+
+	shift =  (UINT64_C(1) << (nbits - 1));
+	last_dirty_bit = 0;
+	for (i = 0; i < nbits; i++) {
+		if (bmap & (shift >> i)) {
+			last_dirty_bit = nbits - i - 1 ;
+			break;
+		}
+	}
+
+	dprintf("bmap:0x%"PRIx64", first_dirty_bit:%d, last_dirty_bit:%d\n",
+			bmap, first_dirty_bit, last_dirty_bit);
+	offset = first_dirty_bit * CACHE_BLOCK_SIZE;
+	data_length = (last_dirty_bit - first_dirty_bit + 1) * CACHE_BLOCK_SIZE;
+
+	/*
+	 * CACHE_BLOCK_SIZE may not be divisible by SD_INODE_SIZE,
+	 * so (offset + data_length) could larger than SD_INODE_SIZE
+	 */
+	if (is_vdi_obj(oid) && (offset + data_length) > SD_INODE_SIZE)
+		data_length = SD_INODE_SIZE - offset;
 
 	buf = valloc(data_length);
 	if (buf == NULL) {
@@ -499,7 +560,7 @@ static int push_cache_object(uint32_t vid, uint32_t idx, int create)
 		goto out;
 	}
 
-	ret = read_cache_object(vid, idx, buf, data_length, 0);
+	ret = read_cache_object(vid, idx, buf, data_length, offset);
 	if (ret != SD_RES_SUCCESS)
 		goto out;
 
@@ -546,7 +607,8 @@ int object_cache_push(struct object_cache *oc)
 	 * request is issued in one of gateway worker threads
 	 * So we need not to protect inactive dirty tree and list */
 	list_for_each_entry_safe(entry, t, inactive_dirty_list, list) {
-		ret = push_cache_object(oc->vid, entry->idx, entry->create);
+		ret = push_cache_object(oc->vid, entry->idx,
+				entry->bmap, entry->create);
 		if (ret != SD_RES_SUCCESS)
 			goto push_failed;
 		del_from_dirty_tree_and_list(entry, inactive_dirty_tree);
@@ -614,6 +676,7 @@ int object_cache_flush_and_delete(struct object_cache *oc)
 	struct dirent *d;
 	uint32_t vid = oc->vid;
 	uint32_t idx;
+	uint64_t all = UINT64_MAX;
 	struct strbuf p;
 	int ret = 0;
 
@@ -635,7 +698,7 @@ int object_cache_flush_and_delete(struct object_cache *oc)
 		idx = strtoul(d->d_name, NULL, 16);
 		if (idx == ULLONG_MAX)
 			continue;
-		if (push_cache_object(vid, idx, 1) != SD_RES_SUCCESS) {
+		if (push_cache_object(vid, idx, all, 1) != SD_RES_SUCCESS) {
 			dprintf("failed to push %"PRIx64"\n",
 				idx_to_oid(vid, idx));
 			ret = -1;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 2275a93..fef22f6 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -414,6 +414,7 @@ static inline int sys_can_halt(void)
 
 #define CACHE_VDI_SHIFT       31
 #define CACHE_VDI_BIT         (UINT32_C(1) << CACHE_VDI_SHIFT)
+#define CACHE_BLOCK_SIZE      (UINT32_C(128) << 10) /* 128 KB */
 
 struct object_cache {
 	uint32_t vid;
@@ -430,6 +431,8 @@ struct object_cache {
 
 struct object_cache_entry {
 	uint32_t idx;
+	uint64_t bmap; /* each bit represents one dirty
+			* block which should be flushed */
 	struct rb_node rb;
 	struct list_head list;
 	int create;
-- 
1.7.7.6




More information about the sheepdog mailing list