[sheepdog] [PATCH 5/5] object cache: implement writethrough mode
Liu Yuan
namei.unix at gmail.com
Sat Aug 4 10:11:32 CEST 2012
From: Liu Yuan <tailai.ly at taobao.com>
Object cache writethrough mode provide us a read-only cache which is alwasy
consistent with backend store.
We can set the object cache mode by 'w' option as following:
sheep -w cache_size{,writethrough | writeback}
For e.g, we can set 1G size writethrough cache:
$ sheep -w 1000,writethrough
$ sheep -w 1000
writethrough mode is default object cache
to set as writeback cache:
$ sheep -w 1000,writeback
Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
sheep/object_cache.c | 125 +++++++++++++++++++++++++++++++++++++-------------
sheep/sheep.c | 22 ++++++---
sheep/sheep_priv.h | 1 +
3 files changed, 109 insertions(+), 39 deletions(-)
diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index bb14fb8..6b102d4 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -53,6 +53,18 @@ struct global_cache {
struct cds_list_head cache_lru_list;
};
+struct object_cache_entry {
+ uint32_t idx;
+ int refcnt;
+ uint64_t bmap; /* each bit represents one dirty block in object */
+ struct object_cache *oc;
+ struct rb_node node;
+ struct rb_node dirty_node;
+ struct list_head dirty_list;
+ struct list_head object_list;
+ struct cds_list_head lru_list;
+};
+
struct object_cache {
uint32_t vid;
struct hlist_node hash;
@@ -63,18 +75,9 @@ struct object_cache {
struct rb_root object_tree;
pthread_rwlock_t lock;
-};
-struct object_cache_entry {
- uint32_t idx;
- int refcnt;
- uint64_t bmap; /* each bit represents one dirty block in object */
- struct object_cache *oc;
- struct rb_node node;
- struct rb_node dirty_node;
- struct list_head dirty_list;
- struct list_head object_list;
- struct cds_list_head lru_list;
+ int (*read)(struct object_cache_entry *, void *, size_t, off_t);
+ int (*write)(struct object_cache_entry *, void *, size_t, off_t, int);
};
static struct global_cache sys_cache;
@@ -280,11 +283,11 @@ static inline void lru_move_entry(struct object_cache_entry *entry)
static inline void update_cache_entry(struct object_cache_entry *entry,
uint32_t idx, size_t datalen,
- off_t offset, int wrt)
+ off_t offset, int dirty)
{
struct object_cache *oc = entry->oc;
- if (wrt) {
+ if (dirty) {
uint64_t bmap = calc_object_bmap(datalen, offset);
pthread_rwlock_wrlock(&oc->lock);
@@ -295,13 +298,12 @@ static inline void update_cache_entry(struct object_cache_entry *entry,
lru_move_entry(entry);
}
-static int write_cache_object(struct object_cache_entry *entry, void *buf,
- size_t count, off_t offset)
+static int read_cache_object_noupdate(uint32_t vid, uint32_t idx, void *buf,
+ size_t count, off_t offset)
{
size_t size;
int fd, flags = def_open_flags, ret = SD_RES_SUCCESS;
struct strbuf p;
- uint32_t vid = entry->oc->vid, idx = entry_idx(entry);
strbuf_init(&p, PATH_MAX);
strbuf_addstr(&p, cache_dir);
@@ -317,12 +319,12 @@ static int write_cache_object(struct object_cache_entry *entry, void *buf,
goto out;
}
- if (flock(fd, LOCK_EX) < 0) {
+ if (flock(fd, LOCK_SH) < 0) {
ret = SD_RES_EIO;
eprintf("%m\n");
goto out_close;
}
- size = xpwrite(fd, buf, count, offset);
+ size = xpread(fd, buf, count, offset);
if (flock(fd, LOCK_UN) < 0) {
ret = SD_RES_EIO;
eprintf("%m\n");
@@ -336,7 +338,6 @@ static int write_cache_object(struct object_cache_entry *entry, void *buf,
goto out_close;
}
- update_cache_entry(entry, idx, count, offset, 1);
out_close:
close(fd);
out:
@@ -344,8 +345,8 @@ out:
return ret;
}
-static int read_cache_object_noupdate(uint32_t vid, uint32_t idx, void *buf,
- size_t count, off_t offset)
+static int write_cache_object_noupdate(uint32_t vid, uint32_t idx, void *buf,
+ size_t count, off_t offset)
{
size_t size;
int fd, flags = def_open_flags, ret = SD_RES_SUCCESS;
@@ -365,12 +366,12 @@ static int read_cache_object_noupdate(uint32_t vid, uint32_t idx, void *buf,
goto out;
}
- if (flock(fd, LOCK_SH) < 0) {
+ if (flock(fd, LOCK_EX) < 0) {
ret = SD_RES_EIO;
eprintf("%m\n");
goto out_close;
}
- size = xpread(fd, buf, count, offset);
+ size = xpwrite(fd, buf, count, offset);
if (flock(fd, LOCK_UN) < 0) {
ret = SD_RES_EIO;
eprintf("%m\n");
@@ -390,6 +391,20 @@ out:
strbuf_release(&p);
return ret;
}
+
+static int write_cache_object(struct object_cache_entry *entry, void *buf,
+ size_t count, off_t offset, int create)
+{
+ uint32_t vid = entry->oc->vid, idx = entry_idx(entry);
+ int ret;
+
+ ret = write_cache_object_noupdate(vid, idx, buf, count, offset);
+
+ if (ret == SD_RES_SUCCESS)
+ update_cache_entry(entry, idx, count, offset, 1);
+ return ret;
+}
+
static int read_cache_object(struct object_cache_entry *entry, void *buf,
size_t count, off_t offset)
{
@@ -403,6 +418,40 @@ static int read_cache_object(struct object_cache_entry *entry, void *buf,
return ret;
}
+static int write_and_push_cache_object(struct object_cache_entry *entry,
+ void *buf, size_t count, off_t offset,
+ int create)
+{
+ uint32_t vid = entry->oc->vid, idx = entry_idx(entry);
+ uint64_t oid = idx_to_oid(vid, idx);
+ struct sd_req hdr;
+ int ret;
+
+ ret = write_cache_object_noupdate(vid, idx, buf, count, offset);
+
+ if (ret != SD_RES_SUCCESS)
+ return ret;
+
+ if (create)
+ sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ);
+ else
+ sd_init_req(&hdr, SD_OP_WRITE_OBJ);
+ hdr.flags = SD_FLAG_CMD_WRITE;
+ hdr.data_length = count;
+
+ hdr.obj.oid = oid;
+ hdr.obj.offset = offset;
+
+ ret = exec_local_req(&hdr, buf);
+ if (ret != SD_RES_SUCCESS) {
+ eprintf("failed to write object %" PRIx64 ", %x\n", oid, ret);
+ return ret;
+ }
+
+ update_cache_entry(entry, idx, count, offset, 0);
+ return ret;
+}
+
static int push_cache_object(uint32_t vid, uint32_t idx, uint64_t bmap,
int create)
{
@@ -589,8 +638,15 @@ not_found:
pthread_rwlock_init(&cache->lock, NULL);
hlist_add_head(&cache->hash, head);
- } else
+
+ cache->read = read_cache_object;
+ if (sys->writethrough)
+ cache->write = write_and_push_cache_object;
+ else
+ cache->write = write_cache_object;
+ } else {
cache = NULL;
+ }
out:
pthread_mutex_unlock(&hashtable_lock[h]);
return cache;
@@ -690,10 +746,14 @@ static int object_cache_lookup(struct object_cache *oc, uint32_t idx,
data_length = SD_DATA_OBJ_SIZE;
ret = prealloc(fd, data_length);
- if (ret != SD_RES_SUCCESS)
+ if (ret != SD_RES_SUCCESS) {
ret = SD_RES_EIO;
- else
- add_to_object_cache(oc, idx, 1);
+ } else {
+ if (sys->writethrough)
+ add_to_object_cache(oc, idx, 0);
+ else
+ add_to_object_cache(oc, idx, 1);
+ }
close(fd);
out:
strbuf_release(&buf);
@@ -990,17 +1050,16 @@ retry:
}
if (hdr->flags & SD_FLAG_CMD_WRITE) {
- ret = write_cache_object(entry, req->data, hdr->data_length,
- hdr->obj.offset);
+ ret = cache->write(entry, req->data, hdr->data_length,
+ hdr->obj.offset, create);
if (ret != SD_RES_SUCCESS)
goto err;
} else {
- ret = read_cache_object(entry, req->data, hdr->data_length,
- hdr->obj.offset);
+ ret = cache->read(entry, req->data, hdr->data_length,
+ hdr->obj.offset);
if (ret != SD_RES_SUCCESS)
goto err;
req->rp.data_length = hdr->data_length;
-
}
err:
put_cache_entry(entry);
@@ -1026,7 +1085,7 @@ int object_cache_write(uint64_t oid, char *data, unsigned int datalen,
return SD_RES_NO_CACHE;
}
- ret = write_cache_object(entry, data, datalen, offset);
+ ret = write_cache_object(entry, data, datalen, offset, create);
put_cache_entry(entry);
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 52a294b..7f34a87 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -192,8 +192,9 @@ int main(int argc, char **argv)
int af;
char *p;
struct cluster_driver *cdrv;
- int enable_write_cache = 0; /* disabled by default */
+ int enable_object_cache = 0; /* disabled by default */
char *pid_file = NULL;
+ char *object_cache_size, *object_cache_mode;
signal(SIGPIPE, SIG_IGN);
@@ -261,8 +262,10 @@ int main(int argc, char **argv)
sys->this_node.zone = zone;
break;
case 'w':
- enable_write_cache = 1;
- cache_size = strtol(optarg, &p, 10);
+ enable_object_cache = 1;
+ object_cache_size = strtok(optarg, ",");
+ object_cache_mode = strtok(NULL, ",");
+ cache_size = strtol(object_cache_size, &p, 10);
if (optarg == p || cache_size < 0 ||
UINT64_MAX < cache_size) {
fprintf(stderr, "Invalid cache size '%s': "
@@ -270,9 +273,16 @@ int main(int argc, char **argv)
optarg, UINT64_MAX);
exit(1);
}
- vprintf(SDOG_INFO, "enable write cache, max cache size %" PRIu64 "M\n",
- cache_size);
sys->cache_size = cache_size * 1024 * 1024;
+
+ if (!object_cache_mode ||
+ strcmp(object_cache_mode, "writeback") != 0) {
+ sys->writethrough = 1;
+ }
+ vprintf(SDOG_INFO, "enable write cache, "
+ "max cache size %" PRIu64 "M, %s mode\n",
+ cache_size, sys->writethrough ?
+ "writethrough" : "writeback");
break;
case 'v':
nr_vnodes = strtol(optarg, &p, 10);
@@ -326,7 +336,7 @@ int main(int argc, char **argv)
if (ret)
exit(1);
- ret = init_store(dir, enable_write_cache);
+ ret = init_store(dir, enable_object_cache);
if (ret)
exit(1);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 0c0e588..0c30851 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -119,6 +119,7 @@ struct cluster_info {
int use_directio;
uint8_t gateway_only;
uint8_t disable_recovery;
+ uint8_t writethrough;
struct work_queue *gateway_wqueue;
struct work_queue *io_wqueue;
--
1.7.10.2
More information about the sheepdog
mailing list