[Sheepdog] [PATCH 2/2] object cache: cache create operation
Liu Yuan
namei.unix at gmail.com
Fri Mar 30 05:18:31 CEST 2012
From: Liu Yuan <tailai.ly at taobao.com>
Currently create operation, we write through the cache. This operation would be
slow and return-err especially in node changes phase.
It is clean to let gateway only talks to object cache, and rely object cache layer
on pulling and pushing objects from/to sheep cluster.
With all the gateway requests operated on object cache to boost the IO performance,
we can do interested tricks with Farm, such as data de-duplication, which trade cpu
cycles for disk space while not hurting the IO performance.
Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
sheep/farm/farm.c | 7 +++--
sheep/object_cache.c | 65 ++++++++++++++++++++++++++++++++-----------------
sheep/sheep_priv.h | 5 +++-
sheep/store.c | 25 ++++++++++--------
4 files changed, 64 insertions(+), 38 deletions(-)
diff --git a/sheep/farm/farm.c b/sheep/farm/farm.c
index 1e073f0..f1d87ad 100644
--- a/sheep/farm/farm.c
+++ b/sheep/farm/farm.c
@@ -127,13 +127,14 @@ static int err_to_sderr(uint64_t oid, int err)
/*
* Preallocate the whole object to get a better filesystem layout.
*/
-static int prealloc(int fd, uint32_t size)
+int prealloc(int fd, uint32_t size)
{
int ret = fallocate(fd, 0, 0, size);
if (ret < 0) {
- if (errno != ENOSYS && errno != EOPNOTSUPP)
+ if (errno != ENOSYS && errno != EOPNOTSUPP) {
+ dprintf("%m\n");
ret = SD_RES_SYSTEM_ERROR;
- else
+ } else
ret = write_last_sector(fd, size);
} else
ret = SD_RES_SUCCESS;
diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index b1b30d7..ef2c05c 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -137,20 +137,52 @@ out:
return cache;
}
-int object_cache_lookup(struct object_cache *oc, uint32_t idx)
+static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx, int create)
+{
+ struct object_cache_entry *entry = xzalloc(sizeof(*entry));
+
+ entry->idx = idx;
+ pthread_mutex_lock(&oc->lock);
+ if (!dirty_tree_insert(&oc->dirty_rb, entry)) {
+ if (create)
+ entry->create = 1;
+ list_add(&entry->list, &oc->dirty_list);
+ } else
+ free(entry);
+ pthread_mutex_unlock(&oc->lock);
+}
+
+int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create)
{
struct strbuf buf;
- int fd, ret = 0;
+ int fd, ret = 0, flags = def_open_flags;
strbuf_init(&buf, PATH_MAX);
strbuf_addstr(&buf, cache_dir);
strbuf_addf(&buf, "/%06"PRIx32"/%08"PRIx32, oc->vid, idx);
- fd = open(buf.buf, def_open_flags, def_fmode);
+ if (create)
+ flags |= O_CREAT | O_TRUNC;
+
+ fd = open(buf.buf, flags, def_fmode);
if (fd < 0) {
ret = -1;
goto out;
}
+
+ if (create) {
+ unsigned data_length;
+ uint64_t oid = oc->oid;
+ if (is_vdi_obj(oid))
+ data_length = SD_INODE_SIZE;
+ else
+ data_length = SD_DATA_OBJ_SIZE;
+ ret = prealloc(fd, data_length);
+ if (ret != SD_RES_SUCCESS)
+ ret = -1;
+ else
+ add_to_dirty_tree_and_list(oc, idx, 1);
+ }
close(fd);
out:
strbuf_release(&buf);
@@ -195,19 +227,6 @@ static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count
return ret;
}
-static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx)
-{
- struct object_cache_entry *entry = xzalloc(sizeof(*entry));
-
- entry->idx = idx;
- pthread_mutex_lock(&oc->lock);
- if (!dirty_tree_insert(&oc->dirty_rb, entry))
- list_add(&entry->list, &oc->dirty_list);
- else
- free(entry);
- pthread_mutex_unlock(&oc->lock);
-}
-
int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
{
struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
@@ -219,7 +238,7 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
ret = write_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
if (ret != SD_RES_SUCCESS)
goto out;
- add_to_dirty_tree_and_list(oc, idx);
+ add_to_dirty_tree_and_list(oc, idx, 0);
} else {
ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
if (ret != SD_RES_SUCCESS)
@@ -353,7 +372,7 @@ static uint64_t idx_to_oid(uint32_t vid, uint32_t idx)
return vid_to_data_oid(vid, idx);
}
-static int push_cache_object(uint32_t vid, uint32_t idx)
+static int push_cache_object(uint32_t vid, uint32_t idx, int create)
{
struct request fake_req;
struct sd_obj_req *hdr = (struct sd_obj_req *)&fake_req.rq;
@@ -362,7 +381,7 @@ static int push_cache_object(uint32_t vid, uint32_t idx)
int ret = SD_RES_NO_MEM;
uint64_t oid = idx_to_oid(vid, idx);
- dprintf("%"PRIx64"\n", oid);
+ dprintf("%"PRIx64", create %d\n", oid, create);
memset(&fake_req, 0, sizeof(fake_req));
if (is_vdi_obj(oid))
@@ -382,13 +401,13 @@ static int push_cache_object(uint32_t vid, uint32_t idx)
hdr->offset = 0;
hdr->data_length = data_length;
- hdr->opcode = SD_OP_WRITE_OBJ;
+ hdr->opcode = create ? SD_OP_CREATE_AND_WRITE_OBJ : SD_OP_WRITE_OBJ;
hdr->flags = SD_FLAG_CMD_WRITE;
hdr->oid = oid;
hdr->copies = sys->nr_sobjs;
hdr->epoch = sys->epoch;
fake_req.data = buf;
- fake_req.op = get_sd_op(SD_OP_WRITE_OBJ);
+ fake_req.op = get_sd_op(hdr->opcode);
fake_req.entry = sys->vnodes;
fake_req.nr_vnodes = sys->nr_vnodes;
fake_req.nr_zones = get_zones_nr_from(sys->nodes, sys->nr_vnodes);
@@ -414,7 +433,7 @@ int object_cache_push(struct object_cache *oc)
return SD_RES_SUCCESS;
list_for_each_entry_safe(entry, t, &oc->dirty_list, list) {
- ret = push_cache_object(oc->vid, entry->idx);
+ ret = push_cache_object(oc->vid, entry->idx, entry->create);
if (ret != SD_RES_SUCCESS)
goto out;
pthread_mutex_lock(&oc->lock);
@@ -441,7 +460,7 @@ int object_is_cached(uint64_t oid)
return 0;
cache->oid = oid;
- if (object_cache_lookup(cache, idx) < 0)
+ if (object_cache_lookup(cache, idx, 0) < 0)
return 0;
else
return 1; /* found it */
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index f2ce87e..1b82389 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -305,6 +305,8 @@ int get_sheep_fd(uint8_t *addr, uint16_t port, int node_idx, uint32_t epoch);
int rmdir_r(char *dir_path);
+int prealloc(int fd, uint32_t size);
+
/* Operations */
struct sd_op_template *get_sd_op(uint8_t opcode);
@@ -414,10 +416,11 @@ struct object_cache_entry {
uint32_t idx;
struct rb_node rb;
struct list_head list;
+ int create;
};
struct object_cache *find_object_cache(uint32_t vid, int create);
-int object_cache_lookup(struct object_cache *oc, uint32_t index);
+int object_cache_lookup(struct object_cache *oc, uint32_t index, int create);
int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *);
int object_cache_pull(struct object_cache *oc, uint32_t index);
int object_cache_push(struct object_cache *oc);
diff --git a/sheep/store.c b/sheep/store.c
index 4d84350..548621a 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -765,14 +765,18 @@ static int handle_gateway_request(struct request *req)
uint32_t vid = oid_to_vid(oid);
uint32_t idx = data_oid_to_idx(oid);
struct object_cache *cache;
- int ret;
+ int ret, create = 0;
if (is_vdi_obj(oid))
idx |= 1 << CACHE_VDI_SHIFT;
cache = find_object_cache(vid, 1);
cache->oid = oid;
- if (object_cache_lookup(cache, idx) < 0) {
+
+ if (hdr->opcode == SD_OP_CREATE_AND_WRITE_OBJ)
+ create = 1;
+
+ if (object_cache_lookup(cache, idx, create) < 0) {
ret = object_cache_pull(cache, idx);
if (ret != SD_RES_SUCCESS)
return ret;
@@ -795,11 +799,10 @@ static int bypass_object_cache(struct sd_obj_req *hdr)
if (!(hdr->flags & SD_FLAG_CMD_CACHE))
return 1;
- /* For create, we skip the cache because of consistency check.
+ /*
* For vmstate && vdi_attr object, we don't do caching
*/
- if (hdr->opcode == SD_OP_CREATE_AND_WRITE_OBJ || is_vmstate_obj(oid)
- || is_vdi_attr_obj(oid))
+ if (is_vmstate_obj(oid) || is_vdi_attr_obj(oid))
return 1;
return 0;
}
@@ -822,13 +825,13 @@ void do_io_request(struct work *work)
if (hdr->flags & SD_FLAG_CMD_IO_LOCAL) {
ret = do_local_io(req, epoch);
} else {
- /* fix object consistency when we read the object for the first time */
- if (req->check_consistency) {
- ret = fix_object_consistency(req);
- if (ret != SD_RES_SUCCESS)
- goto out;
- }
if (bypass_object_cache(hdr)) {
+ /* fix object consistency when we read the object for the first time */
+ if (req->check_consistency) {
+ ret = fix_object_consistency(req);
+ if (ret != SD_RES_SUCCESS)
+ goto out;
+ }
if (hdr->flags & SD_FLAG_CMD_WRITE)
ret = forward_write_obj_req(req);
else
--
1.7.8.2
More information about the sheepdog
mailing list