[Sheepdog] [PATCH v5 3/8] sheep: object cache proper
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Sun Mar 25 22:35:12 CEST 2012
At Sat, 24 Mar 2012 16:47:13 +0800,
Liu Yuan wrote:
>
> From: Liu Yuan <tailai.ly at taobao.com>
>
> Object cache caches data and vdi objects on the local node. It is at
> higher level than backend store. This extra cache layer translate gateway
> requests into local requests, largely reducing the network traffic and highly
> improve the IO performance.
>
> Dirty objects will be flushed to cluster storage by 'sync' request from
> guest OS.
>
> - use red-black tree to track dirty objects
> - use file lock to avoid RW race on object granularity
> - use hash lists to maintain vdi space.
> - each vid has its own independent object cache
>
> Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
> ---
> sheep/object_cache.c | 421 ++++++++++++++++++++++++++++++++++++++++++++++++++
> sheep/sheep_priv.h | 34 ++++
> 2 files changed, 455 insertions(+), 0 deletions(-)
> create mode 100644 sheep/object_cache.c
>
> diff --git a/sheep/object_cache.c b/sheep/object_cache.c
> new file mode 100644
> index 0000000..929e28d
> --- /dev/null
> +++ b/sheep/object_cache.c
> @@ -0,0 +1,421 @@
> +/*
> + * Copyright (C) 2012 Taobao Inc.
> + *
> + * Liu Yuan <namei.unix at gmail.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License version
> + * 2 as published by the Free Software Foundation.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <stdlib.h>
> +#include <stdio.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <unistd.h>
> +#include <pthread.h>
> +#include <errno.h>
> +#include <sys/file.h>
> +
> +#include "sheep_priv.h"
> +#include "util.h"
> +#include "strbuf.h"
> +#include "rbtree.h"
> +
> +#define HASH_BITS 5
> +#define HASH_SIZE (1 << HASH_BITS)
> +
> +static char cache_dir[PATH_MAX];
> +static int def_open_flags = O_DSYNC | O_RDWR;
I think we don't need a sync flag for a cache.
> +extern mode_t def_fmode;
> +extern mode_t def_dmode;
> +extern struct store_driver *sd_store;
> +
> +static pthread_mutex_t hashtable_lock[HASH_SIZE] = { [0 ... HASH_SIZE - 1] = PTHREAD_MUTEX_INITIALIZER };
> +static struct hlist_head cache_hashtable[HASH_SIZE];
> +
> +static inline int hash(uint64_t vid)
> +{
> + return hash_64(vid, HASH_BITS);
> +}
> +
> +static struct object_cache_entry *dirty_rb_insert(struct rb_root *root,
> + struct object_cache_entry *new)
> +{
> + struct rb_node **p = &root->rb_node;
> + struct rb_node *parent = NULL;
> + struct object_cache_entry *entry;
> +
> + while (*p) {
> + parent = *p;
> + entry = rb_entry(parent, struct object_cache_entry, rb);
> +
> + if (new->idx < entry->idx)
> + p = &(*p)->rb_left;
> + else if (new->idx > entry->idx)
> + p = &(*p)->rb_right;
> + else
> + return entry; /* already has this entry */
> + }
> + rb_link_node(&new->rb, parent, p);
> + rb_insert_color(&new->rb, root);
> +
> + return NULL; /* insert successfully */
> +}
> +
> +__attribute__ ((unused))
> +static struct object_cache_entry *dirty_rb_search(struct rb_root *root,
> + struct object_cache_entry *entry)
> +{
> + struct rb_node *n = root->rb_node;
> + struct object_cache_entry *t;
> +
> + while (n) {
> + t = rb_entry(n, struct object_cache_entry, rb);
> +
> + if (entry->idx < t->idx)
> + n = n->rb_left;
> + else if (entry->idx > t->idx)
> + n = n->rb_right;
> + else
> + return t; /* found it */
> + }
> +
> + return NULL;
> +}
> +
> +static int create_dir_for(uint32_t vid)
> +{
> + int ret = 0;
> + struct strbuf buf = STRBUF_INIT;
> +
> + strbuf_addstr(&buf, cache_dir);
> + strbuf_addf(&buf, "/%06"PRIx32, vid);
> + if (mkdir(buf.buf, def_dmode) < 0)
> + if (errno != EEXIST) {
> + eprintf("%m\n");
> + ret = -1;
> + goto err;
> + }
> +err:
> + strbuf_release(&buf);
> + return ret;
> +}
> +
> +static struct object_cache *lookup_object_cache(uint32_t vid, int create)
> +{
> + int h = hash(vid);
> + struct hlist_head *head = cache_hashtable + h;
> + struct object_cache *cache = NULL;
> + struct hlist_node *node;
> +
> + pthread_mutex_lock(&hashtable_lock[h]);
> + if (hlist_empty(head))
> + goto not_found;
> +
> + hlist_for_each_entry(cache, node, head, hash) {
> + if (cache->vid == vid)
> + goto out;
> + }
> +not_found:
> + if (create) {
> + cache = xzalloc(sizeof(*cache));
> + cache->vid = vid;
> + create_dir_for(vid);
> + cache->dirty_rb = RB_ROOT;
> + pthread_mutex_init(&cache->lock, NULL);
> + INIT_LIST_HEAD(&cache->dirty_list);
> + hlist_add_head(&cache->hash, head);
> + } else
> + cache = NULL;
> +out:
> + pthread_mutex_unlock(&hashtable_lock[h]);
> + return cache;
> +}
> +
> +struct object_cache *find_object_cache(uint32_t vid)
> +{
> + return lookup_object_cache(vid, 1);
> +}
> +
> +/* The caller is responsible to release fd */
> +int object_cache_lookup(struct object_cache *oc, uint32_t idx)
> +{
> + struct strbuf buf;
> + int fd, ret = 0;
> +
> + strbuf_init(&buf, PATH_MAX);
> + strbuf_addstr(&buf, cache_dir);
> + strbuf_addf(&buf, "/%06"PRIx32"/%08"PRIx32, oc->vid, idx);
> +
> + fd = open(buf.buf, def_open_flags, def_fmode);
> + if (fd < 0) {
> + ret = -1;
> + goto out;
> + }
> + close(fd);
> +out:
> + strbuf_release(&buf);
> + return ret;
> +}
> +
> +static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count, off_t offset)
> +{
> + size_t size;
> + int fd, ret = SD_RES_SUCCESS;
> + struct strbuf p;
> +
> + strbuf_init(&p, PATH_MAX);
> + strbuf_addstr(&p, cache_dir);
> + strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
> +
> + fd = open(p.buf, def_open_flags, def_fmode);
> + if (flock(fd, LOCK_EX) < 0) {
Do we need flock here? We don't assume that multiple clients open the
same VDI at the same time.
> + ret = SD_RES_EIO;
> + eprintf("%m\n");
> + goto out;
> + }
> + size = xpwrite(fd, buf, count, offset);
> + if (flock(fd, LOCK_UN) < 0) {
> + ret = SD_RES_EIO;
> + eprintf("%m\n");
> + goto out;
> + }
> +
> + if (size != count)
> + ret = SD_RES_EIO;
> +out:
> + close(fd);
> + strbuf_release(&p);
> + return ret;
> +}
> +
> +static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count, off_t offset)
> +{
> + size_t size;
> + int fd, ret = SD_RES_SUCCESS;
> + struct strbuf p;
> +
> + strbuf_init(&p, PATH_MAX);
> + strbuf_addstr(&p, cache_dir);
> + strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
> +
> + fd = open(p.buf, def_open_flags, def_fmode);
> + if (flock(fd, LOCK_SH) < 0) {
> + ret = SD_RES_EIO;
> + eprintf("%m\n");
> + goto out;
> + }
> + size = xpread(fd, buf, count, offset);
> + if (flock(fd, LOCK_UN) < 0) {
> + ret = SD_RES_EIO;
> + eprintf("%m\n");
> + goto out;
> + }
> + if (size != count)
> + ret = SD_RES_EIO;
> +out:
> + close(fd);
> + strbuf_release(&p);
> + return ret;
> +}
> +
> +static void add_to_dirty_rb_and_list(struct object_cache *oc, uint32_t idx)
> +{
> + struct object_cache_entry *entry = xzalloc(sizeof(*entry));
> +
> + entry->idx = idx;
> + pthread_mutex_lock(&oc->lock);
> + if (!dirty_rb_insert(&oc->dirty_rb, entry))
> + list_add(&entry->list, &oc->dirty_list);
> + else
> + free(entry);
> + pthread_mutex_unlock(&oc->lock);
> +}
> +
> +int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
> +{
> + struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
> + struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp;
> + int ret;
> +
> + dprintf("%"PRIx64", len %"PRIu32", off %"PRIu64"\n", oc->oid, hdr->data_length, hdr->offset);
> + if (hdr->flags & SD_FLAG_CMD_WRITE) {
> + ret = write_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
> + if (ret != SD_RES_SUCCESS)
> + goto out;
> + add_to_dirty_rb_and_list(oc, idx);
> + } else {
> + ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
> + if (ret != SD_RES_SUCCESS)
> + goto out;
We must call forward_read_obj when we miss the cache.
Thanks,
Kazutaka
More information about the sheepdog
mailing list