[Sheepdog] [PATCH v5 3/8] sheep: object cache proper

Mon Mar 26 05:30:17 CEST 2012

At Mon, 26 Mar 2012 10:03:39 +0800,
Liu Yuan wrote:
> 
> On 03/26/2012 04:35 AM, MORITA Kazutaka wrote:
> 
> > At Sat, 24 Mar 2012 16:47:13 +0800,
> > Liu Yuan wrote:
> >>
> >> From: Liu Yuan <tailai.ly at taobao.com>
> >>
> >> Object cache caches data and vdi objects on the local node. It is at
> >> higher level than backend store. This extra cache layer translate gateway
> >> requests into local requests, largely reducing the network traffic and highly
> >> improve the IO performance.
> >>
> >> Dirty objects will be flushed to cluster storage by 'sync' request from
> >> guest OS.
> >>
> >> - use red-black tree to track dirty objects
> >> - use file lock to avoid RW race on object granularity
> >> - use hash lists to maintain vdi space.
> >> - each vid has its own independent object cache
> >>
> >> Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
> >> ---
> >>  sheep/object_cache.c |  421 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  sheep/sheep_priv.h   |   34 ++++
> >>  2 files changed, 455 insertions(+), 0 deletions(-)
> >>  create mode 100644 sheep/object_cache.c
> >>
> >> diff --git a/sheep/object_cache.c b/sheep/object_cache.c
> >> new file mode 100644
> >> index 0000000..929e28d
> >> --- /dev/null
> >> +++ b/sheep/object_cache.c
> >> @@ -0,0 +1,421 @@
> >> +/*
> >> + * Copyright (C) 2012 Taobao Inc.
> >> + *
> >> + * Liu Yuan <namei.unix at gmail.com>
> >> + *
> >> + * This program is free software; you can redistribute it and/or
> >> + * modify it under the terms of the GNU General Public License version
> >> + * 2 as published by the Free Software Foundation.
> >> + *
> >> + * You should have received a copy of the GNU General Public License
> >> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> >> + */
> >> +
> >> +#include <stdlib.h>
> >> +#include <stdio.h>
> >> +#include <sys/types.h>
> >> +#include <sys/stat.h>
> >> +#include <fcntl.h>
> >> +#include <unistd.h>
> >> +#include <pthread.h>
> >> +#include <errno.h>
> >> +#include <sys/file.h>
> >> +
> >> +#include "sheep_priv.h"
> >> +#include "util.h"
> >> +#include "strbuf.h"
> >> +#include "rbtree.h"
> >> +
> >> +#define HASH_BITS	5
> >> +#define HASH_SIZE	(1 << HASH_BITS)
> >> +
> >> +static char cache_dir[PATH_MAX];
> >> +static int def_open_flags = O_DSYNC | O_RDWR;
> > 
> > I think we don't need a sync flag for a cache.
> > 
> 
> 
> It is a disk cache, not a memory cache, we don't wanna cost extra memory
> than guest requested. Guest has its own page cache and it sends IO
> requests to tell us it has something to store into cluster, we simply
> cache the objects locally.
> 
> So I think we'd better preserve the sync flag.

In that sense, O_DIRECT looks better.

I want to see a memory cache mode in future.  It is useful when we run
VMs outside Sheepdog cluster.

> 
> > 
> >> +extern mode_t def_fmode;
> >> +extern mode_t def_dmode;
> >> +extern struct store_driver *sd_store;
> >> +
> >> +static pthread_mutex_t hashtable_lock[HASH_SIZE] = { [0 ... HASH_SIZE - 1] = PTHREAD_MUTEX_INITIALIZER };
> >> +static struct hlist_head cache_hashtable[HASH_SIZE];
> >> +
> >> +static inline int hash(uint64_t vid)
> >> +{
> >> +	return hash_64(vid, HASH_BITS);
> >> +}
> >> +
> >> +static struct object_cache_entry *dirty_rb_insert(struct rb_root *root,
> >> +		struct object_cache_entry *new)
> >> +{
> >> +	struct rb_node **p = &root->rb_node;
> >> +	struct rb_node *parent = NULL;
> >> +	struct object_cache_entry *entry;
> >> +
> >> +	while (*p) {
> >> +		parent = *p;
> >> +		entry = rb_entry(parent, struct object_cache_entry, rb);
> >> +
> >> +		if (new->idx < entry->idx)
> >> +			p = &(*p)->rb_left;
> >> +		else if (new->idx > entry->idx)
> >> +			p = &(*p)->rb_right;
> >> +		else
> >> +			return entry; /* already has this entry */
> >> +	}
> >> +	rb_link_node(&new->rb, parent, p);
> >> +	rb_insert_color(&new->rb, root);
> >> +
> >> +	return NULL; /* insert successfully */
> >> +}
> >> +
> >> +__attribute__ ((unused))
> >> +static struct object_cache_entry *dirty_rb_search(struct rb_root *root,
> >> +		struct object_cache_entry *entry)
> >> +{
> >> +	struct rb_node *n = root->rb_node;
> >> +	struct object_cache_entry *t;
> >> +
> >> +	while (n) {
> >> +		t = rb_entry(n, struct object_cache_entry, rb);
> >> +
> >> +		if (entry->idx < t->idx)
> >> +			n = n->rb_left;
> >> +		else if (entry->idx > t->idx)
> >> +			n = n->rb_right;
> >> +		else
> >> +			return t; /* found it */
> >> +	}
> >> +
> >> +	return NULL;
> >> +}
> >> +
> >> +static int create_dir_for(uint32_t vid)
> >> +{
> >> +	int ret = 0;
> >> +	struct strbuf buf = STRBUF_INIT;
> >> +
> >> +	strbuf_addstr(&buf, cache_dir);
> >> +	strbuf_addf(&buf, "/%06"PRIx32, vid);
> >> +	if (mkdir(buf.buf, def_dmode) < 0)
> >> +		if (errno != EEXIST) {
> >> +			eprintf("%m\n");
> >> +			ret = -1;
> >> +			goto err;
> >> +		}
> >> +err:
> >> +	strbuf_release(&buf);
> >> +	return ret;
> >> +}
> >> +
> >> +static struct object_cache *lookup_object_cache(uint32_t vid, int create)
> >> +{
> >> +	int h = hash(vid);
> >> +	struct hlist_head *head = cache_hashtable + h;
> >> +	struct object_cache *cache = NULL;
> >> +	struct hlist_node *node;
> >> +
> >> +	pthread_mutex_lock(&hashtable_lock[h]);
> >> +	if (hlist_empty(head))
> >> +		goto not_found;
> >> +
> >> +	hlist_for_each_entry(cache, node, head, hash) {
> >> +		if (cache->vid == vid)
> >> +			goto out;
> >> +	}
> >> +not_found:
> >> +	if (create) {
> >> +		cache = xzalloc(sizeof(*cache));
> >> +		cache->vid = vid;
> >> +		create_dir_for(vid);
> >> +		cache->dirty_rb = RB_ROOT;
> >> +		pthread_mutex_init(&cache->lock, NULL);
> >> +		INIT_LIST_HEAD(&cache->dirty_list);
> >> +		hlist_add_head(&cache->hash, head);
> >> +	} else
> >> +		cache = NULL;
> >> +out:
> >> +	pthread_mutex_unlock(&hashtable_lock[h]);
> >> +	return cache;
> >> +}
> >> +
> >> +struct object_cache *find_object_cache(uint32_t vid)
> >> +{
> >> +	return lookup_object_cache(vid, 1);
> >> +}
> >> +
> >> +/* The caller is responsible to release fd */
> >> +int object_cache_lookup(struct object_cache *oc, uint32_t idx)
> >> +{
> >> +	struct strbuf buf;
> >> +	int fd, ret = 0;
> >> +
> >> +	strbuf_init(&buf, PATH_MAX);
> >> +	strbuf_addstr(&buf, cache_dir);
> >> +	strbuf_addf(&buf, "/%06"PRIx32"/%08"PRIx32, oc->vid, idx);
> >> +
> >> +	fd = open(buf.buf, def_open_flags, def_fmode);
> >> +	if (fd < 0) {
> >> +		ret = -1;
> >> +		goto out;
> >> +	}
> >> +	close(fd);
> >> +out:
> >> +	strbuf_release(&buf);
> >> +	return ret;
> >> +}
> >> +
> >> +static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count, off_t offset)
> >> +{
> >> +	size_t size;
> >> +	int fd, ret = SD_RES_SUCCESS;
> >> +	struct strbuf p;
> >> +
> >> +	strbuf_init(&p, PATH_MAX);
> >> +	strbuf_addstr(&p, cache_dir);
> >> +	strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
> >> +
> >> +	fd = open(p.buf, def_open_flags, def_fmode);
> >> +	if (flock(fd, LOCK_EX) < 0) {
> > 
> > Do we need flock here?  We don't assume that multiple clients open the
> > same VDI at the same time.
> > 
> 
> 
> In my original design, there will be async cache reclaim mechanism that
> would race with the IO requests.

I see, but I think it's better to introduce these locks just when it
is needed.

> 
> > 
> >> +		ret = SD_RES_EIO;
> >> +		eprintf("%m\n");
> >> +		goto out;
> >> +	}
> >> +	size = xpwrite(fd, buf, count, offset);
> >> +	if (flock(fd, LOCK_UN) < 0) {
> >> +		ret = SD_RES_EIO;
> >> +		eprintf("%m\n");
> >> +		goto out;
> >> +	}
> >> +
> >> +	if (size != count)
> >> +		ret = SD_RES_EIO;
> >> +out:
> >> +	close(fd);
> >> +	strbuf_release(&p);
> >> +	return ret;
> >> +}
> >> +
> >> +static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count, off_t offset)
> >> +{
> >> +	size_t size;
> >> +	int fd, ret = SD_RES_SUCCESS;
> >> +	struct strbuf p;
> >> +
> >> +	strbuf_init(&p, PATH_MAX);
> >> +	strbuf_addstr(&p, cache_dir);
> >> +	strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
> >> +
> >> +	fd = open(p.buf, def_open_flags, def_fmode);
> >> +	if (flock(fd, LOCK_SH) < 0) {
> >> +		ret = SD_RES_EIO;
> >> +		eprintf("%m\n");
> >> +		goto out;
> >> +	}
> >> +	size = xpread(fd, buf, count, offset);
> >> +	if (flock(fd, LOCK_UN) < 0) {
> >> +		ret = SD_RES_EIO;
> >> +		eprintf("%m\n");
> >> +		goto out;
> >> +	}
> >> +	if (size != count)
> >> +		ret = SD_RES_EIO;
> >> +out:
> >> +	close(fd);
> >> +	strbuf_release(&p);
> >> +	return ret;
> >> +}
> >> +
> >> +static void add_to_dirty_rb_and_list(struct object_cache *oc, uint32_t idx)
> >> +{
> >> +	struct object_cache_entry *entry = xzalloc(sizeof(*entry));
> >> +
> >> +	entry->idx = idx;
> >> +	pthread_mutex_lock(&oc->lock);
> >> +	if (!dirty_rb_insert(&oc->dirty_rb, entry))
> >> +		list_add(&entry->list, &oc->dirty_list);
> >> +	else
> >> +		free(entry);
> >> +	pthread_mutex_unlock(&oc->lock);
> >> +}
> >> +
> >> +int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
> >> +{
> >> +	struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
> >> +	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp;
> >> +	int ret;
> >> +
> >> +	dprintf("%"PRIx64", len %"PRIu32", off %"PRIu64"\n", oc->oid, hdr->data_length, hdr->offset);
> >> +	if (hdr->flags & SD_FLAG_CMD_WRITE) {
> >> +		ret = write_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
> >> +		if (ret != SD_RES_SUCCESS)
> >> +			goto out;
> >> +		add_to_dirty_rb_and_list(oc, idx);
> >> +	} else {
> >> +		ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
> >> +		if (ret != SD_RES_SUCCESS)
> >> +			goto out;
> > 
> > We must call forward_read_obj when we miss the cache.
> > 
> 
> 
> No, we don't. when object_cache_rw is called, we already assure that we
> have pulled the targeted object into the cache.

Yes, sorry, you're right.

Thanks,

Kazutaka