[Sheepdog] [PATCH v5 3/8] sheep: object cache proper

Liu Yuan namei.unix at gmail.com
Mon Mar 26 04:03:39 CEST 2012


On 03/26/2012 04:35 AM, MORITA Kazutaka wrote:

> At Sat, 24 Mar 2012 16:47:13 +0800,
> Liu Yuan wrote:
>>
>> From: Liu Yuan <tailai.ly at taobao.com>
>>
>> Object cache caches data and vdi objects on the local node. It is at
>> higher level than backend store. This extra cache layer translate gateway
>> requests into local requests, largely reducing the network traffic and highly
>> improve the IO performance.
>>
>> Dirty objects will be flushed to cluster storage by 'sync' request from
>> guest OS.
>>
>> - use red-black tree to track dirty objects
>> - use file lock to avoid RW race on object granularity
>> - use hash lists to maintain vdi space.
>> - each vid has its own independent object cache
>>
>> Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
>> ---
>>  sheep/object_cache.c |  421 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  sheep/sheep_priv.h   |   34 ++++
>>  2 files changed, 455 insertions(+), 0 deletions(-)
>>  create mode 100644 sheep/object_cache.c
>>
>> diff --git a/sheep/object_cache.c b/sheep/object_cache.c
>> new file mode 100644
>> index 0000000..929e28d
>> --- /dev/null
>> +++ b/sheep/object_cache.c
>> @@ -0,0 +1,421 @@
>> +/*
>> + * Copyright (C) 2012 Taobao Inc.
>> + *
>> + * Liu Yuan <namei.unix at gmail.com>
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public License version
>> + * 2 as published by the Free Software Foundation.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include <stdlib.h>
>> +#include <stdio.h>
>> +#include <sys/types.h>
>> +#include <sys/stat.h>
>> +#include <fcntl.h>
>> +#include <unistd.h>
>> +#include <pthread.h>
>> +#include <errno.h>
>> +#include <sys/file.h>
>> +
>> +#include "sheep_priv.h"
>> +#include "util.h"
>> +#include "strbuf.h"
>> +#include "rbtree.h"
>> +
>> +#define HASH_BITS	5
>> +#define HASH_SIZE	(1 << HASH_BITS)
>> +
>> +static char cache_dir[PATH_MAX];
>> +static int def_open_flags = O_DSYNC | O_RDWR;
> 
> I think we don't need a sync flag for a cache.
> 


It is a disk cache, not a memory cache, we don't wanna cost extra memory
than guest requested. Guest has its own page cache and it sends IO
requests to tell us it has something to store into cluster, we simply
cache the objects locally.

So I think we'd better preserve the sync flag.

> 
>> +extern mode_t def_fmode;
>> +extern mode_t def_dmode;
>> +extern struct store_driver *sd_store;
>> +
>> +static pthread_mutex_t hashtable_lock[HASH_SIZE] = { [0 ... HASH_SIZE - 1] = PTHREAD_MUTEX_INITIALIZER };
>> +static struct hlist_head cache_hashtable[HASH_SIZE];
>> +
>> +static inline int hash(uint64_t vid)
>> +{
>> +	return hash_64(vid, HASH_BITS);
>> +}
>> +
>> +static struct object_cache_entry *dirty_rb_insert(struct rb_root *root,
>> +		struct object_cache_entry *new)
>> +{
>> +	struct rb_node **p = &root->rb_node;
>> +	struct rb_node *parent = NULL;
>> +	struct object_cache_entry *entry;
>> +
>> +	while (*p) {
>> +		parent = *p;
>> +		entry = rb_entry(parent, struct object_cache_entry, rb);
>> +
>> +		if (new->idx < entry->idx)
>> +			p = &(*p)->rb_left;
>> +		else if (new->idx > entry->idx)
>> +			p = &(*p)->rb_right;
>> +		else
>> +			return entry; /* already has this entry */
>> +	}
>> +	rb_link_node(&new->rb, parent, p);
>> +	rb_insert_color(&new->rb, root);
>> +
>> +	return NULL; /* insert successfully */
>> +}
>> +
>> +__attribute__ ((unused))
>> +static struct object_cache_entry *dirty_rb_search(struct rb_root *root,
>> +		struct object_cache_entry *entry)
>> +{
>> +	struct rb_node *n = root->rb_node;
>> +	struct object_cache_entry *t;
>> +
>> +	while (n) {
>> +		t = rb_entry(n, struct object_cache_entry, rb);
>> +
>> +		if (entry->idx < t->idx)
>> +			n = n->rb_left;
>> +		else if (entry->idx > t->idx)
>> +			n = n->rb_right;
>> +		else
>> +			return t; /* found it */
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +static int create_dir_for(uint32_t vid)
>> +{
>> +	int ret = 0;
>> +	struct strbuf buf = STRBUF_INIT;
>> +
>> +	strbuf_addstr(&buf, cache_dir);
>> +	strbuf_addf(&buf, "/%06"PRIx32, vid);
>> +	if (mkdir(buf.buf, def_dmode) < 0)
>> +		if (errno != EEXIST) {
>> +			eprintf("%m\n");
>> +			ret = -1;
>> +			goto err;
>> +		}
>> +err:
>> +	strbuf_release(&buf);
>> +	return ret;
>> +}
>> +
>> +static struct object_cache *lookup_object_cache(uint32_t vid, int create)
>> +{
>> +	int h = hash(vid);
>> +	struct hlist_head *head = cache_hashtable + h;
>> +	struct object_cache *cache = NULL;
>> +	struct hlist_node *node;
>> +
>> +	pthread_mutex_lock(&hashtable_lock[h]);
>> +	if (hlist_empty(head))
>> +		goto not_found;
>> +
>> +	hlist_for_each_entry(cache, node, head, hash) {
>> +		if (cache->vid == vid)
>> +			goto out;
>> +	}
>> +not_found:
>> +	if (create) {
>> +		cache = xzalloc(sizeof(*cache));
>> +		cache->vid = vid;
>> +		create_dir_for(vid);
>> +		cache->dirty_rb = RB_ROOT;
>> +		pthread_mutex_init(&cache->lock, NULL);
>> +		INIT_LIST_HEAD(&cache->dirty_list);
>> +		hlist_add_head(&cache->hash, head);
>> +	} else
>> +		cache = NULL;
>> +out:
>> +	pthread_mutex_unlock(&hashtable_lock[h]);
>> +	return cache;
>> +}
>> +
>> +struct object_cache *find_object_cache(uint32_t vid)
>> +{
>> +	return lookup_object_cache(vid, 1);
>> +}
>> +
>> +/* The caller is responsible to release fd */
>> +int object_cache_lookup(struct object_cache *oc, uint32_t idx)
>> +{
>> +	struct strbuf buf;
>> +	int fd, ret = 0;
>> +
>> +	strbuf_init(&buf, PATH_MAX);
>> +	strbuf_addstr(&buf, cache_dir);
>> +	strbuf_addf(&buf, "/%06"PRIx32"/%08"PRIx32, oc->vid, idx);
>> +
>> +	fd = open(buf.buf, def_open_flags, def_fmode);
>> +	if (fd < 0) {
>> +		ret = -1;
>> +		goto out;
>> +	}
>> +	close(fd);
>> +out:
>> +	strbuf_release(&buf);
>> +	return ret;
>> +}
>> +
>> +static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count, off_t offset)
>> +{
>> +	size_t size;
>> +	int fd, ret = SD_RES_SUCCESS;
>> +	struct strbuf p;
>> +
>> +	strbuf_init(&p, PATH_MAX);
>> +	strbuf_addstr(&p, cache_dir);
>> +	strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
>> +
>> +	fd = open(p.buf, def_open_flags, def_fmode);
>> +	if (flock(fd, LOCK_EX) < 0) {
> 
> Do we need flock here?  We don't assume that multiple clients open the
> same VDI at the same time.
> 


In my original design, there will be async cache reclaim mechanism that
would race with the IO requests.

> 
>> +		ret = SD_RES_EIO;
>> +		eprintf("%m\n");
>> +		goto out;
>> +	}
>> +	size = xpwrite(fd, buf, count, offset);
>> +	if (flock(fd, LOCK_UN) < 0) {
>> +		ret = SD_RES_EIO;
>> +		eprintf("%m\n");
>> +		goto out;
>> +	}
>> +
>> +	if (size != count)
>> +		ret = SD_RES_EIO;
>> +out:
>> +	close(fd);
>> +	strbuf_release(&p);
>> +	return ret;
>> +}
>> +
>> +static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count, off_t offset)
>> +{
>> +	size_t size;
>> +	int fd, ret = SD_RES_SUCCESS;
>> +	struct strbuf p;
>> +
>> +	strbuf_init(&p, PATH_MAX);
>> +	strbuf_addstr(&p, cache_dir);
>> +	strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
>> +
>> +	fd = open(p.buf, def_open_flags, def_fmode);
>> +	if (flock(fd, LOCK_SH) < 0) {
>> +		ret = SD_RES_EIO;
>> +		eprintf("%m\n");
>> +		goto out;
>> +	}
>> +	size = xpread(fd, buf, count, offset);
>> +	if (flock(fd, LOCK_UN) < 0) {
>> +		ret = SD_RES_EIO;
>> +		eprintf("%m\n");
>> +		goto out;
>> +	}
>> +	if (size != count)
>> +		ret = SD_RES_EIO;
>> +out:
>> +	close(fd);
>> +	strbuf_release(&p);
>> +	return ret;
>> +}
>> +
>> +static void add_to_dirty_rb_and_list(struct object_cache *oc, uint32_t idx)
>> +{
>> +	struct object_cache_entry *entry = xzalloc(sizeof(*entry));
>> +
>> +	entry->idx = idx;
>> +	pthread_mutex_lock(&oc->lock);
>> +	if (!dirty_rb_insert(&oc->dirty_rb, entry))
>> +		list_add(&entry->list, &oc->dirty_list);
>> +	else
>> +		free(entry);
>> +	pthread_mutex_unlock(&oc->lock);
>> +}
>> +
>> +int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
>> +{
>> +	struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
>> +	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp;
>> +	int ret;
>> +
>> +	dprintf("%"PRIx64", len %"PRIu32", off %"PRIu64"\n", oc->oid, hdr->data_length, hdr->offset);
>> +	if (hdr->flags & SD_FLAG_CMD_WRITE) {
>> +		ret = write_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
>> +		if (ret != SD_RES_SUCCESS)
>> +			goto out;
>> +		add_to_dirty_rb_and_list(oc, idx);
>> +	} else {
>> +		ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
>> +		if (ret != SD_RES_SUCCESS)
>> +			goto out;
> 
> We must call forward_read_obj when we miss the cache.
> 


No, we don't. when object_cache_rw is called, we already assure that we
have pulled the targeted object into the cache.

> Thanks,
> 
> Kazutaka





More information about the sheepdog mailing list