[Sheepdog] [PATCH v5 3/8] sheep: object cache proper

Liu Yuan namei.unix at gmail.com
Mon Mar 26 06:05:19 CEST 2012


On 03/26/2012 11:30 AM, MORITA Kazutaka wrote:

>>>> +static int def_open_flags = O_DSYNC | O_RDWR;
>>>
>>> I think we don't need a sync flag for a cache.
>>>
>>
>>
>> It is a disk cache, not a memory cache, we don't wanna cost extra memory
>> than guest requested. Guest has its own page cache and it sends IO
>> requests to tell us it has something to store into cluster, we simply
>> cache the objects locally.
>>
>> So I think we'd better preserve the sync flag.
> 
> In that sense, O_DIRECT looks better.
> 
> I want to see a memory cache mode in future.  It is useful when we run
> VMs outside Sheepdog cluster.
> 


I guess you want the effect of cache=writeback in the qemu semantics,
which use host page cache(memory cache) for Guest. This is often for
desktop users that doesn't have a memory scheme for host and guests (so
often with extra memory left). In our production usage, we usually
allocate as much as possible memory to Guests, so there are actually no
extra memory left for host page cache and cache=writeback just cause
higher memory pressure and swapping times, increasing the latency very
much and have a possibility to bring the machine to thrashing for some
heavy workloads.

I don't think you can run VM outside Sheepdog cluster in memory cache,
that is, memory isn't big enough to cache all the objects of VMs.
Instead, current disk cache implementation does have this feature in
mind, for most of VMs running with disk cache, the chances are that we
would cache all the objects that are needed to run the VMs without
sheepdog cluster.

>>
>>>
>>>> +extern mode_t def_fmode;
>>>> +extern mode_t def_dmode;
>>>> +extern struct store_driver *sd_store;
>>>> +
>>>> +static pthread_mutex_t hashtable_lock[HASH_SIZE] = { [0 ... HASH_SIZE - 1] = PTHREAD_MUTEX_INITIALIZER };
>>>> +static struct hlist_head cache_hashtable[HASH_SIZE];
>>>> +
>>>> +static inline int hash(uint64_t vid)
>>>> +{
>>>> +	return hash_64(vid, HASH_BITS);
>>>> +}
>>>> +
>>>> +static struct object_cache_entry *dirty_rb_insert(struct rb_root *root,
>>>> +		struct object_cache_entry *new)
>>>> +{
>>>> +	struct rb_node **p = &root->rb_node;
>>>> +	struct rb_node *parent = NULL;
>>>> +	struct object_cache_entry *entry;
>>>> +
>>>> +	while (*p) {
>>>> +		parent = *p;
>>>> +		entry = rb_entry(parent, struct object_cache_entry, rb);
>>>> +
>>>> +		if (new->idx < entry->idx)
>>>> +			p = &(*p)->rb_left;
>>>> +		else if (new->idx > entry->idx)
>>>> +			p = &(*p)->rb_right;
>>>> +		else
>>>> +			return entry; /* already has this entry */
>>>> +	}
>>>> +	rb_link_node(&new->rb, parent, p);
>>>> +	rb_insert_color(&new->rb, root);
>>>> +
>>>> +	return NULL; /* insert successfully */
>>>> +}
>>>> +
>>>> +__attribute__ ((unused))
>>>> +static struct object_cache_entry *dirty_rb_search(struct rb_root *root,
>>>> +		struct object_cache_entry *entry)
>>>> +{
>>>> +	struct rb_node *n = root->rb_node;
>>>> +	struct object_cache_entry *t;
>>>> +
>>>> +	while (n) {
>>>> +		t = rb_entry(n, struct object_cache_entry, rb);
>>>> +
>>>> +		if (entry->idx < t->idx)
>>>> +			n = n->rb_left;
>>>> +		else if (entry->idx > t->idx)
>>>> +			n = n->rb_right;
>>>> +		else
>>>> +			return t; /* found it */
>>>> +	}
>>>> +
>>>> +	return NULL;
>>>> +}
>>>> +
>>>> +static int create_dir_for(uint32_t vid)
>>>> +{
>>>> +	int ret = 0;
>>>> +	struct strbuf buf = STRBUF_INIT;
>>>> +
>>>> +	strbuf_addstr(&buf, cache_dir);
>>>> +	strbuf_addf(&buf, "/%06"PRIx32, vid);
>>>> +	if (mkdir(buf.buf, def_dmode) < 0)
>>>> +		if (errno != EEXIST) {
>>>> +			eprintf("%m\n");
>>>> +			ret = -1;
>>>> +			goto err;
>>>> +		}
>>>> +err:
>>>> +	strbuf_release(&buf);
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +static struct object_cache *lookup_object_cache(uint32_t vid, int create)
>>>> +{
>>>> +	int h = hash(vid);
>>>> +	struct hlist_head *head = cache_hashtable + h;
>>>> +	struct object_cache *cache = NULL;
>>>> +	struct hlist_node *node;
>>>> +
>>>> +	pthread_mutex_lock(&hashtable_lock[h]);
>>>> +	if (hlist_empty(head))
>>>> +		goto not_found;
>>>> +
>>>> +	hlist_for_each_entry(cache, node, head, hash) {
>>>> +		if (cache->vid == vid)
>>>> +			goto out;
>>>> +	}
>>>> +not_found:
>>>> +	if (create) {
>>>> +		cache = xzalloc(sizeof(*cache));
>>>> +		cache->vid = vid;
>>>> +		create_dir_for(vid);
>>>> +		cache->dirty_rb = RB_ROOT;
>>>> +		pthread_mutex_init(&cache->lock, NULL);
>>>> +		INIT_LIST_HEAD(&cache->dirty_list);
>>>> +		hlist_add_head(&cache->hash, head);
>>>> +	} else
>>>> +		cache = NULL;
>>>> +out:
>>>> +	pthread_mutex_unlock(&hashtable_lock[h]);
>>>> +	return cache;
>>>> +}
>>>> +
>>>> +struct object_cache *find_object_cache(uint32_t vid)
>>>> +{
>>>> +	return lookup_object_cache(vid, 1);
>>>> +}
>>>> +
>>>> +/* The caller is responsible to release fd */
>>>> +int object_cache_lookup(struct object_cache *oc, uint32_t idx)
>>>> +{
>>>> +	struct strbuf buf;
>>>> +	int fd, ret = 0;
>>>> +
>>>> +	strbuf_init(&buf, PATH_MAX);
>>>> +	strbuf_addstr(&buf, cache_dir);
>>>> +	strbuf_addf(&buf, "/%06"PRIx32"/%08"PRIx32, oc->vid, idx);
>>>> +
>>>> +	fd = open(buf.buf, def_open_flags, def_fmode);
>>>> +	if (fd < 0) {
>>>> +		ret = -1;
>>>> +		goto out;
>>>> +	}
>>>> +	close(fd);
>>>> +out:
>>>> +	strbuf_release(&buf);
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count, off_t offset)
>>>> +{
>>>> +	size_t size;
>>>> +	int fd, ret = SD_RES_SUCCESS;
>>>> +	struct strbuf p;
>>>> +
>>>> +	strbuf_init(&p, PATH_MAX);
>>>> +	strbuf_addstr(&p, cache_dir);
>>>> +	strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
>>>> +
>>>> +	fd = open(p.buf, def_open_flags, def_fmode);
>>>> +	if (flock(fd, LOCK_EX) < 0) {
>>>
>>> Do we need flock here?  We don't assume that multiple clients open the
>>> same VDI at the same time.
>>>
>>
>>
>> In my original design, there will be async cache reclaim mechanism that
>> would race with the IO requests.
> 
> I see, but I think it's better to introduce these locks just when it
> is needed.
> 


Okay, I'll remove it in next version.

>>
>>>
>>>> +		ret = SD_RES_EIO;
>>>> +		eprintf("%m\n");
>>>> +		goto out;
>>>> +	}
>>>> +	size = xpwrite(fd, buf, count, offset);
>>>> +	if (flock(fd, LOCK_UN) < 0) {
>>>> +		ret = SD_RES_EIO;
>>>> +		eprintf("%m\n");
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	if (size != count)
>>>> +		ret = SD_RES_EIO;
>>>> +out:
>>>> +	close(fd);
>>>> +	strbuf_release(&p);
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count, off_t offset)
>>>> +{
>>>> +	size_t size;
>>>> +	int fd, ret = SD_RES_SUCCESS;
>>>> +	struct strbuf p;
>>>> +
>>>> +	strbuf_init(&p, PATH_MAX);
>>>> +	strbuf_addstr(&p, cache_dir);
>>>> +	strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
>>>> +
>>>> +	fd = open(p.buf, def_open_flags, def_fmode);
>>>> +	if (flock(fd, LOCK_SH) < 0) {
>>>> +		ret = SD_RES_EIO;
>>>> +		eprintf("%m\n");
>>>> +		goto out;
>>>> +	}
>>>> +	size = xpread(fd, buf, count, offset);
>>>> +	if (flock(fd, LOCK_UN) < 0) {
>>>> +		ret = SD_RES_EIO;
>>>> +		eprintf("%m\n");
>>>> +		goto out;
>>>> +	}
>>>> +	if (size != count)
>>>> +		ret = SD_RES_EIO;
>>>> +out:
>>>> +	close(fd);
>>>> +	strbuf_release(&p);
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +static void add_to_dirty_rb_and_list(struct object_cache *oc, uint32_t idx)
>>>> +{
>>>> +	struct object_cache_entry *entry = xzalloc(sizeof(*entry));
>>>> +
>>>> +	entry->idx = idx;
>>>> +	pthread_mutex_lock(&oc->lock);
>>>> +	if (!dirty_rb_insert(&oc->dirty_rb, entry))
>>>> +		list_add(&entry->list, &oc->dirty_list);
>>>> +	else
>>>> +		free(entry);
>>>> +	pthread_mutex_unlock(&oc->lock);
>>>> +}
>>>> +
>>>> +int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
>>>> +{
>>>> +	struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
>>>> +	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp;
>>>> +	int ret;
>>>> +
>>>> +	dprintf("%"PRIx64", len %"PRIu32", off %"PRIu64"\n", oc->oid, hdr->data_length, hdr->offset);
>>>> +	if (hdr->flags & SD_FLAG_CMD_WRITE) {
>>>> +		ret = write_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
>>>> +		if (ret != SD_RES_SUCCESS)
>>>> +			goto out;
>>>> +		add_to_dirty_rb_and_list(oc, idx);
>>>> +	} else {
>>>> +		ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
>>>> +		if (ret != SD_RES_SUCCESS)
>>>> +			goto out;
>>>
>>> We must call forward_read_obj when we miss the cache.
>>>
>>
>>
>> No, we don't. when object_cache_rw is called, we already assure that we
>> have pulled the targeted object into the cache.
> 
> Yes, sorry, you're right.
> 
> Thanks,
> 
> Kazutaka





More information about the sheepdog mailing list