[sheepdog] [PATCH v2 07/15] sheepfs: add 'volume' entry
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Sun May 20 12:03:27 CEST 2012
At Mon, 14 May 2012 17:47:32 +0800,
Liu Yuan wrote:
>
> From: Liu Yuan <tailai.ly at taobao.com>
>
> This is where we can access sheepdog's storage from well received 'file'
> abstraction. All the attatched volume will be seen as a file in volume
> directory.
>
> To attatch a volume:
> echo test > sheepfs_dir/vdi/mount
>
> Then you will see a file entry in sheepfs_dir/volume/test, which you can do
> the tricks you are fond of. For, e.g, we can boot an sheepdog image as normally
> as for an ordinary raw image:
>
> $ qemu-system-x86_64 --enable-kvm -m 1024 -drive \
> file=sheepfs_dir/volume/test,cache=writeback.
>
> This file abstraction integrates well with kernel's other component such as
> pagecache.
>
> Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
> ---
> sheepfs/Makefile.am | 2 +-
> sheepfs/core.c | 8 +-
> sheepfs/sheepfs.h | 8 ++
> sheepfs/volume.c | 299 +++++++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 314 insertions(+), 3 deletions(-)
> create mode 100644 sheepfs/volume.c
>
> diff --git a/sheepfs/Makefile.am b/sheepfs/Makefile.am
> index f6201a6..c451b86 100644
> --- a/sheepfs/Makefile.am
> +++ b/sheepfs/Makefile.am
> @@ -23,7 +23,7 @@ INCLUDES = -I$(top_builddir)/include -I$(top_srcdir)/include
>
> sbin_PROGRAMS = sheepfs
>
> -sheepfs_SOURCES = core.c cluster.c vdi.c shadow_file.c
> +sheepfs_SOURCES = core.c cluster.c vdi.c shadow_file.c volume.c
>
> sheepfs_LDADD = ../lib/libsheepdog.a $(fuse_LIBS) $(LIBS)
> sheepfs_DEPENDENCIES = ../lib/libsheepdog.a
> diff --git a/sheepfs/core.c b/sheepfs/core.c
> index c042245..0f4f451 100644
> --- a/sheepfs/core.c
> +++ b/sheepfs/core.c
> @@ -48,10 +48,10 @@ static struct sheepfs_file_operation {
> size_t (*get_size)(const char *path);
> } sheepfs_file_ops[] = {
> [OP_NULL ] = { NULL, NULL, NULL },
> - [OP_CLUSTER_INFO] = { cluster_info_read, NULL,
> - cluster_info_get_size },
> + [OP_CLUSTER_INFO] = { cluster_info_read, NULL, cluster_info_get_size },
> [OP_VDI_LIST] = { vdi_list_read, NULL, vdi_list_get_size },
> [OP_VDI_MOUNT] = { NULL, vdi_mount_write, NULL },
> + [OP_VOLUME] = { volume_read, volume_write, volume_get_size },
> };
>
> int sheepfs_set_op(const char *path, unsigned opcode)
> @@ -184,6 +184,8 @@ static int sheepfs_main_loop(char *mountpoint)
> int ret = -1;
>
> fuse_opt_add_arg(&args, "sheepfs"); /* placeholder for argv[0] */
> + fuse_opt_add_arg(&args, "-oallow_root");
> + fuse_opt_add_arg(&args, "-obig_writes");
> fuse_opt_add_arg(&args, "-ofsname=sheepfs");
> fuse_opt_add_arg(&args, mountpoint);
> if (sheepfs_debug)
> @@ -204,6 +206,8 @@ static int create_sheepfs_layout(void)
> return -1;
> if (create_vdi_layout() < 0)
> return -1;
> + if (create_volume_layout() < 0)
> + return -1;
>
> return 0;
> }
> diff --git a/sheepfs/sheepfs.h b/sheepfs/sheepfs.h
> index 48f3dbc..cfb1402 100644
> --- a/sheepfs/sheepfs.h
> +++ b/sheepfs/sheepfs.h
> @@ -6,6 +6,7 @@ enum sheepfs_opcode {
> OP_CLUSTER_INFO,
> OP_VDI_LIST,
> OP_VDI_MOUNT,
> + OP_VOLUME,
> };
>
> extern char sheepfs_shadow[];
> @@ -25,6 +26,13 @@ extern int shadow_file_getxattr(const char *path, const char *name,
> extern int shadow_file_delete(const char *path);
> extern int shadow_file_exsit(const char *path);
>
> +/* volume.c */
> +extern int create_volume_layout(void);
> +extern int volume_read(const char *path, char *buf, size_t size, off_t offset);
> +extern int volume_write(const char *, const char *buf, size_t size, off_t);
> +extern size_t volume_get_size(const char *);
> +extern int volume_create_entry(const char *entry);
> +
> /* cluster.c */
> extern int cluster_info_read(const char *path, char *buf, size_t size, off_t);
> extern size_t cluster_info_get_size(const char *path);
> diff --git a/sheepfs/volume.c b/sheepfs/volume.c
> new file mode 100644
> index 0000000..438e9a2
> --- /dev/null
> +++ b/sheepfs/volume.c
> @@ -0,0 +1,299 @@
> +/*
> + * Copyright (C) 2012 Taobao Inc.
> + *
> + * Liu Yuan <namei.unix at gmail.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License version
> + * 2 as published by the Free Software Foundation.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +#include <unistd.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <errno.h>
> +#include <stdlib.h>
> +#include <stdio.h>
> +#include <time.h>
> +#include <assert.h>
> +#include <syslog.h>
> +
> +#include "strbuf.h"
> +#include "sheepfs.h"
> +#include "net.h"
> +#include "rbtree.h"
> +
> +#define PATH_VOLUME "/volume"
> +
> +#define SH_VID_NAME "user.volume.vid"
> +#define SH_VID_SIZE sizeof(uint32_t)
> +
> +#define SH_SIZE_NAME "user.volume.size"
> +#define SH_SIZE_SIZE sizeof(size_t)
> +
> +#define VOLUME_READ 0
> +#define VOLUME_WRITE 1
> +
> +struct vdi_inode {
> + struct rb_node rb;
> + uint32_t vid;
> + struct sheepdog_inode *inode;
> +};
> +
> +static struct rb_root vdi_inode_tree = RB_ROOT;
> +
> +static struct vdi_inode *vdi_inode_tree_insert(struct vdi_inode *new)
> +{
> + struct rb_node **p = &vdi_inode_tree.rb_node;
> + struct rb_node *parent = NULL;
> + struct vdi_inode *entry;
> +
> + while (*p) {
> + parent = *p;
> + entry = rb_entry(parent, struct vdi_inode, rb);
> +
> + if (new->vid < entry->vid)
> + p = &(*p)->rb_left;
> + else if (new->vid > entry->vid)
> + p = &(*p)->rb_right;
> + else
> + return entry; /* already has this entry */
> + }
> + rb_link_node(&new->rb, parent, p);
> + rb_insert_color(&new->rb, &vdi_inode_tree);
> +
> + return NULL; /* insert successfully */
> +}
> +
> +static struct vdi_inode *vdi_inode_tree_search(uint32_t vid)
> +{
> + struct rb_node *n = vdi_inode_tree.rb_node;
> + struct vdi_inode *t;
> +
> + while (n) {
> + t = rb_entry(n, struct vdi_inode, rb);
> +
> + if (vid < t->vid)
> + n = n->rb_left;
> + else if (vid > t->vid)
> + n = n->rb_right;
> + else
> + return t; /* found it */
> + }
> +
> + return NULL;
> +}
> +
> +int create_volume_layout(void)
> +{
> + if (shadow_dir_create(PATH_VOLUME) < 0)
> + return -1;
> + return 0;
> +}
> +
> +static int volume_rw_object(char *buf, uint64_t oid, size_t size,
> + off_t off, int rw)
> +{
> + struct sd_obj_req hdr = { 0 };
> + struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr;
> + int ret;
> + unsigned wlen = 0, rlen = 0;
> + int create = 0;
> + uint32_t vid = oid_to_vid(oid);
> + struct vdi_inode *vdi = vdi_inode_tree_search(vid);
> + unsigned long idx = 0;
> +
> + if (is_data_obj(oid)) {
> + idx = data_oid_to_idx(oid);
> + assert(vdi);
> + if (!vdi->inode->data_vdi_id[idx]) {
> + /* if object doesn't exist, we'er done */
> + if (rw == VOLUME_READ) {
> + memset(buf, 0, size);
> + goto done;
> + }
> + create = 1;
> + }
> + }
> +
> + if (rw == VOLUME_READ) {
> + rlen = size;
> + hdr.opcode = SD_OP_READ_OBJ;
> + } else {
> + wlen = size;
> + hdr.opcode = create ?
> + SD_OP_CREATE_AND_WRITE_OBJ : SD_OP_WRITE_OBJ;
> + hdr.flags |= SD_FLAG_CMD_WRITE;
> + }
> +
> + hdr.oid = oid;
> + hdr.data_length = size;
> + hdr.offset = off;
If we add '-D' to the sheep command line options, hdr.data_length and
hdr.offset must be aligned to 512 bytes. How about returning EINVAL
to users when off and size is not aligned?
> + hdr.flags |= SD_FLAG_CMD_CACHE;
> +
> + ret = exec_req(0, (struct sd_req *)&hdr, buf, &wlen, &rlen);
> +
> + if (ret || rsp->result != SD_RES_SUCCESS) {
> + syslog(LOG_ERR, "failed to %s object %" PRIx64 " ret %d, res %d\n",
> + rw == VOLUME_READ ? "read" : "write", oid, ret,
> + rsp->result);
> + return -1;
> + }
> +
> + if (create) {
> + vdi->inode->data_vdi_id[idx] = vid;
> + /* writeback inode update */
> + if (volume_rw_object((char *)&vid, vid_to_vdi_oid(vid),
> + sizeof(vid),
> + SD_INODE_HEADER_SIZE + sizeof(vid) * idx,
> + VOLUME_WRITE) < 0)
> + return -1;
Wrong indentation level.
> + }
> +done:
> + return size;
> +}
> +
> +/* Do sync read/write */
> +static int volume_do_rw(const char *path, char *buf, size_t size,
> + off_t offset, int rw)
> +{
> + uint32_t vid;
> + uint64_t oid;
> + unsigned long idx;
> + off_t start;
> + size_t len, ret;
> +
> + if (shadow_file_getxattr(path, SH_VID_NAME, &vid, SH_VID_SIZE) < 0)
> + return -1;
> +
> + idx = offset / SD_DATA_OBJ_SIZE;
> + oid = vid_to_data_oid(vid, idx);
> + start = offset % SD_DATA_OBJ_SIZE;
> +
> + len = SD_DATA_OBJ_SIZE - start;
> + if (size < len)
> + len = size;
> +
> + do {
> + syslog(LOG_ERR, "%s oid %"PRIx64", off %ju, len %zu,"
> + " size %zu\n",
> + rw == VOLUME_READ ? "read" : "write",
> + oid, start, len, size);
> + ret = volume_rw_object(buf, oid, len, start, rw);
> +
> + if (ret != len)
> + return -1;
> +
> + oid++;
> + size -= len;
> + start = (start + len) % SD_DATA_OBJ_SIZE;
> + buf += len;
> + len = size > SD_DATA_OBJ_SIZE ? SD_DATA_OBJ_SIZE : size;
> + } while (size > 0);
> +
> + return 0;
> +}
> +
> +int volume_read(const char *path, char *buf, size_t size, off_t offset)
> +{
> +
> + if (volume_do_rw(path, buf, size, offset, VOLUME_READ) < 0)
> + return -EIO;
> +
> + return size;
> +}
> +
> +int volume_write(const char *path, const char *buf, size_t size, off_t offset)
> +{
> + if (volume_do_rw(path, (char *)buf, size, offset, VOLUME_WRITE) < 0)
> + return -EIO;
> +
> + return size;
> +}
> +
> +size_t volume_get_size(const char *path)
> +{
> + size_t size = 0;
> +
> + shadow_file_getxattr(path, SH_SIZE_NAME, &size, SH_SIZE_SIZE);
> + return size;
> +}
> +
> +static int init_vdi_info(const char *entry, uint32_t *vid, size_t *size)
> +{
> + struct strbuf *buf;
> + void *inode_buf;
> + struct vdi_inode *inode;
> + char command[256] = { 0 };
> + int ret = -1;
> +
> + sprintf(command, "%s %s\n", "collie vdi list -r", entry);
> + buf = sheepfs_run_cmd(command);
> + if (!buf)
> + return -1;
> + if (sscanf(buf->buf, "%*s %*s %*d %zu %*s %*s %*s %"PRIx32,
> + size, vid) < 2) {
> + syslog(LOG_ERR, "[%s] %m\n", __func__);
It seems that sscanf() doesn't set errno, so we shouldn't use %m here.
> + goto out;
> + }
> +
> + inode_buf = malloc(SD_INODE_SIZE);
> + if (!inode_buf) {
> + syslog(LOG_ERR, "[%s] %m\n", __func__);
> + goto out;
> + }
> +
> + if (volume_rw_object(inode_buf, vid_to_vdi_oid(*vid), SD_INODE_SIZE,
> + 0, VOLUME_READ) < 0) {
> + free(inode_buf);
> + goto out;
> + }
> +
> + inode = xzalloc(sizeof(*inode));
> + inode->vid = *vid;
> + inode->inode = inode_buf;
> + if (vdi_inode_tree_insert(inode)) {
> + free(inode_buf);
> + free(inode);
> + }
> + ret = 0;
> +out:
> + strbuf_release(buf);
> + return ret;
> +}
> +
> +int volume_create_entry(const char *entry)
> +{
> + char path[PATH_MAX], *ch;
> + uint32_t vid;
> + size_t size;
> +
> + ch = strchr(entry, '\n');
> + if (ch != NULL)
> + *ch = '\0';
> +
> + sprintf(path, "%s/%s", PATH_VOLUME, entry);
> + if (shadow_file_exsit(path))
> + return 0;
> +
> + if (shadow_file_create(path) < 0)
> + return -1;
The requested vdi may not exist. We should call init_vdi_info()
before creating a volume file.
Thanks,
Kazutaka
> +
> + if (init_vdi_info(entry, &vid, &size) < 0)
> + return -1;
> + if (shadow_file_setxattr(path, SH_VID_NAME, &vid, SH_VID_SIZE) < 0) {
> + shadow_file_delete(path);
> + return -1;
> + }
> + if (shadow_file_setxattr(path, SH_SIZE_NAME, &size, SH_SIZE_SIZE) < 0) {
> + shadow_file_delete(path);
> + return -1;
> + }
> + if (sheepfs_set_op(path, OP_VOLUME) < 0)
> + return -1;
> +
> + return 0;
> +}
> --
> 1.7.8.2
>
More information about the sheepdog
mailing list