From: Liu Yuan <tailai.ly at taobao.com> This is where we can access sheepdog's storage from well received 'file' abstraction. All the attatched volume will be seen as a file in volume directory. To attatch a volume: echo test > sheepfs_dir/vdi/mount Then you will see a file entry in sheepfs_dir/volume/test, which you can do the tricks you are fond of. For, e.g, we can boot an sheepdog image as normally as for an ordinary raw image: $ qemu-system-x86_64 --enable-kvm -m 1024 -drive \ file=sheepfs_dir/volume/test,cache=writeback. This file abstraction integrates well with kernel's other component such as pagecache. Signed-off-by: Liu Yuan <tailai.ly at taobao.com> --- sheepfs/Makefile.am | 2 +- sheepfs/core.c | 8 +- sheepfs/sheepfs.h | 8 ++ sheepfs/volume.c | 305 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 320 insertions(+), 3 deletions(-) create mode 100644 sheepfs/volume.c diff --git a/sheepfs/Makefile.am b/sheepfs/Makefile.am index f6201a6..c451b86 100644 --- a/sheepfs/Makefile.am +++ b/sheepfs/Makefile.am @@ -23,7 +23,7 @@ INCLUDES = -I$(top_builddir)/include -I$(top_srcdir)/include sbin_PROGRAMS = sheepfs -sheepfs_SOURCES = core.c cluster.c vdi.c shadow_file.c +sheepfs_SOURCES = core.c cluster.c vdi.c shadow_file.c volume.c sheepfs_LDADD = ../lib/libsheepdog.a $(fuse_LIBS) $(LIBS) sheepfs_DEPENDENCIES = ../lib/libsheepdog.a diff --git a/sheepfs/core.c b/sheepfs/core.c index a045b4a..67a95c9 100644 --- a/sheepfs/core.c +++ b/sheepfs/core.c @@ -47,10 +47,10 @@ static struct sheepfs_file_operation { size_t (*get_size)(const char *path); } sheepfs_file_ops[] = { [OP_NULL ] = { NULL, NULL, NULL }, - [OP_CLUSTER_INFO] = { cluster_info_read, NULL, - cluster_info_get_size }, + [OP_CLUSTER_INFO] = { cluster_info_read, NULL, cluster_info_get_size }, [OP_VDI_LIST] = { vdi_list_read, NULL, vdi_list_get_size }, [OP_VDI_MOUNT] = { NULL, vdi_mount_write, NULL }, + [OP_VOLUME] = { volume_read, volume_write, volume_get_size }, }; int sheepfs_set_op(const char *path, unsigned opcode) @@ -183,6 +183,8 @@ static int sheepfs_main_loop(char *mountpoint) int ret = -1; fuse_opt_add_arg(&args, "sheepfs"); /* placeholder for argv[0] */ + fuse_opt_add_arg(&args, "-oallow_root"); + fuse_opt_add_arg(&args, "-obig_writes"); fuse_opt_add_arg(&args, "-ofsname=sheepfs"); fuse_opt_add_arg(&args, mountpoint); if (sheepfs_debug) @@ -203,6 +205,8 @@ static int create_sheepfs_layout(void) return -1; if (create_vdi_layout() < 0) return -1; + if (create_volume_layout() < 0) + return -1; return 0; } diff --git a/sheepfs/sheepfs.h b/sheepfs/sheepfs.h index 48f3dbc..cfb1402 100644 --- a/sheepfs/sheepfs.h +++ b/sheepfs/sheepfs.h @@ -6,6 +6,7 @@ enum sheepfs_opcode { OP_CLUSTER_INFO, OP_VDI_LIST, OP_VDI_MOUNT, + OP_VOLUME, }; extern char sheepfs_shadow[]; @@ -25,6 +26,13 @@ extern int shadow_file_getxattr(const char *path, const char *name, extern int shadow_file_delete(const char *path); extern int shadow_file_exsit(const char *path); +/* volume.c */ +extern int create_volume_layout(void); +extern int volume_read(const char *path, char *buf, size_t size, off_t offset); +extern int volume_write(const char *, const char *buf, size_t size, off_t); +extern size_t volume_get_size(const char *); +extern int volume_create_entry(const char *entry); + /* cluster.c */ extern int cluster_info_read(const char *path, char *buf, size_t size, off_t); extern size_t cluster_info_get_size(const char *path); diff --git a/sheepfs/volume.c b/sheepfs/volume.c new file mode 100644 index 0000000..0bacb42 --- /dev/null +++ b/sheepfs/volume.c @@ -0,0 +1,305 @@ +/* + * Copyright (C) 2012 Taobao Inc. + * + * Liu Yuan <namei.unix at gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <time.h> +#include <assert.h> +#include <syslog.h> + +#include "strbuf.h" +#include "sheepfs.h" +#include "net.h" +#include "rbtree.h" + +#define PATH_VOLUME "/volume" + +#define SH_VID_NAME "user.volume.vid" +#define SH_VID_SIZE sizeof(uint32_t) + +#define SH_SIZE_NAME "user.volume.size" +#define SH_SIZE_SIZE sizeof(size_t) + +#define VOLUME_READ 0 +#define VOLUME_WRITE 1 + +struct vdi_inode { + struct rb_node rb; + uint32_t vid; + struct sheepdog_inode *inode; +}; + +static struct rb_root vdi_inode_tree = RB_ROOT; + +static struct vdi_inode *vdi_inode_tree_insert(struct vdi_inode *new) +{ + struct rb_node **p = &vdi_inode_tree.rb_node; + struct rb_node *parent = NULL; + struct vdi_inode *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct vdi_inode, rb); + + if (new->vid < entry->vid) + p = &(*p)->rb_left; + else if (new->vid > entry->vid) + p = &(*p)->rb_right; + else + return entry; /* already has this entry */ + } + rb_link_node(&new->rb, parent, p); + rb_insert_color(&new->rb, &vdi_inode_tree); + + return NULL; /* insert successfully */ +} + +static struct vdi_inode *vdi_inode_tree_search(uint32_t vid) +{ + struct rb_node *n = vdi_inode_tree.rb_node; + struct vdi_inode *t; + + while (n) { + t = rb_entry(n, struct vdi_inode, rb); + + if (vid < t->vid) + n = n->rb_left; + else if (vid > t->vid) + n = n->rb_right; + else + return t; /* found it */ + } + + return NULL; +} + +int create_volume_layout(void) +{ + if (shadow_dir_create(PATH_VOLUME) < 0) + return -1; + return 0; +} + +static int volume_rw_object(char *buf, uint64_t oid, size_t size, + off_t off, int rw) +{ + struct sd_req hdr = { 0 }; + struct sd_rsp *rsp = (struct sd_rsp *)&hdr; + int ret; + unsigned wlen = 0, rlen = 0; + int create = 0; + uint32_t vid = oid_to_vid(oid); + struct vdi_inode *vdi = vdi_inode_tree_search(vid); + unsigned long idx = 0; + + if (is_data_obj(oid)) { + if (off % SECTOR_SIZE || size % SECTOR_SIZE) { + syslog(LOG_ERR, "offset or size not aligned\n"); + return -1; + } + + idx = data_oid_to_idx(oid); + assert(vdi); + if (!vdi->inode->data_vdi_id[idx]) { + /* if object doesn't exist, we'er done */ + if (rw == VOLUME_READ) { + memset(buf, 0, size); + goto done; + } + create = 1; + } + } + + if (rw == VOLUME_READ) { + rlen = size; + hdr.opcode = SD_OP_READ_OBJ; + } else { + wlen = size; + hdr.opcode = create ? + SD_OP_CREATE_AND_WRITE_OBJ : SD_OP_WRITE_OBJ; + hdr.flags |= SD_FLAG_CMD_WRITE; + } + + hdr.obj.oid = oid; + hdr.obj.offset = off; + hdr.data_length = size; + hdr.flags |= SD_FLAG_CMD_CACHE; + + ret = exec_req(0, &hdr, buf, &wlen, &rlen); + + if (ret || rsp->result != SD_RES_SUCCESS) { + syslog(LOG_ERR, "failed to %s object %" PRIx64 " ret %d, res %d\n", + rw == VOLUME_READ ? "read" : "write", oid, ret, + rsp->result); + return -1; + } + + if (create) { + vdi->inode->data_vdi_id[idx] = vid; + /* writeback inode update */ + if (volume_rw_object((char *)&vid, vid_to_vdi_oid(vid), + sizeof(vid), + SD_INODE_HEADER_SIZE + sizeof(vid) * idx, + VOLUME_WRITE) < 0) + return -1; + } +done: + return size; +} + +/* Do sync read/write */ +static int volume_do_rw(const char *path, char *buf, size_t size, + off_t offset, int rw) +{ + uint32_t vid; + uint64_t oid; + unsigned long idx; + off_t start; + size_t len, ret; + + if (shadow_file_getxattr(path, SH_VID_NAME, &vid, SH_VID_SIZE) < 0) + return -1; + + idx = offset / SD_DATA_OBJ_SIZE; + oid = vid_to_data_oid(vid, idx); + start = offset % SD_DATA_OBJ_SIZE; + + len = SD_DATA_OBJ_SIZE - start; + if (size < len) + len = size; + + do { + syslog(LOG_ERR, "%s oid %"PRIx64", off %ju, len %zu," + " size %zu\n", + rw == VOLUME_READ ? "read" : "write", + oid, start, len, size); + ret = volume_rw_object(buf, oid, len, start, rw); + + if (ret != len) + return -1; + + oid++; + size -= len; + start = (start + len) % SD_DATA_OBJ_SIZE; + buf += len; + len = size > SD_DATA_OBJ_SIZE ? SD_DATA_OBJ_SIZE : size; + } while (size > 0); + + return 0; +} + +int volume_read(const char *path, char *buf, size_t size, off_t offset) +{ + + if (volume_do_rw(path, buf, size, offset, VOLUME_READ) < 0) + return -EIO; + + return size; +} + +int volume_write(const char *path, const char *buf, size_t size, off_t offset) +{ + if (volume_do_rw(path, (char *)buf, size, offset, VOLUME_WRITE) < 0) + return -EIO; + + return size; +} + +size_t volume_get_size(const char *path) +{ + size_t size = 0; + + shadow_file_getxattr(path, SH_SIZE_NAME, &size, SH_SIZE_SIZE); + return size; +} + +static int init_vdi_info(const char *entry, uint32_t *vid, size_t *size) +{ + struct strbuf *buf; + void *inode_buf; + struct vdi_inode *inode; + char command[256] = { 0 }; + int ret = -1; + + sprintf(command, "%s %s\n", "collie vdi list -r", entry); + buf = sheepfs_run_cmd(command); + if (!buf) + return -1; + if (sscanf(buf->buf, "%*s %*s %*d %zu %*s %*s %*s %"PRIx32, + size, vid) < 2) { + syslog(LOG_ERR, "[%s] failed to sscanf %s\n", __func__, entry); + goto out; + } + + inode_buf = malloc(SD_INODE_SIZE); + if (!inode_buf) { + syslog(LOG_ERR, "[%s] %m\n", __func__); + goto out; + } + + if (volume_rw_object(inode_buf, vid_to_vdi_oid(*vid), SD_INODE_SIZE, + 0, VOLUME_READ) < 0) { + free(inode_buf); + goto out; + } + + inode = xzalloc(sizeof(*inode)); + inode->vid = *vid; + inode->inode = inode_buf; + if (vdi_inode_tree_insert(inode)) { + free(inode_buf); + free(inode); + } + ret = 0; +out: + strbuf_release(buf); + return ret; +} + +int volume_create_entry(const char *entry) +{ + char path[PATH_MAX], *ch; + uint32_t vid; + size_t size; + + ch = strchr(entry, '\n'); + if (ch != NULL) + *ch = '\0'; + + sprintf(path, "%s/%s", PATH_VOLUME, entry); + if (shadow_file_exsit(path)) + return 0; + + if (init_vdi_info(entry, &vid, &size) < 0) + return -1; + + if (shadow_file_create(path) < 0) + return -1; + + if (shadow_file_setxattr(path, SH_VID_NAME, &vid, SH_VID_SIZE) < 0) { + shadow_file_delete(path); + return -1; + } + if (shadow_file_setxattr(path, SH_SIZE_NAME, &size, SH_SIZE_SIZE) < 0) { + shadow_file_delete(path); + return -1; + } + if (sheepfs_set_op(path, OP_VOLUME) < 0) + return -1; + + return 0; +} -- 1.7.10.2 |