At Mon, 14 May 2012 17:47:32 +0800, Liu Yuan wrote: > > From: Liu Yuan <tailai.ly at taobao.com> > > This is where we can access sheepdog's storage from well received 'file' > abstraction. All the attatched volume will be seen as a file in volume > directory. > > To attatch a volume: > echo test > sheepfs_dir/vdi/mount > > Then you will see a file entry in sheepfs_dir/volume/test, which you can do > the tricks you are fond of. For, e.g, we can boot an sheepdog image as normally > as for an ordinary raw image: > > $ qemu-system-x86_64 --enable-kvm -m 1024 -drive \ > file=sheepfs_dir/volume/test,cache=writeback. > > This file abstraction integrates well with kernel's other component such as > pagecache. > > Signed-off-by: Liu Yuan <tailai.ly at taobao.com> > --- > sheepfs/Makefile.am | 2 +- > sheepfs/core.c | 8 +- > sheepfs/sheepfs.h | 8 ++ > sheepfs/volume.c | 299 +++++++++++++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 314 insertions(+), 3 deletions(-) > create mode 100644 sheepfs/volume.c > > diff --git a/sheepfs/Makefile.am b/sheepfs/Makefile.am > index f6201a6..c451b86 100644 > --- a/sheepfs/Makefile.am > +++ b/sheepfs/Makefile.am > @@ -23,7 +23,7 @@ INCLUDES = -I$(top_builddir)/include -I$(top_srcdir)/include > > sbin_PROGRAMS = sheepfs > > -sheepfs_SOURCES = core.c cluster.c vdi.c shadow_file.c > +sheepfs_SOURCES = core.c cluster.c vdi.c shadow_file.c volume.c > > sheepfs_LDADD = ../lib/libsheepdog.a $(fuse_LIBS) $(LIBS) > sheepfs_DEPENDENCIES = ../lib/libsheepdog.a > diff --git a/sheepfs/core.c b/sheepfs/core.c > index c042245..0f4f451 100644 > --- a/sheepfs/core.c > +++ b/sheepfs/core.c > @@ -48,10 +48,10 @@ static struct sheepfs_file_operation { > size_t (*get_size)(const char *path); > } sheepfs_file_ops[] = { > [OP_NULL ] = { NULL, NULL, NULL }, > - [OP_CLUSTER_INFO] = { cluster_info_read, NULL, > - cluster_info_get_size }, > + [OP_CLUSTER_INFO] = { cluster_info_read, NULL, cluster_info_get_size }, > [OP_VDI_LIST] = { vdi_list_read, NULL, vdi_list_get_size }, > [OP_VDI_MOUNT] = { NULL, vdi_mount_write, NULL }, > + [OP_VOLUME] = { volume_read, volume_write, volume_get_size }, > }; > > int sheepfs_set_op(const char *path, unsigned opcode) > @@ -184,6 +184,8 @@ static int sheepfs_main_loop(char *mountpoint) > int ret = -1; > > fuse_opt_add_arg(&args, "sheepfs"); /* placeholder for argv[0] */ > + fuse_opt_add_arg(&args, "-oallow_root"); > + fuse_opt_add_arg(&args, "-obig_writes"); > fuse_opt_add_arg(&args, "-ofsname=sheepfs"); > fuse_opt_add_arg(&args, mountpoint); > if (sheepfs_debug) > @@ -204,6 +206,8 @@ static int create_sheepfs_layout(void) > return -1; > if (create_vdi_layout() < 0) > return -1; > + if (create_volume_layout() < 0) > + return -1; > > return 0; > } > diff --git a/sheepfs/sheepfs.h b/sheepfs/sheepfs.h > index 48f3dbc..cfb1402 100644 > --- a/sheepfs/sheepfs.h > +++ b/sheepfs/sheepfs.h > @@ -6,6 +6,7 @@ enum sheepfs_opcode { > OP_CLUSTER_INFO, > OP_VDI_LIST, > OP_VDI_MOUNT, > + OP_VOLUME, > }; > > extern char sheepfs_shadow[]; > @@ -25,6 +26,13 @@ extern int shadow_file_getxattr(const char *path, const char *name, > extern int shadow_file_delete(const char *path); > extern int shadow_file_exsit(const char *path); > > +/* volume.c */ > +extern int create_volume_layout(void); > +extern int volume_read(const char *path, char *buf, size_t size, off_t offset); > +extern int volume_write(const char *, const char *buf, size_t size, off_t); > +extern size_t volume_get_size(const char *); > +extern int volume_create_entry(const char *entry); > + > /* cluster.c */ > extern int cluster_info_read(const char *path, char *buf, size_t size, off_t); > extern size_t cluster_info_get_size(const char *path); > diff --git a/sheepfs/volume.c b/sheepfs/volume.c > new file mode 100644 > index 0000000..438e9a2 > --- /dev/null > +++ b/sheepfs/volume.c > @@ -0,0 +1,299 @@ > +/* > + * Copyright (C) 2012 Taobao Inc. > + * > + * Liu Yuan <namei.unix at gmail.com> > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License version > + * 2 as published by the Free Software Foundation. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program. If not, see <http://www.gnu.org/licenses/>. > + */ > +#include <unistd.h> > +#include <sys/types.h> > +#include <sys/stat.h> > +#include <fcntl.h> > +#include <errno.h> > +#include <stdlib.h> > +#include <stdio.h> > +#include <time.h> > +#include <assert.h> > +#include <syslog.h> > + > +#include "strbuf.h" > +#include "sheepfs.h" > +#include "net.h" > +#include "rbtree.h" > + > +#define PATH_VOLUME "/volume" > + > +#define SH_VID_NAME "user.volume.vid" > +#define SH_VID_SIZE sizeof(uint32_t) > + > +#define SH_SIZE_NAME "user.volume.size" > +#define SH_SIZE_SIZE sizeof(size_t) > + > +#define VOLUME_READ 0 > +#define VOLUME_WRITE 1 > + > +struct vdi_inode { > + struct rb_node rb; > + uint32_t vid; > + struct sheepdog_inode *inode; > +}; > + > +static struct rb_root vdi_inode_tree = RB_ROOT; > + > +static struct vdi_inode *vdi_inode_tree_insert(struct vdi_inode *new) > +{ > + struct rb_node **p = &vdi_inode_tree.rb_node; > + struct rb_node *parent = NULL; > + struct vdi_inode *entry; > + > + while (*p) { > + parent = *p; > + entry = rb_entry(parent, struct vdi_inode, rb); > + > + if (new->vid < entry->vid) > + p = &(*p)->rb_left; > + else if (new->vid > entry->vid) > + p = &(*p)->rb_right; > + else > + return entry; /* already has this entry */ > + } > + rb_link_node(&new->rb, parent, p); > + rb_insert_color(&new->rb, &vdi_inode_tree); > + > + return NULL; /* insert successfully */ > +} > + > +static struct vdi_inode *vdi_inode_tree_search(uint32_t vid) > +{ > + struct rb_node *n = vdi_inode_tree.rb_node; > + struct vdi_inode *t; > + > + while (n) { > + t = rb_entry(n, struct vdi_inode, rb); > + > + if (vid < t->vid) > + n = n->rb_left; > + else if (vid > t->vid) > + n = n->rb_right; > + else > + return t; /* found it */ > + } > + > + return NULL; > +} > + > +int create_volume_layout(void) > +{ > + if (shadow_dir_create(PATH_VOLUME) < 0) > + return -1; > + return 0; > +} > + > +static int volume_rw_object(char *buf, uint64_t oid, size_t size, > + off_t off, int rw) > +{ > + struct sd_obj_req hdr = { 0 }; > + struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr; > + int ret; > + unsigned wlen = 0, rlen = 0; > + int create = 0; > + uint32_t vid = oid_to_vid(oid); > + struct vdi_inode *vdi = vdi_inode_tree_search(vid); > + unsigned long idx = 0; > + > + if (is_data_obj(oid)) { > + idx = data_oid_to_idx(oid); > + assert(vdi); > + if (!vdi->inode->data_vdi_id[idx]) { > + /* if object doesn't exist, we'er done */ > + if (rw == VOLUME_READ) { > + memset(buf, 0, size); > + goto done; > + } > + create = 1; > + } > + } > + > + if (rw == VOLUME_READ) { > + rlen = size; > + hdr.opcode = SD_OP_READ_OBJ; > + } else { > + wlen = size; > + hdr.opcode = create ? > + SD_OP_CREATE_AND_WRITE_OBJ : SD_OP_WRITE_OBJ; > + hdr.flags |= SD_FLAG_CMD_WRITE; > + } > + > + hdr.oid = oid; > + hdr.data_length = size; > + hdr.offset = off; If we add '-D' to the sheep command line options, hdr.data_length and hdr.offset must be aligned to 512 bytes. How about returning EINVAL to users when off and size is not aligned? > + hdr.flags |= SD_FLAG_CMD_CACHE; > + > + ret = exec_req(0, (struct sd_req *)&hdr, buf, &wlen, &rlen); > + > + if (ret || rsp->result != SD_RES_SUCCESS) { > + syslog(LOG_ERR, "failed to %s object %" PRIx64 " ret %d, res %d\n", > + rw == VOLUME_READ ? "read" : "write", oid, ret, > + rsp->result); > + return -1; > + } > + > + if (create) { > + vdi->inode->data_vdi_id[idx] = vid; > + /* writeback inode update */ > + if (volume_rw_object((char *)&vid, vid_to_vdi_oid(vid), > + sizeof(vid), > + SD_INODE_HEADER_SIZE + sizeof(vid) * idx, > + VOLUME_WRITE) < 0) > + return -1; Wrong indentation level. > + } > +done: > + return size; > +} > + > +/* Do sync read/write */ > +static int volume_do_rw(const char *path, char *buf, size_t size, > + off_t offset, int rw) > +{ > + uint32_t vid; > + uint64_t oid; > + unsigned long idx; > + off_t start; > + size_t len, ret; > + > + if (shadow_file_getxattr(path, SH_VID_NAME, &vid, SH_VID_SIZE) < 0) > + return -1; > + > + idx = offset / SD_DATA_OBJ_SIZE; > + oid = vid_to_data_oid(vid, idx); > + start = offset % SD_DATA_OBJ_SIZE; > + > + len = SD_DATA_OBJ_SIZE - start; > + if (size < len) > + len = size; > + > + do { > + syslog(LOG_ERR, "%s oid %"PRIx64", off %ju, len %zu," > + " size %zu\n", > + rw == VOLUME_READ ? "read" : "write", > + oid, start, len, size); > + ret = volume_rw_object(buf, oid, len, start, rw); > + > + if (ret != len) > + return -1; > + > + oid++; > + size -= len; > + start = (start + len) % SD_DATA_OBJ_SIZE; > + buf += len; > + len = size > SD_DATA_OBJ_SIZE ? SD_DATA_OBJ_SIZE : size; > + } while (size > 0); > + > + return 0; > +} > + > +int volume_read(const char *path, char *buf, size_t size, off_t offset) > +{ > + > + if (volume_do_rw(path, buf, size, offset, VOLUME_READ) < 0) > + return -EIO; > + > + return size; > +} > + > +int volume_write(const char *path, const char *buf, size_t size, off_t offset) > +{ > + if (volume_do_rw(path, (char *)buf, size, offset, VOLUME_WRITE) < 0) > + return -EIO; > + > + return size; > +} > + > +size_t volume_get_size(const char *path) > +{ > + size_t size = 0; > + > + shadow_file_getxattr(path, SH_SIZE_NAME, &size, SH_SIZE_SIZE); > + return size; > +} > + > +static int init_vdi_info(const char *entry, uint32_t *vid, size_t *size) > +{ > + struct strbuf *buf; > + void *inode_buf; > + struct vdi_inode *inode; > + char command[256] = { 0 }; > + int ret = -1; > + > + sprintf(command, "%s %s\n", "collie vdi list -r", entry); > + buf = sheepfs_run_cmd(command); > + if (!buf) > + return -1; > + if (sscanf(buf->buf, "%*s %*s %*d %zu %*s %*s %*s %"PRIx32, > + size, vid) < 2) { > + syslog(LOG_ERR, "[%s] %m\n", __func__); It seems that sscanf() doesn't set errno, so we shouldn't use %m here. > + goto out; > + } > + > + inode_buf = malloc(SD_INODE_SIZE); > + if (!inode_buf) { > + syslog(LOG_ERR, "[%s] %m\n", __func__); > + goto out; > + } > + > + if (volume_rw_object(inode_buf, vid_to_vdi_oid(*vid), SD_INODE_SIZE, > + 0, VOLUME_READ) < 0) { > + free(inode_buf); > + goto out; > + } > + > + inode = xzalloc(sizeof(*inode)); > + inode->vid = *vid; > + inode->inode = inode_buf; > + if (vdi_inode_tree_insert(inode)) { > + free(inode_buf); > + free(inode); > + } > + ret = 0; > +out: > + strbuf_release(buf); > + return ret; > +} > + > +int volume_create_entry(const char *entry) > +{ > + char path[PATH_MAX], *ch; > + uint32_t vid; > + size_t size; > + > + ch = strchr(entry, '\n'); > + if (ch != NULL) > + *ch = '\0'; > + > + sprintf(path, "%s/%s", PATH_VOLUME, entry); > + if (shadow_file_exsit(path)) > + return 0; > + > + if (shadow_file_create(path) < 0) > + return -1; The requested vdi may not exist. We should call init_vdi_info() before creating a volume file. Thanks, Kazutaka > + > + if (init_vdi_info(entry, &vid, &size) < 0) > + return -1; > + if (shadow_file_setxattr(path, SH_VID_NAME, &vid, SH_VID_SIZE) < 0) { > + shadow_file_delete(path); > + return -1; > + } > + if (shadow_file_setxattr(path, SH_SIZE_NAME, &size, SH_SIZE_SIZE) < 0) { > + shadow_file_delete(path); > + return -1; > + } > + if (sheepfs_set_op(path, OP_VOLUME) < 0) > + return -1; > + > + return 0; > +} > -- > 1.7.8.2 > |