[Sheepdog] [PATCH RFC 06/10] sheepfs: export 'volume' state

Liu Yuan namei.unix at gmail.com
Sat May 5 13:29:18 CEST 2012


From: Liu Yuan <tailai.ly at taobao.com>


Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 include/net.h           |    2 +
 sheep/Makefile.am       |    2 +-
 sheep/sheep.c           |    2 +-
 sheep/sheepfs/core.c    |   19 +++-
 sheep/sheepfs/sheepfs.h |   11 ++-
 sheep/sheepfs/volume.c  |  284 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 313 insertions(+), 7 deletions(-)
 create mode 100644 sheep/sheepfs/volume.c

diff --git a/include/net.h b/include/net.h
index 698b55e..0286ea5 100644
--- a/include/net.h
+++ b/include/net.h
@@ -4,6 +4,8 @@
 #include <sys/socket.h>
 #include <arpa/inet.h>
 
+#include "sheepdog_proto.h"
+
 #define DEFAULT_SOCKET_TIMEOUT 5 /* seconds */
 
 enum conn_state {
diff --git a/sheep/Makefile.am b/sheep/Makefile.am
index 09cb091..2e2e17a 100644
--- a/sheep/Makefile.am
+++ b/sheep/Makefile.am
@@ -48,7 +48,7 @@ endif
 
 if BUILD_SHEEPFS
 sheep_SOURCES		+= sheepfs/core.c sheepfs/cluster.c sheepfs/VDI.c \
-			   sheepfs/shadow_file.c
+			   sheepfs/shadow_file.c sheepfs/volume.c
 endif
 
 sheep_LDADD	  	= ../lib/libsheepdog.a -lpthread \
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 3f7ef3d..b3118f0 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -233,7 +233,7 @@ int main(int argc, char **argv)
 	if (ret)
 		exit(1);
 
-	ret = sheepfs_init(dir);
+	ret = sheepfs_init(dir, port);
 	if (ret)
 		exit(1);
 
diff --git a/sheep/sheepfs/core.c b/sheep/sheepfs/core.c
index b07200c..a518e83 100644
--- a/sheep/sheepfs/core.c
+++ b/sheep/sheepfs/core.c
@@ -8,22 +8,24 @@
 #include "strbuf.h"
 #include "logger.h"
 #include "sheepfs.h"
+#include "net.h"
 
 #define SH_OP_NAME   "user.sheepfs.opcode"
 #define SH_OP_SIZE   sizeof(uint32_t)
 
 char sheepfs_shadow[PATH_MAX];
+int sheep_fd;
 
 static struct sheepfs_file_operation {
 	int (*read)(const char *path, char *buf, size_t size, off_t);
 	int (*write)(const char *path, const char *buf, size_t size, off_t);
 	size_t (*get_size)(const char *path);
 } sheepfs_file_ops[] = {
-	[OP_NULL          = { NULL, NULL, NULL },
-	[OP_CLUSTER_INFO] = { cluster_info_read, NULL,
-				cluster_info_get_size },
+	[OP_NULL]         = { NULL, NULL, NULL },
+	[OP_CLUSTER_INFO] = { cluster_info_read, NULL, cluster_info_get_size },
 	[OP_VDI_LIST]     = { vdi_list_read, NULL, vdi_list_get_size },
 	[OP_VDI_MOUNT]    = { NULL, vdi_mount_write, NULL },
+	[OP_VOLUME]       = { volume_read, volume_write, volume_get_size },
 };
 
 int sheepfs_set_op(const char *path, unsigned opcode)
@@ -164,6 +166,8 @@ static void sheepfs_main_loop(char *root)
 	}
 
 	fuse_opt_add_arg(&args, "sheepfs"); /* placeholder for argv[0] */
+	fuse_opt_add_arg(&args, "-oallow_root");
+	fuse_opt_add_arg(&args, "-obig_writes");
 	fuse_opt_add_arg(&args, "-ofsname=sheepfs");
 	fuse_opt_add_arg(&args, root);
 	ret = fuse_main(args.argc, args.argv, &sheepfs_ops, NULL);
@@ -177,11 +181,13 @@ static int create_sheepfs_layout(void)
 		return -1;
 	if (create_vdi_layout() < 0)
 		return -1;
+	if (create_volume_layout() < 0)
+		return -1;
 
 	return 0;
 }
 
-int sheepfs_init(const char *dir)
+int sheepfs_init(const char *dir, int port)
 {
 	struct strbuf path = STRBUF_INIT;
 	pid_t pid;
@@ -207,6 +213,11 @@ int sheepfs_init(const char *dir)
 		strbuf_release(&path);
 		return 0;
 	} else /* child */ {
+		sheep_fd = connect_to("localhost", port);
+		if (sheep_fd < 0) {
+			eprintf("failed to connect sheep\n");
+			exit(-1);
+		}
 		sheepfs_main_loop(path.buf);
 		exit(0);
 	}
diff --git a/sheep/sheepfs/sheepfs.h b/sheep/sheepfs/sheepfs.h
index 85c95e3..f53b74c 100644
--- a/sheep/sheepfs/sheepfs.h
+++ b/sheep/sheepfs/sheepfs.h
@@ -6,12 +6,14 @@ enum sheepfs_opcode {
 	OP_CLUSTER_INFO,
 	OP_VDI_LIST,
 	OP_VDI_MOUNT,
+	OP_VOLUME,
 };
 
 extern char sheepfs_shadow[];
+extern int sheep_fd;
 
 extern struct strbuf *sheepfs_run_cmd(const char *command);
-extern int sheepfs_init(const char *dir);
+extern int sheepfs_init(const char *dir, int port);
 extern int sheepfs_set_op(const char *path, unsigned opcode);
 
 /* shadow_file.c */
@@ -26,6 +28,13 @@ extern int shadow_file_getxattr(const char *path, const char *name,
 extern int shadow_file_delete(const char *path);
 extern int shadow_file_exsit(const char *path);
 
+/* volume.c */
+extern int create_volume_layout(void);
+extern int volume_read(const char *path, char *buf, size_t size, off_t offset);
+extern int volume_write(const char *, const char *buf, size_t size, off_t);
+extern size_t volume_get_size(const char *);
+extern int volume_create_entry(const char *entry);
+
 /* cluster.c */
 extern int cluster_info_read(const char *path, char *buf, size_t size, off_t);
 extern size_t cluster_info_get_size(const char *path);
diff --git a/sheep/sheepfs/volume.c b/sheep/sheepfs/volume.c
new file mode 100644
index 0000000..d667952
--- /dev/null
+++ b/sheep/sheepfs/volume.c
@@ -0,0 +1,284 @@
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
+#include "../sheep_priv.h"
+#include "sheepdog_proto.h"
+#include "../strbuf.h"
+#include "sheepfs.h"
+#include "logger.h"
+#include "net.h"
+
+#define PATH_VOLUME	"/volume"
+
+#define SH_VID_NAME   "user.volume.vid"
+#define SH_VID_SIZE   sizeof(uint32_t)
+
+#define SH_SIZE_NAME   "user.volume.size"
+#define SH_SIZE_SIZE   sizeof(size_t)
+
+#define VOLUME_READ   0
+#define VOLUME_WRITE  1
+
+struct vdi_inode {
+	struct rb_node rb;
+	uint32_t vid;
+	struct sheepdog_inode *inode;
+};
+
+static struct rb_root vdi_inode_tree = RB_ROOT;
+
+static struct vdi_inode *vdi_inode_tree_insert(struct vdi_inode *new)
+{
+	struct rb_node **p = &vdi_inode_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct vdi_inode *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct vdi_inode, rb);
+
+		if (new->vid < entry->vid)
+			p = &(*p)->rb_left;
+		else if (new->vid > entry->vid)
+			p = &(*p)->rb_right;
+		else
+			return entry; /* already has this entry */
+	}
+	rb_link_node(&new->rb, parent, p);
+	rb_insert_color(&new->rb, &vdi_inode_tree);
+
+	return NULL; /* insert successfully */
+}
+
+static struct vdi_inode *vdi_inode_tree_search(uint32_t vid)
+{
+	struct rb_node *n = vdi_inode_tree.rb_node;
+	struct vdi_inode *t;
+
+	while (n) {
+		t = rb_entry(n, struct vdi_inode, rb);
+
+		if (vid < t->vid)
+			n = n->rb_left;
+		else if (vid > t->vid)
+			n = n->rb_right;
+		else
+			return t; /* found it */
+	}
+
+	return NULL;
+}
+
+int create_volume_layout(void)
+{
+	if (shadow_dir_create(PATH_VOLUME) < 0)
+		return -1;
+	return 0;
+}
+
+static int volume_rw_object(char *buf, uint64_t oid, size_t size,
+			     off_t off, int rw)
+{
+	struct sd_obj_req hdr = { 0 };
+	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr;
+	int ret;
+	unsigned wlen = 0, rlen = 0;
+	int create = 0;
+	uint32_t vid = oid_to_vid(oid);
+	struct vdi_inode *vdi = vdi_inode_tree_search(vid);
+	unsigned long idx = 0;
+
+	if (is_data_obj(oid)) {
+		idx = data_oid_to_idx(oid);
+		assert(vdi);
+		if (!vdi->inode->data_vdi_id[idx]) {
+			/* if object doesn't exist, we'er done */
+			if (rw == VOLUME_READ)
+				goto done;
+			create = 1;
+		}
+	}
+
+	if (rw == VOLUME_READ) {
+		rlen = size;
+		hdr.opcode = SD_OP_READ_OBJ;
+	} else {
+		wlen = size;
+		hdr.opcode = create ?
+			SD_OP_CREATE_AND_WRITE_OBJ : SD_OP_WRITE_OBJ;
+		hdr.flags |= SD_FLAG_CMD_WRITE | SD_FLAG_CMD_CACHE;
+	}
+
+	hdr.oid = oid;
+	hdr.data_length = size;
+	hdr.offset = off;
+
+	ret = exec_req(sheep_fd, (struct sd_req *)&hdr, buf, &wlen, &rlen);
+
+	if (ret) {
+		eprintf("failed to operate object %" PRIx64 "\n", oid);
+		return -1;
+	}
+
+	if (rsp->result != SD_RES_SUCCESS) {
+		eprintf("failed to operate object %" PRIx64 " %s\n", oid,
+			sd_strerror(rsp->result));
+		return -1;
+	}
+
+	if (create) {
+		vdi->inode->data_vdi_id[idx] = vid;
+		/* writeback inode update */
+		volume_rw_object((char *)&vid, vid_to_vdi_oid(vid),
+				 sizeof(vid),
+				 SD_INODE_HEADER_SIZE + sizeof(vid) * idx,
+				 VOLUME_WRITE);
+	}
+done:
+	return size;
+}
+
+/* Do sync read/write */
+static int volume_do_rw(const char *path, char *buf, size_t size,
+			 off_t offset, int rw)
+{
+	uint32_t vid;
+	uint64_t oid;
+	unsigned long idx;
+	off_t start;
+	size_t len, ret;
+
+	if (shadow_file_getxattr(path, SH_VID_NAME, &vid, SH_VID_SIZE) < 0)
+		return -1;
+
+	idx = offset / SD_DATA_OBJ_SIZE;
+	oid = vid_to_data_oid(vid, idx);
+	start = offset % SD_DATA_OBJ_SIZE;
+
+	len = SD_DATA_OBJ_SIZE - start;
+	if (size < len)
+		len = size;
+
+	do {
+		ret = volume_rw_object(buf, oid, len, start, rw);
+		dprintf("%s oid %016"PRIx64", off %ju, len %zu,"
+			"ret %zu, size %zu\n",
+			rw == VOLUME_READ ? "read" : "write",
+			oid, start, len, ret, size);
+
+		if (ret != len)
+			return -1;
+
+		oid++;
+		size -= len;
+		start += len;
+		buf += len;
+		len = size > SD_DATA_OBJ_SIZE ? SD_DATA_OBJ_SIZE : size;
+	} while (size > 0);
+
+	return 0;
+}
+
+int volume_read(const char *path, char *buf, size_t size, off_t offset)
+{
+
+	if (volume_do_rw(path, buf, size, offset, VOLUME_READ) < 0)
+		return -EIO;
+
+	return size;
+}
+
+int volume_write(const char *path, const char *buf, size_t size, off_t offset)
+{
+	if (volume_do_rw(path, (char *)buf, size, offset, VOLUME_WRITE) < 0)
+		return -EIO;
+
+	return size;
+}
+
+size_t volume_get_size(const char *path)
+{
+	size_t size = 0;
+
+	shadow_file_getxattr(path, SH_SIZE_NAME, &size, SH_SIZE_SIZE);
+	return size;
+}
+
+static int init_vdi_info(const char *entry, uint32_t *vid, size_t *size)
+{
+	struct strbuf *buf;
+	void *inode_buf;
+	struct vdi_inode *inode;
+	char command[256] = { 0 };
+
+	sprintf(command, "%s %s\n", "collie vdi list -r", entry);
+	buf = sheepfs_run_cmd(command);
+	if (!buf)
+		return -1;
+	if (sscanf(buf->buf, "%*s %*s %*d %zu %*s %*s %*s %"PRIx32,
+	    size, vid) < 2) {
+		dprintf("%m\n");
+		return -1;
+	}
+
+	inode_buf = malloc(SD_INODE_SIZE);
+	if (!inode_buf) {
+		dprintf("%m\n");
+		return -1;
+	}
+
+	if (volume_rw_object(inode_buf, vid_to_vdi_oid(*vid), SD_INODE_SIZE,
+			     0, VOLUME_READ) < 0) {
+		free(inode_buf);
+		return -1;
+	}
+
+	inode = xzalloc(sizeof(*inode));
+	inode->vid = *vid;
+	inode->inode = inode_buf;
+	if (vdi_inode_tree_insert(inode))
+		free(inode);
+
+	strbuf_release(buf);
+	return 0;
+}
+
+int volume_create_entry(const char *entry)
+{
+	char path[PATH_MAX], *ch;
+	uint32_t vid;
+	size_t size;
+
+	ch = strchr(entry, '\n');
+	if (ch != NULL)
+		*ch = '\0';
+
+	sprintf(path, "%s/%s", PATH_VOLUME, entry);
+	if (shadow_file_exsit(path))
+		return 0;
+
+	if (shadow_file_create(path) < 0)
+		return -1;
+
+	if (init_vdi_info(entry, &vid, &size) < 0)
+		return -1;
+	if (shadow_file_setxattr(path, SH_VID_NAME, &vid, SH_VID_SIZE) < 0) {
+		shadow_file_delete(path);
+		return -1;
+	}
+	if (shadow_file_setxattr(path, SH_SIZE_NAME, &size, SH_SIZE_SIZE) < 0) {
+		shadow_file_delete(path);
+		return -1;
+	}
+	if (sheepfs_set_op(path, OP_VOLUME) < 0)
+		return -1;
+
+	return 0;
+}
-- 
1.7.8.2




More information about the sheepdog mailing list