[Sheepdog] [PATCH 1/2] manage VDIs without the super object

FUJITA Tomonori fujita.tomonori at lab.ntt.co.jp
Fri Mar 26 02:17:20 CET 2010


Signed-off-by: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>
---
 collie/collie.h          |   14 ++-
 collie/group.c           |   17 ++-
 collie/net.c             |    2 +-
 collie/vdi.c             |  278 ++++++++++++++++++++++++++++++++--------------
 include/bitops.h         |  132 ++++++++++++++++++++++
 include/meta.h           |   65 ++++++-----
 include/sheepdog_proto.h |    8 +-
 include/util.h           |    3 +-
 shepherd/shepherd.c      |   80 +++++++-------
 9 files changed, 428 insertions(+), 171 deletions(-)
 create mode 100644 include/bitops.h

diff --git a/collie/collie.h b/collie/collie.h
index 5cd2383..8829079 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -19,6 +19,7 @@
 #include "logger.h"
 #include "work.h"
 #include "net.h"
+#include "meta.h"
 
 #define SD_MSG_JOIN             0x01
 #define SD_MSG_VDI_OP           0x02
@@ -70,6 +71,8 @@ struct cluster_info {
 	struct list_head vm_list;
 	struct list_head pending_list;
 
+	DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
+
 	int nr_sobjs;
 };
 
@@ -79,12 +82,13 @@ int create_listen_port(int port, void *data);
 
 int init_store(char *dir);
 
-int add_vdi(char *buf, int len, uint64_t size,
-	    uint64_t *added_oid, uint64_t base_oid, uint32_t tag, int copies,
-	    uint16_t flags);
+int add_vdi(char *data, int data_len, uint64_t size,
+	    uint64_t *new_oid, uint64_t base_oid, uint32_t copies,
+	    int is_snapshot);
+
+int lookup_vdi(char *data, int data_len, uint64_t *oid, uint32_t snapid);
 
-int lookup_vdi(char *filename, uint64_t * oid,
-	       uint32_t tag, int do_lock, int *current);
+int read_vdis(char *data, int len, unsigned int *rsp_len);
 
 int make_super_object(struct sd_vdi_req *hdr);
 
diff --git a/collie/group.c b/collie/group.c
index a49c1be..4a2397b 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -199,6 +199,9 @@ void cluster_queue_request(struct work *work, int idx)
 
 		rsp->result = SD_RES_SUCCESS;
 		break;
+	case SD_OP_READ_VDIS:
+		rsp->result = read_vdis(req->data, hdr->data_length, &rsp->data_length);
+		break;
 	default:
 		/* forward request to group */
 		goto forward;
@@ -506,21 +509,20 @@ static void vdi_op(struct vdi_op_message *msg)
 	const struct sd_vdi_req *hdr = &msg->req;
 	struct sd_vdi_rsp *rsp = &msg->rsp;
 	void *data = msg->data;
-	int ret = SD_RES_SUCCESS, is_current;
+	int ret = SD_RES_SUCCESS;
 	uint64_t oid = 0;
 
 	switch (hdr->opcode) {
 	case SD_OP_NEW_VDI:
 		ret = add_vdi(data, hdr->data_length, hdr->vdi_size, &oid,
-			      hdr->base_oid, hdr->tag, hdr->copies, hdr->flags);
+			      hdr->base_oid, hdr->copies,
+			      hdr->snapid);
 		break;
 	case SD_OP_LOCK_VDI:
 	case SD_OP_GET_VDI_INFO:
-		ret = lookup_vdi(data, &oid, hdr->tag, 1, &is_current);
+		ret = lookup_vdi(data, hdr->data_length, &oid, hdr->snapid);
 		if (ret != SD_RES_SUCCESS)
 			break;
-		if (is_current)
-			rsp->flags = SD_VDI_RSP_FLAG_CURRENT;
 		break;
 	case SD_OP_RELEASE_VDI:
 		break;
@@ -556,7 +558,12 @@ static void vdi_op_done(struct vdi_op_message *msg)
 
 	switch (hdr->opcode) {
 	case SD_OP_NEW_VDI:
+	{
+		unsigned long nr = (rsp->oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
+		vprintf(SDOG_INFO "done %d %ld %" PRIx64 "\n", ret, nr, rsp->oid);
+		set_bit(nr, sys->vdi_inuse);
 		break;
+	}
 	case SD_OP_LOCK_VDI:
 		if (lookup_vm(&sys->vm_list, (char *)data)) {
 			ret = SD_RES_VDI_LOCKED;
diff --git a/collie/net.c b/collie/net.c
index 04f9547..749c33d 100644
--- a/collie/net.c
+++ b/collie/net.c
@@ -89,10 +89,10 @@ static void queue_request(struct request *req)
 	case SD_OP_MAKE_FS:
 	case SD_OP_SHUTDOWN:
 	case SD_OP_STAT_CLUSTER:
+	case SD_OP_READ_VDIS:
 		req->work.fn = cluster_queue_request;
 		break;
 	case SD_OP_SO:
-	case SD_OP_SO_NEW_VDI:
 	case SD_OP_SO_LOOKUP_VDI:
 	case SD_OP_SO_READ_VDIS:
 	case SD_OP_SO_STAT:
diff --git a/collie/vdi.c b/collie/vdi.c
index 5904488..25cc83c 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -16,17 +16,23 @@
 #include "meta.h"
 #include "collie.h"
 
+
 /* TODO: should be performed atomically */
-static int create_inode_obj(struct sheepdog_node_list_entry *entries,
-			    int nr_nodes, uint64_t epoch, int copies,
-			    uint64_t oid, uint64_t size, uint64_t base_oid)
+static int create_vdi_obj(char *name, uint64_t new_oid, uint64_t size,
+			  uint64_t base_oid, uint64_t cur_oid, uint32_t copies,
+			  uint32_t snapid, int is_snapshot)
 {
-	struct sheepdog_inode inode, base;
+	struct sheepdog_node_list_entry entries[SD_MAX_NODES];
+	/* we are not called concurrently */
+	static struct sheepdog_inode new, base, cur;
 	struct timeval tv;
-	int ret;
+	int ret, nr_nodes;
+	unsigned long block_size = SD_DATA_OBJ_SIZE;
+
+	nr_nodes = build_node_list(&sys->sd_node_list, entries);
 
 	if (base_oid) {
-		ret = read_object(entries, nr_nodes, epoch,
+		ret = read_object(entries, nr_nodes, sys->epoch,
 				  base_oid, (char *)&base, sizeof(base), 0,
 				  copies);
 		if (ret < 0)
@@ -35,26 +41,45 @@ static int create_inode_obj(struct sheepdog_node_list_entry *entries,
 
 	gettimeofday(&tv, NULL);
 
-	memset(&inode, 0, sizeof(inode));
+	if (is_snapshot) {
+		if (cur_oid != base_oid) {
+			vprintf(SDOG_INFO "tree snapshot %s %" PRIx64 " %" PRIx64 "\n",
+				name, cur_oid, base_oid);
+
+			ret = read_object(entries, nr_nodes, sys->epoch,
+					  cur_oid, (char *)&cur, sizeof(cur), 0,
+					  copies);
+			if (ret < 0) {
+				vprintf(SDOG_ERR "failed\n");
+				return SD_RES_BASE_VDI_READ;
+			}
+
+			cur.snap_ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
+		} else
+			base.snap_ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
+	}
+
+	memset(&new, 0, sizeof(new));
 
-	inode.oid = oid;
-	inode.vdi_size = size;
-	inode.block_size = SD_DATA_OBJ_SIZE;
-	inode.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
-	inode.nr_copies = copies;
+	strncpy(new.name, name, sizeof(new.name));
+	new.oid = new_oid;
+	new.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
+	new.vdi_size = size;
+	new.copy_policy = 0;
+	new.nr_copies = copies;
+	new.block_size_shift = find_next_bit(&block_size, BITS_PER_LONG, 0);
+	new.snap_id = snapid;
 
 	if (base_oid) {
 		int i;
 
-		eprintf("%zd %zd\n", sizeof(inode.data_oid),
-			ARRAY_SIZE(base.child_oid));
-		inode.parent_oid = base_oid;
-		memcpy(inode.data_oid, base.data_oid,
+		new.parent_oid = base_oid;
+		memcpy(new.data_oid, base.data_oid,
 		       MAX_DATA_OBJS * sizeof(uint64_t));
 
 		for (i = 0; i < ARRAY_SIZE(base.child_oid); i++) {
 			if (!base.child_oid[i]) {
-				base.child_oid[i] = oid;
+				base.child_oid[i] = new_oid;
 				break;
 			}
 		}
@@ -62,120 +87,203 @@ static int create_inode_obj(struct sheepdog_node_list_entry *entries,
 		if (i == ARRAY_SIZE(base.child_oid))
 			return SD_RES_NO_BASE_VDI;
 
+	}
+
+	if (is_snapshot && cur_oid != base_oid) {
+		ret = write_object(entries, nr_nodes, sys->epoch,
+				   cur_oid, (char *)&cur, sizeof(cur), 0,
+				   copies, 0);
+		if (ret < 0) {
+			vprintf(SDOG_ERR "failed\n");
+			return SD_RES_BASE_VDI_READ;
+		}
+	}
+
+	if (base_oid) {
 		ret = write_object(entries, nr_nodes,
-				   epoch, base_oid, (char *)&base,
+				   sys->epoch, base_oid, (char *)&base,
 				   sizeof(base), 0, copies, 0);
-		if (ret < 0)
+		if (ret < 0) {
+			vprintf(SDOG_ERR "failed\n");
 			return SD_RES_BASE_VDI_WRITE;
+		}
 	}
 
-	ret = write_object(entries, nr_nodes, epoch,
-			   oid, (char *)&inode, sizeof(inode), 0, copies, 1);
+	ret = write_object(entries, nr_nodes, sys->epoch,
+			   new_oid, (char *)&new, sizeof(new), 0, copies, 1);
 	if (ret < 0)
 		return SD_RES_VDI_WRITE;
 
 	return ret;
 }
 
-/*
- * TODO: handle larger buffer
- */
-int add_vdi(char *name, int len, uint64_t size,
-	    uint64_t *added_oid, uint64_t base_oid, uint32_t tag, int copies,
-	    uint16_t flags)
+static int find_first_vdi(unsigned long start, unsigned long end,
+			  char *name, int namelen, uint32_t snapid, uint64_t *oid,
+			  unsigned long *deleted_nr, uint32_t *next_snap)
 {
 	struct sheepdog_node_list_entry entries[SD_MAX_NODES];
+	static struct sheepdog_inode inode;
+	unsigned long i;
 	int nr_nodes, nr_reqs;
-	uint64_t oid = 0;
 	int ret;
-	struct sd_so_req req;
-	struct sd_so_rsp *rsp = (struct sd_so_rsp *)&req;
-
-	memset(&req, 0, sizeof(req));
 
 	nr_nodes = build_node_list(&sys->sd_node_list, entries);
 
-	dprintf("%s (%d) %" PRIu64 ", base: %" PRIu64 "\n", name, len, size,
-		base_oid);
-
 	nr_reqs = sys->nr_sobjs;
 	if (nr_reqs > nr_nodes)
 		nr_reqs = nr_nodes;
 
-	memset(&req, 0, sizeof(req));
-
-	eprintf("%d %d\n", copies, sys->nr_sobjs);
-	/* qemu doesn't specify the copies, then we use the default. */
-	if (!copies)
-		copies = sys->nr_sobjs;
-
-	req.opcode = SD_OP_SO_NEW_VDI;
-	req.copies = copies;
-	req.tag = tag;
-	req.flags |= flags;
+	for (i = start; i >= end; i--) {
+		ret = read_object(entries, nr_nodes, sys->epoch,
+				  bit_to_oid(i), (char *)&inode, sizeof(inode), 0,
+				  nr_reqs);
+		if (ret < 0)
+			return SD_RES_EIO;
 
-	ret = exec_reqs(entries, nr_nodes, sys->epoch,
-			SD_DIR_OID, (struct sd_req *)&req, name, len, 0,
-			nr_reqs, nr_reqs);
+		if (inode.name[0] == '\0') {
+			*deleted_nr = i;
+			continue; /* deleted */
+		}
 
-	if (ret < 0)
-		return rsp->result;
+		if (!strncmp(inode.name, name, strlen(inode.name))) {
+			if (snapid && snapid != inode.snap_id)
+				continue;
 
-	oid = rsp->oid;
-	*added_oid = oid;
+			*next_snap = inode.snap_id + 1;
+			*oid = inode.oid;
+			return SD_RES_SUCCESS;
+		}
+	}
+	return SD_RES_NO_VDI;
+}
 
-	dprintf("%s (%d) %" PRIu64 ", base: %" PRIu64 "\n", name, len, size,
-		oid);
 
-	ret = create_inode_obj(entries, nr_nodes, sys->epoch, copies,
-			       oid, size, base_oid);
+static int do_lookup_vdi(char *name, int namelen, uint64_t *oid, uint32_t snapid,
+			 uint32_t *next_snapid,
+			 unsigned long *right_nr,  unsigned long *deleted_nr)
+{
+	int ret;
+	unsigned long nr, start_nr;
+
+	start_nr = fnv_64a_buf(name, namelen, FNV1A_64_INIT) & (SD_NR_VDIS - 1);
+
+	vprintf(SDOG_INFO "looking for %s %d, %lx\n", name, namelen, start_nr);
+
+	/* bitmap search from the hash point */
+	nr = find_next_zero_bit(sys->vdi_inuse, SD_NR_VDIS, start_nr);
+	*right_nr = nr;
+	if (nr == start_nr) {
+		return SD_RES_NO_VDI;
+	} else if (nr < SD_NR_VDIS) {
+	right_side:
+		/* look up on the right side of the hash point */
+		ret = find_first_vdi(nr - 1, start_nr, name, namelen, snapid, oid,
+				     deleted_nr, next_snapid);
+		return ret;
+	} else {
+		/* round up... bitmap search from the head of the bitmap */
+		nr = find_next_zero_bit(sys->vdi_inuse, SD_NR_VDIS, 0);
+		*right_nr = nr;
+		if (nr >= SD_NR_VDIS)
+			return SD_RES_FULL_VDI;
+		else if (nr) {
+			/* look up on the left side of the hash point */
+			ret = find_first_vdi(nr - 1, 0, name, namelen, snapid, oid,
+					     deleted_nr, next_snapid);
+			if (ret == SD_RES_NO_VDI)
+				; /* we need to go to the right side */
+			else
+				return ret;
+		}
 
-	return ret;
+		nr = SD_NR_VDIS;
+		goto right_side;
+	}
 }
 
-int del_vdi(char *name, int len)
+int lookup_vdi(char *data, int data_len, uint64_t *oid, uint32_t snapid)
 {
-	return 0;
+	char *name = data;
+	uint32_t dummy0;
+	unsigned long dummy1, dummy2;
+
+	if (data_len != SD_MAX_VDI_LEN)
+		return SD_RES_INVALID_PARMS;
+
+	return do_lookup_vdi(name, strlen(name), oid, snapid,
+			     &dummy0, &dummy1, &dummy2);
 }
 
-int lookup_vdi(char *filename, uint64_t * oid, uint32_t tag, int do_lock,
-	       int *current)
+int add_vdi(char *data, int data_len, uint64_t size,
+	    uint64_t *new_oid, uint64_t base_oid, uint32_t copies, int is_snapshot)
 {
-	struct sheepdog_node_list_entry entries[SD_MAX_NODES];
-	int nr_nodes, nr_reqs;
+	uint64_t cur_oid;
+	uint32_t next_snapid;
+	unsigned long nr, deleted_nr = SD_NR_VDIS, right_nr = SD_NR_VDIS;
 	int ret;
-	struct sd_so_req req;
-	struct sd_so_rsp *rsp = (struct sd_so_rsp *)&req;
+	char *name;
 
-	memset(&req, 0, sizeof(req));
+	if (data_len != SD_MAX_VDI_LEN)
+		return SD_RES_INVALID_PARMS;
 
-	nr_nodes = build_node_list(&sys->sd_node_list, entries);
+	name = data;
 
-	*current = 0;
+	ret = do_lookup_vdi(name, strlen(name), &cur_oid, 0, &next_snapid,
+			    &right_nr, &deleted_nr);
 
-	dprintf("looking for %s %zd\n", filename, strlen(filename));
+	if (is_snapshot) {
+		if (ret != SD_RES_SUCCESS) {
+			if (ret == SD_RES_NO_VDI)
+				vprintf(SDOG_CRIT "we dont's have %s\n", name);
+			return ret;
+		}
+		nr = right_nr;
+	} else {
+		/* we already have the same vdi or met other errors. */
+		if (ret != SD_RES_NO_VDI) {
+			if (ret == SD_RES_SUCCESS)
+				ret = SD_RES_VDI_EXIST;
+			return ret;
+		}
 
-	nr_reqs = sys->nr_sobjs;
-	if (nr_reqs > nr_nodes)
-		nr_reqs = nr_nodes;
+		if (deleted_nr == SD_NR_VDIS)
+			nr = right_nr;
+		else
+			nr = deleted_nr; /* we can recycle a deleted vdi */
 
-	memset(&req, 0, sizeof(req));
+		next_snapid = 1;
+	}
 
-	req.opcode = SD_OP_SO_LOOKUP_VDI;
-	req.tag = tag;
+	*new_oid = bit_to_oid(nr);
 
-	ret = exec_reqs(entries, nr_nodes, sys->epoch,
-			SD_DIR_OID, (struct sd_req *)&req, filename, strlen(filename), 0,
-			nr_reqs, 1);
+	vprintf(SDOG_INFO "we create a new vdi, %d %s (%zd) %" PRIu64 ", oid: %"
+		PRIx64 ", base %" PRIx64 ", cur %" PRIx64 " \n",
+		is_snapshot, name, strlen(name), size, *new_oid, base_oid, cur_oid);
 
-	*oid = rsp->oid;
-	if (rsp->flags & SD_VDI_RSP_FLAG_CURRENT)
-		*current = 1;
+	if (!copies) {
+		vprintf(SDOG_WARNING "qemu doesn't specify the copies... %d\n",
+			sys->nr_sobjs);
+		copies = sys->nr_sobjs;
+	}
 
-	dprintf("looking for %s %lx\n", filename, *oid);
+	ret = create_vdi_obj(name, *new_oid, size, base_oid, cur_oid, copies,
+			     next_snapid, is_snapshot);
 
-	if (ret < 0)
-		return rsp->result;
+	return ret;
+}
+
+int del_vdi(char *name, int len)
+{
+	return 0;
+}
+
+int read_vdis(char *data, int len, unsigned int *rsp_len)
+{
+	if (len != sizeof(sys->vdi_inuse))
+		return SD_RES_INVALID_PARMS;
+
+	memcpy(data, sys->vdi_inuse, sizeof(sys->vdi_inuse));
+	*rsp_len = sizeof(sys->vdi_inuse);
 
 	return SD_RES_SUCCESS;
 }
diff --git a/include/bitops.h b/include/bitops.h
new file mode 100644
index 0000000..e3191dd
--- /dev/null
+++ b/include/bitops.h
@@ -0,0 +1,132 @@
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define BITS_PER_BYTE		8
+#define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
+#define DECLARE_BITMAP(name,bits) \
+	unsigned long name[BITS_TO_LONGS(bits)]
+
+#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long))
+
+#define ffz(x)  __ffs(~(x))
+
+static inline unsigned long __ffs(unsigned long word)
+{
+	int num = 0;
+
+	if (BITS_PER_LONG == 64) {
+		if ((word & 0xffffffff) == 0) {
+			num += 32;
+			word >>= 32;
+		}
+	}
+
+	if ((word & 0xffff) == 0) {
+		num += 16;
+		word >>= 16;
+	}
+	if ((word & 0xff) == 0) {
+		num += 8;
+		word >>= 8;
+	}
+	if ((word & 0xf) == 0) {
+		num += 4;
+		word >>= 4;
+	}
+	if ((word & 0x3) == 0) {
+		num += 2;
+		word >>= 2;
+	}
+	if ((word & 0x1) == 0)
+		num += 1;
+	return num;
+}
+
+#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
+
+static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+				 unsigned long offset)
+{
+	const unsigned long *p = addr + BITOP_WORD(offset);
+	unsigned long result = offset & ~(BITS_PER_LONG-1);
+	unsigned long tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset %= BITS_PER_LONG;
+	if (offset) {
+		tmp = *(p++);
+		tmp |= ~0UL >> (BITS_PER_LONG - offset);
+		if (size < BITS_PER_LONG)
+			goto found_first;
+		if (~tmp)
+			goto found_middle;
+		size -= BITS_PER_LONG;
+		result += BITS_PER_LONG;
+	}
+	while (size & ~(BITS_PER_LONG-1)) {
+		if (~(tmp = *(p++)))
+			goto found_middle;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+
+found_first:
+	tmp |= ~0UL << size;
+	if (tmp == ~0UL)	/* Are any bits zero? */
+		return result + size;	/* Nope. */
+found_middle:
+	return result + ffz(tmp);
+}
+
+static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+			    unsigned long offset)
+{
+	const unsigned long *p = addr + BITOP_WORD(offset);
+	unsigned long result = offset & ~(BITS_PER_LONG-1);
+	unsigned long tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset %= BITS_PER_LONG;
+	if (offset) {
+		tmp = *(p++);
+		tmp &= (~0UL << offset);
+		if (size < BITS_PER_LONG)
+			goto found_first;
+		if (tmp)
+			goto found_middle;
+		size -= BITS_PER_LONG;
+		result += BITS_PER_LONG;
+	}
+	while (size & ~(BITS_PER_LONG-1)) {
+		if ((tmp = *(p++)))
+			goto found_middle;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+
+found_first:
+	tmp &= (~0UL >> (BITS_PER_LONG - size));
+	if (tmp == 0UL)		/* Are any bits set? */
+		return result + size;	/* Nope. */
+found_middle:
+	return result + __ffs(tmp);
+}
+
+static inline void set_bit(int nr, unsigned long *addr)
+{
+	addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
+}
+
+static inline int test_bit(unsigned int nr, const unsigned long *addr)
+{
+	return ((1UL << (nr % BITS_PER_LONG)) &
+		(((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
diff --git a/include/meta.h b/include/meta.h
index 67d2b11..5b296b2 100644
--- a/include/meta.h
+++ b/include/meta.h
@@ -21,33 +21,22 @@
 /*
  * Object ID rules
  *
- *  0 - 17 (18 bits): data object
- * 17 - 55 (37 bits): inode object
- * 56 - 63 ( 8 bits): PGID
- *
- * each VDI can use 2^18 data objects.
+ *  0 - 19 (20 bits): data object space
+ * 20 - 31 (12 bits): reserved data object space
+ * 32 - 55 (24 bits): vdi object space
+ * 56 - 62 (17 bits): reserved vdi object space
+ * 63 - 63 ( 1 bit ): set if vdi
  */
 
-#define DATA_SPACE_SHIFT 18
-
+#define VDI_SPACE   24
+#define VDI_SPACE_SHIFT   32
+#define VDI_BIT (UINT64_C(1) << 63)
 #define DEAFAULT_NR_COPIES 1
+#define SD_MAX_VDI_LEN 256
+#define MAX_DATA_OBJS (1ULL << 20)
+#define MAX_CHILDREN 1024
 
-static inline uint64_t oid_to_ino(uint64_t inode_oid)
-{
-	return (inode_oid >> DATA_SPACE_SHIFT) & ((UINT64_C(1) << 37) - 1);
-}
-
-static inline int is_data_obj_writeable(uint64_t inode_oid, uint64_t data_oid)
-{
-	return oid_to_ino(inode_oid) == oid_to_ino(data_oid);
-}
-
-static inline int is_data_obj(uint64_t oid)
-{
-	return oid & ((UINT64_C(1) << DATA_SPACE_SHIFT) - 1);
-}
-
-#define SHEEPDOG_SUPER_OBJ_SIZE (UINT64_C(1) << 12)
+#define SD_NR_VDIS   (1U << 24)
 
 #define FLAG_CURRENT 1
 
@@ -63,19 +52,37 @@ struct sheepdog_vdi_info {
 	char tag[SD_MAX_VDI_LEN];
 };
 
-#define MAX_DATA_OBJS (1 << 18)
-#define MAX_CHILDREN 1024
-
 struct sheepdog_inode {
+	char name[SD_MAX_VDI_LEN];
 	uint64_t oid;
 	uint64_t ctime;
+	uint64_t snap_ctime;
 	uint64_t vdi_size;
-	uint64_t block_size;
-	uint32_t copy_policy;
-	uint32_t nr_copies;
+	uint16_t copy_policy;
+	uint8_t  nr_copies;
+	uint8_t  block_size_shift;
+	uint32_t snap_id;
 	uint64_t parent_oid;
 	uint64_t child_oid[MAX_CHILDREN];
 	uint64_t data_oid[MAX_DATA_OBJS];
 };
 
+static inline int is_data_obj_writeable(struct sheepdog_inode *inode, int idx)
+{
+	return (inode->oid >> VDI_SPACE_SHIFT) ==
+		(inode->data_oid[idx] >> VDI_SPACE_SHIFT);
+}
+
+static inline int is_data_obj(uint64_t oid)
+{
+	return !(VDI_BIT & oid);
+}
+
+#define NR_VDIS (1U << DATA_SPECE_SHIFT)
+
+static inline uint64_t bit_to_oid(unsigned long nr)
+{
+	return ((unsigned long long)nr << VDI_SPACE_SHIFT) | VDI_BIT;
+}
+
 #endif
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 9863aa3..b6afbe1 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -20,8 +20,6 @@
 #define SD_MAX_NODES 1024
 #define SD_MAX_VMS   4096
 
-#define SD_MAX_VDI_LEN 256
-
 /* -> vmon */
 
 #define SD_OP_NEW_VDI        0x11
@@ -36,6 +34,7 @@
 #define SD_OP_GET_EPOCH      0x23
 #define SD_OP_SHUTDOWN       0x24
 #define SD_OP_READ_EPOCH     0x25
+#define SD_OP_READ_VDIS      0x26
 
 #define SD_OP_DEBUG_INC_NVER 0xA0
 #define SD_OP_DEBUG_SET_NODE 0xA1
@@ -96,6 +95,7 @@
 #define SD_RES_SHUTDOWN      0x18 /* Sheepdog is shutting down */
 #define SD_RES_NO_MEM        0x19 /* Cannot allocate memory */
 #define SD_RES_INCONSISTENT_EPOCHS  0x1A /* There is inconsistency between epochs */
+#define SD_RES_FULL_VDI      0x1B /* we already have the maximum vdis */
 
 #define SD_VDI_RSP_FLAG_CURRENT 0x01
 
@@ -206,10 +206,10 @@ struct sd_vdi_req {
 	uint32_t        id;
 	uint32_t        data_length;
 	uint64_t        base_oid;
-	uint64_t        tag;
 	uint64_t	vdi_size;
 	uint32_t	copies;
-	uint32_t        pad[1];
+	uint32_t        snapid;
+	uint32_t        pad[2];
 };
 
 struct sd_vdi_rsp {
diff --git a/include/util.h b/include/util.h
index 4c10670..b107e30 100644
--- a/include/util.h
+++ b/include/util.h
@@ -3,9 +3,10 @@
 
 #include <string.h>
 
+#include "bitops.h"
+
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
 
 #if __BYTE_ORDER == __LITTLE_ENDIAN
 #define __cpu_to_be16(x) bswap_16(x)
diff --git a/shepherd/shepherd.c b/shepherd/shepherd.c
index 5d89710..0d7cecb 100644
--- a/shepherd/shepherd.c
+++ b/shepherd/shepherd.c
@@ -100,6 +100,11 @@ static struct sheepdog_node_list_entry *node_list_entries;
 static int nr_nodes;
 static unsigned master_idx;
 
+static int is_current(struct sheepdog_inode *i)
+{
+	return !i->snap_ctime;
+}
+
 static char *size_to_str(uint64_t size, char *str, int str_size)
 {
 	char *units[] = {"MB", "GB", "TB", "PB", "EB", "ZB", "YB"};
@@ -407,53 +412,46 @@ static int shutdown_sheepdog(void)
 typedef void (*vdi_parser_func_t)(uint64_t oid, char *name, uint32_t tag, uint32_t flags,
 				  struct sheepdog_inode *i, void *data);
 
-/*
- * TODO: handle larger buffer
- */
+
+
 int parse_vdi(vdi_parser_func_t func, void *data)
 {
-	struct sheepdog_vdi_info *ent;
-	char *buf;
-	int rest, ret;
-	struct sheepdog_inode i;
-	struct sd_so_req req;
+	int ret, fd;
+	unsigned long nr;
+	static struct sheepdog_inode i;
+	struct sd_req req;
+	static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
+	unsigned int rlen, wlen = 0;
 
-	memset(&req, 0, sizeof(req));
+	fd = connect_to("localhost", sdport);
+	if (fd < 0)
+		return fd;
 
-	buf = zalloc(DIR_BUF_LEN);
-	if (!buf)
-		return 1;
+	memset(&req, 0, sizeof(req));
 
-	req.opcode = SD_OP_SO_READ_VDIS;
+	req.opcode = SD_OP_READ_VDIS;
+	req.data_length = sizeof(vdi_inuse);
+	req.epoch = node_list_version;
 
-	ret = exec_reqs(node_list_entries, nr_nodes, node_list_version,
-			SD_DIR_OID, (struct sd_req *)&req, buf, 0, DIR_BUF_LEN,
-			nr_nodes, 1);
+	rlen = sizeof(vdi_inuse);
+	ret = exec_req(fd, &req, vdi_inuse, &wlen, &rlen);
+	close(fd);
 
-	if (ret < 0) {
-		ret = 1;
-		goto out;
-	}
+	if (ret < 0)
+		return ret;
 
-	ent = (struct sheepdog_vdi_info *)buf;
-	rest = ret;
-	while (rest > 0) {
-		if (!ent->name_len)
-			break;
+	for (nr = 0; nr < SD_NR_VDIS; nr++) {
+		if (!test_bit(nr, vdi_inuse))
+			continue;
 
 		ret = read_object(node_list_entries, nr_nodes, node_list_version,
-				  ent->oid, (void *)&i, sizeof(i), 0, nr_nodes);
+				  bit_to_oid(nr), (void *)&i, sizeof(i), 0, nr_nodes);
 
 		if (ret == sizeof(i))
-			func(ent->oid, ent->name, ent->id, ent->flags, &i, data);
+			func(i.oid, i.name, i.snap_id, 0, &i, data);
 
-		ent++;
-		rest -= sizeof(*ent);
 	}
 
-out:
-	free(buf);
-
 	return 0;
 }
 
@@ -499,7 +497,7 @@ static void print_graph_tree(uint64_t oid, char *name, uint32_t tag,
 	       "time: %8s",
 	       name, tag, size_str, date, time);
 
-	if (info->highlight && (flags & FLAG_CURRENT))
+	if (info->highlight && is_current(i))
 		printf("\", color=\"red\"];\n");
 	else
 		printf("\"];\n");
@@ -548,9 +546,9 @@ static void print_vdi_tree(uint64_t oid, char *name, uint32_t tag,
 	if (info->name && strcmp(name, info->name))
 		return;
 
-	if (flags & FLAG_CURRENT) {
+	if (is_current(i))
 		strcpy(buf, "(You Are Here)");
-	} else {
+	else {
 		ti = i->ctime >> 32;
 		localtime_r(&ti, &tm);
 
@@ -559,7 +557,7 @@ static void print_vdi_tree(uint64_t oid, char *name, uint32_t tag,
 	}
 
 	add_vdi_tree(name, buf, oid, i->parent_oid,
-		 info->highlight && (flags & FLAG_CURRENT));
+		     info->highlight && is_current(i));
 }
 
 static int treeview_vdi(char *vdiname, int highlight)
@@ -599,7 +597,7 @@ static void print_vdi_list(uint64_t oid, char *name, uint32_t tag,
 	for (idx = 0; idx < MAX_DATA_OBJS; idx++) {
 		if (!i->data_oid[idx])
 			continue;
-		if (is_data_obj_writeable(i->data_oid[idx], oid))
+		if (is_data_obj_writeable(i, idx))
 			my_objs++;
 		else
 			cow_objs++;
@@ -611,7 +609,7 @@ static void print_vdi_list(uint64_t oid, char *name, uint32_t tag,
 
 	if (!data || strcmp(name, data) == 0) {
 		printf("%c %-8s %5d %7s %7s %7s %s  %9" PRIx64 "\n",
-		       flags & FLAG_CURRENT ? ' ' : 's', name, tag,
+		       is_current(i) ? ' ' : 's', name, tag,
 		       vdi_size_str, my_objs_str, cow_objs_str, dbuf, oid);
 	}
 }
@@ -630,7 +628,7 @@ static void print_vm_list(uint64_t oid, char *name, uint32_t tag,
 	struct vm_list_info *vli = (struct vm_list_info *)data;
 	char vdi_size_str[8], my_objs_str[8], cow_objs_str[8];
 
-	if (!(flags & FLAG_CURRENT))
+	if (!is_current(inode))
 		return;
 
 	for (i = 0; i < vli->nr_vms; i++) {
@@ -643,7 +641,7 @@ static void print_vm_list(uint64_t oid, char *name, uint32_t tag,
 	for (j = 0; j < MAX_DATA_OBJS; j++) {
 		if (!inode->data_oid[j])
 			continue;
-		if (is_data_obj_writeable(inode->data_oid[j], oid))
+		if (is_data_obj_writeable(inode, j))
 			my_objs++;
 		else
 			cow_objs++;
@@ -676,7 +674,7 @@ static void cal_total_vdi_size(uint64_t oid, char *name, uint32_t tag,
 {
 	uint64_t *size = data;
 
-	if (flags & FLAG_CURRENT)
+	if (is_current(i))
 		*size += i->vdi_size;
 }
 
-- 
1.7.0




More information about the sheepdog mailing list