[Sheepdog] [PATCH 1/2] manage VDIs without the super object
FUJITA Tomonori
fujita.tomonori at lab.ntt.co.jp
Fri Mar 26 02:17:20 CET 2010
Signed-off-by: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>
---
collie/collie.h | 14 ++-
collie/group.c | 17 ++-
collie/net.c | 2 +-
collie/vdi.c | 278 ++++++++++++++++++++++++++++++++--------------
include/bitops.h | 132 ++++++++++++++++++++++
include/meta.h | 65 ++++++-----
include/sheepdog_proto.h | 8 +-
include/util.h | 3 +-
shepherd/shepherd.c | 80 +++++++-------
9 files changed, 428 insertions(+), 171 deletions(-)
create mode 100644 include/bitops.h
diff --git a/collie/collie.h b/collie/collie.h
index 5cd2383..8829079 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -19,6 +19,7 @@
#include "logger.h"
#include "work.h"
#include "net.h"
+#include "meta.h"
#define SD_MSG_JOIN 0x01
#define SD_MSG_VDI_OP 0x02
@@ -70,6 +71,8 @@ struct cluster_info {
struct list_head vm_list;
struct list_head pending_list;
+ DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
+
int nr_sobjs;
};
@@ -79,12 +82,13 @@ int create_listen_port(int port, void *data);
int init_store(char *dir);
-int add_vdi(char *buf, int len, uint64_t size,
- uint64_t *added_oid, uint64_t base_oid, uint32_t tag, int copies,
- uint16_t flags);
+int add_vdi(char *data, int data_len, uint64_t size,
+ uint64_t *new_oid, uint64_t base_oid, uint32_t copies,
+ int is_snapshot);
+
+int lookup_vdi(char *data, int data_len, uint64_t *oid, uint32_t snapid);
-int lookup_vdi(char *filename, uint64_t * oid,
- uint32_t tag, int do_lock, int *current);
+int read_vdis(char *data, int len, unsigned int *rsp_len);
int make_super_object(struct sd_vdi_req *hdr);
diff --git a/collie/group.c b/collie/group.c
index a49c1be..4a2397b 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -199,6 +199,9 @@ void cluster_queue_request(struct work *work, int idx)
rsp->result = SD_RES_SUCCESS;
break;
+ case SD_OP_READ_VDIS:
+ rsp->result = read_vdis(req->data, hdr->data_length, &rsp->data_length);
+ break;
default:
/* forward request to group */
goto forward;
@@ -506,21 +509,20 @@ static void vdi_op(struct vdi_op_message *msg)
const struct sd_vdi_req *hdr = &msg->req;
struct sd_vdi_rsp *rsp = &msg->rsp;
void *data = msg->data;
- int ret = SD_RES_SUCCESS, is_current;
+ int ret = SD_RES_SUCCESS;
uint64_t oid = 0;
switch (hdr->opcode) {
case SD_OP_NEW_VDI:
ret = add_vdi(data, hdr->data_length, hdr->vdi_size, &oid,
- hdr->base_oid, hdr->tag, hdr->copies, hdr->flags);
+ hdr->base_oid, hdr->copies,
+ hdr->snapid);
break;
case SD_OP_LOCK_VDI:
case SD_OP_GET_VDI_INFO:
- ret = lookup_vdi(data, &oid, hdr->tag, 1, &is_current);
+ ret = lookup_vdi(data, hdr->data_length, &oid, hdr->snapid);
if (ret != SD_RES_SUCCESS)
break;
- if (is_current)
- rsp->flags = SD_VDI_RSP_FLAG_CURRENT;
break;
case SD_OP_RELEASE_VDI:
break;
@@ -556,7 +558,12 @@ static void vdi_op_done(struct vdi_op_message *msg)
switch (hdr->opcode) {
case SD_OP_NEW_VDI:
+ {
+ unsigned long nr = (rsp->oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
+ vprintf(SDOG_INFO "done %d %ld %" PRIx64 "\n", ret, nr, rsp->oid);
+ set_bit(nr, sys->vdi_inuse);
break;
+ }
case SD_OP_LOCK_VDI:
if (lookup_vm(&sys->vm_list, (char *)data)) {
ret = SD_RES_VDI_LOCKED;
diff --git a/collie/net.c b/collie/net.c
index 04f9547..749c33d 100644
--- a/collie/net.c
+++ b/collie/net.c
@@ -89,10 +89,10 @@ static void queue_request(struct request *req)
case SD_OP_MAKE_FS:
case SD_OP_SHUTDOWN:
case SD_OP_STAT_CLUSTER:
+ case SD_OP_READ_VDIS:
req->work.fn = cluster_queue_request;
break;
case SD_OP_SO:
- case SD_OP_SO_NEW_VDI:
case SD_OP_SO_LOOKUP_VDI:
case SD_OP_SO_READ_VDIS:
case SD_OP_SO_STAT:
diff --git a/collie/vdi.c b/collie/vdi.c
index 5904488..25cc83c 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -16,17 +16,23 @@
#include "meta.h"
#include "collie.h"
+
/* TODO: should be performed atomically */
-static int create_inode_obj(struct sheepdog_node_list_entry *entries,
- int nr_nodes, uint64_t epoch, int copies,
- uint64_t oid, uint64_t size, uint64_t base_oid)
+static int create_vdi_obj(char *name, uint64_t new_oid, uint64_t size,
+ uint64_t base_oid, uint64_t cur_oid, uint32_t copies,
+ uint32_t snapid, int is_snapshot)
{
- struct sheepdog_inode inode, base;
+ struct sheepdog_node_list_entry entries[SD_MAX_NODES];
+ /* we are not called concurrently */
+ static struct sheepdog_inode new, base, cur;
struct timeval tv;
- int ret;
+ int ret, nr_nodes;
+ unsigned long block_size = SD_DATA_OBJ_SIZE;
+
+ nr_nodes = build_node_list(&sys->sd_node_list, entries);
if (base_oid) {
- ret = read_object(entries, nr_nodes, epoch,
+ ret = read_object(entries, nr_nodes, sys->epoch,
base_oid, (char *)&base, sizeof(base), 0,
copies);
if (ret < 0)
@@ -35,26 +41,45 @@ static int create_inode_obj(struct sheepdog_node_list_entry *entries,
gettimeofday(&tv, NULL);
- memset(&inode, 0, sizeof(inode));
+ if (is_snapshot) {
+ if (cur_oid != base_oid) {
+ vprintf(SDOG_INFO "tree snapshot %s %" PRIx64 " %" PRIx64 "\n",
+ name, cur_oid, base_oid);
+
+ ret = read_object(entries, nr_nodes, sys->epoch,
+ cur_oid, (char *)&cur, sizeof(cur), 0,
+ copies);
+ if (ret < 0) {
+ vprintf(SDOG_ERR "failed\n");
+ return SD_RES_BASE_VDI_READ;
+ }
+
+ cur.snap_ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
+ } else
+ base.snap_ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
+ }
+
+ memset(&new, 0, sizeof(new));
- inode.oid = oid;
- inode.vdi_size = size;
- inode.block_size = SD_DATA_OBJ_SIZE;
- inode.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
- inode.nr_copies = copies;
+ strncpy(new.name, name, sizeof(new.name));
+ new.oid = new_oid;
+ new.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
+ new.vdi_size = size;
+ new.copy_policy = 0;
+ new.nr_copies = copies;
+ new.block_size_shift = find_next_bit(&block_size, BITS_PER_LONG, 0);
+ new.snap_id = snapid;
if (base_oid) {
int i;
- eprintf("%zd %zd\n", sizeof(inode.data_oid),
- ARRAY_SIZE(base.child_oid));
- inode.parent_oid = base_oid;
- memcpy(inode.data_oid, base.data_oid,
+ new.parent_oid = base_oid;
+ memcpy(new.data_oid, base.data_oid,
MAX_DATA_OBJS * sizeof(uint64_t));
for (i = 0; i < ARRAY_SIZE(base.child_oid); i++) {
if (!base.child_oid[i]) {
- base.child_oid[i] = oid;
+ base.child_oid[i] = new_oid;
break;
}
}
@@ -62,120 +87,203 @@ static int create_inode_obj(struct sheepdog_node_list_entry *entries,
if (i == ARRAY_SIZE(base.child_oid))
return SD_RES_NO_BASE_VDI;
+ }
+
+ if (is_snapshot && cur_oid != base_oid) {
+ ret = write_object(entries, nr_nodes, sys->epoch,
+ cur_oid, (char *)&cur, sizeof(cur), 0,
+ copies, 0);
+ if (ret < 0) {
+ vprintf(SDOG_ERR "failed\n");
+ return SD_RES_BASE_VDI_READ;
+ }
+ }
+
+ if (base_oid) {
ret = write_object(entries, nr_nodes,
- epoch, base_oid, (char *)&base,
+ sys->epoch, base_oid, (char *)&base,
sizeof(base), 0, copies, 0);
- if (ret < 0)
+ if (ret < 0) {
+ vprintf(SDOG_ERR "failed\n");
return SD_RES_BASE_VDI_WRITE;
+ }
}
- ret = write_object(entries, nr_nodes, epoch,
- oid, (char *)&inode, sizeof(inode), 0, copies, 1);
+ ret = write_object(entries, nr_nodes, sys->epoch,
+ new_oid, (char *)&new, sizeof(new), 0, copies, 1);
if (ret < 0)
return SD_RES_VDI_WRITE;
return ret;
}
-/*
- * TODO: handle larger buffer
- */
-int add_vdi(char *name, int len, uint64_t size,
- uint64_t *added_oid, uint64_t base_oid, uint32_t tag, int copies,
- uint16_t flags)
+static int find_first_vdi(unsigned long start, unsigned long end,
+ char *name, int namelen, uint32_t snapid, uint64_t *oid,
+ unsigned long *deleted_nr, uint32_t *next_snap)
{
struct sheepdog_node_list_entry entries[SD_MAX_NODES];
+ static struct sheepdog_inode inode;
+ unsigned long i;
int nr_nodes, nr_reqs;
- uint64_t oid = 0;
int ret;
- struct sd_so_req req;
- struct sd_so_rsp *rsp = (struct sd_so_rsp *)&req;
-
- memset(&req, 0, sizeof(req));
nr_nodes = build_node_list(&sys->sd_node_list, entries);
- dprintf("%s (%d) %" PRIu64 ", base: %" PRIu64 "\n", name, len, size,
- base_oid);
-
nr_reqs = sys->nr_sobjs;
if (nr_reqs > nr_nodes)
nr_reqs = nr_nodes;
- memset(&req, 0, sizeof(req));
-
- eprintf("%d %d\n", copies, sys->nr_sobjs);
- /* qemu doesn't specify the copies, then we use the default. */
- if (!copies)
- copies = sys->nr_sobjs;
-
- req.opcode = SD_OP_SO_NEW_VDI;
- req.copies = copies;
- req.tag = tag;
- req.flags |= flags;
+ for (i = start; i >= end; i--) {
+ ret = read_object(entries, nr_nodes, sys->epoch,
+ bit_to_oid(i), (char *)&inode, sizeof(inode), 0,
+ nr_reqs);
+ if (ret < 0)
+ return SD_RES_EIO;
- ret = exec_reqs(entries, nr_nodes, sys->epoch,
- SD_DIR_OID, (struct sd_req *)&req, name, len, 0,
- nr_reqs, nr_reqs);
+ if (inode.name[0] == '\0') {
+ *deleted_nr = i;
+ continue; /* deleted */
+ }
- if (ret < 0)
- return rsp->result;
+ if (!strncmp(inode.name, name, strlen(inode.name))) {
+ if (snapid && snapid != inode.snap_id)
+ continue;
- oid = rsp->oid;
- *added_oid = oid;
+ *next_snap = inode.snap_id + 1;
+ *oid = inode.oid;
+ return SD_RES_SUCCESS;
+ }
+ }
+ return SD_RES_NO_VDI;
+}
- dprintf("%s (%d) %" PRIu64 ", base: %" PRIu64 "\n", name, len, size,
- oid);
- ret = create_inode_obj(entries, nr_nodes, sys->epoch, copies,
- oid, size, base_oid);
+static int do_lookup_vdi(char *name, int namelen, uint64_t *oid, uint32_t snapid,
+ uint32_t *next_snapid,
+ unsigned long *right_nr, unsigned long *deleted_nr)
+{
+ int ret;
+ unsigned long nr, start_nr;
+
+ start_nr = fnv_64a_buf(name, namelen, FNV1A_64_INIT) & (SD_NR_VDIS - 1);
+
+ vprintf(SDOG_INFO "looking for %s %d, %lx\n", name, namelen, start_nr);
+
+ /* bitmap search from the hash point */
+ nr = find_next_zero_bit(sys->vdi_inuse, SD_NR_VDIS, start_nr);
+ *right_nr = nr;
+ if (nr == start_nr) {
+ return SD_RES_NO_VDI;
+ } else if (nr < SD_NR_VDIS) {
+ right_side:
+ /* look up on the right side of the hash point */
+ ret = find_first_vdi(nr - 1, start_nr, name, namelen, snapid, oid,
+ deleted_nr, next_snapid);
+ return ret;
+ } else {
+ /* round up... bitmap search from the head of the bitmap */
+ nr = find_next_zero_bit(sys->vdi_inuse, SD_NR_VDIS, 0);
+ *right_nr = nr;
+ if (nr >= SD_NR_VDIS)
+ return SD_RES_FULL_VDI;
+ else if (nr) {
+ /* look up on the left side of the hash point */
+ ret = find_first_vdi(nr - 1, 0, name, namelen, snapid, oid,
+ deleted_nr, next_snapid);
+ if (ret == SD_RES_NO_VDI)
+ ; /* we need to go to the right side */
+ else
+ return ret;
+ }
- return ret;
+ nr = SD_NR_VDIS;
+ goto right_side;
+ }
}
-int del_vdi(char *name, int len)
+int lookup_vdi(char *data, int data_len, uint64_t *oid, uint32_t snapid)
{
- return 0;
+ char *name = data;
+ uint32_t dummy0;
+ unsigned long dummy1, dummy2;
+
+ if (data_len != SD_MAX_VDI_LEN)
+ return SD_RES_INVALID_PARMS;
+
+ return do_lookup_vdi(name, strlen(name), oid, snapid,
+ &dummy0, &dummy1, &dummy2);
}
-int lookup_vdi(char *filename, uint64_t * oid, uint32_t tag, int do_lock,
- int *current)
+int add_vdi(char *data, int data_len, uint64_t size,
+ uint64_t *new_oid, uint64_t base_oid, uint32_t copies, int is_snapshot)
{
- struct sheepdog_node_list_entry entries[SD_MAX_NODES];
- int nr_nodes, nr_reqs;
+ uint64_t cur_oid;
+ uint32_t next_snapid;
+ unsigned long nr, deleted_nr = SD_NR_VDIS, right_nr = SD_NR_VDIS;
int ret;
- struct sd_so_req req;
- struct sd_so_rsp *rsp = (struct sd_so_rsp *)&req;
+ char *name;
- memset(&req, 0, sizeof(req));
+ if (data_len != SD_MAX_VDI_LEN)
+ return SD_RES_INVALID_PARMS;
- nr_nodes = build_node_list(&sys->sd_node_list, entries);
+ name = data;
- *current = 0;
+ ret = do_lookup_vdi(name, strlen(name), &cur_oid, 0, &next_snapid,
+ &right_nr, &deleted_nr);
- dprintf("looking for %s %zd\n", filename, strlen(filename));
+ if (is_snapshot) {
+ if (ret != SD_RES_SUCCESS) {
+ if (ret == SD_RES_NO_VDI)
+ vprintf(SDOG_CRIT "we dont's have %s\n", name);
+ return ret;
+ }
+ nr = right_nr;
+ } else {
+ /* we already have the same vdi or met other errors. */
+ if (ret != SD_RES_NO_VDI) {
+ if (ret == SD_RES_SUCCESS)
+ ret = SD_RES_VDI_EXIST;
+ return ret;
+ }
- nr_reqs = sys->nr_sobjs;
- if (nr_reqs > nr_nodes)
- nr_reqs = nr_nodes;
+ if (deleted_nr == SD_NR_VDIS)
+ nr = right_nr;
+ else
+ nr = deleted_nr; /* we can recycle a deleted vdi */
- memset(&req, 0, sizeof(req));
+ next_snapid = 1;
+ }
- req.opcode = SD_OP_SO_LOOKUP_VDI;
- req.tag = tag;
+ *new_oid = bit_to_oid(nr);
- ret = exec_reqs(entries, nr_nodes, sys->epoch,
- SD_DIR_OID, (struct sd_req *)&req, filename, strlen(filename), 0,
- nr_reqs, 1);
+ vprintf(SDOG_INFO "we create a new vdi, %d %s (%zd) %" PRIu64 ", oid: %"
+ PRIx64 ", base %" PRIx64 ", cur %" PRIx64 " \n",
+ is_snapshot, name, strlen(name), size, *new_oid, base_oid, cur_oid);
- *oid = rsp->oid;
- if (rsp->flags & SD_VDI_RSP_FLAG_CURRENT)
- *current = 1;
+ if (!copies) {
+ vprintf(SDOG_WARNING "qemu doesn't specify the copies... %d\n",
+ sys->nr_sobjs);
+ copies = sys->nr_sobjs;
+ }
- dprintf("looking for %s %lx\n", filename, *oid);
+ ret = create_vdi_obj(name, *new_oid, size, base_oid, cur_oid, copies,
+ next_snapid, is_snapshot);
- if (ret < 0)
- return rsp->result;
+ return ret;
+}
+
+int del_vdi(char *name, int len)
+{
+ return 0;
+}
+
+int read_vdis(char *data, int len, unsigned int *rsp_len)
+{
+ if (len != sizeof(sys->vdi_inuse))
+ return SD_RES_INVALID_PARMS;
+
+ memcpy(data, sys->vdi_inuse, sizeof(sys->vdi_inuse));
+ *rsp_len = sizeof(sys->vdi_inuse);
return SD_RES_SUCCESS;
}
diff --git a/include/bitops.h b/include/bitops.h
new file mode 100644
index 0000000..e3191dd
--- /dev/null
+++ b/include/bitops.h
@@ -0,0 +1,132 @@
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define BITS_PER_BYTE 8
+#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
+#define DECLARE_BITMAP(name,bits) \
+ unsigned long name[BITS_TO_LONGS(bits)]
+
+#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long))
+
+#define ffz(x) __ffs(~(x))
+
+static inline unsigned long __ffs(unsigned long word)
+{
+ int num = 0;
+
+ if (BITS_PER_LONG == 64) {
+ if ((word & 0xffffffff) == 0) {
+ num += 32;
+ word >>= 32;
+ }
+ }
+
+ if ((word & 0xffff) == 0) {
+ num += 16;
+ word >>= 16;
+ }
+ if ((word & 0xff) == 0) {
+ num += 8;
+ word >>= 8;
+ }
+ if ((word & 0xf) == 0) {
+ num += 4;
+ word >>= 4;
+ }
+ if ((word & 0x3) == 0) {
+ num += 2;
+ word >>= 2;
+ }
+ if ((word & 0x1) == 0)
+ num += 1;
+ return num;
+}
+
+#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+
+static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ const unsigned long *p = addr + BITOP_WORD(offset);
+ unsigned long result = offset & ~(BITS_PER_LONG-1);
+ unsigned long tmp;
+
+ if (offset >= size)
+ return size;
+ size -= result;
+ offset %= BITS_PER_LONG;
+ if (offset) {
+ tmp = *(p++);
+ tmp |= ~0UL >> (BITS_PER_LONG - offset);
+ if (size < BITS_PER_LONG)
+ goto found_first;
+ if (~tmp)
+ goto found_middle;
+ size -= BITS_PER_LONG;
+ result += BITS_PER_LONG;
+ }
+ while (size & ~(BITS_PER_LONG-1)) {
+ if (~(tmp = *(p++)))
+ goto found_middle;
+ result += BITS_PER_LONG;
+ size -= BITS_PER_LONG;
+ }
+ if (!size)
+ return result;
+ tmp = *p;
+
+found_first:
+ tmp |= ~0UL << size;
+ if (tmp == ~0UL) /* Are any bits zero? */
+ return result + size; /* Nope. */
+found_middle:
+ return result + ffz(tmp);
+}
+
+static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ const unsigned long *p = addr + BITOP_WORD(offset);
+ unsigned long result = offset & ~(BITS_PER_LONG-1);
+ unsigned long tmp;
+
+ if (offset >= size)
+ return size;
+ size -= result;
+ offset %= BITS_PER_LONG;
+ if (offset) {
+ tmp = *(p++);
+ tmp &= (~0UL << offset);
+ if (size < BITS_PER_LONG)
+ goto found_first;
+ if (tmp)
+ goto found_middle;
+ size -= BITS_PER_LONG;
+ result += BITS_PER_LONG;
+ }
+ while (size & ~(BITS_PER_LONG-1)) {
+ if ((tmp = *(p++)))
+ goto found_middle;
+ result += BITS_PER_LONG;
+ size -= BITS_PER_LONG;
+ }
+ if (!size)
+ return result;
+ tmp = *p;
+
+found_first:
+ tmp &= (~0UL >> (BITS_PER_LONG - size));
+ if (tmp == 0UL) /* Are any bits set? */
+ return result + size; /* Nope. */
+found_middle:
+ return result + __ffs(tmp);
+}
+
+static inline void set_bit(int nr, unsigned long *addr)
+{
+ addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
+}
+
+static inline int test_bit(unsigned int nr, const unsigned long *addr)
+{
+ return ((1UL << (nr % BITS_PER_LONG)) &
+ (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
diff --git a/include/meta.h b/include/meta.h
index 67d2b11..5b296b2 100644
--- a/include/meta.h
+++ b/include/meta.h
@@ -21,33 +21,22 @@
/*
* Object ID rules
*
- * 0 - 17 (18 bits): data object
- * 17 - 55 (37 bits): inode object
- * 56 - 63 ( 8 bits): PGID
- *
- * each VDI can use 2^18 data objects.
+ * 0 - 19 (20 bits): data object space
+ * 20 - 31 (12 bits): reserved data object space
+ * 32 - 55 (24 bits): vdi object space
+ * 56 - 62 (17 bits): reserved vdi object space
+ * 63 - 63 ( 1 bit ): set if vdi
*/
-#define DATA_SPACE_SHIFT 18
-
+#define VDI_SPACE 24
+#define VDI_SPACE_SHIFT 32
+#define VDI_BIT (UINT64_C(1) << 63)
#define DEAFAULT_NR_COPIES 1
+#define SD_MAX_VDI_LEN 256
+#define MAX_DATA_OBJS (1ULL << 20)
+#define MAX_CHILDREN 1024
-static inline uint64_t oid_to_ino(uint64_t inode_oid)
-{
- return (inode_oid >> DATA_SPACE_SHIFT) & ((UINT64_C(1) << 37) - 1);
-}
-
-static inline int is_data_obj_writeable(uint64_t inode_oid, uint64_t data_oid)
-{
- return oid_to_ino(inode_oid) == oid_to_ino(data_oid);
-}
-
-static inline int is_data_obj(uint64_t oid)
-{
- return oid & ((UINT64_C(1) << DATA_SPACE_SHIFT) - 1);
-}
-
-#define SHEEPDOG_SUPER_OBJ_SIZE (UINT64_C(1) << 12)
+#define SD_NR_VDIS (1U << 24)
#define FLAG_CURRENT 1
@@ -63,19 +52,37 @@ struct sheepdog_vdi_info {
char tag[SD_MAX_VDI_LEN];
};
-#define MAX_DATA_OBJS (1 << 18)
-#define MAX_CHILDREN 1024
-
struct sheepdog_inode {
+ char name[SD_MAX_VDI_LEN];
uint64_t oid;
uint64_t ctime;
+ uint64_t snap_ctime;
uint64_t vdi_size;
- uint64_t block_size;
- uint32_t copy_policy;
- uint32_t nr_copies;
+ uint16_t copy_policy;
+ uint8_t nr_copies;
+ uint8_t block_size_shift;
+ uint32_t snap_id;
uint64_t parent_oid;
uint64_t child_oid[MAX_CHILDREN];
uint64_t data_oid[MAX_DATA_OBJS];
};
+static inline int is_data_obj_writeable(struct sheepdog_inode *inode, int idx)
+{
+ return (inode->oid >> VDI_SPACE_SHIFT) ==
+ (inode->data_oid[idx] >> VDI_SPACE_SHIFT);
+}
+
+static inline int is_data_obj(uint64_t oid)
+{
+ return !(VDI_BIT & oid);
+}
+
+#define NR_VDIS (1U << DATA_SPECE_SHIFT)
+
+static inline uint64_t bit_to_oid(unsigned long nr)
+{
+ return ((unsigned long long)nr << VDI_SPACE_SHIFT) | VDI_BIT;
+}
+
#endif
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 9863aa3..b6afbe1 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -20,8 +20,6 @@
#define SD_MAX_NODES 1024
#define SD_MAX_VMS 4096
-#define SD_MAX_VDI_LEN 256
-
/* -> vmon */
#define SD_OP_NEW_VDI 0x11
@@ -36,6 +34,7 @@
#define SD_OP_GET_EPOCH 0x23
#define SD_OP_SHUTDOWN 0x24
#define SD_OP_READ_EPOCH 0x25
+#define SD_OP_READ_VDIS 0x26
#define SD_OP_DEBUG_INC_NVER 0xA0
#define SD_OP_DEBUG_SET_NODE 0xA1
@@ -96,6 +95,7 @@
#define SD_RES_SHUTDOWN 0x18 /* Sheepdog is shutting down */
#define SD_RES_NO_MEM 0x19 /* Cannot allocate memory */
#define SD_RES_INCONSISTENT_EPOCHS 0x1A /* There is inconsistency between epochs */
+#define SD_RES_FULL_VDI 0x1B /* we already have the maximum vdis */
#define SD_VDI_RSP_FLAG_CURRENT 0x01
@@ -206,10 +206,10 @@ struct sd_vdi_req {
uint32_t id;
uint32_t data_length;
uint64_t base_oid;
- uint64_t tag;
uint64_t vdi_size;
uint32_t copies;
- uint32_t pad[1];
+ uint32_t snapid;
+ uint32_t pad[2];
};
struct sd_vdi_rsp {
diff --git a/include/util.h b/include/util.h
index 4c10670..b107e30 100644
--- a/include/util.h
+++ b/include/util.h
@@ -3,9 +3,10 @@
#include <string.h>
+#include "bitops.h"
+
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
#if __BYTE_ORDER == __LITTLE_ENDIAN
#define __cpu_to_be16(x) bswap_16(x)
diff --git a/shepherd/shepherd.c b/shepherd/shepherd.c
index 5d89710..0d7cecb 100644
--- a/shepherd/shepherd.c
+++ b/shepherd/shepherd.c
@@ -100,6 +100,11 @@ static struct sheepdog_node_list_entry *node_list_entries;
static int nr_nodes;
static unsigned master_idx;
+static int is_current(struct sheepdog_inode *i)
+{
+ return !i->snap_ctime;
+}
+
static char *size_to_str(uint64_t size, char *str, int str_size)
{
char *units[] = {"MB", "GB", "TB", "PB", "EB", "ZB", "YB"};
@@ -407,53 +412,46 @@ static int shutdown_sheepdog(void)
typedef void (*vdi_parser_func_t)(uint64_t oid, char *name, uint32_t tag, uint32_t flags,
struct sheepdog_inode *i, void *data);
-/*
- * TODO: handle larger buffer
- */
+
+
int parse_vdi(vdi_parser_func_t func, void *data)
{
- struct sheepdog_vdi_info *ent;
- char *buf;
- int rest, ret;
- struct sheepdog_inode i;
- struct sd_so_req req;
+ int ret, fd;
+ unsigned long nr;
+ static struct sheepdog_inode i;
+ struct sd_req req;
+ static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
+ unsigned int rlen, wlen = 0;
- memset(&req, 0, sizeof(req));
+ fd = connect_to("localhost", sdport);
+ if (fd < 0)
+ return fd;
- buf = zalloc(DIR_BUF_LEN);
- if (!buf)
- return 1;
+ memset(&req, 0, sizeof(req));
- req.opcode = SD_OP_SO_READ_VDIS;
+ req.opcode = SD_OP_READ_VDIS;
+ req.data_length = sizeof(vdi_inuse);
+ req.epoch = node_list_version;
- ret = exec_reqs(node_list_entries, nr_nodes, node_list_version,
- SD_DIR_OID, (struct sd_req *)&req, buf, 0, DIR_BUF_LEN,
- nr_nodes, 1);
+ rlen = sizeof(vdi_inuse);
+ ret = exec_req(fd, &req, vdi_inuse, &wlen, &rlen);
+ close(fd);
- if (ret < 0) {
- ret = 1;
- goto out;
- }
+ if (ret < 0)
+ return ret;
- ent = (struct sheepdog_vdi_info *)buf;
- rest = ret;
- while (rest > 0) {
- if (!ent->name_len)
- break;
+ for (nr = 0; nr < SD_NR_VDIS; nr++) {
+ if (!test_bit(nr, vdi_inuse))
+ continue;
ret = read_object(node_list_entries, nr_nodes, node_list_version,
- ent->oid, (void *)&i, sizeof(i), 0, nr_nodes);
+ bit_to_oid(nr), (void *)&i, sizeof(i), 0, nr_nodes);
if (ret == sizeof(i))
- func(ent->oid, ent->name, ent->id, ent->flags, &i, data);
+ func(i.oid, i.name, i.snap_id, 0, &i, data);
- ent++;
- rest -= sizeof(*ent);
}
-out:
- free(buf);
-
return 0;
}
@@ -499,7 +497,7 @@ static void print_graph_tree(uint64_t oid, char *name, uint32_t tag,
"time: %8s",
name, tag, size_str, date, time);
- if (info->highlight && (flags & FLAG_CURRENT))
+ if (info->highlight && is_current(i))
printf("\", color=\"red\"];\n");
else
printf("\"];\n");
@@ -548,9 +546,9 @@ static void print_vdi_tree(uint64_t oid, char *name, uint32_t tag,
if (info->name && strcmp(name, info->name))
return;
- if (flags & FLAG_CURRENT) {
+ if (is_current(i))
strcpy(buf, "(You Are Here)");
- } else {
+ else {
ti = i->ctime >> 32;
localtime_r(&ti, &tm);
@@ -559,7 +557,7 @@ static void print_vdi_tree(uint64_t oid, char *name, uint32_t tag,
}
add_vdi_tree(name, buf, oid, i->parent_oid,
- info->highlight && (flags & FLAG_CURRENT));
+ info->highlight && is_current(i));
}
static int treeview_vdi(char *vdiname, int highlight)
@@ -599,7 +597,7 @@ static void print_vdi_list(uint64_t oid, char *name, uint32_t tag,
for (idx = 0; idx < MAX_DATA_OBJS; idx++) {
if (!i->data_oid[idx])
continue;
- if (is_data_obj_writeable(i->data_oid[idx], oid))
+ if (is_data_obj_writeable(i, idx))
my_objs++;
else
cow_objs++;
@@ -611,7 +609,7 @@ static void print_vdi_list(uint64_t oid, char *name, uint32_t tag,
if (!data || strcmp(name, data) == 0) {
printf("%c %-8s %5d %7s %7s %7s %s %9" PRIx64 "\n",
- flags & FLAG_CURRENT ? ' ' : 's', name, tag,
+ is_current(i) ? ' ' : 's', name, tag,
vdi_size_str, my_objs_str, cow_objs_str, dbuf, oid);
}
}
@@ -630,7 +628,7 @@ static void print_vm_list(uint64_t oid, char *name, uint32_t tag,
struct vm_list_info *vli = (struct vm_list_info *)data;
char vdi_size_str[8], my_objs_str[8], cow_objs_str[8];
- if (!(flags & FLAG_CURRENT))
+ if (!is_current(inode))
return;
for (i = 0; i < vli->nr_vms; i++) {
@@ -643,7 +641,7 @@ static void print_vm_list(uint64_t oid, char *name, uint32_t tag,
for (j = 0; j < MAX_DATA_OBJS; j++) {
if (!inode->data_oid[j])
continue;
- if (is_data_obj_writeable(inode->data_oid[j], oid))
+ if (is_data_obj_writeable(inode, j))
my_objs++;
else
cow_objs++;
@@ -676,7 +674,7 @@ static void cal_total_vdi_size(uint64_t oid, char *name, uint32_t tag,
{
uint64_t *size = data;
- if (flags & FLAG_CURRENT)
+ if (is_current(i))
*size += i->vdi_size;
}
--
1.7.0
More information about the sheepdog
mailing list