[Sheepdog] [PATCH] sheepdog: changes for the superblock removal
FUJITA Tomonori
fujita.tomonori at lab.ntt.co.jp
Fri Mar 26 02:17:50 CET 2010
Signed-off-by: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>
---
block/sheepdog.c | 210 ++++++++++++++++++++++++++---------------------------
1 files changed, 103 insertions(+), 107 deletions(-)
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 6a45cfa..e049463 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -36,6 +36,7 @@
#define SD_OP_GET_NODE_LIST 0x19
#define SD_OP_GET_VM_LIST 0x20
#define SD_OP_MAKE_FS 0x21
+#define SD_OP_READ_VDIS 0x26
#define SD_OP_DEBUG_INC_NVER 0xA0
#define SD_OP_DEBUG_SET_NODE 0xA1
@@ -49,8 +50,6 @@
#define SD_FLAG_CMD_WRITE 0x01
#define SD_FLAG_CMD_COW 0x02
-#define SD_FLAG_CMD_SNAPSHOT (1U << 8)
-
#define SD_RES_SUCCESS 0x00 /* Success */
#define SD_RES_UNKNOWN 0x01 /* Unknown error */
#define SD_RES_NO_OBJ 0x02 /* No object found */
@@ -77,21 +76,21 @@
#define SD_RES_VDI_NOT_LOCKED 0x17 /* Vdi is not locked */
#define SD_RES_SHUTDOWN 0x18 /* Sheepdog is shutting down */
-#define MAX_DATA_OBJS (1ULL << 18)
-#define MAX_CHILDREN 1024
-#define MAX_AIO_REQS 4096
-
/* should be configurable? */
#define MAX_RETRIES 6
+#define SD_NR_VDIS (1U << 24)
+#define VDI_SPACE_SHIFT 32
+#define VDI_BIT (UINT64_C(1) << 63)
+#define DEAFAULT_NR_COPIES 1
+#define SD_MAX_VDI_LEN 256
+#define MAX_DATA_OBJS (1ULL << 20)
+#define MAX_CHILDREN 1024
#define SD_DATA_OBJ_SIZE (1UL << 22)
#define SD_INODE_SIZE (sizeof(struct sd_inode))
-
#define CURRENT_VDI_ID 0
-#define SD_MAX_VDI_LEN 256
-
#undef eprintf
#define eprintf(fmt, args...) \
do { \
@@ -162,13 +161,12 @@ struct sd_vdi_req {
uint32_t id;
uint32_t data_length;
uint64_t base_oid;
- uint64_t tag;
uint64_t vdi_size;
+ uint32_t copies;
+ uint32_t snapid;
uint32_t pad[2];
};
-#define SD_VDI_RSP_FLAG_CURRENT 0x01;
-
struct sd_vdi_rsp {
uint8_t proto_ver;
uint8_t opcode;
@@ -207,12 +205,15 @@ struct sd_node_rsp {
};
struct sd_inode {
+ char name[SD_MAX_VDI_LEN];
uint64_t oid;
uint64_t ctime;
+ uint64_t snap_ctime;
uint64_t vdi_size;
- uint64_t block_size;
- uint32_t copy_policy;
- uint32_t nr_copies;
+ uint16_t copy_policy;
+ uint8_t nr_copies;
+ uint8_t block_size_shift;
+ uint32_t snap_id;
uint64_t parent_oid;
uint64_t child_oid[MAX_CHILDREN];
uint64_t data_oid[MAX_DATA_OBJS];
@@ -252,6 +253,8 @@ struct sd_aiocb {
QLIST_HEAD(aioreq_head, aio_req) aioreq_head;
};
+#define MAX_AIO_REQS 4096
+
struct sd_aiostate {
struct bdrv_sd_state *s;
int fd;
@@ -322,19 +325,15 @@ static inline int after(uint32_t seq1, uint32_t seq2)
return (int32_t)(seq2 - seq1) < 0;
}
-static inline uint64_t oid_to_ino(uint64_t inode_oid)
-{
- return (inode_oid >> 18) & ((1ULL << 37) - 1);
-}
-
-static inline int is_data_obj_writeable(uint64_t inode_oid, uint64_t data_oid)
+static inline int is_data_obj_writeable(struct sd_inode *inode, unsigned int idx)
{
- return oid_to_ino(inode_oid) == oid_to_ino(data_oid);
+ return (inode->oid >> VDI_SPACE_SHIFT) ==
+ (inode->data_oid[idx] >> VDI_SPACE_SHIFT);
}
static inline int is_data_obj(uint64_t oid)
{
- return oid & ((1ULL << 18) - 1);
+ return !(VDI_BIT & oid);
}
/*
@@ -872,7 +871,7 @@ static int get_sheep_fd(struct bdrv_sd_state *s)
}
static int parse_vdiname(const char *filename, char *vdi, int vdi_len,
- uint64_t *tag)
+ uint32_t *snapid)
{
char *p, *q;
@@ -888,35 +887,37 @@ static int parse_vdiname(const char *filename, char *vdi, int vdi_len,
p = strchr(vdi, ':');
if (p) {
*p++ = '\0';
- *tag = strtol(p, NULL, 16);
+ *snapid = strtol(p, NULL, 16);
} else
- *tag = CURRENT_VDI_ID; /* search current vdi */
+ *snapid = CURRENT_VDI_ID; /* search current vdi */
free(q);
return 0;
}
-static int find_vdi_name(struct bdrv_sd_state *s, char *filename, uint64_t tag,
- uint64_t *oid, int for_snapshot, int *current)
+static int find_vdi_name(struct bdrv_sd_state *s, char *filename, uint32_t snapid,
+ uint64_t *oid)
{
int ret, fd;
struct sd_vdi_req hdr;
struct sd_vdi_rsp *rsp = (struct sd_vdi_rsp *)&hdr;
unsigned int wlen, rlen = 0;
+ char buf[SD_MAX_VDI_LEN];
fd = connect_to_vost();
if (fd < 0)
return -1;
memset(&hdr, 0, sizeof(hdr));
+ snprintf(buf, sizeof(buf), "%s", filename);
hdr.opcode = SD_OP_GET_VDI_INFO;
- wlen = strlen(filename) + 1;
- hdr.data_length = wlen;
- hdr.tag = tag;
+ wlen = SD_MAX_VDI_LEN;
+ hdr.data_length = SD_MAX_VDI_LEN;
+ hdr.snapid = snapid;
hdr.flags = SD_FLAG_CMD_WRITE;
- ret = do_req(fd, (struct sd_req *)&hdr, filename, &wlen, &rlen);
+ ret = do_req(fd, (struct sd_req *)&hdr, buf, &wlen, &rlen);
if (ret) {
ret = -1;
goto out;
@@ -927,9 +928,8 @@ static int find_vdi_name(struct bdrv_sd_state *s, char *filename, uint64_t tag,
ret = -1;
goto out;
}
-
*oid = rsp->oid;
- s->is_current = rsp->flags & SD_VDI_RSP_FLAG_CURRENT;
+
ret = 0;
out:
close(fd);
@@ -1045,7 +1045,7 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
uint64_t oid = 0;
struct bdrv_sd_state *s = bs->opaque;
char vdi[256];
- uint64_t tag;
+ uint32_t snapid;
int for_snapshot = 0, dummy;
char *buf;
@@ -1070,15 +1070,17 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
for_snapshot = 1;
memset(vdi, 0, sizeof(vdi));
- if (parse_vdiname(filename, vdi, sizeof(vdi), &tag) < 0)
+ if (parse_vdiname(filename, vdi, sizeof(vdi), &snapid) < 0)
goto out;
- ret = find_vdi_name(s, vdi, tag, &oid, for_snapshot, &s->is_current);
+ ret = find_vdi_name(s, vdi, snapid, &oid);
if (ret)
goto out;
- if (!s->is_current)
+ if (snapid)
eprintf("%" PRIx64 " non current inode was open.\n", oid);
+ else
+ s->is_current = 1;
ret = read_vdi_obj(buf, oid, &dummy);
if (ret)
@@ -1105,28 +1107,23 @@ static int do_sd_create(char *filename, char *tag, int64_t total_sectors,
struct sd_vdi_rsp *rsp = (struct sd_vdi_rsp *)&hdr;
int fd, ret;
unsigned int wlen, rlen = 0;
- char buf[SD_MAX_VDI_LEN * 2];
+ char buf[SD_MAX_VDI_LEN];
fd = connect_to_vost();
if (fd < 0)
return -1;
- memset(buf, 0, sizeof(buf));
strncpy(buf, filename, SD_MAX_VDI_LEN);
- if (tag)
- strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_LEN);
memset(&hdr, 0, sizeof(hdr));
hdr.opcode = SD_OP_NEW_VDI;
hdr.base_oid = base_oid;
wlen = SD_MAX_VDI_LEN;
- if (tag)
- wlen += SD_MAX_VDI_LEN;
hdr.flags = SD_FLAG_CMD_WRITE;
- if (snapshot)
- hdr.flags |= SD_FLAG_CMD_SNAPSHOT;
+ hdr.snapid = snapshot;
+
hdr.data_length = wlen;
hdr.vdi_size = total_sectors * 512;
@@ -1166,8 +1163,8 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
if (backing_file) {
BlockDriverState bs;
- char vdi[256];
- uint64_t tag;
+ char vdi[SD_MAX_VDI_LEN];
+ uint32_t snapid;
memset(&bs, 0, sizeof(bs));
@@ -1179,16 +1176,15 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
if (ret < 0)
return -1;
- if (parse_vdiname(backing_file, vdi, sizeof(vdi), &tag) < 0)
+ if (parse_vdiname(backing_file, vdi, sizeof(vdi), &snapid) < 0)
return -1;
/* cannot clone from a current inode */
- if (tag == CURRENT_VDI_ID)
+ if (snapid == CURRENT_VDI_ID)
return -1;
- ret = find_vdi_name(bs.opaque, vdi, tag, &oid, 1, NULL);
- struct bdrv_sd_state *s = bs.opaque;
- if (ret || s->is_current)
+ ret = find_vdi_name(bs.opaque, vdi, snapid, &oid);
+ if (ret)
return -1;
}
@@ -1199,8 +1195,6 @@ static void sd_close(BlockDriverState *bs)
{
struct bdrv_sd_state *s = bs->opaque;
- eprintf("%s\n", s->name);
-
free(s->name);
}
@@ -1211,6 +1205,7 @@ static int sd_claim(BlockDriverState *bs)
struct sd_vdi_req hdr;
struct sd_vdi_rsp *rsp = (struct sd_vdi_rsp *)&hdr;
unsigned int wlen, rlen = 0;
+ char buf[SD_MAX_VDI_LEN];
eprintf("%s\n", s->name);
@@ -1218,14 +1213,16 @@ static int sd_claim(BlockDriverState *bs)
if (fd < 0)
return -1;
+ memset(buf, 0, sizeof(buf));
+ strncpy(buf, s->name, SD_MAX_VDI_LEN);
memset(&hdr, 0, sizeof(hdr));
hdr.opcode = SD_OP_LOCK_VDI;
- wlen = strlen(s->name) + 1;
- hdr.data_length = wlen;
- hdr.tag = CURRENT_VDI_ID;
+ wlen = SD_MAX_VDI_LEN;
+ hdr.data_length = SD_MAX_VDI_LEN;
+ hdr.snapid = CURRENT_VDI_ID;
hdr.flags = SD_FLAG_CMD_WRITE;
- ret = do_req(fd, (struct sd_req *)&hdr, s->name, &wlen, &rlen);
+ ret = do_req(fd, (struct sd_req *)&hdr, buf, &wlen, &rlen);
if (ret) {
ret = -1;
goto out;
@@ -1407,19 +1404,19 @@ static void sd_write_bh_cb(void *p)
len = min_t(unsigned long, total - done, CHUNK_SIZE - offset);
- if (!oid || !is_data_obj_writeable(inode->oid, oid)) {
+ if (!oid || !is_data_obj_writeable(inode, idx)) {
if (!acb->write)
goto done;
create = 1;
dprintf("update ino (%" PRIu64") %"
PRIu64 " %" PRIu64 " %" PRIu64 "\n",
- inode->oid, oid, inode->oid + (idx + 1), idx);
- if (oid && !is_data_obj_writeable(inode->oid, oid)) {
+ inode->oid, oid, inode->oid + idx, idx);
+ if (oid && !is_data_obj_writeable(inode, idx)) {
old_oid = oid;
flags = SD_FLAG_CMD_COW;
}
- oid = inode->oid + (idx + 1);
+ oid = inode->oid + idx;
acb->oid[i] = oid;
dprintf("new oid %lx\n", acb->oid[i]);
}
@@ -1619,96 +1616,95 @@ struct sd_so_rsp {
uint32_t opcode_specific[2];
};
-struct sheepdog_vdi_info {
- uint64_t oid;
- uint16_t id;
- uint16_t name_len;
- uint16_t tag_len;
- uint8_t type;
- uint8_t flags;
- uint32_t epoch;
- char name[SD_MAX_VDI_LEN];
- char tag[SD_MAX_VDI_LEN];
-};
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define BITS_PER_BYTE 8
+#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
+#define DECLARE_BITMAP(name,bits) \
+ unsigned long name[BITS_TO_LONGS(bits)]
+
+#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long))
+
+static inline int test_bit(unsigned int nr, const unsigned long *addr)
+{
+ return ((1UL << (nr % BITS_PER_LONG)) &
+ (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
-#define SD_OP_SO_READ_VDIS 0x64
+static inline uint64_t bit_to_oid(unsigned long nr)
+{
+ return ((unsigned long long)nr << VDI_SPACE_SHIFT) | VDI_BIT;
+}
static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
{
struct bdrv_sd_state *s = bs->opaque;
- struct sd_so_req req;
- struct sd_rsp *rsp;
- struct sheepdog_vdi_info *vi;
- int i, fd, nr = 0, ret, max = 1024; /* FIXME */
- char name[SD_MAX_VDI_LEN];
+ struct sd_req req;
+ int i, fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
QEMUSnapshotInfo *sn_tab = NULL;
unsigned wlen, rlen;
int found = 0;
- struct sd_inode inode;
+ static struct sd_inode inode;
+ unsigned long *vdi_inuse;
+ unsigned int start_nr;
- vi = malloc(max * sizeof(*vi));
- if (!vi)
+ vdi_inuse = malloc(max);
+ if (!vdi_inuse)
return 0;
- memset(name, 0, sizeof(name));
- snprintf(name, sizeof(name), "%s", s->name);
-
fd = connect_to_vost();
if (fd < 0)
goto out;
+ rlen = max;
wlen = 0;
- rlen = max * sizeof(*vi);
memset(&req, 0, sizeof(req));
- req.opcode = SD_OP_SO_READ_VDIS;
- req.data_length = rlen;
+ req.opcode = SD_OP_READ_VDIS;
+ req.data_length = max;
- ret = do_req(fd, (struct sd_req *)&req, vi, &wlen, &rlen);
+ ret = do_req(fd, (struct sd_req *)&req, vdi_inuse, &wlen, &rlen);
close(fd);
if (ret)
goto out;
- rsp = (struct sd_rsp *)&req;
- if (rsp->result != SD_RES_SUCCESS)
- goto out;
-
- nr = rsp->data_length / sizeof(*vi);
sn_tab = malloc(nr * sizeof(*sn_tab));
if (!sn_tab)
goto out;
memset(sn_tab, 0, nr * sizeof(*sn_tab));
- for (i = 0; i < nr; i++) {
+ start_nr = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT) & (SD_NR_VDIS - 1);
+
+ /* TODO: round up */
+ for (i = start_nr; i < SD_NR_VDIS && found < nr; i++) {
int copies;
- if (strcmp(vi[i].name, s->name) || !vi[i].id)
- continue;
+ if (!test_bit(i, vdi_inuse))
+ break;
- ret = read_vdi_obj((char *)&inode, vi[i].oid, &copies);
+ ret = read_vdi_obj((char *)&inode, bit_to_oid(i), &copies);
if (ret)
continue;
- sn_tab[found].date_sec = inode.ctime >> 32;
- sn_tab[found].date_nsec = inode.ctime & 0xffffffff;
+ if (!strcmp(inode.name, s->name) && inode.snap_ctime) {
+ sn_tab[found].date_sec = inode.snap_ctime >> 32;
+ sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
- snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
- vi[i].id);
- strncpy(sn_tab[found].name, vi[i].tag, sizeof(sn_tab[found].name));
- found++;
+ snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
+ inode.snap_id);
+ found++;
+ }
}
out:
*psn_tab = sn_tab;
- free(vi);
+ free(vdi_inuse);
return found;
}
-
static QEMUOptionParameter sd_create_options[] = {
{
.name = BLOCK_OPT_SIZE,
--
1.7.0
More information about the sheepdog
mailing list