Signed-off-by: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp> --- block/sheepdog.c | 210 ++++++++++++++++++++++++++--------------------------- 1 files changed, 103 insertions(+), 107 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index 6a45cfa..e049463 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -36,6 +36,7 @@ #define SD_OP_GET_NODE_LIST 0x19 #define SD_OP_GET_VM_LIST 0x20 #define SD_OP_MAKE_FS 0x21 +#define SD_OP_READ_VDIS 0x26 #define SD_OP_DEBUG_INC_NVER 0xA0 #define SD_OP_DEBUG_SET_NODE 0xA1 @@ -49,8 +50,6 @@ #define SD_FLAG_CMD_WRITE 0x01 #define SD_FLAG_CMD_COW 0x02 -#define SD_FLAG_CMD_SNAPSHOT (1U << 8) - #define SD_RES_SUCCESS 0x00 /* Success */ #define SD_RES_UNKNOWN 0x01 /* Unknown error */ #define SD_RES_NO_OBJ 0x02 /* No object found */ @@ -77,21 +76,21 @@ #define SD_RES_VDI_NOT_LOCKED 0x17 /* Vdi is not locked */ #define SD_RES_SHUTDOWN 0x18 /* Sheepdog is shutting down */ -#define MAX_DATA_OBJS (1ULL << 18) -#define MAX_CHILDREN 1024 -#define MAX_AIO_REQS 4096 - /* should be configurable? */ #define MAX_RETRIES 6 +#define SD_NR_VDIS (1U << 24) +#define VDI_SPACE_SHIFT 32 +#define VDI_BIT (UINT64_C(1) << 63) +#define DEAFAULT_NR_COPIES 1 +#define SD_MAX_VDI_LEN 256 +#define MAX_DATA_OBJS (1ULL << 20) +#define MAX_CHILDREN 1024 #define SD_DATA_OBJ_SIZE (1UL << 22) #define SD_INODE_SIZE (sizeof(struct sd_inode)) - #define CURRENT_VDI_ID 0 -#define SD_MAX_VDI_LEN 256 - #undef eprintf #define eprintf(fmt, args...) \ do { \ @@ -162,13 +161,12 @@ struct sd_vdi_req { uint32_t id; uint32_t data_length; uint64_t base_oid; - uint64_t tag; uint64_t vdi_size; + uint32_t copies; + uint32_t snapid; uint32_t pad[2]; }; -#define SD_VDI_RSP_FLAG_CURRENT 0x01; - struct sd_vdi_rsp { uint8_t proto_ver; uint8_t opcode; @@ -207,12 +205,15 @@ struct sd_node_rsp { }; struct sd_inode { + char name[SD_MAX_VDI_LEN]; uint64_t oid; uint64_t ctime; + uint64_t snap_ctime; uint64_t vdi_size; - uint64_t block_size; - uint32_t copy_policy; - uint32_t nr_copies; + uint16_t copy_policy; + uint8_t nr_copies; + uint8_t block_size_shift; + uint32_t snap_id; uint64_t parent_oid; uint64_t child_oid[MAX_CHILDREN]; uint64_t data_oid[MAX_DATA_OBJS]; @@ -252,6 +253,8 @@ struct sd_aiocb { QLIST_HEAD(aioreq_head, aio_req) aioreq_head; }; +#define MAX_AIO_REQS 4096 + struct sd_aiostate { struct bdrv_sd_state *s; int fd; @@ -322,19 +325,15 @@ static inline int after(uint32_t seq1, uint32_t seq2) return (int32_t)(seq2 - seq1) < 0; } -static inline uint64_t oid_to_ino(uint64_t inode_oid) -{ - return (inode_oid >> 18) & ((1ULL << 37) - 1); -} - -static inline int is_data_obj_writeable(uint64_t inode_oid, uint64_t data_oid) +static inline int is_data_obj_writeable(struct sd_inode *inode, unsigned int idx) { - return oid_to_ino(inode_oid) == oid_to_ino(data_oid); + return (inode->oid >> VDI_SPACE_SHIFT) == + (inode->data_oid[idx] >> VDI_SPACE_SHIFT); } static inline int is_data_obj(uint64_t oid) { - return oid & ((1ULL << 18) - 1); + return !(VDI_BIT & oid); } /* @@ -872,7 +871,7 @@ static int get_sheep_fd(struct bdrv_sd_state *s) } static int parse_vdiname(const char *filename, char *vdi, int vdi_len, - uint64_t *tag) + uint32_t *snapid) { char *p, *q; @@ -888,35 +887,37 @@ static int parse_vdiname(const char *filename, char *vdi, int vdi_len, p = strchr(vdi, ':'); if (p) { *p++ = '\0'; - *tag = strtol(p, NULL, 16); + *snapid = strtol(p, NULL, 16); } else - *tag = CURRENT_VDI_ID; /* search current vdi */ + *snapid = CURRENT_VDI_ID; /* search current vdi */ free(q); return 0; } -static int find_vdi_name(struct bdrv_sd_state *s, char *filename, uint64_t tag, - uint64_t *oid, int for_snapshot, int *current) +static int find_vdi_name(struct bdrv_sd_state *s, char *filename, uint32_t snapid, + uint64_t *oid) { int ret, fd; struct sd_vdi_req hdr; struct sd_vdi_rsp *rsp = (struct sd_vdi_rsp *)&hdr; unsigned int wlen, rlen = 0; + char buf[SD_MAX_VDI_LEN]; fd = connect_to_vost(); if (fd < 0) return -1; memset(&hdr, 0, sizeof(hdr)); + snprintf(buf, sizeof(buf), "%s", filename); hdr.opcode = SD_OP_GET_VDI_INFO; - wlen = strlen(filename) + 1; - hdr.data_length = wlen; - hdr.tag = tag; + wlen = SD_MAX_VDI_LEN; + hdr.data_length = SD_MAX_VDI_LEN; + hdr.snapid = snapid; hdr.flags = SD_FLAG_CMD_WRITE; - ret = do_req(fd, (struct sd_req *)&hdr, filename, &wlen, &rlen); + ret = do_req(fd, (struct sd_req *)&hdr, buf, &wlen, &rlen); if (ret) { ret = -1; goto out; @@ -927,9 +928,8 @@ static int find_vdi_name(struct bdrv_sd_state *s, char *filename, uint64_t tag, ret = -1; goto out; } - *oid = rsp->oid; - s->is_current = rsp->flags & SD_VDI_RSP_FLAG_CURRENT; + ret = 0; out: close(fd); @@ -1045,7 +1045,7 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags) uint64_t oid = 0; struct bdrv_sd_state *s = bs->opaque; char vdi[256]; - uint64_t tag; + uint32_t snapid; int for_snapshot = 0, dummy; char *buf; @@ -1070,15 +1070,17 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags) for_snapshot = 1; memset(vdi, 0, sizeof(vdi)); - if (parse_vdiname(filename, vdi, sizeof(vdi), &tag) < 0) + if (parse_vdiname(filename, vdi, sizeof(vdi), &snapid) < 0) goto out; - ret = find_vdi_name(s, vdi, tag, &oid, for_snapshot, &s->is_current); + ret = find_vdi_name(s, vdi, snapid, &oid); if (ret) goto out; - if (!s->is_current) + if (snapid) eprintf("%" PRIx64 " non current inode was open.\n", oid); + else + s->is_current = 1; ret = read_vdi_obj(buf, oid, &dummy); if (ret) @@ -1105,28 +1107,23 @@ static int do_sd_create(char *filename, char *tag, int64_t total_sectors, struct sd_vdi_rsp *rsp = (struct sd_vdi_rsp *)&hdr; int fd, ret; unsigned int wlen, rlen = 0; - char buf[SD_MAX_VDI_LEN * 2]; + char buf[SD_MAX_VDI_LEN]; fd = connect_to_vost(); if (fd < 0) return -1; - memset(buf, 0, sizeof(buf)); strncpy(buf, filename, SD_MAX_VDI_LEN); - if (tag) - strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_LEN); memset(&hdr, 0, sizeof(hdr)); hdr.opcode = SD_OP_NEW_VDI; hdr.base_oid = base_oid; wlen = SD_MAX_VDI_LEN; - if (tag) - wlen += SD_MAX_VDI_LEN; hdr.flags = SD_FLAG_CMD_WRITE; - if (snapshot) - hdr.flags |= SD_FLAG_CMD_SNAPSHOT; + hdr.snapid = snapshot; + hdr.data_length = wlen; hdr.vdi_size = total_sectors * 512; @@ -1166,8 +1163,8 @@ static int sd_create(const char *filename, QEMUOptionParameter *options) if (backing_file) { BlockDriverState bs; - char vdi[256]; - uint64_t tag; + char vdi[SD_MAX_VDI_LEN]; + uint32_t snapid; memset(&bs, 0, sizeof(bs)); @@ -1179,16 +1176,15 @@ static int sd_create(const char *filename, QEMUOptionParameter *options) if (ret < 0) return -1; - if (parse_vdiname(backing_file, vdi, sizeof(vdi), &tag) < 0) + if (parse_vdiname(backing_file, vdi, sizeof(vdi), &snapid) < 0) return -1; /* cannot clone from a current inode */ - if (tag == CURRENT_VDI_ID) + if (snapid == CURRENT_VDI_ID) return -1; - ret = find_vdi_name(bs.opaque, vdi, tag, &oid, 1, NULL); - struct bdrv_sd_state *s = bs.opaque; - if (ret || s->is_current) + ret = find_vdi_name(bs.opaque, vdi, snapid, &oid); + if (ret) return -1; } @@ -1199,8 +1195,6 @@ static void sd_close(BlockDriverState *bs) { struct bdrv_sd_state *s = bs->opaque; - eprintf("%s\n", s->name); - free(s->name); } @@ -1211,6 +1205,7 @@ static int sd_claim(BlockDriverState *bs) struct sd_vdi_req hdr; struct sd_vdi_rsp *rsp = (struct sd_vdi_rsp *)&hdr; unsigned int wlen, rlen = 0; + char buf[SD_MAX_VDI_LEN]; eprintf("%s\n", s->name); @@ -1218,14 +1213,16 @@ static int sd_claim(BlockDriverState *bs) if (fd < 0) return -1; + memset(buf, 0, sizeof(buf)); + strncpy(buf, s->name, SD_MAX_VDI_LEN); memset(&hdr, 0, sizeof(hdr)); hdr.opcode = SD_OP_LOCK_VDI; - wlen = strlen(s->name) + 1; - hdr.data_length = wlen; - hdr.tag = CURRENT_VDI_ID; + wlen = SD_MAX_VDI_LEN; + hdr.data_length = SD_MAX_VDI_LEN; + hdr.snapid = CURRENT_VDI_ID; hdr.flags = SD_FLAG_CMD_WRITE; - ret = do_req(fd, (struct sd_req *)&hdr, s->name, &wlen, &rlen); + ret = do_req(fd, (struct sd_req *)&hdr, buf, &wlen, &rlen); if (ret) { ret = -1; goto out; @@ -1407,19 +1404,19 @@ static void sd_write_bh_cb(void *p) len = min_t(unsigned long, total - done, CHUNK_SIZE - offset); - if (!oid || !is_data_obj_writeable(inode->oid, oid)) { + if (!oid || !is_data_obj_writeable(inode, idx)) { if (!acb->write) goto done; create = 1; dprintf("update ino (%" PRIu64") %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", - inode->oid, oid, inode->oid + (idx + 1), idx); - if (oid && !is_data_obj_writeable(inode->oid, oid)) { + inode->oid, oid, inode->oid + idx, idx); + if (oid && !is_data_obj_writeable(inode, idx)) { old_oid = oid; flags = SD_FLAG_CMD_COW; } - oid = inode->oid + (idx + 1); + oid = inode->oid + idx; acb->oid[i] = oid; dprintf("new oid %lx\n", acb->oid[i]); } @@ -1619,96 +1616,95 @@ struct sd_so_rsp { uint32_t opcode_specific[2]; }; -struct sheepdog_vdi_info { - uint64_t oid; - uint16_t id; - uint16_t name_len; - uint16_t tag_len; - uint8_t type; - uint8_t flags; - uint32_t epoch; - char name[SD_MAX_VDI_LEN]; - char tag[SD_MAX_VDI_LEN]; -}; +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#define BITS_PER_BYTE 8 +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) +#define DECLARE_BITMAP(name,bits) \ + unsigned long name[BITS_TO_LONGS(bits)] + +#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long)) + +static inline int test_bit(unsigned int nr, const unsigned long *addr) +{ + return ((1UL << (nr % BITS_PER_LONG)) & + (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0; +} -#define SD_OP_SO_READ_VDIS 0x64 +static inline uint64_t bit_to_oid(unsigned long nr) +{ + return ((unsigned long long)nr << VDI_SPACE_SHIFT) | VDI_BIT; +} static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) { struct bdrv_sd_state *s = bs->opaque; - struct sd_so_req req; - struct sd_rsp *rsp; - struct sheepdog_vdi_info *vi; - int i, fd, nr = 0, ret, max = 1024; /* FIXME */ - char name[SD_MAX_VDI_LEN]; + struct sd_req req; + int i, fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long); QEMUSnapshotInfo *sn_tab = NULL; unsigned wlen, rlen; int found = 0; - struct sd_inode inode; + static struct sd_inode inode; + unsigned long *vdi_inuse; + unsigned int start_nr; - vi = malloc(max * sizeof(*vi)); - if (!vi) + vdi_inuse = malloc(max); + if (!vdi_inuse) return 0; - memset(name, 0, sizeof(name)); - snprintf(name, sizeof(name), "%s", s->name); - fd = connect_to_vost(); if (fd < 0) goto out; + rlen = max; wlen = 0; - rlen = max * sizeof(*vi); memset(&req, 0, sizeof(req)); - req.opcode = SD_OP_SO_READ_VDIS; - req.data_length = rlen; + req.opcode = SD_OP_READ_VDIS; + req.data_length = max; - ret = do_req(fd, (struct sd_req *)&req, vi, &wlen, &rlen); + ret = do_req(fd, (struct sd_req *)&req, vdi_inuse, &wlen, &rlen); close(fd); if (ret) goto out; - rsp = (struct sd_rsp *)&req; - if (rsp->result != SD_RES_SUCCESS) - goto out; - - nr = rsp->data_length / sizeof(*vi); sn_tab = malloc(nr * sizeof(*sn_tab)); if (!sn_tab) goto out; memset(sn_tab, 0, nr * sizeof(*sn_tab)); - for (i = 0; i < nr; i++) { + start_nr = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT) & (SD_NR_VDIS - 1); + + /* TODO: round up */ + for (i = start_nr; i < SD_NR_VDIS && found < nr; i++) { int copies; - if (strcmp(vi[i].name, s->name) || !vi[i].id) - continue; + if (!test_bit(i, vdi_inuse)) + break; - ret = read_vdi_obj((char *)&inode, vi[i].oid, &copies); + ret = read_vdi_obj((char *)&inode, bit_to_oid(i), &copies); if (ret) continue; - sn_tab[found].date_sec = inode.ctime >> 32; - sn_tab[found].date_nsec = inode.ctime & 0xffffffff; + if (!strcmp(inode.name, s->name) && inode.snap_ctime) { + sn_tab[found].date_sec = inode.snap_ctime >> 32; + sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff; - snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u", - vi[i].id); - strncpy(sn_tab[found].name, vi[i].tag, sizeof(sn_tab[found].name)); - found++; + snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u", + inode.snap_id); + found++; + } } out: *psn_tab = sn_tab; - free(vi); + free(vdi_inuse); return found; } - static QEMUOptionParameter sd_create_options[] = { { .name = BLOCK_OPT_SIZE, -- 1.7.0 |