[sheepdog] [PATCH v4 2/7] replace structure of inode->data_vdi_id[] from array to btree
Liu Yuan
namei.unix at gmail.com
Fri Nov 1 11:55:38 CET 2013
On Fri, Nov 01, 2013 at 06:03:37PM +0800, Robin Dong wrote:
> 1. add basic {dog,sheep,sheepfs}_bnode_{reader,writer} function for b-tree to read/write node
> 2. add sd_extent_header to manage meta-data in data_vdi_id[] or middle-node
> 3. add new type of object: B-tree object as middle-node
>
> Signed-off-by: Robin Dong <sanbai at taobao.com>
> ---
> dog/cluster.c | 2 +-
> dog/dog.h | 9 +
> dog/vdi.c | 51 +++--
> include/sheepdog_proto.h | 61 ++++--
> lib/sd_inode.c | 567 +++++++++++++++++++++++++++++++++++++++++++++-
> sheep/sheep_priv.h | 9 +
> sheep/vdi.c | 13 +-
> sheepfs/volume.c | 43 ++++-
> 8 files changed, 716 insertions(+), 39 deletions(-)
>
> diff --git a/dog/cluster.c b/dog/cluster.c
> index 2e77407..78586a5 100644
> --- a/dog/cluster.c
> +++ b/dog/cluster.c
> @@ -259,7 +259,7 @@ static void fill_object_tree(uint32_t vid, const char *name, const char *tag,
> /* fill data object id */
> nr_objs = count_data_objs(i);
> for (uint64_t idx = 0; idx < nr_objs; idx++) {
> - vdi_id = sd_inode_get_vid(i, idx);
> + vdi_id = INODE_GET_VID(i, idx);
> if (vdi_id) {
> uint64_t oid = vid_to_data_oid(vdi_id, idx);
> object_tree_insert(oid, i->nr_copies, i->copy_policy);
> diff --git a/dog/dog.h b/dog/dog.h
> index 28c36a1..53b387e 100644
> --- a/dog/dog.h
> +++ b/dog/dog.h
> @@ -85,6 +85,15 @@ void show_progress(uint64_t done, uint64_t total, bool raw);
> size_t get_store_objsize(uint8_t copy_policy, uint64_t oid);
> bool is_erasure_oid(uint64_t oid, uint8_t policy);
>
> +int dog_bnode_writer(uint64_t oid, void *mem, unsigned int len,
> + int copies, int copy_policy, int create);
> +int dog_bnode_reader(uint64_t oid, void **mem, unsigned int len);
> +
> +#define INODE_GET_VID(inode, idx) (sd_inode_get_vid(dog_bnode_reader, \
> + inode, idx))
> +#define INODE_SET_VID(inode, idx, vdi_id) (sd_inode_set_vid(dog_bnode_writer, \
> + dog_bnode_reader, inode, idx, vdi_id))
> +
> extern struct command vdi_command;
> extern struct command node_command;
> extern struct command cluster_command;
> diff --git a/dog/vdi.c b/dog/vdi.c
> index efbf9f4..894ff75 100644
> --- a/dog/vdi.c
> +++ b/dog/vdi.c
> @@ -58,6 +58,24 @@ struct get_vdi_info {
> uint8_t copy_policy;
> };
>
> +int dog_bnode_writer(uint64_t oid, void *mem, unsigned int len,
> + int copies, int copy_policy, int create)
> +{
> + return sd_write_object(oid, 0, mem, len, 0, 0, copies, copy_policy,
> + true, true);
> +}
> +
> +int dog_bnode_reader(uint64_t oid, void **mem, unsigned int len)
> +{
> + return sd_read_object(oid, *mem, len, 0, true);
> +}
> +
> +static inline bool is_data_obj_writeable(const struct sd_inode *inode,
> + uint32_t idx)
> +{
> + return inode->vdi_id == INODE_GET_VID(inode, idx);
> +}
> +
> static void vdi_show_progress(uint64_t done, uint64_t total)
> {
> return show_progress(done, total, false);
> @@ -300,7 +318,7 @@ static int obj_info_filler(const char *sheep, uint64_t oid, struct sd_rsp *rsp,
> if (info->success)
> break;
> info->success = true;
> - vdi_id = sd_inode_get_vid(inode, info->idx);
> + vdi_id = INODE_GET_VID(inode, info->idx);
> if (vdi_id) {
> info->data_oid = vid_to_data_oid(vdi_id, info->idx);
> return 1;
> @@ -516,7 +534,7 @@ static int vdi_create(int argc, char **argv)
> uint64_t size;
> uint32_t vid;
> uint64_t oid;
> - int idx, max_idx, ret, nr_copies = vdi_cmd_data.nr_copies;
> + uint32_t idx, max_idx, ret, nr_copies = vdi_cmd_data.nr_copies;
> struct sd_inode *inode = NULL;
>
> if (!argv[optind]) {
> @@ -563,7 +581,7 @@ static int vdi_create(int argc, char **argv)
> goto out;
> }
>
> - sd_inode_set_vid(inode, idx, vid);
> + INODE_SET_VID(inode, idx, vid);
> ret = sd_write_object(vid_to_vdi_oid(vid), 0, &vid, sizeof(vid),
> SD_INODE_HEADER_SIZE + sizeof(vid) * idx,
> 0, inode->nr_copies, inode->copy_policy,
> @@ -632,7 +650,7 @@ static int vdi_clone(int argc, char **argv)
> const char *src_vdi = argv[optind++], *dst_vdi;
> uint32_t base_vid, new_vid, vdi_id;
> uint64_t oid;
> - int idx, max_idx, ret;
> + uint32_t idx, max_idx, ret;
> struct sd_inode *inode = NULL;
> char *buf = NULL;
>
> @@ -670,7 +688,7 @@ static int vdi_clone(int argc, char **argv)
> size_t size;
>
> vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size);
> - vdi_id = sd_inode_get_vid(inode, idx);
> + vdi_id = INODE_GET_VID(inode, idx);
> if (vdi_id) {
> oid = vid_to_data_oid(vdi_id, idx);
> ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0, true);
> @@ -1230,7 +1248,7 @@ static int vdi_read(int argc, char **argv)
> offset %= SD_DATA_OBJ_SIZE;
> while (done < total) {
> len = min(total - done, SD_DATA_OBJ_SIZE - offset);
> - vdi_id = sd_inode_get_vid(inode, idx);
> + vdi_id = INODE_GET_VID(inode, idx);
> if (vdi_id) {
> oid = vid_to_data_oid(vdi_id, idx);
> ret = sd_read_object(oid, buf, len, offset, false);
> @@ -1306,7 +1324,7 @@ static int vdi_write(int argc, char **argv)
> flags = 0;
> len = min(total - done, SD_DATA_OBJ_SIZE - offset);
>
> - vdi_id = sd_inode_get_vid(inode, idx);
> + vdi_id = INODE_GET_VID(inode, idx);
> if (!vdi_id)
> create = true;
> else if (!is_data_obj_writeable(inode, idx)) {
> @@ -1328,7 +1346,7 @@ static int vdi_write(int argc, char **argv)
> total = done + len;
> }
>
> - sd_inode_set_vid(inode, idx, inode->vdi_id);
> + INODE_SET_VID(inode, idx, inode->vdi_id);
> oid = vid_to_data_oid(inode->vdi_id, idx);
> ret = sd_write_object(oid, old_oid, buf, len, offset, flags,
> inode->nr_copies, inode->copy_policy,
> @@ -1668,7 +1686,7 @@ static void queue_vdi_check_work(const struct sd_inode *inode, uint64_t oid,
>
> int do_vdi_check(const struct sd_inode *inode)
> {
> - int max_idx;
> + uint32_t max_idx;
> uint64_t done = 0, oid;
> uint32_t vid;
> struct work_queue *wq;
> @@ -1686,8 +1704,8 @@ int do_vdi_check(const struct sd_inode *inode)
>
> max_idx = count_data_objs(inode);
> vdi_show_progress(done, inode->vdi_size);
> - for (int idx = 0; idx < max_idx; idx++) {
> - vid = sd_inode_get_vid(inode, idx);
> + for (uint32_t idx = 0; idx < max_idx; idx++) {
> + vid = INODE_GET_VID(inode, idx);
> if (vid) {
> oid = vid_to_data_oid(vid, idx);
> queue_vdi_check_work(inode, oid, &done, wq);
> @@ -1762,7 +1780,7 @@ static void compact_obj_backup(struct obj_backup *backup, uint8_t *from_data)
> }
> }
>
> -static int get_obj_backup(int idx, uint32_t from_vid, uint32_t to_vid,
> +static int get_obj_backup(uint32_t idx, uint32_t from_vid, uint32_t to_vid,
> struct obj_backup *backup)
> {
> int ret;
> @@ -1803,7 +1821,8 @@ static int get_obj_backup(int idx, uint32_t from_vid, uint32_t to_vid,
> static int vdi_backup(int argc, char **argv)
> {
> const char *vdiname = argv[optind++];
> - int ret = EXIT_SUCCESS, idx, nr_objs;
> + int ret = EXIT_SUCCESS;
> + uint32_t idx, nr_objs;
> struct sd_inode *from_inode = xzalloc(sizeof(*from_inode));
> struct sd_inode *to_inode = xzalloc(sizeof(*to_inode));
> struct backup_hdr hdr = {
> @@ -1842,8 +1861,8 @@ static int vdi_backup(int argc, char **argv)
> }
>
> for (idx = 0; idx < nr_objs; idx++) {
> - uint32_t from_vid = sd_inode_get_vid(from_inode, idx);
> - uint32_t to_vid = sd_inode_get_vid(to_inode, idx);
> + uint32_t from_vid = INODE_GET_VID(from_inode, idx);
> + uint32_t to_vid = INODE_GET_VID(to_inode, idx);
>
> if (to_vid == 0 && from_vid == 0)
> continue;
> @@ -1896,7 +1915,7 @@ static int restore_obj(struct obj_backup *backup, uint32_t vid,
> struct sd_inode *parent_inode)
> {
> int ret;
> - uint32_t parent_vid = sd_inode_get_vid(parent_inode, backup->idx);
> + uint32_t parent_vid = INODE_GET_VID(parent_inode, backup->idx);
> uint64_t parent_oid = 0;
>
> if (parent_vid)
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 36d5701..67c8d6e 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -74,6 +74,9 @@
> #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
> #define SD_RES_HALT 0x19 /* Sheepdog is stopped doing IO */
> #define SD_RES_READONLY 0x1A /* Object is read-only */
> +#define SD_RES_BTREE_NOT_FOUND 0x1B /* Cannot found node in btree */
> +#define SD_RES_BTREE_FOUND 0x1C /* Found node in btree */
> +#define SD_RES_BTREE_REPEAT 0x1D /* Should repeat op in btree */
>
> /* errors above 0x80 are sheepdog-internal */
>
> @@ -92,8 +95,9 @@
> #define VDI_BIT (UINT64_C(1) << 63)
> #define VMSTATE_BIT (UINT64_C(1) << 62)
> #define VDI_ATTR_BIT (UINT64_C(1) << 61)
> +#define VDI_BTREE_BIT (UINT64_C(1) << 60)
> #define MAX_DATA_OBJS (1ULL << 20)
> -#define MAX_CHILDREN 1024U
> +#define MAX_CHILDREN (1024U - 1) /* we use the last uint32_t as btree_counter */
This modification should be removed.
> #define SD_MAX_VDI_LEN 256U
> #define SD_MAX_VDI_TAG_LEN 256U
> #define SD_MAX_VDI_ATTR_KEY_LEN 256U
> @@ -104,8 +108,8 @@
> #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
>
> #define SD_INODE_SIZE (sizeof(struct sd_inode))
> -#define SD_INODE_HEADER_SIZE (sizeof(struct sd_inode) - \
> - sizeof(uint32_t) * MAX_DATA_OBJS)
> +#define SD_INODE_INDEX_SIZE (sizeof(uint32_t) * MAX_DATA_OBJS)
> +#define SD_INODE_HEADER_SIZE (sizeof(struct sd_inode) - SD_INODE_INDEX_SIZE)
> #define SD_ATTR_OBJ_SIZE (sizeof(struct sheepdog_vdi_attr))
> #define CURRENT_VDI_ID 0
>
> @@ -215,16 +219,39 @@ struct sd_inode {
> uint64_t vdi_size;
> uint64_t vm_state_size;
> uint8_t copy_policy;
> - uint8_t reserved;
> + uint8_t store_policy;
> uint8_t nr_copies;
> uint8_t block_size_shift;
> uint32_t snap_id;
> uint32_t vdi_id;
> uint32_t parent_vdi_id;
> uint32_t child_vdi_id[MAX_CHILDREN];
> + uint32_t btree_counter;
> uint32_t data_vdi_id[MAX_DATA_OBJS];
I think you need to add btree_counter after data_vdi_id for complete
compatability as kazutaka commented.
Thanks
Yuan
More information about the sheepdog
mailing list