[sheepdog] [PATCH v4 2/7] replace structure of inode->data_vdi_id[] from array to btree

Liu Yuan namei.unix at gmail.com
Fri Nov 1 11:55:38 CET 2013


On Fri, Nov 01, 2013 at 06:03:37PM +0800, Robin Dong wrote:
> 1. add basic {dog,sheep,sheepfs}_bnode_{reader,writer} function for b-tree to read/write node
> 2. add sd_extent_header to manage meta-data in data_vdi_id[] or middle-node
> 3. add new type of object: B-tree object as middle-node
> 
> Signed-off-by: Robin Dong <sanbai at taobao.com>
> ---
>  dog/cluster.c            |    2 +-
>  dog/dog.h                |    9 +
>  dog/vdi.c                |   51 +++--
>  include/sheepdog_proto.h |   61 ++++--
>  lib/sd_inode.c           |  567 +++++++++++++++++++++++++++++++++++++++++++++-
>  sheep/sheep_priv.h       |    9 +
>  sheep/vdi.c              |   13 +-
>  sheepfs/volume.c         |   43 ++++-
>  8 files changed, 716 insertions(+), 39 deletions(-)
> 
> diff --git a/dog/cluster.c b/dog/cluster.c
> index 2e77407..78586a5 100644
> --- a/dog/cluster.c
> +++ b/dog/cluster.c
> @@ -259,7 +259,7 @@ static void fill_object_tree(uint32_t vid, const char *name, const char *tag,
>  	/* fill data object id */
>  	nr_objs = count_data_objs(i);
>  	for (uint64_t idx = 0; idx < nr_objs; idx++) {
> -		vdi_id = sd_inode_get_vid(i, idx);
> +		vdi_id = INODE_GET_VID(i, idx);
>  		if (vdi_id) {
>  			uint64_t oid = vid_to_data_oid(vdi_id, idx);
>  			object_tree_insert(oid, i->nr_copies, i->copy_policy);
> diff --git a/dog/dog.h b/dog/dog.h
> index 28c36a1..53b387e 100644
> --- a/dog/dog.h
> +++ b/dog/dog.h
> @@ -85,6 +85,15 @@ void show_progress(uint64_t done, uint64_t total, bool raw);
>  size_t get_store_objsize(uint8_t copy_policy, uint64_t oid);
>  bool is_erasure_oid(uint64_t oid, uint8_t policy);
>  
> +int dog_bnode_writer(uint64_t oid, void *mem, unsigned int len,
> +		     int copies, int copy_policy, int create);
> +int dog_bnode_reader(uint64_t oid, void **mem, unsigned int len);
> +
> +#define INODE_GET_VID(inode, idx) (sd_inode_get_vid(dog_bnode_reader, \
> +							inode, idx))
> +#define INODE_SET_VID(inode, idx, vdi_id) (sd_inode_set_vid(dog_bnode_writer, \
> +					dog_bnode_reader, inode, idx, vdi_id))
> +
>  extern struct command vdi_command;
>  extern struct command node_command;
>  extern struct command cluster_command;
> diff --git a/dog/vdi.c b/dog/vdi.c
> index efbf9f4..894ff75 100644
> --- a/dog/vdi.c
> +++ b/dog/vdi.c
> @@ -58,6 +58,24 @@ struct get_vdi_info {
>  	uint8_t copy_policy;
>  };
>  
> +int dog_bnode_writer(uint64_t oid, void *mem, unsigned int len,
> +		     int copies, int copy_policy, int create)
> +{
> +	return sd_write_object(oid, 0, mem, len, 0, 0, copies, copy_policy,
> +			true, true);
> +}
> +
> +int dog_bnode_reader(uint64_t oid, void **mem, unsigned int len)
> +{
> +	return sd_read_object(oid, *mem, len, 0, true);
> +}
> +
> +static inline bool is_data_obj_writeable(const struct sd_inode *inode,
> +					 uint32_t idx)
> +{
> +	return inode->vdi_id == INODE_GET_VID(inode, idx);
> +}
> +
>  static void vdi_show_progress(uint64_t done, uint64_t total)
>  {
>  	return show_progress(done, total, false);
> @@ -300,7 +318,7 @@ static int obj_info_filler(const char *sheep, uint64_t oid, struct sd_rsp *rsp,
>  		if (info->success)
>  			break;
>  		info->success = true;
> -		vdi_id = sd_inode_get_vid(inode, info->idx);
> +		vdi_id = INODE_GET_VID(inode, info->idx);
>  		if (vdi_id) {
>  			info->data_oid = vid_to_data_oid(vdi_id, info->idx);
>  			return 1;
> @@ -516,7 +534,7 @@ static int vdi_create(int argc, char **argv)
>  	uint64_t size;
>  	uint32_t vid;
>  	uint64_t oid;
> -	int idx, max_idx, ret, nr_copies = vdi_cmd_data.nr_copies;
> +	uint32_t idx, max_idx, ret, nr_copies = vdi_cmd_data.nr_copies;
>  	struct sd_inode *inode = NULL;
>  
>  	if (!argv[optind]) {
> @@ -563,7 +581,7 @@ static int vdi_create(int argc, char **argv)
>  			goto out;
>  		}
>  
> -		sd_inode_set_vid(inode, idx, vid);
> +		INODE_SET_VID(inode, idx, vid);
>  		ret = sd_write_object(vid_to_vdi_oid(vid), 0, &vid, sizeof(vid),
>  				      SD_INODE_HEADER_SIZE + sizeof(vid) * idx,
>  				      0, inode->nr_copies, inode->copy_policy,
> @@ -632,7 +650,7 @@ static int vdi_clone(int argc, char **argv)
>  	const char *src_vdi = argv[optind++], *dst_vdi;
>  	uint32_t base_vid, new_vid, vdi_id;
>  	uint64_t oid;
> -	int idx, max_idx, ret;
> +	uint32_t idx, max_idx, ret;
>  	struct sd_inode *inode = NULL;
>  	char *buf = NULL;
>  
> @@ -670,7 +688,7 @@ static int vdi_clone(int argc, char **argv)
>  		size_t size;
>  
>  		vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size);
> -		vdi_id = sd_inode_get_vid(inode, idx);
> +		vdi_id = INODE_GET_VID(inode, idx);
>  		if (vdi_id) {
>  			oid = vid_to_data_oid(vdi_id, idx);
>  			ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0, true);
> @@ -1230,7 +1248,7 @@ static int vdi_read(int argc, char **argv)
>  	offset %= SD_DATA_OBJ_SIZE;
>  	while (done < total) {
>  		len = min(total - done, SD_DATA_OBJ_SIZE - offset);
> -		vdi_id = sd_inode_get_vid(inode, idx);
> +		vdi_id = INODE_GET_VID(inode, idx);
>  		if (vdi_id) {
>  			oid = vid_to_data_oid(vdi_id, idx);
>  			ret = sd_read_object(oid, buf, len, offset, false);
> @@ -1306,7 +1324,7 @@ static int vdi_write(int argc, char **argv)
>  		flags = 0;
>  		len = min(total - done, SD_DATA_OBJ_SIZE - offset);
>  
> -		vdi_id = sd_inode_get_vid(inode, idx);
> +		vdi_id = INODE_GET_VID(inode, idx);
>  		if (!vdi_id)
>  			create = true;
>  		else if (!is_data_obj_writeable(inode, idx)) {
> @@ -1328,7 +1346,7 @@ static int vdi_write(int argc, char **argv)
>  			total = done + len;
>  		}
>  
> -		sd_inode_set_vid(inode, idx, inode->vdi_id);
> +		INODE_SET_VID(inode, idx, inode->vdi_id);
>  		oid = vid_to_data_oid(inode->vdi_id, idx);
>  		ret = sd_write_object(oid, old_oid, buf, len, offset, flags,
>  				      inode->nr_copies, inode->copy_policy,
> @@ -1668,7 +1686,7 @@ static void queue_vdi_check_work(const struct sd_inode *inode, uint64_t oid,
>  
>  int do_vdi_check(const struct sd_inode *inode)
>  {
> -	int max_idx;
> +	uint32_t max_idx;
>  	uint64_t done = 0, oid;
>  	uint32_t vid;
>  	struct work_queue *wq;
> @@ -1686,8 +1704,8 @@ int do_vdi_check(const struct sd_inode *inode)
>  
>  	max_idx = count_data_objs(inode);
>  	vdi_show_progress(done, inode->vdi_size);
> -	for (int idx = 0; idx < max_idx; idx++) {
> -		vid = sd_inode_get_vid(inode, idx);
> +	for (uint32_t idx = 0; idx < max_idx; idx++) {
> +		vid = INODE_GET_VID(inode, idx);
>  		if (vid) {
>  			oid = vid_to_data_oid(vid, idx);
>  			queue_vdi_check_work(inode, oid, &done, wq);
> @@ -1762,7 +1780,7 @@ static void compact_obj_backup(struct obj_backup *backup, uint8_t *from_data)
>  	}
>  }
>  
> -static int get_obj_backup(int idx, uint32_t from_vid, uint32_t to_vid,
> +static int get_obj_backup(uint32_t idx, uint32_t from_vid, uint32_t to_vid,
>  			  struct obj_backup *backup)
>  {
>  	int ret;
> @@ -1803,7 +1821,8 @@ static int get_obj_backup(int idx, uint32_t from_vid, uint32_t to_vid,
>  static int vdi_backup(int argc, char **argv)
>  {
>  	const char *vdiname = argv[optind++];
> -	int ret = EXIT_SUCCESS, idx, nr_objs;
> +	int ret = EXIT_SUCCESS;
> +	uint32_t idx, nr_objs;
>  	struct sd_inode *from_inode = xzalloc(sizeof(*from_inode));
>  	struct sd_inode *to_inode = xzalloc(sizeof(*to_inode));
>  	struct backup_hdr hdr = {
> @@ -1842,8 +1861,8 @@ static int vdi_backup(int argc, char **argv)
>  	}
>  
>  	for (idx = 0; idx < nr_objs; idx++) {
> -		uint32_t from_vid = sd_inode_get_vid(from_inode, idx);
> -		uint32_t to_vid = sd_inode_get_vid(to_inode, idx);
> +		uint32_t from_vid = INODE_GET_VID(from_inode, idx);
> +		uint32_t to_vid = INODE_GET_VID(to_inode, idx);
>  
>  		if (to_vid == 0 && from_vid == 0)
>  			continue;
> @@ -1896,7 +1915,7 @@ static int restore_obj(struct obj_backup *backup, uint32_t vid,
>  		       struct sd_inode *parent_inode)
>  {
>  	int ret;
> -	uint32_t parent_vid = sd_inode_get_vid(parent_inode, backup->idx);
> +	uint32_t parent_vid = INODE_GET_VID(parent_inode, backup->idx);
>  	uint64_t parent_oid = 0;
>  
>  	if (parent_vid)
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 36d5701..67c8d6e 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -74,6 +74,9 @@
>  #define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */
>  #define SD_RES_HALT          0x19 /* Sheepdog is stopped doing IO */
>  #define SD_RES_READONLY      0x1A /* Object is read-only */
> +#define SD_RES_BTREE_NOT_FOUND	0x1B /* Cannot found node in btree */
> +#define SD_RES_BTREE_FOUND   0x1C /* Found node in btree */
> +#define SD_RES_BTREE_REPEAT  0x1D /* Should repeat op in btree */
>  
>  /* errors above 0x80 are sheepdog-internal */
>  
> @@ -92,8 +95,9 @@
>  #define VDI_BIT (UINT64_C(1) << 63)
>  #define VMSTATE_BIT (UINT64_C(1) << 62)
>  #define VDI_ATTR_BIT (UINT64_C(1) << 61)
> +#define VDI_BTREE_BIT (UINT64_C(1) << 60)
>  #define MAX_DATA_OBJS (1ULL << 20)
> -#define MAX_CHILDREN 1024U
> +#define MAX_CHILDREN (1024U - 1) /* we use the last uint32_t as btree_counter */

This modification should be removed.

>  #define SD_MAX_VDI_LEN 256U
>  #define SD_MAX_VDI_TAG_LEN 256U
>  #define SD_MAX_VDI_ATTR_KEY_LEN 256U
> @@ -104,8 +108,8 @@
>  #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
>  
>  #define SD_INODE_SIZE (sizeof(struct sd_inode))
> -#define SD_INODE_HEADER_SIZE (sizeof(struct sd_inode) - \
> -			      sizeof(uint32_t) * MAX_DATA_OBJS)
> +#define SD_INODE_INDEX_SIZE (sizeof(uint32_t) * MAX_DATA_OBJS)
> +#define SD_INODE_HEADER_SIZE (sizeof(struct sd_inode) - SD_INODE_INDEX_SIZE)
>  #define SD_ATTR_OBJ_SIZE (sizeof(struct sheepdog_vdi_attr))
>  #define CURRENT_VDI_ID 0
>  
> @@ -215,16 +219,39 @@ struct sd_inode {
>  	uint64_t vdi_size;
>  	uint64_t vm_state_size;
>  	uint8_t  copy_policy;
> -	uint8_t  reserved;
> +	uint8_t  store_policy;
>  	uint8_t  nr_copies;
>  	uint8_t  block_size_shift;
>  	uint32_t snap_id;
>  	uint32_t vdi_id;
>  	uint32_t parent_vdi_id;
>  	uint32_t child_vdi_id[MAX_CHILDREN];
> +	uint32_t btree_counter;
>  	uint32_t data_vdi_id[MAX_DATA_OBJS];

I think you need to add btree_counter after data_vdi_id for complete
compatability as kazutaka commented.

Thanks
Yuan



More information about the sheepdog mailing list