[sheepdog] [PATCH v4 1/3] sheep/http: store accounts and containers into hyper volume for object-storage

Liu Yuan namei.unix at gmail.com
Thu Dec 12 13:12:31 CET 2013


On Thu, Dec 12, 2013 at 06:15:55PM +0800, Robin Dong wrote:
> From: Robin Dong <sanbai at taobao.com>
> 
> Using hyper volume (size up to 16PB) to store large number of accounts
> and containers.
> 
> Signed-off-by: Robin Dong <sanbai at taobao.com>
> ---
>  sheep/http/http.c  |   5 +
>  sheep/http/http.h  |   1 +
>  sheep/http/kv.c    | 646 +++++++++++++++++++++++++++++++++++++++++++++++------
>  sheep/http/kv.h    |  22 +-
>  sheep/http/s3.c    |   6 +-
>  sheep/http/swift.c | 108 ++++++---
>  6 files changed, 676 insertions(+), 112 deletions(-)
> 
> diff --git a/sheep/http/http.c b/sheep/http/http.c
> index 04ef364..0081707 100644
> --- a/sheep/http/http.c
> +++ b/sheep/http/http.c
> @@ -52,6 +52,7 @@ static inline const char *strstatus(enum http_status status)
>  		[NO_CONTENT] = "204 No Content",
>  		[PARTIAL_CONTENT] = "206 Partial Content",
>  		[BAD_REQUEST] = "400 Bad Request",
> +		[UNAUTHORIZED] = "401 Unauthorized",
>  		[NOT_FOUND] = "404 Not Found",
>  		[METHOD_NOT_ALLOWED] = "405 Method Not Allowed",
>  		[CONFLICT] = "409 Conflict",
> @@ -192,6 +193,9 @@ void http_response_header(struct http_request *req, enum http_status status)
>  
>  	req->status = status;
>  	http_request_writef(req, "Status: %s\r\n", strstatus(status));
> +	if (req->opcode == HTTP_GET && req->data_length > 0)
> +		http_request_writef(req, "Content-Length: %lu\r\n",
> +				    req->data_length);
>  	http_request_writes(req, "Content-type: text/plain;\r\n\r\n");
>  }
>  
> @@ -233,6 +237,7 @@ static void http_run_request(struct work *work)
>  
>  		if (method != NULL) {
>  			method(req);
> +			sd_debug("req->status %d", req->status);
>  			if (req->status != UNKNOWN)
>  				goto out;
>  		}
> diff --git a/sheep/http/http.h b/sheep/http/http.h
> index 046d412..a8527d1 100644
> --- a/sheep/http/http.h
> +++ b/sheep/http/http.h
> @@ -32,6 +32,7 @@ enum http_status {
>  	NO_CONTENT,                     /* 204 */
>  	PARTIAL_CONTENT,                /* 206 */
>  	BAD_REQUEST,                    /* 400 */
> +	UNAUTHORIZED,			/* 401 */
>  	NOT_FOUND,                      /* 404 */
>  	METHOD_NOT_ALLOWED,             /* 405 */
>  	CONFLICT,                       /* 409 */
> diff --git a/sheep/http/kv.c b/sheep/http/kv.c
> index 8113389..8d33e37 100644
> --- a/sheep/http/kv.c
> +++ b/sheep/http/kv.c
> @@ -16,14 +16,30 @@
>  
>  #define FOR_EACH_VDI(nr, vdis) FOR_EACH_BIT(nr, vdis, SD_NR_VDIS)
>  
> -static int lookup_bucket(struct http_request *req, const char *bucket,
> -			 uint32_t *vid)
> +struct bucket_inode_hdr {
> +	char bucket_name[SD_MAX_BUCKET_NAME];
> +	uint64_t obj_count;
> +	uint64_t bytes_used;
> +	uint32_t onode_vid;
> +};
> +
> +struct bucket_inode {
> +	union {
> +		struct bucket_inode_hdr hdr;
> +		uint8_t data[SD_MAX_BUCKET_NAME << 1];
> +	};
> +};
> +
> +#define MAX_BUCKETS (SD_MAX_VDI_SIZE / sizeof(struct bucket_inode))
> +#define BUCKETS_PER_SD_OBJ (SD_DATA_OBJ_SIZE / sizeof(struct bucket_inode))
> +
> +static int lookup_vdi(const char *name, uint32_t *vid)
>  {
>  	int ret;
>  	struct vdi_info info = {};
>  	struct vdi_iocb iocb = {
> -		.name = bucket,
> -		.data_len = strlen(bucket),
> +		.name = name,
> +		.data_len = strlen(name),
>  	};
>  
>  	ret = vdi_lookup(&iocb, &info);
> @@ -32,27 +48,23 @@ static int lookup_bucket(struct http_request *req, const char *bucket,
>  		*vid = info.vid;
>  		break;
>  	case SD_RES_NO_VDI:
> -		sd_info("no such bucket %s", bucket);
> -		http_response_header(req, NOT_FOUND);
> -		return -1;
> +		sd_info("no such vdi %s", name);
> +		break;
>  	default:
> -		sd_err("%s: bucket %s", sd_strerror(ret), bucket);
> -		http_response_header(req, INTERNAL_SERVER_ERROR);
> -		return -1;
> +		sd_err("Failed to find vdi %s %s", name, sd_strerror(ret));
>  	}
>  
> -	return 0;
> +	return ret;
>  }
>  
> -/* Bucket operations */
> -
> -int kv_create_bucket(struct http_request *req, const char *bucket)
> +static int kv_create_hyper_volume(const char *name, uint32_t *vdi_id)
>  {
>  	struct sd_req hdr;
> +	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
>  	int ret;
>  	char buf[SD_MAX_VDI_LEN] = {0};
>  
> -	pstrcpy(buf, SD_MAX_VDI_LEN, bucket);
> +	pstrcpy(buf, SD_MAX_VDI_LEN, name);
>  
>  	sd_init_req(&hdr, SD_OP_NEW_VDI);
>  	hdr.flags = SD_FLAG_CMD_WRITE;
> @@ -64,104 +76,592 @@ int kv_create_bucket(struct http_request *req, const char *bucket)
>  	hdr.vdi.store_policy = 1;
>  
>  	ret = exec_local_req(&hdr, buf);
> +	if (rsp->result != SD_RES_SUCCESS)
> +		sd_err("Failed to create VDI %s: %s", name,
> +		       sd_strerror(rsp->result));
> +
> +	if (vdi_id)
> +		*vdi_id = rsp->vdi.vdi_id;
> +
> +	return ret;
> +}
> +
> +static int discard_data_obj(uint64_t oid)
> +{
> +	int ret;
> +	struct sd_req hdr;
> +
> +	sd_init_req(&hdr, SD_OP_DISCARD_OBJ);
> +	hdr.obj.oid = oid;
> +
> +	ret = exec_local_req(&hdr, NULL);
> +	if (ret != SD_RES_SUCCESS)
> +		sd_err("Failed to discard data obj %lu %s", oid,
> +		       sd_strerror(ret));
> +
> +	return ret;
> +}
> +
> +static int kv_delete_vdi(const char *name)
> +{
> +	int ret;
> +	struct sd_req hdr;
> +	char data[SD_MAX_VDI_LEN] = {0};
> +	uint32_t vid;
> +
> +	ret = lookup_vdi(name, &vid);
> +	if (ret != SD_RES_SUCCESS)
> +		return ret;
> +
> +	sd_init_req(&hdr, SD_OP_DEL_VDI);
> +	hdr.flags = SD_FLAG_CMD_WRITE;
> +	hdr.data_length = sizeof(data);
> +	pstrcpy(data, SD_MAX_VDI_LEN, name);
> +
> +	ret = exec_local_req(&hdr, data);
> +	if (ret != SD_RES_SUCCESS)
> +		sd_err("Failed to delete vdi %s %s", name, sd_strerror(ret));
> +
> +	return ret;
> +}
> +
> +/*
> + * An account is actually a hyper volume vdi (up to 16PB),
> + * all the buckets (or containers, identified by 'struct bucket_inode') are
> + * stores in this hyper vdi using hashing algorithm.
> + * The bucket also has a hyper vdi named "account/bucket" which stores
> + * 'struct kv_onodes'.
> + *
> + * For example: account "coly" has two buckets "jetta" and "volvo"
> + *
> + *
> + * account vdi
> + * +-----------+---+--------------------------+---+--------------------------+--
> + * |name: coly |...|bucket_inode (name: jetta)|...|bucket_inode (name: volvo)|..
> + * +-----------+---+--------------------------+---+--------------------------+--
> + *                                  |                             |
> + *                                 /                              |
> + * bucket vdi                     /                               |
> + * +-----------------+-------+ <--                                |
> + * |name: coly/jetta |.......|                                    |
> + * +-----------------+-------+                                   /
> + *                              bucket vdi                      /
> + *                              +-----------------+------+ <----
> + *                              | name: coly/volvo|......|
> + *                              +-----------------+------+
> + */
> +
> +/* Account operations */
> +
> +int kv_create_account(const char *account)
> +{
> +	uint32_t vdi_id;
> +	return kv_create_hyper_volume(account, &vdi_id);
> +}
> +
> +typedef void (*list_cb)(struct http_request *req, const char *bucket,
> +			void *opaque);
> +
> +struct list_buckets_arg {
> +	struct http_request *req;
> +	void *opaque;
> +	list_cb cb;
> +	uint32_t bucket_counter;
> +};
> +
> +static void list_buckets_cb(void *data, enum btree_node_type type, void *arg)
> +{
> +	struct sd_extent *ext;
> +	struct list_buckets_arg *lbarg = arg;
> +	struct bucket_inode *bnode;
> +	uint64_t oid;
> +	char *buf = NULL;
> +	int ret;
> +
> +	if (type == BTREE_EXT) {
> +		ext = (struct sd_extent *)data;
> +		if (!ext->vdi_id)
> +			return;
> +
> +		buf = xzalloc(SD_DATA_OBJ_SIZE);
> +
> +		oid = vid_to_data_oid(ext->vdi_id, ext->idx);
> +		ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to read data object %lx", oid);
> +			goto out;
> +		}
> +		/* loop all bucket_inodes in this data-object */
> +		for (int i = 0; i < BUCKETS_PER_SD_OBJ; i++) {
> +			bnode = (struct bucket_inode *)
> +				(buf + i * sizeof(struct bucket_inode));
> +			if (bnode->hdr.onode_vid == 0)
> +				continue;
> +			if (lbarg->cb)
> +				lbarg->cb(lbarg->req, bnode->hdr.bucket_name,
> +					  (void *)lbarg->opaque);
> +			lbarg->bucket_counter++;
> +		}
> +	}
> +out:
> +	free(buf);
> +}
> +
> +/* get number of buckets in this account */
> +static int kv_get_account(const char *account, uint32_t *nr_buckets)
> +{
> +	struct sd_inode inode;
> +	uint64_t oid;
> +	uint32_t account_vid;
> +	int ret;
> +
> +	ret = lookup_vdi(account, &account_vid);
> +	if (ret != SD_RES_SUCCESS)
> +		return ret;
> +
> +	/* read account vdi out */
> +	oid = vid_to_vdi_oid(account_vid);
> +	ret = sd_read_object(oid, (char *)&inode, sizeof(struct sd_inode), 0);
> +	if (ret != SD_RES_SUCCESS) {
> +		sd_err("Failed to read inode header %lx", oid);
> +		return ret;
> +	}
> +
> +	struct list_buckets_arg arg = {NULL, NULL, NULL, 0};
> +	traverse_btree(sheep_bnode_reader, &inode, list_buckets_cb, &arg);
> +	if (nr_buckets)
> +		*nr_buckets = arg.bucket_counter;
> +
> +	return SD_RES_SUCCESS;
> +}
> +
> +int kv_read_account(const char *account, uint32_t *nr_buckets)
> +{
> +	int ret;
> +
> +	ret = kv_get_account(account, nr_buckets);
> +	if (ret != SD_RES_SUCCESS)
> +		sd_err("Failed to get number of buckets in %s", account);
> +	return ret;
> +}
> +
> +int kv_update_account(const char *account)
> +{
> +	/* TODO: update metadata of the account */
> +	return -1;
> +}
> +
> +int kv_delete_account(const char *account)
> +{
> +	int ret;
> +
> +	ret = kv_delete_vdi(account);
> +	if (ret != SD_RES_SUCCESS)
> +		sd_err("Failed to delete vdi %s", account);
> +
> +	return ret;
> +}
> +
> +/* Bucket operations */
> +
> +static int lookup_bucket(struct http_request *req, const char *bucket,
> +			 uint32_t *vid)
> +{
> +	int ret;
> +	struct vdi_info info = {};
> +	struct vdi_iocb iocb = {
> +		.name = bucket,
> +		.data_len = strlen(bucket),
> +	};
> +
> +	ret = vdi_lookup(&iocb, &info);
>  	switch (ret) {
>  	case SD_RES_SUCCESS:
> -		http_response_header(req, CREATED);
> +		*vid = info.vid;
>  		break;
> -	case SD_RES_VDI_EXIST:
> -		http_response_header(req, ACCEPTED);
> +	case SD_RES_NO_VDI:
> +		sd_info("no such bucket %s", bucket);
> +		http_response_header(req, NOT_FOUND);
>  		break;
>  	default:
> -		sd_err("%s: bucket %s", sd_strerror(ret), bucket);
> +		sd_err("Failed to find bucket %s %s", bucket, sd_strerror(ret));
>  		http_response_header(req, INTERNAL_SERVER_ERROR);
> -		return -1;
>  	}
>  
> -	return 0;
> +	return ret;
>  }
>  
> -int kv_read_bucket(struct http_request *req, const char *bucket)
> +/*
> + * Delete bucket(container) inode in account vdi.
> + * idx: the target hash positon of bucket
> + * Return the position of bucket_inode in sd-data-object if success
> + * Return BUCKETS_PER_SD_OBJ if bucket_inode is not found
> + * Return -1 if some errors happend
> + */
> +static int delete_bucket(struct sd_inode *account_inode, uint64_t idx,
> +			 const char *bucket)
>  {
> -	/* TODO: read metadata of the bucket */
> -	return -1;
> +	struct bucket_inode *bnode;
> +	char *buf = NULL;
> +	uint32_t vdi_id;
> +	uint64_t oid;
> +	uint64_t data_index = idx / BUCKETS_PER_SD_OBJ;
> +	int offset = idx % BUCKETS_PER_SD_OBJ;
> +	int ret, i, empty_buckets = 0, found = 0;
> +
> +	vdi_id = INODE_GET_VID(account_inode, data_index);
> +	if (!vdi_id) {
> +		sd_err("the %lu in vdi %s is not exists", data_index,
> +		       account_inode->name);
> +		ret = -1;
> +		goto out;
> +	}
> +
> +	oid = vid_to_data_oid(account_inode->vdi_id, data_index);
> +	buf = xzalloc(SD_DATA_OBJ_SIZE);
> +	ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
> +	if (ret != SD_RES_SUCCESS) {
> +		sd_err("Failed to read inode header %lx", oid);
> +		ret = -1;
> +		goto out;
> +	}
> +
> +	for (i = 0; i < BUCKETS_PER_SD_OBJ; i++) {
> +		char vdi_name[SD_MAX_VDI_LEN];
> +		bnode = (struct bucket_inode *)
> +			(buf + i * sizeof(struct bucket_inode));
> +		/* count all empty buckets in this sd-data-obj */
> +		if (bnode->hdr.onode_vid == 0) {
> +			empty_buckets++;
> +			continue;
> +		}
> +		if (strncmp(bnode->hdr.bucket_name, bucket, SD_MAX_BUCKET_NAME))
> +			continue;
> +
> +		if (i < offset)
> +			panic("postion of bucket inode %d is smaller than %d",
> +			      i, offset);
> +
> +		found = i;
> +		/* find the bnode */
> +		bnode->hdr.onode_vid = 0;
> +		snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
> +			 account_inode->name, bucket);
> +
> +		ret = kv_delete_vdi(vdi_name);
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to delete vdi %s", vdi_name);
> +			ret = -1;
> +			goto out;
> +		}
> +		sd_debug("delete vdi %s success", vdi_name);
> +	}
> +
> +	if (!found) {
> +		ret = BUCKETS_PER_SD_OBJ;
> +		goto out;
> +	}
> +
> +	/*
> +	 * if only this bucket_inode is in the sd-data-obj,
> +	 * then delete this sd-data-obj
> +	 */
> +	if (empty_buckets == BUCKETS_PER_SD_OBJ - 1) {
> +		ret = discard_data_obj(oid);
> +		if (ret != SD_RES_SUCCESS) {
> +			ret = -1;
> +			goto out;
> +		}
> +		INODE_SET_VID(account_inode, data_index, 0);
> +		ret = sd_inode_write_vid(sheep_bnode_writer, account_inode,
> +					 data_index, vdi_id, vdi_id, 0, false,
> +					 false);
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to write inode %x", vdi_id);
> +			ret = -1;
> +			goto out;
> +		}
> +		sd_debug("discard obj %lx and update vdi %x success",
> +			 oid, vdi_id);
> +	} else {
> +		ret = sd_write_object(oid, buf, sizeof(struct bucket_inode),
> +				   i * sizeof(struct bucket_inode), false);
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to write object %lx", oid);
> +			ret = -1;
> +			goto out;
> +		}
> +	}
> +
> +	sd_debug("write object oid %lx success", oid);
> +	ret = found;
> +out:
> +	free(buf);
> +	return ret;
>  }
>  
> -int kv_update_bucket(struct http_request *req, const char *bucket)
> +/*
> + * Add bucket(container) inode into account vdi.
> + * idx: the target hash positon of bucket
> + * Return the position of bucket_inode in sd-data-object if success
> + * Return BUCKETS_PER_SD_OBJ if the data-object is full of bucket_inode
> + * Return -1 if some error happend
> + */
> +static int add_bucket(struct sd_inode *account_inode, uint64_t idx,
> +		      const char *bucket)
>  {
> -	/* TODO: update metadata of the bucket */
> -	return -1;
> +	struct bucket_inode *bnode;
> +	char *buf = NULL;
> +	uint32_t vdi_id;
> +	uint64_t oid;
> +	uint64_t data_index = idx / BUCKETS_PER_SD_OBJ;
> +	int offset = idx % BUCKETS_PER_SD_OBJ;
> +	int ret, i;
> +	bool create = false;
> +
> +	buf = xzalloc(SD_DATA_OBJ_SIZE);
> +
> +	vdi_id = INODE_GET_VID(account_inode, data_index);
> +	oid = vid_to_data_oid(account_inode->vdi_id, data_index);
> +	sd_debug("oid %x %lx %lx", account_inode->vdi_id, data_index, oid);
> +	/* the data object is exists */
> +	if (vdi_id) {
> +		ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to read inode header %lx", oid);
> +			ret = -1;
> +			goto out;
> +		}
> +	} else
> +		create = true;
> +
> +	sd_debug("bucket_inode offset %d %lu", offset, BUCKETS_PER_SD_OBJ);
> +	for (i = offset; i < BUCKETS_PER_SD_OBJ; i++) {
> +		char vdi_name[SD_MAX_VDI_LEN];
> +		bnode = (struct bucket_inode *)
> +			(buf + i * sizeof(struct bucket_inode));
> +		if (bnode->hdr.onode_vid != 0)
> +			continue;
> +
> +		/* the bnode not used */
> +		strncpy(bnode->hdr.bucket_name, bucket, SD_MAX_BUCKET_NAME);
> +		bnode->hdr.obj_count = 0;
> +		bnode->hdr.bytes_used = 0;
> +		snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
> +			 account_inode->name, bucket);
> +		ret = kv_create_hyper_volume(vdi_name, &(bnode->hdr.onode_vid));
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to create hyper volume %d", ret);
> +			ret = -1;
> +			goto out;
> +		}
> +		sd_debug("create hyper volume %s success", vdi_name);
> +		break;
> +	}
> +
> +	if (i >= BUCKETS_PER_SD_OBJ) {
> +		ret = BUCKETS_PER_SD_OBJ;
> +		goto out;
> +	}
> +
> +	/* write bnode back to account-vdi */
> +	if (create)
> +		ret = sd_write_object(oid, buf, SD_DATA_OBJ_SIZE, 0, create);
> +	else
> +		ret = sd_write_object(oid, buf, sizeof(struct bucket_inode),
> +				   i * sizeof(struct bucket_inode), create);
> +
> +	if (ret != SD_RES_SUCCESS) {
> +		sd_err("Failed to write object %lx", oid);
> +		ret = -1;
> +		goto out;
> +	}
> +
> +	sd_debug("write object oid %lx success", oid);
> +
> +	/* update index of vdi */
> +	if (create) {
> +		vdi_id = account_inode->vdi_id;
> +		INODE_SET_VID(account_inode, data_index, vdi_id);
> +		ret = sd_inode_write_vid(sheep_bnode_writer, account_inode,
> +					 data_index, vdi_id, vdi_id, 0, false,
> +					 false);
> +		if (ret != SD_RES_SUCCESS) {
> +			sd_err("Failed to write inode %x", vdi_id);
> +			ret = -1;
> +			goto out;
> +		}
> +		sd_debug("write account inode success");
> +	}
> +
> +	ret = i;
> +out:
> +	free(buf);
> +	return ret;
>  }
>  
> -/* TODO: return HTTP_CONFLICT when the bucket is not empty */
> -int kv_delete_bucket(struct http_request *req, const char *bucket)
> +static int kv_get_bucket(struct sd_inode *account_inode, uint32_t account_vid,
> +			 const char *account, const char *bucket)

account_inode has the vid, so we can remove extra account_vid from parameter

Thanks
Yuan



More information about the sheepdog mailing list