[sheepdog] [PATCH v4 1/3] sheep/http: store accounts and containers into hyper volume for object-storage
Liu Yuan
namei.unix at gmail.com
Thu Dec 12 13:12:31 CET 2013
On Thu, Dec 12, 2013 at 06:15:55PM +0800, Robin Dong wrote:
> From: Robin Dong <sanbai at taobao.com>
>
> Using hyper volume (size up to 16PB) to store large number of accounts
> and containers.
>
> Signed-off-by: Robin Dong <sanbai at taobao.com>
> ---
> sheep/http/http.c | 5 +
> sheep/http/http.h | 1 +
> sheep/http/kv.c | 646 +++++++++++++++++++++++++++++++++++++++++++++++------
> sheep/http/kv.h | 22 +-
> sheep/http/s3.c | 6 +-
> sheep/http/swift.c | 108 ++++++---
> 6 files changed, 676 insertions(+), 112 deletions(-)
>
> diff --git a/sheep/http/http.c b/sheep/http/http.c
> index 04ef364..0081707 100644
> --- a/sheep/http/http.c
> +++ b/sheep/http/http.c
> @@ -52,6 +52,7 @@ static inline const char *strstatus(enum http_status status)
> [NO_CONTENT] = "204 No Content",
> [PARTIAL_CONTENT] = "206 Partial Content",
> [BAD_REQUEST] = "400 Bad Request",
> + [UNAUTHORIZED] = "401 Unauthorized",
> [NOT_FOUND] = "404 Not Found",
> [METHOD_NOT_ALLOWED] = "405 Method Not Allowed",
> [CONFLICT] = "409 Conflict",
> @@ -192,6 +193,9 @@ void http_response_header(struct http_request *req, enum http_status status)
>
> req->status = status;
> http_request_writef(req, "Status: %s\r\n", strstatus(status));
> + if (req->opcode == HTTP_GET && req->data_length > 0)
> + http_request_writef(req, "Content-Length: %lu\r\n",
> + req->data_length);
> http_request_writes(req, "Content-type: text/plain;\r\n\r\n");
> }
>
> @@ -233,6 +237,7 @@ static void http_run_request(struct work *work)
>
> if (method != NULL) {
> method(req);
> + sd_debug("req->status %d", req->status);
> if (req->status != UNKNOWN)
> goto out;
> }
> diff --git a/sheep/http/http.h b/sheep/http/http.h
> index 046d412..a8527d1 100644
> --- a/sheep/http/http.h
> +++ b/sheep/http/http.h
> @@ -32,6 +32,7 @@ enum http_status {
> NO_CONTENT, /* 204 */
> PARTIAL_CONTENT, /* 206 */
> BAD_REQUEST, /* 400 */
> + UNAUTHORIZED, /* 401 */
> NOT_FOUND, /* 404 */
> METHOD_NOT_ALLOWED, /* 405 */
> CONFLICT, /* 409 */
> diff --git a/sheep/http/kv.c b/sheep/http/kv.c
> index 8113389..8d33e37 100644
> --- a/sheep/http/kv.c
> +++ b/sheep/http/kv.c
> @@ -16,14 +16,30 @@
>
> #define FOR_EACH_VDI(nr, vdis) FOR_EACH_BIT(nr, vdis, SD_NR_VDIS)
>
> -static int lookup_bucket(struct http_request *req, const char *bucket,
> - uint32_t *vid)
> +struct bucket_inode_hdr {
> + char bucket_name[SD_MAX_BUCKET_NAME];
> + uint64_t obj_count;
> + uint64_t bytes_used;
> + uint32_t onode_vid;
> +};
> +
> +struct bucket_inode {
> + union {
> + struct bucket_inode_hdr hdr;
> + uint8_t data[SD_MAX_BUCKET_NAME << 1];
> + };
> +};
> +
> +#define MAX_BUCKETS (SD_MAX_VDI_SIZE / sizeof(struct bucket_inode))
> +#define BUCKETS_PER_SD_OBJ (SD_DATA_OBJ_SIZE / sizeof(struct bucket_inode))
> +
> +static int lookup_vdi(const char *name, uint32_t *vid)
> {
> int ret;
> struct vdi_info info = {};
> struct vdi_iocb iocb = {
> - .name = bucket,
> - .data_len = strlen(bucket),
> + .name = name,
> + .data_len = strlen(name),
> };
>
> ret = vdi_lookup(&iocb, &info);
> @@ -32,27 +48,23 @@ static int lookup_bucket(struct http_request *req, const char *bucket,
> *vid = info.vid;
> break;
> case SD_RES_NO_VDI:
> - sd_info("no such bucket %s", bucket);
> - http_response_header(req, NOT_FOUND);
> - return -1;
> + sd_info("no such vdi %s", name);
> + break;
> default:
> - sd_err("%s: bucket %s", sd_strerror(ret), bucket);
> - http_response_header(req, INTERNAL_SERVER_ERROR);
> - return -1;
> + sd_err("Failed to find vdi %s %s", name, sd_strerror(ret));
> }
>
> - return 0;
> + return ret;
> }
>
> -/* Bucket operations */
> -
> -int kv_create_bucket(struct http_request *req, const char *bucket)
> +static int kv_create_hyper_volume(const char *name, uint32_t *vdi_id)
> {
> struct sd_req hdr;
> + struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> int ret;
> char buf[SD_MAX_VDI_LEN] = {0};
>
> - pstrcpy(buf, SD_MAX_VDI_LEN, bucket);
> + pstrcpy(buf, SD_MAX_VDI_LEN, name);
>
> sd_init_req(&hdr, SD_OP_NEW_VDI);
> hdr.flags = SD_FLAG_CMD_WRITE;
> @@ -64,104 +76,592 @@ int kv_create_bucket(struct http_request *req, const char *bucket)
> hdr.vdi.store_policy = 1;
>
> ret = exec_local_req(&hdr, buf);
> + if (rsp->result != SD_RES_SUCCESS)
> + sd_err("Failed to create VDI %s: %s", name,
> + sd_strerror(rsp->result));
> +
> + if (vdi_id)
> + *vdi_id = rsp->vdi.vdi_id;
> +
> + return ret;
> +}
> +
> +static int discard_data_obj(uint64_t oid)
> +{
> + int ret;
> + struct sd_req hdr;
> +
> + sd_init_req(&hdr, SD_OP_DISCARD_OBJ);
> + hdr.obj.oid = oid;
> +
> + ret = exec_local_req(&hdr, NULL);
> + if (ret != SD_RES_SUCCESS)
> + sd_err("Failed to discard data obj %lu %s", oid,
> + sd_strerror(ret));
> +
> + return ret;
> +}
> +
> +static int kv_delete_vdi(const char *name)
> +{
> + int ret;
> + struct sd_req hdr;
> + char data[SD_MAX_VDI_LEN] = {0};
> + uint32_t vid;
> +
> + ret = lookup_vdi(name, &vid);
> + if (ret != SD_RES_SUCCESS)
> + return ret;
> +
> + sd_init_req(&hdr, SD_OP_DEL_VDI);
> + hdr.flags = SD_FLAG_CMD_WRITE;
> + hdr.data_length = sizeof(data);
> + pstrcpy(data, SD_MAX_VDI_LEN, name);
> +
> + ret = exec_local_req(&hdr, data);
> + if (ret != SD_RES_SUCCESS)
> + sd_err("Failed to delete vdi %s %s", name, sd_strerror(ret));
> +
> + return ret;
> +}
> +
> +/*
> + * An account is actually a hyper volume vdi (up to 16PB),
> + * all the buckets (or containers, identified by 'struct bucket_inode') are
> + * stores in this hyper vdi using hashing algorithm.
> + * The bucket also has a hyper vdi named "account/bucket" which stores
> + * 'struct kv_onodes'.
> + *
> + * For example: account "coly" has two buckets "jetta" and "volvo"
> + *
> + *
> + * account vdi
> + * +-----------+---+--------------------------+---+--------------------------+--
> + * |name: coly |...|bucket_inode (name: jetta)|...|bucket_inode (name: volvo)|..
> + * +-----------+---+--------------------------+---+--------------------------+--
> + * | |
> + * / |
> + * bucket vdi / |
> + * +-----------------+-------+ <-- |
> + * |name: coly/jetta |.......| |
> + * +-----------------+-------+ /
> + * bucket vdi /
> + * +-----------------+------+ <----
> + * | name: coly/volvo|......|
> + * +-----------------+------+
> + */
> +
> +/* Account operations */
> +
> +int kv_create_account(const char *account)
> +{
> + uint32_t vdi_id;
> + return kv_create_hyper_volume(account, &vdi_id);
> +}
> +
> +typedef void (*list_cb)(struct http_request *req, const char *bucket,
> + void *opaque);
> +
> +struct list_buckets_arg {
> + struct http_request *req;
> + void *opaque;
> + list_cb cb;
> + uint32_t bucket_counter;
> +};
> +
> +static void list_buckets_cb(void *data, enum btree_node_type type, void *arg)
> +{
> + struct sd_extent *ext;
> + struct list_buckets_arg *lbarg = arg;
> + struct bucket_inode *bnode;
> + uint64_t oid;
> + char *buf = NULL;
> + int ret;
> +
> + if (type == BTREE_EXT) {
> + ext = (struct sd_extent *)data;
> + if (!ext->vdi_id)
> + return;
> +
> + buf = xzalloc(SD_DATA_OBJ_SIZE);
> +
> + oid = vid_to_data_oid(ext->vdi_id, ext->idx);
> + ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to read data object %lx", oid);
> + goto out;
> + }
> + /* loop all bucket_inodes in this data-object */
> + for (int i = 0; i < BUCKETS_PER_SD_OBJ; i++) {
> + bnode = (struct bucket_inode *)
> + (buf + i * sizeof(struct bucket_inode));
> + if (bnode->hdr.onode_vid == 0)
> + continue;
> + if (lbarg->cb)
> + lbarg->cb(lbarg->req, bnode->hdr.bucket_name,
> + (void *)lbarg->opaque);
> + lbarg->bucket_counter++;
> + }
> + }
> +out:
> + free(buf);
> +}
> +
> +/* get number of buckets in this account */
> +static int kv_get_account(const char *account, uint32_t *nr_buckets)
> +{
> + struct sd_inode inode;
> + uint64_t oid;
> + uint32_t account_vid;
> + int ret;
> +
> + ret = lookup_vdi(account, &account_vid);
> + if (ret != SD_RES_SUCCESS)
> + return ret;
> +
> + /* read account vdi out */
> + oid = vid_to_vdi_oid(account_vid);
> + ret = sd_read_object(oid, (char *)&inode, sizeof(struct sd_inode), 0);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to read inode header %lx", oid);
> + return ret;
> + }
> +
> + struct list_buckets_arg arg = {NULL, NULL, NULL, 0};
> + traverse_btree(sheep_bnode_reader, &inode, list_buckets_cb, &arg);
> + if (nr_buckets)
> + *nr_buckets = arg.bucket_counter;
> +
> + return SD_RES_SUCCESS;
> +}
> +
> +int kv_read_account(const char *account, uint32_t *nr_buckets)
> +{
> + int ret;
> +
> + ret = kv_get_account(account, nr_buckets);
> + if (ret != SD_RES_SUCCESS)
> + sd_err("Failed to get number of buckets in %s", account);
> + return ret;
> +}
> +
> +int kv_update_account(const char *account)
> +{
> + /* TODO: update metadata of the account */
> + return -1;
> +}
> +
> +int kv_delete_account(const char *account)
> +{
> + int ret;
> +
> + ret = kv_delete_vdi(account);
> + if (ret != SD_RES_SUCCESS)
> + sd_err("Failed to delete vdi %s", account);
> +
> + return ret;
> +}
> +
> +/* Bucket operations */
> +
> +static int lookup_bucket(struct http_request *req, const char *bucket,
> + uint32_t *vid)
> +{
> + int ret;
> + struct vdi_info info = {};
> + struct vdi_iocb iocb = {
> + .name = bucket,
> + .data_len = strlen(bucket),
> + };
> +
> + ret = vdi_lookup(&iocb, &info);
> switch (ret) {
> case SD_RES_SUCCESS:
> - http_response_header(req, CREATED);
> + *vid = info.vid;
> break;
> - case SD_RES_VDI_EXIST:
> - http_response_header(req, ACCEPTED);
> + case SD_RES_NO_VDI:
> + sd_info("no such bucket %s", bucket);
> + http_response_header(req, NOT_FOUND);
> break;
> default:
> - sd_err("%s: bucket %s", sd_strerror(ret), bucket);
> + sd_err("Failed to find bucket %s %s", bucket, sd_strerror(ret));
> http_response_header(req, INTERNAL_SERVER_ERROR);
> - return -1;
> }
>
> - return 0;
> + return ret;
> }
>
> -int kv_read_bucket(struct http_request *req, const char *bucket)
> +/*
> + * Delete bucket(container) inode in account vdi.
> + * idx: the target hash positon of bucket
> + * Return the position of bucket_inode in sd-data-object if success
> + * Return BUCKETS_PER_SD_OBJ if bucket_inode is not found
> + * Return -1 if some errors happend
> + */
> +static int delete_bucket(struct sd_inode *account_inode, uint64_t idx,
> + const char *bucket)
> {
> - /* TODO: read metadata of the bucket */
> - return -1;
> + struct bucket_inode *bnode;
> + char *buf = NULL;
> + uint32_t vdi_id;
> + uint64_t oid;
> + uint64_t data_index = idx / BUCKETS_PER_SD_OBJ;
> + int offset = idx % BUCKETS_PER_SD_OBJ;
> + int ret, i, empty_buckets = 0, found = 0;
> +
> + vdi_id = INODE_GET_VID(account_inode, data_index);
> + if (!vdi_id) {
> + sd_err("the %lu in vdi %s is not exists", data_index,
> + account_inode->name);
> + ret = -1;
> + goto out;
> + }
> +
> + oid = vid_to_data_oid(account_inode->vdi_id, data_index);
> + buf = xzalloc(SD_DATA_OBJ_SIZE);
> + ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to read inode header %lx", oid);
> + ret = -1;
> + goto out;
> + }
> +
> + for (i = 0; i < BUCKETS_PER_SD_OBJ; i++) {
> + char vdi_name[SD_MAX_VDI_LEN];
> + bnode = (struct bucket_inode *)
> + (buf + i * sizeof(struct bucket_inode));
> + /* count all empty buckets in this sd-data-obj */
> + if (bnode->hdr.onode_vid == 0) {
> + empty_buckets++;
> + continue;
> + }
> + if (strncmp(bnode->hdr.bucket_name, bucket, SD_MAX_BUCKET_NAME))
> + continue;
> +
> + if (i < offset)
> + panic("postion of bucket inode %d is smaller than %d",
> + i, offset);
> +
> + found = i;
> + /* find the bnode */
> + bnode->hdr.onode_vid = 0;
> + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
> + account_inode->name, bucket);
> +
> + ret = kv_delete_vdi(vdi_name);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to delete vdi %s", vdi_name);
> + ret = -1;
> + goto out;
> + }
> + sd_debug("delete vdi %s success", vdi_name);
> + }
> +
> + if (!found) {
> + ret = BUCKETS_PER_SD_OBJ;
> + goto out;
> + }
> +
> + /*
> + * if only this bucket_inode is in the sd-data-obj,
> + * then delete this sd-data-obj
> + */
> + if (empty_buckets == BUCKETS_PER_SD_OBJ - 1) {
> + ret = discard_data_obj(oid);
> + if (ret != SD_RES_SUCCESS) {
> + ret = -1;
> + goto out;
> + }
> + INODE_SET_VID(account_inode, data_index, 0);
> + ret = sd_inode_write_vid(sheep_bnode_writer, account_inode,
> + data_index, vdi_id, vdi_id, 0, false,
> + false);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to write inode %x", vdi_id);
> + ret = -1;
> + goto out;
> + }
> + sd_debug("discard obj %lx and update vdi %x success",
> + oid, vdi_id);
> + } else {
> + ret = sd_write_object(oid, buf, sizeof(struct bucket_inode),
> + i * sizeof(struct bucket_inode), false);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to write object %lx", oid);
> + ret = -1;
> + goto out;
> + }
> + }
> +
> + sd_debug("write object oid %lx success", oid);
> + ret = found;
> +out:
> + free(buf);
> + return ret;
> }
>
> -int kv_update_bucket(struct http_request *req, const char *bucket)
> +/*
> + * Add bucket(container) inode into account vdi.
> + * idx: the target hash positon of bucket
> + * Return the position of bucket_inode in sd-data-object if success
> + * Return BUCKETS_PER_SD_OBJ if the data-object is full of bucket_inode
> + * Return -1 if some error happend
> + */
> +static int add_bucket(struct sd_inode *account_inode, uint64_t idx,
> + const char *bucket)
> {
> - /* TODO: update metadata of the bucket */
> - return -1;
> + struct bucket_inode *bnode;
> + char *buf = NULL;
> + uint32_t vdi_id;
> + uint64_t oid;
> + uint64_t data_index = idx / BUCKETS_PER_SD_OBJ;
> + int offset = idx % BUCKETS_PER_SD_OBJ;
> + int ret, i;
> + bool create = false;
> +
> + buf = xzalloc(SD_DATA_OBJ_SIZE);
> +
> + vdi_id = INODE_GET_VID(account_inode, data_index);
> + oid = vid_to_data_oid(account_inode->vdi_id, data_index);
> + sd_debug("oid %x %lx %lx", account_inode->vdi_id, data_index, oid);
> + /* the data object is exists */
> + if (vdi_id) {
> + ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to read inode header %lx", oid);
> + ret = -1;
> + goto out;
> + }
> + } else
> + create = true;
> +
> + sd_debug("bucket_inode offset %d %lu", offset, BUCKETS_PER_SD_OBJ);
> + for (i = offset; i < BUCKETS_PER_SD_OBJ; i++) {
> + char vdi_name[SD_MAX_VDI_LEN];
> + bnode = (struct bucket_inode *)
> + (buf + i * sizeof(struct bucket_inode));
> + if (bnode->hdr.onode_vid != 0)
> + continue;
> +
> + /* the bnode not used */
> + strncpy(bnode->hdr.bucket_name, bucket, SD_MAX_BUCKET_NAME);
> + bnode->hdr.obj_count = 0;
> + bnode->hdr.bytes_used = 0;
> + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
> + account_inode->name, bucket);
> + ret = kv_create_hyper_volume(vdi_name, &(bnode->hdr.onode_vid));
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to create hyper volume %d", ret);
> + ret = -1;
> + goto out;
> + }
> + sd_debug("create hyper volume %s success", vdi_name);
> + break;
> + }
> +
> + if (i >= BUCKETS_PER_SD_OBJ) {
> + ret = BUCKETS_PER_SD_OBJ;
> + goto out;
> + }
> +
> + /* write bnode back to account-vdi */
> + if (create)
> + ret = sd_write_object(oid, buf, SD_DATA_OBJ_SIZE, 0, create);
> + else
> + ret = sd_write_object(oid, buf, sizeof(struct bucket_inode),
> + i * sizeof(struct bucket_inode), create);
> +
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to write object %lx", oid);
> + ret = -1;
> + goto out;
> + }
> +
> + sd_debug("write object oid %lx success", oid);
> +
> + /* update index of vdi */
> + if (create) {
> + vdi_id = account_inode->vdi_id;
> + INODE_SET_VID(account_inode, data_index, vdi_id);
> + ret = sd_inode_write_vid(sheep_bnode_writer, account_inode,
> + data_index, vdi_id, vdi_id, 0, false,
> + false);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("Failed to write inode %x", vdi_id);
> + ret = -1;
> + goto out;
> + }
> + sd_debug("write account inode success");
> + }
> +
> + ret = i;
> +out:
> + free(buf);
> + return ret;
> }
>
> -/* TODO: return HTTP_CONFLICT when the bucket is not empty */
> -int kv_delete_bucket(struct http_request *req, const char *bucket)
> +static int kv_get_bucket(struct sd_inode *account_inode, uint32_t account_vid,
> + const char *account, const char *bucket)
account_inode has the vid, so we can remove extra account_vid from parameter
Thanks
Yuan
More information about the sheepdog
mailing list