[sheepdog] [PATCH v4 1/3] sheep/http: store accounts and containers into hyper volume for object-storage
Robin Dong
robin.k.dong at gmail.com
Fri Dec 13 11:49:30 CET 2013
2013/12/12 Liu Yuan <namei.unix at gmail.com>
> On Thu, Dec 12, 2013 at 06:15:55PM +0800, Robin Dong wrote:
> > From: Robin Dong <sanbai at taobao.com>
> >
> > Using hyper volume (size up to 16PB) to store large number of accounts
> > and containers.
> >
> > Signed-off-by: Robin Dong <sanbai at taobao.com>
> > ---
> > sheep/http/http.c | 5 +
> > sheep/http/http.h | 1 +
> > sheep/http/kv.c | 646
> +++++++++++++++++++++++++++++++++++++++++++++++------
> > sheep/http/kv.h | 22 +-
> > sheep/http/s3.c | 6 +-
> > sheep/http/swift.c | 108 ++++++---
> > 6 files changed, 676 insertions(+), 112 deletions(-)
> >
> > diff --git a/sheep/http/http.c b/sheep/http/http.c
> > index 04ef364..0081707 100644
> > --- a/sheep/http/http.c
> > +++ b/sheep/http/http.c
> > @@ -52,6 +52,7 @@ static inline const char *strstatus(enum http_status
> status)
> > [NO_CONTENT] = "204 No Content",
> > [PARTIAL_CONTENT] = "206 Partial Content",
> > [BAD_REQUEST] = "400 Bad Request",
> > + [UNAUTHORIZED] = "401 Unauthorized",
> > [NOT_FOUND] = "404 Not Found",
> > [METHOD_NOT_ALLOWED] = "405 Method Not Allowed",
> > [CONFLICT] = "409 Conflict",
> > @@ -192,6 +193,9 @@ void http_response_header(struct http_request *req,
> enum http_status status)
> >
> > req->status = status;
> > http_request_writef(req, "Status: %s\r\n", strstatus(status));
> > + if (req->opcode == HTTP_GET && req->data_length > 0)
> > + http_request_writef(req, "Content-Length: %lu\r\n",
> > + req->data_length);
> > http_request_writes(req, "Content-type: text/plain;\r\n\r\n");
> > }
> >
> > @@ -233,6 +237,7 @@ static void http_run_request(struct work *work)
> >
> > if (method != NULL) {
> > method(req);
> > + sd_debug("req->status %d", req->status);
> > if (req->status != UNKNOWN)
> > goto out;
> > }
> > diff --git a/sheep/http/http.h b/sheep/http/http.h
> > index 046d412..a8527d1 100644
> > --- a/sheep/http/http.h
> > +++ b/sheep/http/http.h
> > @@ -32,6 +32,7 @@ enum http_status {
> > NO_CONTENT, /* 204 */
> > PARTIAL_CONTENT, /* 206 */
> > BAD_REQUEST, /* 400 */
> > + UNAUTHORIZED, /* 401 */
> > NOT_FOUND, /* 404 */
> > METHOD_NOT_ALLOWED, /* 405 */
> > CONFLICT, /* 409 */
> > diff --git a/sheep/http/kv.c b/sheep/http/kv.c
> > index 8113389..8d33e37 100644
> > --- a/sheep/http/kv.c
> > +++ b/sheep/http/kv.c
> > @@ -16,14 +16,30 @@
> >
> > #define FOR_EACH_VDI(nr, vdis) FOR_EACH_BIT(nr, vdis, SD_NR_VDIS)
> >
> > -static int lookup_bucket(struct http_request *req, const char *bucket,
> > - uint32_t *vid)
> > +struct bucket_inode_hdr {
> > + char bucket_name[SD_MAX_BUCKET_NAME];
> > + uint64_t obj_count;
> > + uint64_t bytes_used;
> > + uint32_t onode_vid;
> > +};
> > +
> > +struct bucket_inode {
> > + union {
> > + struct bucket_inode_hdr hdr;
> > + uint8_t data[SD_MAX_BUCKET_NAME << 1];
> > + };
> > +};
> > +
> > +#define MAX_BUCKETS (SD_MAX_VDI_SIZE / sizeof(struct bucket_inode))
> > +#define BUCKETS_PER_SD_OBJ (SD_DATA_OBJ_SIZE / sizeof(struct
> bucket_inode))
> > +
> > +static int lookup_vdi(const char *name, uint32_t *vid)
> > {
> > int ret;
> > struct vdi_info info = {};
> > struct vdi_iocb iocb = {
> > - .name = bucket,
> > - .data_len = strlen(bucket),
> > + .name = name,
> > + .data_len = strlen(name),
> > };
> >
> > ret = vdi_lookup(&iocb, &info);
> > @@ -32,27 +48,23 @@ static int lookup_bucket(struct http_request *req,
> const char *bucket,
> > *vid = info.vid;
> > break;
> > case SD_RES_NO_VDI:
> > - sd_info("no such bucket %s", bucket);
> > - http_response_header(req, NOT_FOUND);
> > - return -1;
> > + sd_info("no such vdi %s", name);
> > + break;
> > default:
> > - sd_err("%s: bucket %s", sd_strerror(ret), bucket);
> > - http_response_header(req, INTERNAL_SERVER_ERROR);
> > - return -1;
> > + sd_err("Failed to find vdi %s %s", name, sd_strerror(ret));
> > }
> >
> > - return 0;
> > + return ret;
> > }
> >
> > -/* Bucket operations */
> > -
> > -int kv_create_bucket(struct http_request *req, const char *bucket)
> > +static int kv_create_hyper_volume(const char *name, uint32_t *vdi_id)
> > {
> > struct sd_req hdr;
> > + struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> > int ret;
> > char buf[SD_MAX_VDI_LEN] = {0};
> >
> > - pstrcpy(buf, SD_MAX_VDI_LEN, bucket);
> > + pstrcpy(buf, SD_MAX_VDI_LEN, name);
> >
> > sd_init_req(&hdr, SD_OP_NEW_VDI);
> > hdr.flags = SD_FLAG_CMD_WRITE;
> > @@ -64,104 +76,592 @@ int kv_create_bucket(struct http_request *req,
> const char *bucket)
> > hdr.vdi.store_policy = 1;
> >
> > ret = exec_local_req(&hdr, buf);
> > + if (rsp->result != SD_RES_SUCCESS)
> > + sd_err("Failed to create VDI %s: %s", name,
> > + sd_strerror(rsp->result));
> > +
> > + if (vdi_id)
> > + *vdi_id = rsp->vdi.vdi_id;
> > +
> > + return ret;
> > +}
> > +
> > +static int discard_data_obj(uint64_t oid)
> > +{
> > + int ret;
> > + struct sd_req hdr;
> > +
> > + sd_init_req(&hdr, SD_OP_DISCARD_OBJ);
> > + hdr.obj.oid = oid;
> > +
> > + ret = exec_local_req(&hdr, NULL);
> > + if (ret != SD_RES_SUCCESS)
> > + sd_err("Failed to discard data obj %lu %s", oid,
> > + sd_strerror(ret));
> > +
> > + return ret;
> > +}
> > +
> > +static int kv_delete_vdi(const char *name)
> > +{
> > + int ret;
> > + struct sd_req hdr;
> > + char data[SD_MAX_VDI_LEN] = {0};
> > + uint32_t vid;
> > +
> > + ret = lookup_vdi(name, &vid);
> > + if (ret != SD_RES_SUCCESS)
> > + return ret;
> > +
> > + sd_init_req(&hdr, SD_OP_DEL_VDI);
> > + hdr.flags = SD_FLAG_CMD_WRITE;
> > + hdr.data_length = sizeof(data);
> > + pstrcpy(data, SD_MAX_VDI_LEN, name);
> > +
> > + ret = exec_local_req(&hdr, data);
> > + if (ret != SD_RES_SUCCESS)
> > + sd_err("Failed to delete vdi %s %s", name,
> sd_strerror(ret));
> > +
> > + return ret;
> > +}
> > +
> > +/*
> > + * An account is actually a hyper volume vdi (up to 16PB),
> > + * all the buckets (or containers, identified by 'struct bucket_inode')
> are
> > + * stores in this hyper vdi using hashing algorithm.
> > + * The bucket also has a hyper vdi named "account/bucket" which stores
> > + * 'struct kv_onodes'.
> > + *
> > + * For example: account "coly" has two buckets "jetta" and "volvo"
> > + *
> > + *
> > + * account vdi
> > + *
> +-----------+---+--------------------------+---+--------------------------+--
> > + * |name: coly |...|bucket_inode (name: jetta)|...|bucket_inode (name:
> volvo)|..
> > + *
> +-----------+---+--------------------------+---+--------------------------+--
> > + * | |
> > + * / |
> > + * bucket vdi / |
> > + * +-----------------+-------+ <-- |
> > + * |name: coly/jetta |.......| |
> > + * +-----------------+-------+ /
> > + * bucket vdi /
> > + * +-----------------+------+ <----
> > + * | name: coly/volvo|......|
> > + * +-----------------+------+
> > + */
> > +
> > +/* Account operations */
> > +
> > +int kv_create_account(const char *account)
> > +{
> > + uint32_t vdi_id;
> > + return kv_create_hyper_volume(account, &vdi_id);
> > +}
> > +
> > +typedef void (*list_cb)(struct http_request *req, const char *bucket,
> > + void *opaque);
> > +
> > +struct list_buckets_arg {
> > + struct http_request *req;
> > + void *opaque;
> > + list_cb cb;
> > + uint32_t bucket_counter;
> > +};
> > +
> > +static void list_buckets_cb(void *data, enum btree_node_type type, void
> *arg)
> > +{
> > + struct sd_extent *ext;
> > + struct list_buckets_arg *lbarg = arg;
> > + struct bucket_inode *bnode;
> > + uint64_t oid;
> > + char *buf = NULL;
> > + int ret;
> > +
> > + if (type == BTREE_EXT) {
> > + ext = (struct sd_extent *)data;
> > + if (!ext->vdi_id)
> > + return;
> > +
> > + buf = xzalloc(SD_DATA_OBJ_SIZE);
> > +
> > + oid = vid_to_data_oid(ext->vdi_id, ext->idx);
> > + ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
> > + if (ret != SD_RES_SUCCESS) {
> > + sd_err("Failed to read data object %lx", oid);
> > + goto out;
> > + }
> > + /* loop all bucket_inodes in this data-object */
> > + for (int i = 0; i < BUCKETS_PER_SD_OBJ; i++) {
> > + bnode = (struct bucket_inode *)
> > + (buf + i * sizeof(struct bucket_inode));
> > + if (bnode->hdr.onode_vid == 0)
> > + continue;
> > + if (lbarg->cb)
> > + lbarg->cb(lbarg->req,
> bnode->hdr.bucket_name,
> > + (void *)lbarg->opaque);
> > + lbarg->bucket_counter++;
> > + }
> > + }
> > +out:
> > + free(buf);
> > +}
> > +
> > +/* get number of buckets in this account */
> > +static int kv_get_account(const char *account, uint32_t *nr_buckets)
> > +{
> > + struct sd_inode inode;
> > + uint64_t oid;
> > + uint32_t account_vid;
> > + int ret;
> > +
> > + ret = lookup_vdi(account, &account_vid);
> > + if (ret != SD_RES_SUCCESS)
> > + return ret;
> > +
> > + /* read account vdi out */
> > + oid = vid_to_vdi_oid(account_vid);
> > + ret = sd_read_object(oid, (char *)&inode, sizeof(struct sd_inode),
> 0);
> > + if (ret != SD_RES_SUCCESS) {
> > + sd_err("Failed to read inode header %lx", oid);
> > + return ret;
> > + }
> > +
> > + struct list_buckets_arg arg = {NULL, NULL, NULL, 0};
> > + traverse_btree(sheep_bnode_reader, &inode, list_buckets_cb, &arg);
> > + if (nr_buckets)
> > + *nr_buckets = arg.bucket_counter;
> > +
> > + return SD_RES_SUCCESS;
> > +}
> > +
> > +int kv_read_account(const char *account, uint32_t *nr_buckets)
> > +{
> > + int ret;
> > +
> > + ret = kv_get_account(account, nr_buckets);
> > + if (ret != SD_RES_SUCCESS)
> > + sd_err("Failed to get number of buckets in %s", account);
> > + return ret;
> > +}
> > +
> > +int kv_update_account(const char *account)
> > +{
> > + /* TODO: update metadata of the account */
> > + return -1;
> > +}
> > +
> > +int kv_delete_account(const char *account)
> > +{
> > + int ret;
> > +
> > + ret = kv_delete_vdi(account);
> > + if (ret != SD_RES_SUCCESS)
> > + sd_err("Failed to delete vdi %s", account);
> > +
> > + return ret;
> > +}
> > +
> > +/* Bucket operations */
> > +
> > +static int lookup_bucket(struct http_request *req, const char *bucket,
> > + uint32_t *vid)
> > +{
> > + int ret;
> > + struct vdi_info info = {};
> > + struct vdi_iocb iocb = {
> > + .name = bucket,
> > + .data_len = strlen(bucket),
> > + };
> > +
> > + ret = vdi_lookup(&iocb, &info);
> > switch (ret) {
> > case SD_RES_SUCCESS:
> > - http_response_header(req, CREATED);
> > + *vid = info.vid;
> > break;
> > - case SD_RES_VDI_EXIST:
> > - http_response_header(req, ACCEPTED);
> > + case SD_RES_NO_VDI:
> > + sd_info("no such bucket %s", bucket);
> > + http_response_header(req, NOT_FOUND);
> > break;
> > default:
> > - sd_err("%s: bucket %s", sd_strerror(ret), bucket);
> > + sd_err("Failed to find bucket %s %s", bucket,
> sd_strerror(ret));
> > http_response_header(req, INTERNAL_SERVER_ERROR);
> > - return -1;
> > }
> >
> > - return 0;
> > + return ret;
> > }
> >
> > -int kv_read_bucket(struct http_request *req, const char *bucket)
> > +/*
> > + * Delete bucket(container) inode in account vdi.
> > + * idx: the target hash positon of bucket
> > + * Return the position of bucket_inode in sd-data-object if success
> > + * Return BUCKETS_PER_SD_OBJ if bucket_inode is not found
> > + * Return -1 if some errors happend
> > + */
> > +static int delete_bucket(struct sd_inode *account_inode, uint64_t idx,
> > + const char *bucket)
> > {
> > - /* TODO: read metadata of the bucket */
> > - return -1;
> > + struct bucket_inode *bnode;
> > + char *buf = NULL;
> > + uint32_t vdi_id;
> > + uint64_t oid;
> > + uint64_t data_index = idx / BUCKETS_PER_SD_OBJ;
> > + int offset = idx % BUCKETS_PER_SD_OBJ;
> > + int ret, i, empty_buckets = 0, found = 0;
> > +
> > + vdi_id = INODE_GET_VID(account_inode, data_index);
> > + if (!vdi_id) {
> > + sd_err("the %lu in vdi %s is not exists", data_index,
> > + account_inode->name);
> > + ret = -1;
> > + goto out;
> > + }
> > +
> > + oid = vid_to_data_oid(account_inode->vdi_id, data_index);
> > + buf = xzalloc(SD_DATA_OBJ_SIZE);
> > + ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
> > + if (ret != SD_RES_SUCCESS) {
> > + sd_err("Failed to read inode header %lx", oid);
> > + ret = -1;
> > + goto out;
> > + }
> > +
> > + for (i = 0; i < BUCKETS_PER_SD_OBJ; i++) {
> > + char vdi_name[SD_MAX_VDI_LEN];
> > + bnode = (struct bucket_inode *)
> > + (buf + i * sizeof(struct bucket_inode));
> > + /* count all empty buckets in this sd-data-obj */
> > + if (bnode->hdr.onode_vid == 0) {
> > + empty_buckets++;
> > + continue;
> > + }
> > + if (strncmp(bnode->hdr.bucket_name, bucket,
> SD_MAX_BUCKET_NAME))
> > + continue;
> > +
> > + if (i < offset)
> > + panic("postion of bucket inode %d is smaller than
> %d",
> > + i, offset);
> > +
> > + found = i;
> > + /* find the bnode */
> > + bnode->hdr.onode_vid = 0;
> > + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
> > + account_inode->name, bucket);
> > +
> > + ret = kv_delete_vdi(vdi_name);
> > + if (ret != SD_RES_SUCCESS) {
> > + sd_err("Failed to delete vdi %s", vdi_name);
> > + ret = -1;
> > + goto out;
> > + }
> > + sd_debug("delete vdi %s success", vdi_name);
> > + }
> > +
> > + if (!found) {
> > + ret = BUCKETS_PER_SD_OBJ;
> > + goto out;
> > + }
> > +
> > + /*
> > + * if only this bucket_inode is in the sd-data-obj,
> > + * then delete this sd-data-obj
> > + */
> > + if (empty_buckets == BUCKETS_PER_SD_OBJ - 1) {
> > + ret = discard_data_obj(oid);
> > + if (ret != SD_RES_SUCCESS) {
> > + ret = -1;
> > + goto out;
> > + }
> > + INODE_SET_VID(account_inode, data_index, 0);
> > + ret = sd_inode_write_vid(sheep_bnode_writer, account_inode,
> > + data_index, vdi_id, vdi_id, 0,
> false,
> > + false);
> > + if (ret != SD_RES_SUCCESS) {
> > + sd_err("Failed to write inode %x", vdi_id);
> > + ret = -1;
> > + goto out;
> > + }
> > + sd_debug("discard obj %lx and update vdi %x success",
> > + oid, vdi_id);
> > + } else {
> > + ret = sd_write_object(oid, buf, sizeof(struct
> bucket_inode),
> > + i * sizeof(struct bucket_inode), false);
> > + if (ret != SD_RES_SUCCESS) {
> > + sd_err("Failed to write object %lx", oid);
> > + ret = -1;
> > + goto out;
> > + }
> > + }
> > +
> > + sd_debug("write object oid %lx success", oid);
> > + ret = found;
> > +out:
> > + free(buf);
> > + return ret;
> > }
> >
> > -int kv_update_bucket(struct http_request *req, const char *bucket)
> > +/*
> > + * Add bucket(container) inode into account vdi.
> > + * idx: the target hash positon of bucket
> > + * Return the position of bucket_inode in sd-data-object if success
> > + * Return BUCKETS_PER_SD_OBJ if the data-object is full of bucket_inode
> > + * Return -1 if some error happend
> > + */
> > +static int add_bucket(struct sd_inode *account_inode, uint64_t idx,
> > + const char *bucket)
> > {
> > - /* TODO: update metadata of the bucket */
> > - return -1;
> > + struct bucket_inode *bnode;
> > + char *buf = NULL;
> > + uint32_t vdi_id;
> > + uint64_t oid;
> > + uint64_t data_index = idx / BUCKETS_PER_SD_OBJ;
> > + int offset = idx % BUCKETS_PER_SD_OBJ;
> > + int ret, i;
> > + bool create = false;
> > +
> > + buf = xzalloc(SD_DATA_OBJ_SIZE);
> > +
> > + vdi_id = INODE_GET_VID(account_inode, data_index);
> > + oid = vid_to_data_oid(account_inode->vdi_id, data_index);
> > + sd_debug("oid %x %lx %lx", account_inode->vdi_id, data_index, oid);
> > + /* the data object is exists */
> > + if (vdi_id) {
> > + ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
> > + if (ret != SD_RES_SUCCESS) {
> > + sd_err("Failed to read inode header %lx", oid);
> > + ret = -1;
> > + goto out;
> > + }
> > + } else
> > + create = true;
> > +
> > + sd_debug("bucket_inode offset %d %lu", offset, BUCKETS_PER_SD_OBJ);
> > + for (i = offset; i < BUCKETS_PER_SD_OBJ; i++) {
> > + char vdi_name[SD_MAX_VDI_LEN];
> > + bnode = (struct bucket_inode *)
> > + (buf + i * sizeof(struct bucket_inode));
> > + if (bnode->hdr.onode_vid != 0)
> > + continue;
> > +
> > + /* the bnode not used */
> > + strncpy(bnode->hdr.bucket_name, bucket,
> SD_MAX_BUCKET_NAME);
> > + bnode->hdr.obj_count = 0;
> > + bnode->hdr.bytes_used = 0;
> > + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
> > + account_inode->name, bucket);
> > + ret = kv_create_hyper_volume(vdi_name,
> &(bnode->hdr.onode_vid));
> > + if (ret != SD_RES_SUCCESS) {
> > + sd_err("Failed to create hyper volume %d", ret);
> > + ret = -1;
> > + goto out;
> > + }
> > + sd_debug("create hyper volume %s success", vdi_name);
> > + break;
> > + }
> > +
> > + if (i >= BUCKETS_PER_SD_OBJ) {
> > + ret = BUCKETS_PER_SD_OBJ;
> > + goto out;
> > + }
> > +
> > + /* write bnode back to account-vdi */
> > + if (create)
> > + ret = sd_write_object(oid, buf, SD_DATA_OBJ_SIZE, 0,
> create);
> > + else
> > + ret = sd_write_object(oid, buf, sizeof(struct
> bucket_inode),
> > + i * sizeof(struct bucket_inode),
> create);
> > +
> > + if (ret != SD_RES_SUCCESS) {
> > + sd_err("Failed to write object %lx", oid);
> > + ret = -1;
> > + goto out;
> > + }
> > +
> > + sd_debug("write object oid %lx success", oid);
> > +
> > + /* update index of vdi */
> > + if (create) {
> > + vdi_id = account_inode->vdi_id;
> > + INODE_SET_VID(account_inode, data_index, vdi_id);
> > + ret = sd_inode_write_vid(sheep_bnode_writer, account_inode,
> > + data_index, vdi_id, vdi_id, 0,
> false,
> > + false);
> > + if (ret != SD_RES_SUCCESS) {
> > + sd_err("Failed to write inode %x", vdi_id);
> > + ret = -1;
> > + goto out;
> > + }
> > + sd_debug("write account inode success");
> > + }
> > +
> > + ret = i;
> > +out:
> > + free(buf);
> > + return ret;
> > }
> >
> > -/* TODO: return HTTP_CONFLICT when the bucket is not empty */
> > -int kv_delete_bucket(struct http_request *req, const char *bucket)
> > +static int kv_get_bucket(struct sd_inode *account_inode, uint32_t
> account_vid,
> > + const char *account, const char *bucket)
>
> account_inode has the vid, so we can remove extra account_vid from
> parameter
>
No, we can't, the account_inode passed in is empty.
>
> Thanks
> Yuan
>
--
--
Best Regard
Robin Dong
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.wpkg.org/pipermail/sheepdog/attachments/20131213/ee2d1277/attachment-0004.html>
More information about the sheepdog
mailing list