[sheepdog] [PATCH v4 1/3] sheep/http: store accounts and containers into hyper volume for object-storage

Robin Dong robin.k.dong at gmail.com
Fri Dec 13 11:49:30 CET 2013


2013/12/12 Liu Yuan <namei.unix at gmail.com>

> On Thu, Dec 12, 2013 at 06:15:55PM +0800, Robin Dong wrote:
> > From: Robin Dong <sanbai at taobao.com>
> >
> > Using hyper volume (size up to 16PB) to store large number of accounts
> > and containers.
> >
> > Signed-off-by: Robin Dong <sanbai at taobao.com>
> > ---
> >  sheep/http/http.c  |   5 +
> >  sheep/http/http.h  |   1 +
> >  sheep/http/kv.c    | 646
> +++++++++++++++++++++++++++++++++++++++++++++++------
> >  sheep/http/kv.h    |  22 +-
> >  sheep/http/s3.c    |   6 +-
> >  sheep/http/swift.c | 108 ++++++---
> >  6 files changed, 676 insertions(+), 112 deletions(-)
> >
> > diff --git a/sheep/http/http.c b/sheep/http/http.c
> > index 04ef364..0081707 100644
> > --- a/sheep/http/http.c
> > +++ b/sheep/http/http.c
> > @@ -52,6 +52,7 @@ static inline const char *strstatus(enum http_status
> status)
> >               [NO_CONTENT] = "204 No Content",
> >               [PARTIAL_CONTENT] = "206 Partial Content",
> >               [BAD_REQUEST] = "400 Bad Request",
> > +             [UNAUTHORIZED] = "401 Unauthorized",
> >               [NOT_FOUND] = "404 Not Found",
> >               [METHOD_NOT_ALLOWED] = "405 Method Not Allowed",
> >               [CONFLICT] = "409 Conflict",
> > @@ -192,6 +193,9 @@ void http_response_header(struct http_request *req,
> enum http_status status)
> >
> >       req->status = status;
> >       http_request_writef(req, "Status: %s\r\n", strstatus(status));
> > +     if (req->opcode == HTTP_GET && req->data_length > 0)
> > +             http_request_writef(req, "Content-Length: %lu\r\n",
> > +                                 req->data_length);
> >       http_request_writes(req, "Content-type: text/plain;\r\n\r\n");
> >  }
> >
> > @@ -233,6 +237,7 @@ static void http_run_request(struct work *work)
> >
> >               if (method != NULL) {
> >                       method(req);
> > +                     sd_debug("req->status %d", req->status);
> >                       if (req->status != UNKNOWN)
> >                               goto out;
> >               }
> > diff --git a/sheep/http/http.h b/sheep/http/http.h
> > index 046d412..a8527d1 100644
> > --- a/sheep/http/http.h
> > +++ b/sheep/http/http.h
> > @@ -32,6 +32,7 @@ enum http_status {
> >       NO_CONTENT,                     /* 204 */
> >       PARTIAL_CONTENT,                /* 206 */
> >       BAD_REQUEST,                    /* 400 */
> > +     UNAUTHORIZED,                   /* 401 */
> >       NOT_FOUND,                      /* 404 */
> >       METHOD_NOT_ALLOWED,             /* 405 */
> >       CONFLICT,                       /* 409 */
> > diff --git a/sheep/http/kv.c b/sheep/http/kv.c
> > index 8113389..8d33e37 100644
> > --- a/sheep/http/kv.c
> > +++ b/sheep/http/kv.c
> > @@ -16,14 +16,30 @@
> >
> >  #define FOR_EACH_VDI(nr, vdis) FOR_EACH_BIT(nr, vdis, SD_NR_VDIS)
> >
> > -static int lookup_bucket(struct http_request *req, const char *bucket,
> > -                      uint32_t *vid)
> > +struct bucket_inode_hdr {
> > +     char bucket_name[SD_MAX_BUCKET_NAME];
> > +     uint64_t obj_count;
> > +     uint64_t bytes_used;
> > +     uint32_t onode_vid;
> > +};
> > +
> > +struct bucket_inode {
> > +     union {
> > +             struct bucket_inode_hdr hdr;
> > +             uint8_t data[SD_MAX_BUCKET_NAME << 1];
> > +     };
> > +};
> > +
> > +#define MAX_BUCKETS (SD_MAX_VDI_SIZE / sizeof(struct bucket_inode))
> > +#define BUCKETS_PER_SD_OBJ (SD_DATA_OBJ_SIZE / sizeof(struct
> bucket_inode))
> > +
> > +static int lookup_vdi(const char *name, uint32_t *vid)
> >  {
> >       int ret;
> >       struct vdi_info info = {};
> >       struct vdi_iocb iocb = {
> > -             .name = bucket,
> > -             .data_len = strlen(bucket),
> > +             .name = name,
> > +             .data_len = strlen(name),
> >       };
> >
> >       ret = vdi_lookup(&iocb, &info);
> > @@ -32,27 +48,23 @@ static int lookup_bucket(struct http_request *req,
> const char *bucket,
> >               *vid = info.vid;
> >               break;
> >       case SD_RES_NO_VDI:
> > -             sd_info("no such bucket %s", bucket);
> > -             http_response_header(req, NOT_FOUND);
> > -             return -1;
> > +             sd_info("no such vdi %s", name);
> > +             break;
> >       default:
> > -             sd_err("%s: bucket %s", sd_strerror(ret), bucket);
> > -             http_response_header(req, INTERNAL_SERVER_ERROR);
> > -             return -1;
> > +             sd_err("Failed to find vdi %s %s", name, sd_strerror(ret));
> >       }
> >
> > -     return 0;
> > +     return ret;
> >  }
> >
> > -/* Bucket operations */
> > -
> > -int kv_create_bucket(struct http_request *req, const char *bucket)
> > +static int kv_create_hyper_volume(const char *name, uint32_t *vdi_id)
> >  {
> >       struct sd_req hdr;
> > +     struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> >       int ret;
> >       char buf[SD_MAX_VDI_LEN] = {0};
> >
> > -     pstrcpy(buf, SD_MAX_VDI_LEN, bucket);
> > +     pstrcpy(buf, SD_MAX_VDI_LEN, name);
> >
> >       sd_init_req(&hdr, SD_OP_NEW_VDI);
> >       hdr.flags = SD_FLAG_CMD_WRITE;
> > @@ -64,104 +76,592 @@ int kv_create_bucket(struct http_request *req,
> const char *bucket)
> >       hdr.vdi.store_policy = 1;
> >
> >       ret = exec_local_req(&hdr, buf);
> > +     if (rsp->result != SD_RES_SUCCESS)
> > +             sd_err("Failed to create VDI %s: %s", name,
> > +                    sd_strerror(rsp->result));
> > +
> > +     if (vdi_id)
> > +             *vdi_id = rsp->vdi.vdi_id;
> > +
> > +     return ret;
> > +}
> > +
> > +static int discard_data_obj(uint64_t oid)
> > +{
> > +     int ret;
> > +     struct sd_req hdr;
> > +
> > +     sd_init_req(&hdr, SD_OP_DISCARD_OBJ);
> > +     hdr.obj.oid = oid;
> > +
> > +     ret = exec_local_req(&hdr, NULL);
> > +     if (ret != SD_RES_SUCCESS)
> > +             sd_err("Failed to discard data obj %lu %s", oid,
> > +                    sd_strerror(ret));
> > +
> > +     return ret;
> > +}
> > +
> > +static int kv_delete_vdi(const char *name)
> > +{
> > +     int ret;
> > +     struct sd_req hdr;
> > +     char data[SD_MAX_VDI_LEN] = {0};
> > +     uint32_t vid;
> > +
> > +     ret = lookup_vdi(name, &vid);
> > +     if (ret != SD_RES_SUCCESS)
> > +             return ret;
> > +
> > +     sd_init_req(&hdr, SD_OP_DEL_VDI);
> > +     hdr.flags = SD_FLAG_CMD_WRITE;
> > +     hdr.data_length = sizeof(data);
> > +     pstrcpy(data, SD_MAX_VDI_LEN, name);
> > +
> > +     ret = exec_local_req(&hdr, data);
> > +     if (ret != SD_RES_SUCCESS)
> > +             sd_err("Failed to delete vdi %s %s", name,
> sd_strerror(ret));
> > +
> > +     return ret;
> > +}
> > +
> > +/*
> > + * An account is actually a hyper volume vdi (up to 16PB),
> > + * all the buckets (or containers, identified by 'struct bucket_inode')
> are
> > + * stores in this hyper vdi using hashing algorithm.
> > + * The bucket also has a hyper vdi named "account/bucket" which stores
> > + * 'struct kv_onodes'.
> > + *
> > + * For example: account "coly" has two buckets "jetta" and "volvo"
> > + *
> > + *
> > + * account vdi
> > + *
> +-----------+---+--------------------------+---+--------------------------+--
> > + * |name: coly |...|bucket_inode (name: jetta)|...|bucket_inode (name:
> volvo)|..
> > + *
> +-----------+---+--------------------------+---+--------------------------+--
> > + *                                  |                             |
> > + *                                 /                              |
> > + * bucket vdi                     /                               |
> > + * +-----------------+-------+ <--                                |
> > + * |name: coly/jetta |.......|                                    |
> > + * +-----------------+-------+                                   /
> > + *                              bucket vdi                      /
> > + *                              +-----------------+------+ <----
> > + *                              | name: coly/volvo|......|
> > + *                              +-----------------+------+
> > + */
> > +
> > +/* Account operations */
> > +
> > +int kv_create_account(const char *account)
> > +{
> > +     uint32_t vdi_id;
> > +     return kv_create_hyper_volume(account, &vdi_id);
> > +}
> > +
> > +typedef void (*list_cb)(struct http_request *req, const char *bucket,
> > +                     void *opaque);
> > +
> > +struct list_buckets_arg {
> > +     struct http_request *req;
> > +     void *opaque;
> > +     list_cb cb;
> > +     uint32_t bucket_counter;
> > +};
> > +
> > +static void list_buckets_cb(void *data, enum btree_node_type type, void
> *arg)
> > +{
> > +     struct sd_extent *ext;
> > +     struct list_buckets_arg *lbarg = arg;
> > +     struct bucket_inode *bnode;
> > +     uint64_t oid;
> > +     char *buf = NULL;
> > +     int ret;
> > +
> > +     if (type == BTREE_EXT) {
> > +             ext = (struct sd_extent *)data;
> > +             if (!ext->vdi_id)
> > +                     return;
> > +
> > +             buf = xzalloc(SD_DATA_OBJ_SIZE);
> > +
> > +             oid = vid_to_data_oid(ext->vdi_id, ext->idx);
> > +             ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
> > +             if (ret != SD_RES_SUCCESS) {
> > +                     sd_err("Failed to read data object %lx", oid);
> > +                     goto out;
> > +             }
> > +             /* loop all bucket_inodes in this data-object */
> > +             for (int i = 0; i < BUCKETS_PER_SD_OBJ; i++) {
> > +                     bnode = (struct bucket_inode *)
> > +                             (buf + i * sizeof(struct bucket_inode));
> > +                     if (bnode->hdr.onode_vid == 0)
> > +                             continue;
> > +                     if (lbarg->cb)
> > +                             lbarg->cb(lbarg->req,
> bnode->hdr.bucket_name,
> > +                                       (void *)lbarg->opaque);
> > +                     lbarg->bucket_counter++;
> > +             }
> > +     }
> > +out:
> > +     free(buf);
> > +}
> > +
> > +/* get number of buckets in this account */
> > +static int kv_get_account(const char *account, uint32_t *nr_buckets)
> > +{
> > +     struct sd_inode inode;
> > +     uint64_t oid;
> > +     uint32_t account_vid;
> > +     int ret;
> > +
> > +     ret = lookup_vdi(account, &account_vid);
> > +     if (ret != SD_RES_SUCCESS)
> > +             return ret;
> > +
> > +     /* read account vdi out */
> > +     oid = vid_to_vdi_oid(account_vid);
> > +     ret = sd_read_object(oid, (char *)&inode, sizeof(struct sd_inode),
> 0);
> > +     if (ret != SD_RES_SUCCESS) {
> > +             sd_err("Failed to read inode header %lx", oid);
> > +             return ret;
> > +     }
> > +
> > +     struct list_buckets_arg arg = {NULL, NULL, NULL, 0};
> > +     traverse_btree(sheep_bnode_reader, &inode, list_buckets_cb, &arg);
> > +     if (nr_buckets)
> > +             *nr_buckets = arg.bucket_counter;
> > +
> > +     return SD_RES_SUCCESS;
> > +}
> > +
> > +int kv_read_account(const char *account, uint32_t *nr_buckets)
> > +{
> > +     int ret;
> > +
> > +     ret = kv_get_account(account, nr_buckets);
> > +     if (ret != SD_RES_SUCCESS)
> > +             sd_err("Failed to get number of buckets in %s", account);
> > +     return ret;
> > +}
> > +
> > +int kv_update_account(const char *account)
> > +{
> > +     /* TODO: update metadata of the account */
> > +     return -1;
> > +}
> > +
> > +int kv_delete_account(const char *account)
> > +{
> > +     int ret;
> > +
> > +     ret = kv_delete_vdi(account);
> > +     if (ret != SD_RES_SUCCESS)
> > +             sd_err("Failed to delete vdi %s", account);
> > +
> > +     return ret;
> > +}
> > +
> > +/* Bucket operations */
> > +
> > +static int lookup_bucket(struct http_request *req, const char *bucket,
> > +                      uint32_t *vid)
> > +{
> > +     int ret;
> > +     struct vdi_info info = {};
> > +     struct vdi_iocb iocb = {
> > +             .name = bucket,
> > +             .data_len = strlen(bucket),
> > +     };
> > +
> > +     ret = vdi_lookup(&iocb, &info);
> >       switch (ret) {
> >       case SD_RES_SUCCESS:
> > -             http_response_header(req, CREATED);
> > +             *vid = info.vid;
> >               break;
> > -     case SD_RES_VDI_EXIST:
> > -             http_response_header(req, ACCEPTED);
> > +     case SD_RES_NO_VDI:
> > +             sd_info("no such bucket %s", bucket);
> > +             http_response_header(req, NOT_FOUND);
> >               break;
> >       default:
> > -             sd_err("%s: bucket %s", sd_strerror(ret), bucket);
> > +             sd_err("Failed to find bucket %s %s", bucket,
> sd_strerror(ret));
> >               http_response_header(req, INTERNAL_SERVER_ERROR);
> > -             return -1;
> >       }
> >
> > -     return 0;
> > +     return ret;
> >  }
> >
> > -int kv_read_bucket(struct http_request *req, const char *bucket)
> > +/*
> > + * Delete bucket(container) inode in account vdi.
> > + * idx: the target hash positon of bucket
> > + * Return the position of bucket_inode in sd-data-object if success
> > + * Return BUCKETS_PER_SD_OBJ if bucket_inode is not found
> > + * Return -1 if some errors happend
> > + */
> > +static int delete_bucket(struct sd_inode *account_inode, uint64_t idx,
> > +                      const char *bucket)
> >  {
> > -     /* TODO: read metadata of the bucket */
> > -     return -1;
> > +     struct bucket_inode *bnode;
> > +     char *buf = NULL;
> > +     uint32_t vdi_id;
> > +     uint64_t oid;
> > +     uint64_t data_index = idx / BUCKETS_PER_SD_OBJ;
> > +     int offset = idx % BUCKETS_PER_SD_OBJ;
> > +     int ret, i, empty_buckets = 0, found = 0;
> > +
> > +     vdi_id = INODE_GET_VID(account_inode, data_index);
> > +     if (!vdi_id) {
> > +             sd_err("the %lu in vdi %s is not exists", data_index,
> > +                    account_inode->name);
> > +             ret = -1;
> > +             goto out;
> > +     }
> > +
> > +     oid = vid_to_data_oid(account_inode->vdi_id, data_index);
> > +     buf = xzalloc(SD_DATA_OBJ_SIZE);
> > +     ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
> > +     if (ret != SD_RES_SUCCESS) {
> > +             sd_err("Failed to read inode header %lx", oid);
> > +             ret = -1;
> > +             goto out;
> > +     }
> > +
> > +     for (i = 0; i < BUCKETS_PER_SD_OBJ; i++) {
> > +             char vdi_name[SD_MAX_VDI_LEN];
> > +             bnode = (struct bucket_inode *)
> > +                     (buf + i * sizeof(struct bucket_inode));
> > +             /* count all empty buckets in this sd-data-obj */
> > +             if (bnode->hdr.onode_vid == 0) {
> > +                     empty_buckets++;
> > +                     continue;
> > +             }
> > +             if (strncmp(bnode->hdr.bucket_name, bucket,
> SD_MAX_BUCKET_NAME))
> > +                     continue;
> > +
> > +             if (i < offset)
> > +                     panic("postion of bucket inode %d is smaller than
> %d",
> > +                           i, offset);
> > +
> > +             found = i;
> > +             /* find the bnode */
> > +             bnode->hdr.onode_vid = 0;
> > +             snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
> > +                      account_inode->name, bucket);
> > +
> > +             ret = kv_delete_vdi(vdi_name);
> > +             if (ret != SD_RES_SUCCESS) {
> > +                     sd_err("Failed to delete vdi %s", vdi_name);
> > +                     ret = -1;
> > +                     goto out;
> > +             }
> > +             sd_debug("delete vdi %s success", vdi_name);
> > +     }
> > +
> > +     if (!found) {
> > +             ret = BUCKETS_PER_SD_OBJ;
> > +             goto out;
> > +     }
> > +
> > +     /*
> > +      * if only this bucket_inode is in the sd-data-obj,
> > +      * then delete this sd-data-obj
> > +      */
> > +     if (empty_buckets == BUCKETS_PER_SD_OBJ - 1) {
> > +             ret = discard_data_obj(oid);
> > +             if (ret != SD_RES_SUCCESS) {
> > +                     ret = -1;
> > +                     goto out;
> > +             }
> > +             INODE_SET_VID(account_inode, data_index, 0);
> > +             ret = sd_inode_write_vid(sheep_bnode_writer, account_inode,
> > +                                      data_index, vdi_id, vdi_id, 0,
> false,
> > +                                      false);
> > +             if (ret != SD_RES_SUCCESS) {
> > +                     sd_err("Failed to write inode %x", vdi_id);
> > +                     ret = -1;
> > +                     goto out;
> > +             }
> > +             sd_debug("discard obj %lx and update vdi %x success",
> > +                      oid, vdi_id);
> > +     } else {
> > +             ret = sd_write_object(oid, buf, sizeof(struct
> bucket_inode),
> > +                                i * sizeof(struct bucket_inode), false);
> > +             if (ret != SD_RES_SUCCESS) {
> > +                     sd_err("Failed to write object %lx", oid);
> > +                     ret = -1;
> > +                     goto out;
> > +             }
> > +     }
> > +
> > +     sd_debug("write object oid %lx success", oid);
> > +     ret = found;
> > +out:
> > +     free(buf);
> > +     return ret;
> >  }
> >
> > -int kv_update_bucket(struct http_request *req, const char *bucket)
> > +/*
> > + * Add bucket(container) inode into account vdi.
> > + * idx: the target hash positon of bucket
> > + * Return the position of bucket_inode in sd-data-object if success
> > + * Return BUCKETS_PER_SD_OBJ if the data-object is full of bucket_inode
> > + * Return -1 if some error happend
> > + */
> > +static int add_bucket(struct sd_inode *account_inode, uint64_t idx,
> > +                   const char *bucket)
> >  {
> > -     /* TODO: update metadata of the bucket */
> > -     return -1;
> > +     struct bucket_inode *bnode;
> > +     char *buf = NULL;
> > +     uint32_t vdi_id;
> > +     uint64_t oid;
> > +     uint64_t data_index = idx / BUCKETS_PER_SD_OBJ;
> > +     int offset = idx % BUCKETS_PER_SD_OBJ;
> > +     int ret, i;
> > +     bool create = false;
> > +
> > +     buf = xzalloc(SD_DATA_OBJ_SIZE);
> > +
> > +     vdi_id = INODE_GET_VID(account_inode, data_index);
> > +     oid = vid_to_data_oid(account_inode->vdi_id, data_index);
> > +     sd_debug("oid %x %lx %lx", account_inode->vdi_id, data_index, oid);
> > +     /* the data object is exists */
> > +     if (vdi_id) {
> > +             ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0);
> > +             if (ret != SD_RES_SUCCESS) {
> > +                     sd_err("Failed to read inode header %lx", oid);
> > +                     ret = -1;
> > +                     goto out;
> > +             }
> > +     } else
> > +             create = true;
> > +
> > +     sd_debug("bucket_inode offset %d %lu", offset, BUCKETS_PER_SD_OBJ);
> > +     for (i = offset; i < BUCKETS_PER_SD_OBJ; i++) {
> > +             char vdi_name[SD_MAX_VDI_LEN];
> > +             bnode = (struct bucket_inode *)
> > +                     (buf + i * sizeof(struct bucket_inode));
> > +             if (bnode->hdr.onode_vid != 0)
> > +                     continue;
> > +
> > +             /* the bnode not used */
> > +             strncpy(bnode->hdr.bucket_name, bucket,
> SD_MAX_BUCKET_NAME);
> > +             bnode->hdr.obj_count = 0;
> > +             bnode->hdr.bytes_used = 0;
> > +             snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s",
> > +                      account_inode->name, bucket);
> > +             ret = kv_create_hyper_volume(vdi_name,
> &(bnode->hdr.onode_vid));
> > +             if (ret != SD_RES_SUCCESS) {
> > +                     sd_err("Failed to create hyper volume %d", ret);
> > +                     ret = -1;
> > +                     goto out;
> > +             }
> > +             sd_debug("create hyper volume %s success", vdi_name);
> > +             break;
> > +     }
> > +
> > +     if (i >= BUCKETS_PER_SD_OBJ) {
> > +             ret = BUCKETS_PER_SD_OBJ;
> > +             goto out;
> > +     }
> > +
> > +     /* write bnode back to account-vdi */
> > +     if (create)
> > +             ret = sd_write_object(oid, buf, SD_DATA_OBJ_SIZE, 0,
> create);
> > +     else
> > +             ret = sd_write_object(oid, buf, sizeof(struct
> bucket_inode),
> > +                                i * sizeof(struct bucket_inode),
> create);
> > +
> > +     if (ret != SD_RES_SUCCESS) {
> > +             sd_err("Failed to write object %lx", oid);
> > +             ret = -1;
> > +             goto out;
> > +     }
> > +
> > +     sd_debug("write object oid %lx success", oid);
> > +
> > +     /* update index of vdi */
> > +     if (create) {
> > +             vdi_id = account_inode->vdi_id;
> > +             INODE_SET_VID(account_inode, data_index, vdi_id);
> > +             ret = sd_inode_write_vid(sheep_bnode_writer, account_inode,
> > +                                      data_index, vdi_id, vdi_id, 0,
> false,
> > +                                      false);
> > +             if (ret != SD_RES_SUCCESS) {
> > +                     sd_err("Failed to write inode %x", vdi_id);
> > +                     ret = -1;
> > +                     goto out;
> > +             }
> > +             sd_debug("write account inode success");
> > +     }
> > +
> > +     ret = i;
> > +out:
> > +     free(buf);
> > +     return ret;
> >  }
> >
> > -/* TODO: return HTTP_CONFLICT when the bucket is not empty */
> > -int kv_delete_bucket(struct http_request *req, const char *bucket)
> > +static int kv_get_bucket(struct sd_inode *account_inode, uint32_t
> account_vid,
> > +                      const char *account, const char *bucket)
>
> account_inode has the vid, so we can remove extra account_vid from
> parameter
>

No, we can't, the account_inode passed in is empty.


>
> Thanks
> Yuan
>



-- 
--
Best Regard
Robin Dong
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.wpkg.org/pipermail/sheepdog/attachments/20131213/ee2d1277/attachment-0004.html>


More information about the sheepdog mailing list