On Fri, Dec 06, 2013 at 05:04:19PM +0800, Robin Dong wrote: > From: Robin Dong <sanbai at taobao.com> > > Using hyper volume and extent structure of onode to store large number of > objects size of which exceed SD_DATA_OBJ_SIZE. > > Signed-off-by: Robin Dong <sanbai at taobao.com> > --- > sheep/http/kv.c | 422 ++++++++++++++++++++++++++++++++++++++++------------- > sheep/http/kv.h | 15 +- > sheep/http/s3.c | 8 +- > sheep/http/swift.c | 30 +--- > 4 files changed, 335 insertions(+), 140 deletions(-) > > diff --git a/sheep/http/kv.c b/sheep/http/kv.c > index 55a7e24..68f0759 100644 > --- a/sheep/http/kv.c > +++ b/sheep/http/kv.c > @@ -21,6 +21,7 @@ struct bucket_inode { > uint64_t obj_count; > uint64_t bytes_used; > uint32_t vdi_id; /* kv_onode stores in this vdi */ > + uint32_t data_vid; /* data of objects store in this vdi */ seems that this data_vid isn't used. > uint32_t pad; > uint64_t reserved[SD_MAX_BUCKET_NAME/sizeof(uint64_t) - 3]; > }; > @@ -172,13 +173,13 @@ int kv_create_account(const char *account) > return kv_create_hyper_volume(account, &vdi_id); > } > > -typedef void (*list_cb)(struct http_request *req, const char *bucket, > +typedef void (*list_bucket_cb)(struct http_request *req, const char *bucket, > void *opaque); > > struct list_buckets_arg { > struct http_request *req; > const char *account; > - list_cb cb; > + list_bucket_cb cb; > uint32_t bucket_counter; > }; > > @@ -360,7 +361,16 @@ static int delete_bucket(struct sd_inode *account_inode, uint64_t idx, > bnode->vdi_id = 0; > snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", > account_inode->name, bucket); > - > + /* delete vdi which store kv_onode */ > + ret = kv_delete_vdi(vdi_name); > + if (ret != SD_RES_SUCCESS) { > + sd_err("Failed to delete vdi %s", vdi_name); > + ret = -1; > + goto out; > + } > + /* delete vdi which store object data */ > + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator", > + account_inode->name, bucket); > ret = kv_delete_vdi(vdi_name); > if (ret != SD_RES_SUCCESS) { > sd_err("Failed to delete vdi %s", vdi_name); > @@ -462,12 +472,29 @@ static int add_bucket(struct sd_inode *account_inode, uint64_t idx, > bnode->bytes_used = 0; > snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", > account_inode->name, bucket); > + /* create vdi to store kv_onode */ > ret = kv_create_hyper_volume(vdi_name, &(bnode->vdi_id)); > if (ret != SD_RES_SUCCESS) { > sd_err("Failed to create hyper volume %d", ret); > ret = -1; > goto out; > } > + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator", > + account_inode->name, bucket); > + /* create vdi to store objects */ > + ret = kv_create_hyper_volume(vdi_name, &(bnode->data_vid)); > + if (ret != SD_RES_SUCCESS) { > + sd_err("Failed to create hyper volume %d", ret); > + ret = -1; > + goto out; > + } > + ret = oalloc_init(bnode->data_vid); > + if (ret != SD_RES_SUCCESS) { > + sd_err("Failed to init allocator on %x", > + bnode->data_vid); > + ret = -1; > + goto out; > + } > sd_debug("create hyper volume %s success", vdi_name); > break; > } > @@ -629,7 +656,7 @@ int kv_delete_bucket(const char *account, const char *bucket) > return SD_RES_SUCCESS; > } > > -int kv_list_buckets(struct http_request *req, list_cb cb, void *opaque) > +int kv_list_buckets(struct http_request *req, list_bucket_cb cb, void *opaque) > { > struct sd_inode account_inode; > const char *account = (const char *)opaque; > @@ -658,6 +685,34 @@ int kv_list_buckets(struct http_request *req, list_cb cb, void *opaque) > return SD_RES_SUCCESS; > } > > +/* > + * A bucket contains two vdi: one (vdi_id) stores 'struct kv_onode' by hash > + * algorithm and another one (data_vid) stores data of objects. > + * The first vdi names "account/bucket" and the second vdi names > + * "account/bucket/allocator". > + * > + * It manage space in data vdi by algorithm in oalloc.c. > + * > + * For example: bucket "fruit" with account 'coly' has two objects "banana" > + * and "apple" > + * > + * > + * --------------------- kv_onode ----------------------- > + * | | > + * bucket vdi v v > + * +-----------------+--+---------------------------+--------------------------+ > + * |name: coly/fruit |..|kv_onode_hdr (name: banana)|onode_extent: start, count| > + * +-----------------+--+---------------------------+--------------------------+ > + * / > + * / > + * ------------ > + * / > + * data_vid v > + * +---------------------------+---+-----------------+ > + * |name: coly/fruit/allocator |...| data | > + * +---------------------------+---+-----------------+ > + */ > + > /* Object operations */ > > /* 4 KB header of kv object index node */ > @@ -691,24 +746,71 @@ struct kv_onode { > struct kv_onode_hdr hdr; > union { > uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct kv_onode_hdr)]; > - struct onode_extent *o_extent; > + struct onode_extent o_extent[0]; > }; > }; > > +typedef void (*list_object_cb)(struct http_request *req, const char *bucket, > + const char *object, void *opaque); > + > +struct list_objects_arg { > + struct http_request *req; > + void *opaque; > + const char *bucket; > + list_object_cb cb; > + uint32_t object_counter; > +}; > + > +static void list_objects_cb(void *data, enum btree_node_type type, void *arg) > +{ > + struct sd_extent *ext; > + struct list_objects_arg *loarg = arg; > + struct kv_onode *onode = NULL; > + uint64_t oid; > + int ret; > + > + if (type == BTREE_EXT) { > + ext = (struct sd_extent *)data; > + if (!ext->vdi_id) > + goto out; > + > + onode = xmalloc(SD_DATA_OBJ_SIZE); > + > + oid = vid_to_data_oid(ext->vdi_id, ext->idx); > + ret = sd_read_object(oid, (char *)onode, SD_DATA_OBJ_SIZE, 0); > + if (ret != SD_RES_SUCCESS) { > + sd_err("Failed to read data object %lx", oid); > + goto out; > + } > + > + if (onode->hdr.name[0] == '\0') > + goto out; > + if (loarg->cb) > + loarg->cb(loarg->req, loarg->bucket, onode->hdr.name, > + loarg->opaque); > + loarg->object_counter++; > + } > +out: > + free(onode); > +} > + > #define KV_ONODE_INLINE_SIZE (SD_DATA_OBJ_SIZE - sizeof(struct kv_onode_hdr)) > > -static int kv_create_inlined_object(struct sd_inode *inode, > - struct kv_onode *onode, > - uint32_t vid, uint32_t idx, > - bool overwrite) > +static int kv_write_onode(struct sd_inode *inode, struct kv_onode *onode, > + uint32_t vid, uint32_t idx, bool overwrite) > { > - uint64_t oid = vid_to_data_oid(vid, idx); > + uint64_t oid = vid_to_data_oid(vid, idx), len; > int ret; > > + if (onode->hdr.inlined) > + len = onode->hdr.size; > + else > + len = sizeof(struct onode_extent) * onode->hdr.nr_extent; > + > if (overwrite) { > sd_info("overwrite object %s", onode->hdr.name); > ret = sd_write_object(oid, (char *)onode, > - sizeof(onode->hdr) + onode->hdr.size, > + sizeof(onode->hdr) + len, > 0, false); > if (ret != SD_RES_SUCCESS) { > sd_err("failed to write object, %" PRIx64, oid); > @@ -716,7 +818,7 @@ static int kv_create_inlined_object(struct sd_inode *inode, > } > } else { > ret = sd_write_object(oid, (char *)onode, > - sizeof(onode->hdr) + onode->hdr.size, > + sizeof(onode->hdr) + len, > 0, true); > if (ret != SD_RES_SUCCESS) { > sd_err("failed to create object, %" PRIx64, oid); > @@ -735,13 +837,6 @@ out: > return ret; > } > > -static int kv_create_extented_object(struct sd_inode *inode, > - struct kv_onode *onode, > - uint32_t vid, uint32_t idx) > -{ > - return SD_RES_SUCCESS; > -} > - > /* > * Create the object if the index isn't taken. Overwrite the object if it exists > * Return SD_RES_OBJ_TAKEN if the index is taken by other object. > @@ -778,48 +873,112 @@ static int do_kv_create_object(struct http_request *req, > goto out; > } > } > - if (onode->hdr.inlined) > - ret = kv_create_inlined_object(inode, onode, vid, idx, > - !!tmp_vid); > - else > - ret = kv_create_extented_object(inode, onode, vid, idx); > + > + ret = kv_write_onode(inode, onode, vid, idx, !!tmp_vid); > + if (ret != SD_RES_SUCCESS) > + sd_err("Failed to write onode"); > out: > free(inode); > return ret; > } > > -int kv_create_object(struct http_request *req, const char *bucket, > - const char *name) > +int kv_create_object(struct http_request *req, const char *account, > + const char *bucket, const char *name) > { > struct kv_onode *onode; > - ssize_t size; > + ssize_t size, total_size = 0; > int ret; > - uint64_t hval; > - uint32_t vid; > + uint64_t hval, start = 0, count, block, limit; > + uint32_t vid, data_vid; > struct timeval tv; > + char vdi_name[SD_MAX_VDI_LEN]; > + char *data_buf = NULL; > > - ret = lookup_bucket(req, bucket, &vid); > + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s", account, bucket); > + ret = lookup_bucket(req, vdi_name, &vid); > + if (ret < 0) > + return ret; > + > + snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator", account, bucket); > + ret = lookup_bucket(req, vdi_name, &data_vid); > if (ret < 0) > return ret; > > onode = xzalloc(sizeof(*onode)); > > + /* for inlined onode */ > + if (req->data_length <= KV_ONODE_INLINE_SIZE) { > + onode->hdr.inlined = 1; > + size = http_request_read(req, onode->data, sizeof(onode->data)); > + if (size < 0) { > + sd_err("%s: bucket %s, object %s", sd_strerror(ret), > + bucket, name); > + http_response_header(req, INTERNAL_SERVER_ERROR); > + ret = -1; > + goto out; > + } > + total_size = size; > + } else { > + sd_debug("data_length: %lu, %lu", req->data_length, > + SD_DATA_OBJ_SIZE); > + count = (req->data_length + SD_DATA_OBJ_SIZE + 1) / > + SD_DATA_OBJ_SIZE; > + ret = oalloc_new_prepare(data_vid, &start, count); > + if (ret != SD_RES_SUCCESS) { > + sd_err("Failed to prepare allocation of %lu bytes!", > + req->data_length); > + ret = -1; > + goto out; > + } > + > + /* receive and write data at first, then write onode */ > + data_buf = xmalloc(SD_DATA_OBJ_SIZE); > + > + sd_debug("start: %lu, count: %lu", start, count); > + for (block = start, limit = start + count; > + block < limit; block++) { > + sd_debug("block: %lu, limit: %lu", block, limit); > + size = http_request_read(req, data_buf, > + SD_DATA_OBJ_SIZE); > + total_size += size; > + ret = sd_write_object(vid_to_data_oid(data_vid, block), > + data_buf, size, 0, true); > + if (ret != SD_RES_SUCCESS) { > + sd_err("Failed to write data object for %" > + PRIx32" %s", data_vid, sd_strerror(ret)); > + goto out; > + } > + if (size < SD_DATA_OBJ_SIZE) > + break; > + } > + > + sd_debug("DATA_LENGTH: %lu, total size: %lu, last blocks: %lu", > + req->data_length, total_size, start); > + > + sd_debug("finish start: %lu, count: %lu", start, count); > + ret = oalloc_new_finish(data_vid, start, count); > + if (ret != SD_RES_SUCCESS) { > + sd_err("Failed to finish allocation of %lu bytes!", > + req->data_length); > + ret = -1; > + goto out; > + } > + > + onode->o_extent[0].vdi = data_vid; Seems that onode_extent.vdi can be removed because of node->hdr.data_vid. Get a compile warning CC corosync.o CC zookeeper.o Built sheep http/kv.c: In function ‘kv_delete_bucket’: http/kv.c:422:6: warning: ‘buf’ may be used uninitialized in this function [-Wmaybe-uninitialized] http/kv.c:319:8: note: ‘buf’ was declared here char *buf; ^ CCLD sheep Thanks Yuan |