[sheepdog] [PATCH] sheep/http: unify onode lookup/find operation
Robin Dong
robin.k.dong at gmail.com
Wed Dec 25 06:11:13 CET 2013
2013/12/24 Liu Yuan <namei.unix at gmail.com>
> Previously we discard the onode object, so can't check if object exists by
> loop
> around the adjacent objects one by one becaue there is might be hole
> between
> collisioned hash chains. For this reason, we have to iterate all the
> objects
> in a container to check if some one is in. This is obviously:
>
> - time consuming because CREATE/DELETE/GET operations will firstly call
> onode_find() to iterate the container.
> - can't scale at all. Every object operation will read the whole onodes
> in the
> bucket.
>
> This patch reworks the onode lookup operation:
>
> - We check adjacent objects one by one once we get a start index by hashing
> name. Unallocated slot marks the end of the check window.
>
> For e.g, if we are going to check if fish in the following bucket, assume
> fish hashes to 'sheep', so we compare the name one by one from 'sheep' to
> 'fish'. '\0' indicates that object was deleted before checking.
>
> [ sheep, dog, wolve, '\0', fish, {unallocated}, tiger, ]
>
> With this lookup mechanism, we can safely remove onode_find(). With this
> patch
> we can observe a huge time reduction even in tests/func/081, 082:
>
> before:
> 081 Last Used:20s. Test http service with a single server
> 082 Last Used:21s. Test http service with multiple servers
>
> after:
> 081 Last Used:11s. Test http service with a single server
> 082 Last Used:12s. Test http service with multiple servers
>
> Signed-off-by: Liu Yuan <namei.unix at gmail.com>
> ---
> sheep/http/kv.c | 274
> +++++++++++++++++++++++++++--------------------
> tests/functional/081.out | 4 +-
> tests/functional/082.out | 4 +-
> 3 files changed, 160 insertions(+), 122 deletions(-)
>
> diff --git a/sheep/http/kv.c b/sheep/http/kv.c
> index 5843bb3..c0b3c4f 100644
> --- a/sheep/http/kv.c
> +++ b/sheep/http/kv.c
> @@ -43,7 +43,7 @@ struct kv_onode {
> uint8_t inlined;
> };
>
> - uint8_t __pad[BLOCK_SIZE];
> + uint8_t pad[BLOCK_SIZE];
> };
> union {
> uint8_t data[SD_DATA_OBJ_SIZE - BLOCK_SIZE];
> @@ -88,69 +88,6 @@ static int kv_create_hyper_volume(const char *name,
> uint32_t *vdi_id)
> return ret;
> }
>
> -/*
> - * Find an free object index by hash of name in the vid and create an
> object
> - * that holds the kv node{kv_bnode, kv_onode}.
> - */
> -#define kv_generic_object_create(node, vid, node_do_create) \
> -({ \
> - struct sd_inode *__inode = xmalloc(sizeof(struct sd_inode)); \
> - uint32_t __tmp_vid, __idx, __i; \
> - uint64_t __hval; \
> - int __ret; \
> - \
> - __ret = sd_read_object(vid_to_vdi_oid(vid), (char *)__inode, \
> - sizeof(*__inode), 0); \
> - if (__ret != SD_RES_SUCCESS) { \
> - sd_err("failed to read %" PRIx32 " %s", vid, \
> - sd_strerror(__ret)); \
> - goto out; \
> - } \
> - \
> - __hval = sd_hash(node->name, strlen(node->name)); \
> - for (__i = 0; __i < MAX_DATA_OBJS; __i++) { \
> - __idx = (__hval + __i) % MAX_DATA_OBJS; \
> - __tmp_vid = INODE_GET_VID(__inode, __idx); \
> - if (__tmp_vid) \
> - continue; \
> - else \
> - break; \
> - } \
> - if (__i == MAX_DATA_OBJS) { \
> - __ret = SD_RES_NO_SPACE; \
> - goto out; \
> - } \
> - __ret = node_do_create(node, __inode, __idx); \
> -out: \
> - free(__inode); \
> - __ret; \
> -})
> -
> -/* Find the object in the vid which holds the 'node' that matches 'name'
> */
> -#define kv_generic_object_lookup(node, vid, name) \
> -({ \
> - uint64_t __hval; \
> - uint32_t __i; \
> - int __ret; \
> - \
> - __hval = sd_hash(name, strlen(name)); \
> - for (__i = 0; __i < MAX_DATA_OBJS; __i++) { \
> - uint32_t __idx = (__hval + __i) % MAX_DATA_OBJS; \
> - uint64_t __oid = vid_to_data_oid(vid, __idx); \
> - \
> - __ret = sd_read_object(__oid, (char *)node, sizeof(*node),
> 0); \
> - if (__ret != SD_RES_SUCCESS) \
> - goto out; \
> - if (strcmp(node->name, name) == 0) \
> - break; \
> - } \
> - \
> - if (__i == MAX_DATA_OBJS) \
> - __ret = SD_RES_NO_OBJ; \
> -out: \
> - __ret; \
> -})
> -
> /* Account operations */
>
> /*
> @@ -320,7 +257,36 @@ out:
>
> static int bnode_create(struct kv_bnode *bnode, uint32_t account_vid)
> {
> - return kv_generic_object_create(bnode, account_vid,
> bnode_do_create);
> + struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
> + uint32_t tmp_vid, idx, i;
> + uint64_t hval;
> + int ret;
> +
> + ret = sd_read_object(vid_to_vdi_oid(account_vid), (char *)inode,
> + sizeof(*inode), 0);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("failed to read %" PRIx32 " %s", account_vid,
> + sd_strerror(ret));
> + goto out;
> + }
> +
> + hval = sd_hash(bnode->name, strlen(bnode->name));
> + for (i = 0; i < MAX_DATA_OBJS; i++) {
>
MAX_DATA_OBJS is (2**32) and 'i' is type of uint32_t, so 'i' will always be
smaller than MAX_DATA_OBJS.
If all the oid in this vdi is not empty, we will fall into dead-loop.
So I think type of 'i' should be 'uint64_t'.
> + idx = (hval + i) % MAX_DATA_OBJS;
> + tmp_vid = INODE_GET_VID(inode, idx);
> + if (tmp_vid)
> + continue;
> + else
> + break;
> + }
> + if (i == MAX_DATA_OBJS) {
> + ret = SD_RES_NO_SPACE;
> + goto out;
> + }
> + ret = bnode_do_create(bnode, inode, idx);
> +out:
> + free(inode);
> + return ret;
> }
>
> static int bucket_create(const char *account, uint32_t account_vid,
> @@ -366,9 +332,28 @@ err:
> return ret;
> }
>
> -static int bucket_lookup(struct kv_bnode *bnode, uint32_t vid, const char
> *name)
> +static int bnode_lookup(struct kv_bnode *bnode, uint32_t vid, const char
> *name)
> {
> - return kv_generic_object_lookup(bnode, vid, name);
> + uint64_t hval;
> + uint32_t i;
> + int ret;
> +
> + hval = sd_hash(name, strlen(name));
> + for (i = 0; i < MAX_DATA_OBJS; i++) {
>
As above
> + uint32_t idx = (hval + i) % MAX_DATA_OBJS;
> + uint64_t oid = vid_to_data_oid(vid, idx);
> +
> + ret = sd_read_object(oid, (char *)bnode, sizeof(*bnode),
> 0);
> + if (ret != SD_RES_SUCCESS)
> + goto out;
> + if (strcmp(bnode->name, name) == 0)
> + break;
> + }
> +
> + if (i == MAX_DATA_OBJS)
> + ret = SD_RES_NO_OBJ;
> +out:
> + return ret;
> }
>
> static int bucket_delete(const char *account, uint32_t avid, const char
> *bucket)
> @@ -382,7 +367,7 @@ static int bucket_delete(const char *account, uint32_t
> avid, const char *bucket)
> snprintf(alloc_name, SD_MAX_VDI_LEN, "%s/%s/allocator", account,
> bucket);
>
> - ret = bucket_lookup(&bnode, avid, bucket);
> + ret = bnode_lookup(&bnode, avid, bucket);
> if (ret != SD_RES_SUCCESS)
> return ret;
>
> @@ -684,7 +669,7 @@ out:
> }
>
> static int onode_do_create(struct kv_onode *onode, struct sd_inode *inode,
> - uint32_t idx)
> + uint32_t idx, bool create)
> {
> uint32_t vid = inode->vdi_id;
> uint64_t oid = vid_to_data_oid(vid, idx), len;
> @@ -697,11 +682,14 @@ static int onode_do_create(struct kv_onode *onode,
> struct sd_inode *inode,
> len = sizeof(struct onode_extent) * onode->nr_extent;
>
> ret = sd_write_object(oid, (char *)onode, BLOCK_SIZE + len,
> - 0, true);
> + 0, create);
> if (ret != SD_RES_SUCCESS) {
> sd_err("failed to create object, %" PRIx64, oid);
> goto out;
> }
> + if (!create)
> + goto out;
> +
> INODE_SET_VID(inode, idx, vid);
> ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx,
> vid, vid, 0, false, false);
> @@ -716,12 +704,48 @@ out:
>
> static int onode_create(struct kv_onode *onode, uint32_t bucket_vid)
> {
> + struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
> + uint32_t tmp_vid, idx, i;
> + uint64_t hval;
> int ret;
> + bool create = true;
>
> sys->cdrv->lock(bucket_vid);
> - ret = kv_generic_object_create(onode, bucket_vid, onode_do_create);
> - sys->cdrv->unlock(bucket_vid);
> + ret = sd_read_object(vid_to_vdi_oid(bucket_vid), (char *)inode,
> + sizeof(*inode), 0);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("failed to read %" PRIx32 " %s", bucket_vid,
> + sd_strerror(ret));
> + goto out;
> + }
> +
> + hval = sd_hash(onode->name, strlen(onode->name));
> + for (i = 0; i < MAX_DATA_OBJS; i++) {
>
As above
> + idx = (hval + i) % MAX_DATA_OBJS;
> + tmp_vid = INODE_GET_VID(inode, idx);
> + if (tmp_vid) {
> + uint64_t oid = vid_to_data_oid(bucket_vid, idx);
> + char name[SD_MAX_OBJECT_NAME] = { };
>
> + ret = sd_read_object(oid, name, sizeof(name), 0);
> + if (ret != SD_RES_SUCCESS)
> + goto out;
> + if (name[0] == 0) {
> + create = false;
> + goto create;
> + }
> + } else
> + break;
> + }
> + if (i == MAX_DATA_OBJS) {
> + ret = SD_RES_NO_SPACE;
> + goto out;
> + }
> +create:
> + ret = onode_do_create(onode, inode, idx, create);
> +out:
> + free(inode);
> + sys->cdrv->unlock(bucket_vid);
> return ret;
> }
>
> @@ -773,12 +797,60 @@ out:
> return ret;
> }
>
> +/*
> + * Check if object by name exists in a bucket and init 'onode' if it
> exists.
> + *
> + * Return SD_RES_SUCCESS if found, SD_RES_NO_OBJ if not found.
> + *
> + * We check adjacent objects one by one once we get a start index by
> hashing
> + * name. Unallocated slot marks the end of the check window.
> + *
> + * For e.g, if we are going to check if fish in the following bucket,
> assume
> + * fish hashes to 'sheep', so we compare the name one by one from 'sheep'
> to
> + * 'fish'. '\0' indicates that object was deleted before checking.
> + *
> + * [ sheep, dog, wolve, '\0', fish, {unallocated}, tiger, ]
> + */
> static int onode_lookup(struct kv_onode *onode, uint32_t ovid, const char
> *name)
> {
> + struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
> + uint32_t tmp_vid, idx, i;
> + uint64_t hval;
> int ret;
>
> sys->cdrv->lock(ovid);
> - ret = kv_generic_object_lookup(onode, ovid, name);
> + ret = sd_read_object(vid_to_vdi_oid(ovid), (char *)inode,
> + sizeof(*inode), 0);
> + if (ret != SD_RES_SUCCESS) {
> + sd_err("failed to read %" PRIx32 " %s", ovid,
> + sd_strerror(ret));
> + goto out;
> + }
> +
> + hval = sd_hash(name, strlen(name));
> + for (i = 0; i < MAX_DATA_OBJS; i++) {
>
As above
> + idx = (hval + i) % MAX_DATA_OBJS;
> + tmp_vid = INODE_GET_VID(inode, idx);
> + if (tmp_vid) {
> + uint64_t oid = vid_to_data_oid(ovid, idx);
> +
> + ret = sd_read_object(oid, (char *)onode,
> + sizeof(*onode), 0);
> + if (ret != SD_RES_SUCCESS)
> + goto out;
> + if (strcmp(onode->name, name) == 0)
> + break;
> + } else {
> + ret = SD_RES_NO_OBJ;
> + break;
> + }
> + }
> + if (i == MAX_DATA_OBJS) {
> + ret = SD_RES_NO_OBJ;
> + goto out;
> + }
> +out:
> + free(inode);
> sys->cdrv->unlock(ovid);
> return ret;
> }
> @@ -800,7 +872,9 @@ static int onode_read_data(struct kv_onode *onode,
> struct http_request *req)
> /*
> * We free the data and meta data in following sequence:
> *
> - * 1. discard onode
> + * 1. zero onode
> + * - we can't discard it because onode_lookup() need it to find if some
> object
> + * exists or not by checking adjacent objects
> * 2. discard data
> *
> * If (1) success, we consdier it a successful deletion of user object.
> If (2)
> @@ -810,11 +884,12 @@ static int onode_read_data(struct kv_onode *onode,
> struct http_request *req)
> */
> static int onode_delete(struct kv_onode *onode)
> {
> + char name[SD_MAX_OBJECT_NAME] = {};
> int ret;
>
> - ret = sd_discard_object(onode->oid);
> + ret = sd_write_object(onode->oid, name, sizeof(name), 0, 0);
> if (ret != SD_RES_SUCCESS) {
> - sd_err("failed to discard onode for %s", onode->name);
> + sd_err("failed to zero onode for %s", onode->name);
> return ret;
> }
>
> @@ -825,36 +900,6 @@ static int onode_delete(struct kv_onode *onode)
> return SD_RES_SUCCESS;
> }
>
> -struct find_object_arg {
> - const char *object_name;
> - int ret;
> -};
> -
> -static void find_object_cb(const char *name, void *opaque)
> -{
> - struct find_object_arg *foarg = (struct find_object_arg *)opaque;
> -
> - if (!strncmp(foarg->object_name, name, SD_MAX_OBJECT_NAME))
> - foarg->ret = SD_RES_SUCCESS;
> -}
> -
> -/*
> - * If we don't know if object exists in the bucket, we should use
> onode_find,
> - * which iterate effeciently the bucket, instead of onode_lookup(), that
> assumes
> - * object alrady exists and tries to look it up.
> - */
> -static int onode_find(uint32_t ovid, const char *name)
> -{
> - struct find_object_arg arg = {name, SD_RES_NO_OBJ};
> - int ret;
> -
> - ret = bucket_iterate_object(ovid, find_object_cb, &arg);
> - if (ret != SD_RES_SUCCESS)
> - return ret;
> -
> - return arg.ret;
> -}
> -
> /*
> * user object name -> struct kv_onode -> sheepdog objects -> user data
> *
> @@ -874,23 +919,24 @@ int kv_create_object(struct http_request *req, const
> char *account,
> if (ret != SD_RES_SUCCESS)
> return ret;
>
> - ret = onode_find(bucket_vid, name);
> + onode = xzalloc(sizeof(*onode));
> + ret = onode_lookup(onode, bucket_vid, name);
> if (ret == SD_RES_SUCCESS) {
> /* For overwrite, we delete old object and then create */
> ret = kv_delete_object(account, bucket, name);
> if (ret != SD_RES_SUCCESS) {
> sd_err("Failed to delete exists object %s", name);
> - return ret;
> + goto out;
> }
> } else if (ret != SD_RES_NO_OBJ)
> - return ret;
> + goto out;
>
> snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator", account,
> bucket);
> ret = sd_lookup_vdi(vdi_name, &data_vid);
> if (ret != SD_RES_SUCCESS)
> return ret;
>
> - onode = xzalloc(sizeof(*onode));
> + memset(onode, 0, sizeof(*onode));
> pstrcpy(onode->name, sizeof(onode->name), name);
> onode->data_vid = data_vid;
>
> @@ -923,10 +969,6 @@ int kv_read_object(struct http_request *req, const
> char *account,
> if (ret != SD_RES_SUCCESS)
> return ret;
>
> - ret = onode_find(bucket_vid, name);
> - if (ret != SD_RES_SUCCESS)
> - return ret;
> -
> onode = xzalloc(sizeof(*onode));
> ret = onode_lookup(onode, bucket_vid, name);
> if (ret != SD_RES_SUCCESS)
> @@ -954,10 +996,6 @@ int kv_delete_object(const char *account, const char
> *bucket, const char *name)
> if (ret != SD_RES_SUCCESS)
> return ret;
>
> - ret = onode_find(bucket_vid, name);
> - if (ret != SD_RES_SUCCESS)
> - return ret;
> -
> onode = xzalloc(sizeof(*onode));
> ret = onode_lookup(onode, bucket_vid, name);
> if (ret != SD_RES_SUCCESS)
> diff --git a/tests/functional/081.out b/tests/functional/081.out
> index f0cc8f2..d9d7dca 100644
> --- a/tests/functional/081.out
> +++ b/tests/functional/081.out
> @@ -59,11 +59,11 @@ data97
> Name Id Size Used Shared Creation time VDI id
> Copies Tag
> sd/dog 0 16 PB 52 MB 0.0 MB DATE 5a5cbf 6
> sd 0 16 PB 8.0 MB 0.0 MB DATE 7927f2 6
> - sd/sheep 0 16 PB 16 MB 0.0 MB DATE 8ad11e 6
> + sd/sheep 0 16 PB 124 MB 0.0 MB DATE 8ad11e 6
> sd/dog/allocator 0 16 PB 4.0 MB 0.0 MB DATE 936d95 6
> sd/sheep/allocator 0 16 PB 268 MB 0.0 MB DATE fd57fc 6
> sheep
> Name Id Size Used Shared Creation time VDI id
> Copies Tag
> sd 0 16 PB 4.0 MB 0.0 MB DATE 7927f2 6
> - sd/sheep 0 16 PB 0.0 MB 0.0 MB DATE 8ad11e 6
> + sd/sheep 0 16 PB 124 MB 0.0 MB DATE 8ad11e 6
> sd/sheep/allocator 0 16 PB 4.0 MB 0.0 MB DATE fd57fc 6
> diff --git a/tests/functional/082.out b/tests/functional/082.out
> index 6d6a4dc..c23828e 100644
> --- a/tests/functional/082.out
> +++ b/tests/functional/082.out
> @@ -75,11 +75,11 @@ data9
> Name Id Size Used Shared Creation time VDI id
> Copies Tag
> sd/dog 0 16 PB 52 MB 0.0 MB DATE 5a5cbf 6
> sd 0 16 PB 8.0 MB 0.0 MB DATE 7927f2 6
> - sd/sheep 0 16 PB 48 MB 0.0 MB DATE 8ad11e 6
> + sd/sheep 0 16 PB 156 MB 0.0 MB DATE 8ad11e 6
> sd/dog/allocator 0 16 PB 4.0 MB 0.0 MB DATE 936d95 6
> sd/sheep/allocator 0 16 PB 316 MB 0.0 MB DATE fd57fc 6
> sheep
> Name Id Size Used Shared Creation time VDI id
> Copies Tag
> sd 0 16 PB 4.0 MB 0.0 MB DATE 7927f2 6
> - sd/sheep 0 16 PB 0.0 MB 0.0 MB DATE 8ad11e 6
> + sd/sheep 0 16 PB 156 MB 0.0 MB DATE 8ad11e 6
> sd/sheep/allocator 0 16 PB 4.0 MB 0.0 MB DATE fd57fc 6
> --
> 1.8.1.2
>
> --
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog
>
--
--
Best Regard
Robin Dong
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.wpkg.org/pipermail/sheepdog/attachments/20131225/0b649256/attachment-0004.html>
More information about the sheepdog
mailing list