[sheepdog] [PATCH] sheep/http: unify onode lookup/find operation
Liu Yuan
namei.unix at gmail.com
Wed Dec 25 06:21:51 CET 2013
On Wed, Dec 25, 2013 at 01:16:05PM +0800, Liu Yuan wrote:
> On Wed, Dec 25, 2013 at 01:12:26PM +0800, Liu Yuan wrote:
> > On Wed, Dec 25, 2013 at 01:11:13PM +0800, Robin Dong wrote:
> > > 2013/12/24 Liu Yuan <namei.unix at gmail.com>
> > >
> > > > Previously we discard the onode object, so can't check if object exists by
> > > > loop
> > > > around the adjacent objects one by one becaue there is might be hole
> > > > between
> > > > collisioned hash chains. For this reason, we have to iterate all the
> > > > objects
> > > > in a container to check if some one is in. This is obviously:
> > > >
> > > > - time consuming because CREATE/DELETE/GET operations will firstly call
> > > > onode_find() to iterate the container.
> > > > - can't scale at all. Every object operation will read the whole onodes
> > > > in the
> > > > bucket.
> > > >
> > > > This patch reworks the onode lookup operation:
> > > >
> > > > - We check adjacent objects one by one once we get a start index by hashing
> > > > name. Unallocated slot marks the end of the check window.
> > > >
> > > > For e.g, if we are going to check if fish in the following bucket, assume
> > > > fish hashes to 'sheep', so we compare the name one by one from 'sheep' to
> > > > 'fish'. '\0' indicates that object was deleted before checking.
> > > >
> > > > [ sheep, dog, wolve, '\0', fish, {unallocated}, tiger, ]
> > > >
> > > > With this lookup mechanism, we can safely remove onode_find(). With this
> > > > patch
> > > > we can observe a huge time reduction even in tests/func/081, 082:
> > > >
> > > > before:
> > > > 081 Last Used:20s. Test http service with a single server
> > > > 082 Last Used:21s. Test http service with multiple servers
> > > >
> > > > after:
> > > > 081 Last Used:11s. Test http service with a single server
> > > > 082 Last Used:12s. Test http service with multiple servers
> > > >
> > > > Signed-off-by: Liu Yuan <namei.unix at gmail.com>
> > > > ---
> > > > sheep/http/kv.c | 274
> > > > +++++++++++++++++++++++++++--------------------
> > > > tests/functional/081.out | 4 +-
> > > > tests/functional/082.out | 4 +-
> > > > 3 files changed, 160 insertions(+), 122 deletions(-)
> > > >
> > > > diff --git a/sheep/http/kv.c b/sheep/http/kv.c
> > > > index 5843bb3..c0b3c4f 100644
> > > > --- a/sheep/http/kv.c
> > > > +++ b/sheep/http/kv.c
> > > > @@ -43,7 +43,7 @@ struct kv_onode {
> > > > uint8_t inlined;
> > > > };
> > > >
> > > > - uint8_t __pad[BLOCK_SIZE];
> > > > + uint8_t pad[BLOCK_SIZE];
> > > > };
> > > > union {
> > > > uint8_t data[SD_DATA_OBJ_SIZE - BLOCK_SIZE];
> > > > @@ -88,69 +88,6 @@ static int kv_create_hyper_volume(const char *name,
> > > > uint32_t *vdi_id)
> > > > return ret;
> > > > }
> > > >
> > > > -/*
> > > > - * Find an free object index by hash of name in the vid and create an
> > > > object
> > > > - * that holds the kv node{kv_bnode, kv_onode}.
> > > > - */
> > > > -#define kv_generic_object_create(node, vid, node_do_create) \
> > > > -({ \
> > > > - struct sd_inode *__inode = xmalloc(sizeof(struct sd_inode)); \
> > > > - uint32_t __tmp_vid, __idx, __i; \
> > > > - uint64_t __hval; \
> > > > - int __ret; \
> > > > - \
> > > > - __ret = sd_read_object(vid_to_vdi_oid(vid), (char *)__inode, \
> > > > - sizeof(*__inode), 0); \
> > > > - if (__ret != SD_RES_SUCCESS) { \
> > > > - sd_err("failed to read %" PRIx32 " %s", vid, \
> > > > - sd_strerror(__ret)); \
> > > > - goto out; \
> > > > - } \
> > > > - \
> > > > - __hval = sd_hash(node->name, strlen(node->name)); \
> > > > - for (__i = 0; __i < MAX_DATA_OBJS; __i++) { \
> > > > - __idx = (__hval + __i) % MAX_DATA_OBJS; \
> > > > - __tmp_vid = INODE_GET_VID(__inode, __idx); \
> > > > - if (__tmp_vid) \
> > > > - continue; \
> > > > - else \
> > > > - break; \
> > > > - } \
> > > > - if (__i == MAX_DATA_OBJS) { \
> > > > - __ret = SD_RES_NO_SPACE; \
> > > > - goto out; \
> > > > - } \
> > > > - __ret = node_do_create(node, __inode, __idx); \
> > > > -out: \
> > > > - free(__inode); \
> > > > - __ret; \
> > > > -})
> > > > -
> > > > -/* Find the object in the vid which holds the 'node' that matches 'name'
> > > > */
> > > > -#define kv_generic_object_lookup(node, vid, name) \
> > > > -({ \
> > > > - uint64_t __hval; \
> > > > - uint32_t __i; \
> > > > - int __ret; \
> > > > - \
> > > > - __hval = sd_hash(name, strlen(name)); \
> > > > - for (__i = 0; __i < MAX_DATA_OBJS; __i++) { \
> > > > - uint32_t __idx = (__hval + __i) % MAX_DATA_OBJS; \
> > > > - uint64_t __oid = vid_to_data_oid(vid, __idx); \
> > > > - \
> > > > - __ret = sd_read_object(__oid, (char *)node, sizeof(*node),
> > > > 0); \
> > > > - if (__ret != SD_RES_SUCCESS) \
> > > > - goto out; \
> > > > - if (strcmp(node->name, name) == 0) \
> > > > - break; \
> > > > - } \
> > > > - \
> > > > - if (__i == MAX_DATA_OBJS) \
> > > > - __ret = SD_RES_NO_OBJ; \
> > > > -out: \
> > > > - __ret; \
> > > > -})
> > > > -
> > > > /* Account operations */
> > > >
> > > > /*
> > > > @@ -320,7 +257,36 @@ out:
> > > >
> > > > static int bnode_create(struct kv_bnode *bnode, uint32_t account_vid)
> > > > {
> > > > - return kv_generic_object_create(bnode, account_vid,
> > > > bnode_do_create);
> > > > + struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
> > > > + uint32_t tmp_vid, idx, i;
> > > > + uint64_t hval;
> > > > + int ret;
> > > > +
> > > > + ret = sd_read_object(vid_to_vdi_oid(account_vid), (char *)inode,
> > > > + sizeof(*inode), 0);
> > > > + if (ret != SD_RES_SUCCESS) {
> > > > + sd_err("failed to read %" PRIx32 " %s", account_vid,
> > > > + sd_strerror(ret));
> > > > + goto out;
> > > > + }
> > > > +
> > > > + hval = sd_hash(bnode->name, strlen(bnode->name));
> > > > + for (i = 0; i < MAX_DATA_OBJS; i++) {
> > > >
> > >
> > > MAX_DATA_OBJS is (2**32) and 'i' is type of uint32_t, so 'i' will always be
> > > smaller than MAX_DATA_OBJS.
> > > If all the oid in this vdi is not empty, we will fall into dead-loop.
> > > So I think type of 'i' should be 'uint64_t'.
> >
> > Oops, good catch. I'll rework it on patch v2
>
> For a second thought, MAX_DATA_OBJS might be max of uint32_t, so your comment
> doesn't hold true, no?
>
Okay, you are right :)
Thanks
Yuan
More information about the sheepdog
mailing list