<div dir="ltr">Reviewed-by: Robin Dong <<a href="mailto:sanbai@taobao.com">sanbai@taobao.com</a>></div><div class="gmail_extra"><br><br><div class="gmail_quote">2013/12/25 Liu Yuan <span dir="ltr"><<a href="mailto:namei.unix@gmail.com" target="_blank">namei.unix@gmail.com</a>></span><br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="HOEnZb"><div class="h5">Previously we discard the onode object, so can't check if object exists by loop<br>
around the adjacent objects one by one becaue there is might be hole between<br>
collisioned hash chains. For this reason, we have to iterate all the objects<br>
in a container to check if some one is in. This is obviously:<br>
<br>
- time consuming because CREATE/DELETE/GET operations will firstly call<br>
onode_find() to iterate the container.<br>
- can't scale at all. Every object operation will read the whole onodes in the<br>
bucket.<br>
<br>
This patch reworks the onode lookup operation:<br>
<br>
- We check adjacent objects one by one once we get a start index by hashing<br>
name. Unallocated slot marks the end of the check window.<br>
<br>
For e.g, if we are going to check if fish in the following bucket, assume<br>
fish hashes to 'sheep', so we compare the name one by one from 'sheep' to<br>
'fish'. '\0' indicates that object was deleted before checking.<br>
<br>
[ sheep, dog, wolve, '\0', fish, {unallocated}, tiger, ]<br>
<br>
With this lookup mechanism, we can safely remove onode_find(). With this patch<br>
we can observe a huge time reduction even in tests/func/081, 082:<br>
<br>
before:<br>
081 Last Used:20s. Test http service with a single server<br>
082 Last Used:21s. Test http service with multiple servers<br>
<br>
after:<br>
081 Last Used:11s. Test http service with a single server<br>
082 Last Used:12s. Test http service with multiple servers<br>
<br>
Signed-off-by: Liu Yuan <<a href="mailto:namei.unix@gmail.com">namei.unix@gmail.com</a>><br>
---<br>
</div></div> v2:<br>
- fix a bug that i should be uint64_t<br>
<br>
sheep/http/kv.c | 276 ++++++++++++++++++++++++++---------------------<br>
<div class="im"> tests/functional/081.out | 4 +-<br>
tests/functional/082.out | 4 +-<br>
</div> 3 files changed, 160 insertions(+), 124 deletions(-)<br>
<br>
diff --git a/sheep/http/kv.c b/sheep/http/kv.c<br>
index 5843bb3..5fde277 100644<br>
<div><div class="h5">--- a/sheep/http/kv.c<br>
+++ b/sheep/http/kv.c<br>
@@ -43,7 +43,7 @@ struct kv_onode {<br>
uint8_t inlined;<br>
};<br>
<br>
- uint8_t __pad[BLOCK_SIZE];<br>
+ uint8_t pad[BLOCK_SIZE];<br>
};<br>
union {<br>
uint8_t data[SD_DATA_OBJ_SIZE - BLOCK_SIZE];<br>
@@ -88,69 +88,6 @@ static int kv_create_hyper_volume(const char *name, uint32_t *vdi_id)<br>
return ret;<br>
}<br>
<br>
-/*<br>
- * Find an free object index by hash of name in the vid and create an object<br>
- * that holds the kv node{kv_bnode, kv_onode}.<br>
- */<br>
-#define kv_generic_object_create(node, vid, node_do_create) \<br>
-({ \<br>
- struct sd_inode *__inode = xmalloc(sizeof(struct sd_inode)); \<br>
- uint32_t __tmp_vid, __idx, __i; \<br>
- uint64_t __hval; \<br>
- int __ret; \<br>
- \<br>
- __ret = sd_read_object(vid_to_vdi_oid(vid), (char *)__inode, \<br>
- sizeof(*__inode), 0); \<br>
- if (__ret != SD_RES_SUCCESS) { \<br>
- sd_err("failed to read %" PRIx32 " %s", vid, \<br>
- sd_strerror(__ret)); \<br>
- goto out; \<br>
- } \<br>
- \<br>
- __hval = sd_hash(node->name, strlen(node->name)); \<br>
- for (__i = 0; __i < MAX_DATA_OBJS; __i++) { \<br>
- __idx = (__hval + __i) % MAX_DATA_OBJS; \<br>
- __tmp_vid = INODE_GET_VID(__inode, __idx); \<br>
- if (__tmp_vid) \<br>
- continue; \<br>
- else \<br>
- break; \<br>
- } \<br>
- if (__i == MAX_DATA_OBJS) { \<br>
- __ret = SD_RES_NO_SPACE; \<br>
- goto out; \<br>
- } \<br>
- __ret = node_do_create(node, __inode, __idx); \<br>
-out: \<br>
- free(__inode); \<br>
- __ret; \<br>
-})<br>
-<br>
-/* Find the object in the vid which holds the 'node' that matches 'name' */<br>
-#define kv_generic_object_lookup(node, vid, name) \<br>
-({ \<br>
- uint64_t __hval; \<br>
- uint32_t __i; \<br>
- int __ret; \<br>
- \<br>
- __hval = sd_hash(name, strlen(name)); \<br>
- for (__i = 0; __i < MAX_DATA_OBJS; __i++) { \<br>
- uint32_t __idx = (__hval + __i) % MAX_DATA_OBJS; \<br>
- uint64_t __oid = vid_to_data_oid(vid, __idx); \<br>
- \<br>
- __ret = sd_read_object(__oid, (char *)node, sizeof(*node), 0); \<br>
- if (__ret != SD_RES_SUCCESS) \<br>
- goto out; \<br>
- if (strcmp(node->name, name) == 0) \<br>
- break; \<br>
- } \<br>
- \<br>
- if (__i == MAX_DATA_OBJS) \<br>
- __ret = SD_RES_NO_OBJ; \<br>
-out: \<br>
- __ret; \<br>
-})<br>
-<br>
/* Account operations */<br>
<br>
/*<br>
@@ -320,7 +257,36 @@ out:<br>
<br>
static int bnode_create(struct kv_bnode *bnode, uint32_t account_vid)<br>
{<br>
- return kv_generic_object_create(bnode, account_vid, bnode_do_create);<br>
+ struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));<br>
</div></div>+ uint32_t tmp_vid, idx;<br>
+ uint64_t hval, i;<br>
<div class="im">+ int ret;<br>
+<br>
+ ret = sd_read_object(vid_to_vdi_oid(account_vid), (char *)inode,<br>
+ sizeof(*inode), 0);<br>
+ if (ret != SD_RES_SUCCESS) {<br>
+ sd_err("failed to read %" PRIx32 " %s", account_vid,<br>
+ sd_strerror(ret));<br>
+ goto out;<br>
+ }<br>
+<br>
+ hval = sd_hash(bnode->name, strlen(bnode->name));<br>
+ for (i = 0; i < MAX_DATA_OBJS; i++) {<br>
</div><div class="im">+ idx = (hval + i) % MAX_DATA_OBJS;<br>
+ tmp_vid = INODE_GET_VID(inode, idx);<br>
+ if (tmp_vid)<br>
+ continue;<br>
+ else<br>
+ break;<br>
+ }<br>
+ if (i == MAX_DATA_OBJS) {<br>
+ ret = SD_RES_NO_SPACE;<br>
</div>+ goto out;<br>
+ }<br>
+ ret = bnode_do_create(bnode, inode, idx);<br>
<div class="im">+out:<br>
+ free(inode);<br>
+ return ret;<br>
}<br>
<br>
static int bucket_create(const char *account, uint32_t account_vid,<br>
</div>@@ -366,9 +332,27 @@ err:<br>
<div class="im"> return ret;<br>
}<br>
<br>
-static int bucket_lookup(struct kv_bnode *bnode, uint32_t vid, const char *name)<br>
+static int bnode_lookup(struct kv_bnode *bnode, uint32_t vid, const char *name)<br>
{<br>
- return kv_generic_object_lookup(bnode, vid, name);<br>
</div>+ uint64_t hval, i;<br>
+ int ret;<br>
+<br>
+ hval = sd_hash(name, strlen(name));<br>
<div class="im">+ for (i = 0; i < MAX_DATA_OBJS; i++) {<br>
</div><div class="im">+ uint32_t idx = (hval + i) % MAX_DATA_OBJS;<br>
+ uint64_t oid = vid_to_data_oid(vid, idx);<br>
+<br>
</div>+ ret = sd_read_object(oid, (char *)bnode, sizeof(*bnode), 0);<br>
<div class="im">+ if (ret != SD_RES_SUCCESS)<br>
</div>+ goto out;<br>
<div class="im">+ if (strcmp(bnode->name, name) == 0)<br>
+ break;<br>
+ }<br>
+<br>
+ if (i == MAX_DATA_OBJS)<br>
+ ret = SD_RES_NO_OBJ;<br>
+out:<br>
+ return ret;<br>
}<br>
<br>
static int bucket_delete(const char *account, uint32_t avid, const char *bucket)<br>
</div>@@ -382,7 +366,7 @@ static int bucket_delete(const char *account, uint32_t avid, const char *bucket)<br>
<div class="im"> snprintf(alloc_name, SD_MAX_VDI_LEN, "%s/%s/allocator", account,<br>
bucket);<br>
<br>
- ret = bucket_lookup(&bnode, avid, bucket);<br>
+ ret = bnode_lookup(&bnode, avid, bucket);<br>
if (ret != SD_RES_SUCCESS)<br>
return ret;<br>
<br>
</div>@@ -684,7 +668,7 @@ out:<br>
<div class="im"> }<br>
<br>
static int onode_do_create(struct kv_onode *onode, struct sd_inode *inode,<br>
- uint32_t idx)<br>
+ uint32_t idx, bool create)<br>
{<br>
uint32_t vid = inode->vdi_id;<br>
uint64_t oid = vid_to_data_oid(vid, idx), len;<br>
</div>@@ -697,11 +681,14 @@ static int onode_do_create(struct kv_onode *onode, struct sd_inode *inode,<br>
<div class="im"> len = sizeof(struct onode_extent) * onode->nr_extent;<br>
<br>
ret = sd_write_object(oid, (char *)onode, BLOCK_SIZE + len,<br>
- 0, true);<br>
+ 0, create);<br>
if (ret != SD_RES_SUCCESS) {<br>
sd_err("failed to create object, %" PRIx64, oid);<br>
goto out;<br>
}<br>
+ if (!create)<br>
+ goto out;<br>
+<br>
INODE_SET_VID(inode, idx, vid);<br>
ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx,<br>
vid, vid, 0, false, false);<br>
</div>@@ -716,12 +703,48 @@ out:<br>
<div class="im"><br>
static int onode_create(struct kv_onode *onode, uint32_t bucket_vid)<br>
{<br>
</div><div class="im">+ struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));<br>
</div>+ uint32_t tmp_vid, idx;<br>
+ uint64_t hval, i;<br>
<div class="im"> int ret;<br>
+ bool create = true;<br>
<br>
sys->cdrv->lock(bucket_vid);<br>
- ret = kv_generic_object_create(onode, bucket_vid, onode_do_create);<br>
- sys->cdrv->unlock(bucket_vid);<br>
</div>+ ret = sd_read_object(vid_to_vdi_oid(bucket_vid), (char *)inode,<br>
<div class="im">+ sizeof(*inode), 0);<br>
+ if (ret != SD_RES_SUCCESS) {<br>
</div>+ sd_err("failed to read %" PRIx32 " %s", bucket_vid,<br>
<div class="im">+ sd_strerror(ret));<br>
+ goto out;<br>
+ }<br>
+<br>
</div>+ hval = sd_hash(onode->name, strlen(onode->name));<br>
<div class="im">+ for (i = 0; i < MAX_DATA_OBJS; i++) {<br>
</div><div class="im">+ idx = (hval + i) % MAX_DATA_OBJS;<br>
+ tmp_vid = INODE_GET_VID(inode, idx);<br>
+ if (tmp_vid) {<br>
+ uint64_t oid = vid_to_data_oid(bucket_vid, idx);<br>
+ char name[SD_MAX_OBJECT_NAME] = { };<br>
<br>
</div>+ ret = sd_read_object(oid, name, sizeof(name), 0);<br>
<div class="im">+ if (ret != SD_RES_SUCCESS)<br>
</div>+ goto out;<br>
<div class="im">+ if (name[0] == 0) {<br>
+ create = false;<br>
+ goto create;<br>
+ }<br>
+ } else<br>
+ break;<br>
+ }<br>
+ if (i == MAX_DATA_OBJS) {<br>
+ ret = SD_RES_NO_SPACE;<br>
+ goto out;<br>
+ }<br>
+create:<br>
+ ret = onode_do_create(onode, inode, idx, create);<br>
+out:<br>
+ free(inode);<br>
+ sys->cdrv->unlock(bucket_vid);<br>
return ret;<br>
}<br>
<br>
</div>@@ -742,8 +765,7 @@ static int onode_free_data(struct kv_onode *onode)<br>
static int onode_read_extents(struct kv_onode *onode, struct http_request *req)<br>
{<br>
struct onode_extent *ext;<br>
- uint64_t size, total, total_size, offset, done = 0;<br>
- uint32_t i;<br>
+ uint64_t size, total, total_size, offset, done = 0, i;<br>
int ret;<br>
char *data_buf = NULL;<br>
<br>
@@ -773,12 +795,60 @@ out:<br>
<div class="im"> return ret;<br>
}<br>
<br>
+/*<br>
+ * Check if object by name exists in a bucket and init 'onode' if it exists.<br>
+ *<br>
+ * Return SD_RES_SUCCESS if found, SD_RES_NO_OBJ if not found.<br>
+ *<br>
+ * We check adjacent objects one by one once we get a start index by hashing<br>
+ * name. Unallocated slot marks the end of the check window.<br>
+ *<br>
+ * For e.g, if we are going to check if fish in the following bucket, assume<br>
+ * fish hashes to 'sheep', so we compare the name one by one from 'sheep' to<br>
+ * 'fish'. '\0' indicates that object was deleted before checking.<br>
+ *<br>
+ * [ sheep, dog, wolve, '\0', fish, {unallocated}, tiger, ]<br>
+ */<br>
static int onode_lookup(struct kv_onode *onode, uint32_t ovid, const char *name)<br>
{<br>
</div><div class="im">+ struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));<br>
</div>+ uint32_t tmp_vid, idx;<br>
+ uint64_t hval, i;<br>
<div class="im"> int ret;<br>
<br>
sys->cdrv->lock(ovid);<br>
- ret = kv_generic_object_lookup(onode, ovid, name);<br>
</div>+ ret = sd_read_object(vid_to_vdi_oid(ovid), (char *)inode,<br>
<div class="im">+ sizeof(*inode), 0);<br>
+ if (ret != SD_RES_SUCCESS) {<br>
</div>+ sd_err("failed to read %" PRIx32 " %s", ovid,<br>
<div class="im">+ sd_strerror(ret));<br>
+ goto out;<br>
+ }<br>
+<br>
</div>+ hval = sd_hash(name, strlen(name));<br>
<div class="im">+ for (i = 0; i < MAX_DATA_OBJS; i++) {<br>
</div><div class="im">+ idx = (hval + i) % MAX_DATA_OBJS;<br>
+ tmp_vid = INODE_GET_VID(inode, idx);<br>
+ if (tmp_vid) {<br>
+ uint64_t oid = vid_to_data_oid(ovid, idx);<br>
+<br>
+ ret = sd_read_object(oid, (char *)onode,<br>
</div>+ sizeof(*onode), 0);<br>
<div class="im">+ if (ret != SD_RES_SUCCESS)<br>
</div>+ goto out;<br>
<div class="im">+ if (strcmp(onode->name, name) == 0)<br>
+ break;<br>
+ } else {<br>
+ ret = SD_RES_NO_OBJ;<br>
+ break;<br>
+ }<br>
+ }<br>
+ if (i == MAX_DATA_OBJS) {<br>
+ ret = SD_RES_NO_OBJ;<br>
+ goto out;<br>
+ }<br>
+out:<br>
+ free(inode);<br>
sys->cdrv->unlock(ovid);<br>
return ret;<br>
}<br>
</div>@@ -800,7 +870,9 @@ static int onode_read_data(struct kv_onode *onode, struct http_request *req)<br>
<div class="im"> /*<br>
* We free the data and meta data in following sequence:<br>
*<br>
- * 1. discard onode<br>
+ * 1. zero onode<br>
+ * - we can't discard it because onode_lookup() need it to find if some object<br>
+ * exists or not by checking adjacent objects<br>
* 2. discard data<br>
*<br>
* If (1) success, we consdier it a successful deletion of user object. If (2)<br>
</div>@@ -810,11 +882,12 @@ static int onode_read_data(struct kv_onode *onode, struct http_request *req)<br>
<div class="im"> */<br>
static int onode_delete(struct kv_onode *onode)<br>
{<br>
+ char name[SD_MAX_OBJECT_NAME] = {};<br>
int ret;<br>
<br>
- ret = sd_discard_object(onode->oid);<br>
+ ret = sd_write_object(onode->oid, name, sizeof(name), 0, 0);<br>
</div> if (ret != SD_RES_SUCCESS) {<br>
- sd_err("failed to discard onode for %s", onode->name);<br>
<div class="im">+ sd_err("failed to zero onode for %s", onode->name);<br>
return ret;<br>
}<br>
<br>
</div>@@ -825,36 +898,6 @@ static int onode_delete(struct kv_onode *onode)<br>
<div><div class="h5"> return SD_RES_SUCCESS;<br>
}<br>
<br>
-struct find_object_arg {<br>
- const char *object_name;<br>
- int ret;<br>
-};<br>
-<br>
-static void find_object_cb(const char *name, void *opaque)<br>
-{<br>
- struct find_object_arg *foarg = (struct find_object_arg *)opaque;<br>
-<br>
- if (!strncmp(foarg->object_name, name, SD_MAX_OBJECT_NAME))<br>
- foarg->ret = SD_RES_SUCCESS;<br>
-}<br>
-<br>
-/*<br>
- * If we don't know if object exists in the bucket, we should use onode_find,<br>
- * which iterate effeciently the bucket, instead of onode_lookup(), that assumes<br>
- * object alrady exists and tries to look it up.<br>
- */<br>
-static int onode_find(uint32_t ovid, const char *name)<br>
-{<br>
- struct find_object_arg arg = {name, SD_RES_NO_OBJ};<br>
- int ret;<br>
-<br>
- ret = bucket_iterate_object(ovid, find_object_cb, &arg);<br>
- if (ret != SD_RES_SUCCESS)<br>
- return ret;<br>
-<br>
- return arg.ret;<br>
-}<br>
-<br>
/*<br>
* user object name -> struct kv_onode -> sheepdog objects -> user data<br>
*<br>
</div></div>@@ -874,23 +917,24 @@ int kv_create_object(struct http_request *req, const char *account,<br>
<div class="im"> if (ret != SD_RES_SUCCESS)<br>
return ret;<br>
<br>
- ret = onode_find(bucket_vid, name);<br>
+ onode = xzalloc(sizeof(*onode));<br>
+ ret = onode_lookup(onode, bucket_vid, name);<br>
if (ret == SD_RES_SUCCESS) {<br>
/* For overwrite, we delete old object and then create */<br>
ret = kv_delete_object(account, bucket, name);<br>
if (ret != SD_RES_SUCCESS) {<br>
sd_err("Failed to delete exists object %s", name);<br>
- return ret;<br>
+ goto out;<br>
}<br>
} else if (ret != SD_RES_NO_OBJ)<br>
- return ret;<br>
+ goto out;<br>
<br>
snprintf(vdi_name, SD_MAX_VDI_LEN, "%s/%s/allocator", account, bucket);<br>
ret = sd_lookup_vdi(vdi_name, &data_vid);<br>
if (ret != SD_RES_SUCCESS)<br>
return ret;<br>
<br>
- onode = xzalloc(sizeof(*onode));<br>
+ memset(onode, 0, sizeof(*onode));<br>
pstrcpy(onode->name, sizeof(onode->name), name);<br>
onode->data_vid = data_vid;<br>
<br>
</div>@@ -923,10 +967,6 @@ int kv_read_object(struct http_request *req, const char *account,<br>
<div class="im"> if (ret != SD_RES_SUCCESS)<br>
return ret;<br>
<br>
- ret = onode_find(bucket_vid, name);<br>
- if (ret != SD_RES_SUCCESS)<br>
- return ret;<br>
-<br>
onode = xzalloc(sizeof(*onode));<br>
ret = onode_lookup(onode, bucket_vid, name);<br>
if (ret != SD_RES_SUCCESS)<br>
</div>@@ -954,10 +994,6 @@ int kv_delete_object(const char *account, const char *bucket, const char *name)<br>
<div class="HOEnZb"><div class="h5"> if (ret != SD_RES_SUCCESS)<br>
return ret;<br>
<br>
- ret = onode_find(bucket_vid, name);<br>
- if (ret != SD_RES_SUCCESS)<br>
- return ret;<br>
-<br>
onode = xzalloc(sizeof(*onode));<br>
ret = onode_lookup(onode, bucket_vid, name);<br>
if (ret != SD_RES_SUCCESS)<br>
diff --git a/tests/functional/081.out b/tests/functional/081.out<br>
index f0cc8f2..d9d7dca 100644<br>
--- a/tests/functional/081.out<br>
+++ b/tests/functional/081.out<br>
@@ -59,11 +59,11 @@ data97<br>
Name Id Size Used Shared Creation time VDI id Copies Tag<br>
sd/dog 0 16 PB 52 MB 0.0 MB DATE 5a5cbf 6<br>
sd 0 16 PB 8.0 MB 0.0 MB DATE 7927f2 6<br>
- sd/sheep 0 16 PB 16 MB 0.0 MB DATE 8ad11e 6<br>
+ sd/sheep 0 16 PB 124 MB 0.0 MB DATE 8ad11e 6<br>
sd/dog/allocator 0 16 PB 4.0 MB 0.0 MB DATE 936d95 6<br>
sd/sheep/allocator 0 16 PB 268 MB 0.0 MB DATE fd57fc 6<br>
sheep<br>
Name Id Size Used Shared Creation time VDI id Copies Tag<br>
sd 0 16 PB 4.0 MB 0.0 MB DATE 7927f2 6<br>
- sd/sheep 0 16 PB 0.0 MB 0.0 MB DATE 8ad11e 6<br>
+ sd/sheep 0 16 PB 124 MB 0.0 MB DATE 8ad11e 6<br>
sd/sheep/allocator 0 16 PB 4.0 MB 0.0 MB DATE fd57fc 6<br>
diff --git a/tests/functional/082.out b/tests/functional/082.out<br>
index 6d6a4dc..c23828e 100644<br>
--- a/tests/functional/082.out<br>
+++ b/tests/functional/082.out<br>
@@ -75,11 +75,11 @@ data9<br>
Name Id Size Used Shared Creation time VDI id Copies Tag<br>
sd/dog 0 16 PB 52 MB 0.0 MB DATE 5a5cbf 6<br>
sd 0 16 PB 8.0 MB 0.0 MB DATE 7927f2 6<br>
- sd/sheep 0 16 PB 48 MB 0.0 MB DATE 8ad11e 6<br>
+ sd/sheep 0 16 PB 156 MB 0.0 MB DATE 8ad11e 6<br>
sd/dog/allocator 0 16 PB 4.0 MB 0.0 MB DATE 936d95 6<br>
sd/sheep/allocator 0 16 PB 316 MB 0.0 MB DATE fd57fc 6<br>
sheep<br>
Name Id Size Used Shared Creation time VDI id Copies Tag<br>
sd 0 16 PB 4.0 MB 0.0 MB DATE 7927f2 6<br>
- sd/sheep 0 16 PB 0.0 MB 0.0 MB DATE 8ad11e 6<br>
+ sd/sheep 0 16 PB 156 MB 0.0 MB DATE 8ad11e 6<br>
sd/sheep/allocator 0 16 PB 4.0 MB 0.0 MB DATE fd57fc 6<br>
--<br>
1.8.1.2<br>
<br>
--<br>
sheepdog mailing list<br>
<a href="mailto:sheepdog@lists.wpkg.org">sheepdog@lists.wpkg.org</a><br>
<a href="http://lists.wpkg.org/mailman/listinfo/sheepdog" target="_blank">http://lists.wpkg.org/mailman/listinfo/sheepdog</a><br>
</div></div></blockquote></div><br><br clear="all"><div><br></div>-- <br>--<br>Best Regard<br>Robin Dong
</div>