[sheepdog] [PATCH 2/2] trim redundant zero bytes of network and disk I/O data
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Sat Oct 6 12:56:27 CEST 2012
This will save a lot of network and disk I/Os especially when
recoverying sparse objects.
This updates the protocol version between sheep and other programs,
but the older one is also supported.
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
collie/common.c | 2 ++
collie/vdi.c | 4 ++++
include/sheepdog_proto.h | 7 +++++--
include/util.h | 6 ++++++
lib/util.c | 43 +++++++++++++++++++++++++++++++++++++++++++
sheep/object_cache.c | 19 ++++++++++++++++---
sheep/ops.c | 19 ++++++++++++++++++-
sheep/recovery.c | 2 ++
sheep/request.c | 5 ++++-
sheep/store.c | 3 +++
10 files changed, 103 insertions(+), 7 deletions(-)
diff --git a/collie/common.c b/collie/common.c
index fecdbe1..a29c86d 100644
--- a/collie/common.c
+++ b/collie/common.c
@@ -78,6 +78,8 @@ int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
return rsp->result;
}
+ set_trimmed_sectors(data, rsp->obj.offset, rsp->data_length, datalen);
+
return SD_RES_SUCCESS;
}
diff --git a/collie/vdi.c b/collie/vdi.c
index 1b8ec4c..104fbb1 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -314,6 +314,7 @@ static void parse_objs(uint64_t oid, obj_parser_func_t func, void *data, unsigne
if (ret)
fprintf(stderr, "Failed to connect to %s\n", name);
else {
+ set_trimmed_sectors(buf, rsp->obj.offset, rlen, size);
cb_ret = func(name, oid, rsp, buf, data);
if (cb_ret)
break;
@@ -1398,6 +1399,9 @@ static void *read_object_from(struct sd_vnode *vnode, uint64_t oid)
sd_strerror(rsp->result));
exit(EXIT_FAILURE);
}
+
+ set_trimmed_sectors(buf, rsp->obj.offset, rlen, SD_DATA_OBJ_SIZE);
+
return buf;
}
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index e97d029..cccdfa2 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -15,7 +15,10 @@
#include <stdint.h>
#include "util.h"
-#define SD_PROTO_VER 0x01
+#define SD_PROTO_VER 0x02
+
+/* This or later version supports trimming zero sectors from read response */
+#define SD_PROTO_VER_TRIM_ZERO_SECTORS 0x02
#define SD_LISTEN_PORT 7000
@@ -97,7 +100,6 @@
#define SD_NR_VDIS (1U << 24)
#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
-#define SECTOR_SIZE (1U << 9)
#define SD_INODE_SIZE (sizeof(struct sheepdog_inode))
#define SD_INODE_HEADER_SIZE (sizeof(struct sheepdog_inode) - \
@@ -144,6 +146,7 @@ struct sd_rsp {
struct {
uint32_t __pad;
uint32_t copies;
+ uint64_t offset;
} obj;
struct {
uint32_t __pad;
diff --git a/include/util.h b/include/util.h
index 90ed414..6f0e993 100644
--- a/include/util.h
+++ b/include/util.h
@@ -12,6 +12,8 @@
#include "list.h"
#include "logger.h"
+#define SECTOR_SIZE (1U << 9)
+
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
@@ -77,6 +79,10 @@ extern ssize_t xpread(int fd, void *buf, size_t count, off_t offset);
extern ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset);
extern int rmdir_r(char *dir_path);
+void trim_zero_sectors(void *buf, uint64_t *offset, uint32_t *len);
+void set_trimmed_sectors(void *buf, uint64_t offset,uint32_t len,
+ uint32_t requested_len);
+
#ifdef assert
#undef assert
#endif
diff --git a/lib/util.c b/lib/util.c
index 78bba8c..3c28165 100644
--- a/lib/util.c
+++ b/lib/util.c
@@ -17,6 +17,7 @@
#include <sys/stat.h>
#include <unistd.h>
#include <stdio.h>
+#include <assert.h>
#include "util.h"
#include "logger.h"
@@ -262,3 +263,45 @@ out:
closedir(dir);
return ret;
}
+
+/* Trim zero sectors from the beginning and end of buffer */
+void trim_zero_sectors(void *buf, uint64_t *offset, uint32_t *len)
+{
+ const uint8_t zero[SECTOR_SIZE] = {0};
+ uint8_t *p = buf;
+
+ assert(*offset == 0);
+
+ /* trim zero sectors from the beginning of buffer */
+ while (*len >= SECTOR_SIZE) {
+ if (memcmp(p + *offset, zero, SECTOR_SIZE) != 0)
+ break;
+
+ *offset += SECTOR_SIZE;
+ *len -= SECTOR_SIZE;
+ }
+ memmove(buf, p + *offset, *len);
+
+ /* trim zero sectors from the end of buffer */
+ while (*len >= SECTOR_SIZE) {
+ if (memcmp(p + *len - SECTOR_SIZE, zero, SECTOR_SIZE) != 0)
+ break;
+
+ *len -= SECTOR_SIZE;
+ }
+}
+
+/* Set trimmed zero sectors to the beginning and end of buffer */
+void set_trimmed_sectors(void *buf, uint64_t offset, uint32_t len,
+ uint32_t requested_len)
+{
+ uint8_t *p = buf;
+
+ if (offset > 0) {
+ memmove(p + offset, buf, len);
+ memset(p, 0, offset);
+ }
+
+ if (offset + len < requested_len)
+ memset(p + offset + len, 0, requested_len- offset - len);
+}
diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index 6fd505c..fb606c0 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -724,7 +724,8 @@ out:
}
static int create_cache_object(struct object_cache *oc, uint32_t idx,
- void *buffer, size_t buf_size)
+ void *buffer, size_t buf_size, off_t offset,
+ size_t obj_size)
{
int flags = def_open_flags | O_CREAT | O_EXCL, fd;
int ret = SD_RES_OID_EXIST;
@@ -745,7 +746,16 @@ static int create_cache_object(struct object_cache *oc, uint32_t idx,
goto out;
}
- ret = xpwrite(fd, buffer, buf_size, 0);
+ if (offset != 0 || buf_size != obj_size) {
+ ret = prealloc(fd, obj_size);
+ if (ret < 0) {
+ ret = SD_RES_EIO;
+ eprintf("%m\n");
+ goto out_close;
+ }
+ }
+
+ ret = xpwrite(fd, buffer, buf_size, offset);
if (ret != buf_size) {
ret = SD_RES_EIO;
eprintf("failed, vid %"PRIx32", idx %"PRIx32"\n", oc->vid, idx);
@@ -764,6 +774,7 @@ out:
static int object_cache_pull(struct object_cache *oc, uint32_t idx)
{
struct sd_req hdr;
+ struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
int ret = SD_RES_NO_MEM;
uint64_t oid;
uint32_t data_length;
@@ -791,7 +802,9 @@ static int object_cache_pull(struct object_cache *oc, uint32_t idx)
if (ret == SD_RES_SUCCESS) {
dprintf("oid %"PRIx64" pulled successfully\n", oid);
- ret = create_cache_object(oc, idx, buf, data_length);
+
+ ret = create_cache_object(oc, idx, buf, rsp->data_length,
+ rsp->obj.offset, data_length);
if (ret == SD_RES_SUCCESS)
add_to_object_cache(oc, idx, 0);
else if (ret == SD_RES_OID_EXIST)
diff --git a/sheep/ops.c b/sheep/ops.c
index ac02683..3c8aa4d 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -718,6 +718,8 @@ static int read_copy_from_replica(struct request *req, uint32_t epoch,
{
struct request read_req = { };
struct sd_req *hdr = &read_req.rq;
+ struct sd_rsp *rsp = &read_req.rp;
+ int ret;
/* Create a fake gateway read request */
sd_init_req(hdr, SD_OP_READ_OBJ);
@@ -732,7 +734,13 @@ static int read_copy_from_replica(struct request *req, uint32_t epoch,
read_req.op = get_sd_op(hdr->opcode);
read_req.vinfo = req->vinfo;
- return gateway_read_obj(&read_req);
+ ret = gateway_read_obj(&read_req);
+
+ if (ret == SD_RES_SUCCESS)
+ set_trimmed_sectors(buf, rsp->obj.offset, rsp->data_length,
+ SD_DATA_OBJ_SIZE);
+
+ return ret;
}
int peer_remove_obj(struct request *req)
@@ -767,6 +775,13 @@ int peer_read_obj(struct request *req)
goto out;
rsp->data_length = hdr->data_length;
+
+ if (hdr->proto_ver >= SD_PROTO_VER_TRIM_ZERO_SECTORS) {
+ rsp->obj.offset = 0;
+ trim_zero_sectors(req->data, &rsp->obj.offset,
+ &rsp->data_length);
+ }
+
if (hdr->obj.copies)
rsp->obj.copies = hdr->obj.copies;
else
@@ -863,6 +878,8 @@ int peer_create_and_write_obj(struct request *req)
memcpy(&cow_hdr, hdr, sizeof(cow_hdr));
cow_hdr.data_length = SD_DATA_OBJ_SIZE;
cow_hdr.obj.offset = 0;
+ trim_zero_sectors(buf, &cow_hdr.obj.offset,
+ &cow_hdr.data_length);
ret = do_create_and_write_obj(&iocb, &cow_hdr, epoch, buf);
} else
diff --git a/sheep/recovery.c b/sheep/recovery.c
index dec9688..17dbbbb 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -64,6 +64,7 @@ static int recover_object_from_replica(uint64_t oid, struct sd_vnode *vnode,
uint32_t epoch, uint32_t tgt_epoch)
{
struct sd_req hdr;
+ struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
unsigned wlen = 0, rlen;
int ret = SD_RES_NO_MEM;
void *buf = NULL;
@@ -95,6 +96,7 @@ static int recover_object_from_replica(uint64_t oid, struct sd_vnode *vnode,
goto out;
iocb.epoch = epoch;
iocb.length = rlen;
+ iocb.offset = rsp->obj.offset;
iocb.buf = buf;
ret = sd_store->create_and_write(oid, &iocb);
out:
diff --git a/sheep/request.c b/sheep/request.c
index c53a487..2a76547 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -338,7 +338,7 @@ static void queue_request(struct request *req)
goto done;
}
} else if (hdr->proto_ver) {
- if (hdr->proto_ver != SD_PROTO_VER) {
+ if (hdr->proto_ver > SD_PROTO_VER) {
rsp->result = SD_RES_VER_MISMATCH;
goto done;
}
@@ -476,6 +476,9 @@ again:
goto again;
}
+ /* fill rq with response header as exec_req does */
+ memcpy(rq, &req->rp, sizeof(req->rp));
+
close(req->wait_efd);
ret = req->rp.result;
free_local_request(req);
diff --git a/sheep/store.c b/sheep/store.c
index d3d50e1..fa2d5dc 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -493,6 +493,7 @@ int read_backend_object(uint64_t oid, char *data, unsigned int datalen,
uint64_t offset, int nr_copies)
{
struct sd_req hdr;
+ struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
int ret;
sd_init_req(&hdr, SD_OP_READ_OBJ);
@@ -505,6 +506,8 @@ int read_backend_object(uint64_t oid, char *data, unsigned int datalen,
if (ret != SD_RES_SUCCESS)
eprintf("failed to read object %" PRIx64 ", %x\n", oid, ret);
+ set_trimmed_sectors(data, rsp->obj.offset, rsp->data_length, datalen);
+
return ret;
}
--
1.7.2.5
More information about the sheepdog
mailing list