[sheepdog] [PATCH 2/2] trim redundant zero bytes of network and disk I/O data

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Sat Oct 6 12:56:27 CEST 2012


This will save a lot of network and disk I/Os especially when
recoverying sparse objects.

This updates the protocol version between sheep and other programs,
but the older one is also supported.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 collie/common.c          |    2 ++
 collie/vdi.c             |    4 ++++
 include/sheepdog_proto.h |    7 +++++--
 include/util.h           |    6 ++++++
 lib/util.c               |   43 +++++++++++++++++++++++++++++++++++++++++++
 sheep/object_cache.c     |   19 ++++++++++++++++---
 sheep/ops.c              |   19 ++++++++++++++++++-
 sheep/recovery.c         |    2 ++
 sheep/request.c          |    5 ++++-
 sheep/store.c            |    3 +++
 10 files changed, 103 insertions(+), 7 deletions(-)

diff --git a/collie/common.c b/collie/common.c
index fecdbe1..a29c86d 100644
--- a/collie/common.c
+++ b/collie/common.c
@@ -78,6 +78,8 @@ int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
 		return rsp->result;
 	}
 
+	set_trimmed_sectors(data, rsp->obj.offset, rsp->data_length, datalen);
+
 	return SD_RES_SUCCESS;
 }
 
diff --git a/collie/vdi.c b/collie/vdi.c
index 1b8ec4c..104fbb1 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -314,6 +314,7 @@ static void parse_objs(uint64_t oid, obj_parser_func_t func, void *data, unsigne
 		if (ret)
 			fprintf(stderr, "Failed to connect to %s\n", name);
 		else {
+			set_trimmed_sectors(buf, rsp->obj.offset, rlen, size);
 			cb_ret = func(name, oid, rsp, buf, data);
 			if (cb_ret)
 				break;
@@ -1398,6 +1399,9 @@ static void *read_object_from(struct sd_vnode *vnode, uint64_t oid)
 			sd_strerror(rsp->result));
 		exit(EXIT_FAILURE);
 	}
+
+	set_trimmed_sectors(buf, rsp->obj.offset, rlen, SD_DATA_OBJ_SIZE);
+
 	return buf;
 }
 
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index e97d029..cccdfa2 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -15,7 +15,10 @@
 #include <stdint.h>
 #include "util.h"
 
-#define SD_PROTO_VER 0x01
+#define SD_PROTO_VER 0x02
+
+/* This or later version supports trimming zero sectors from read response */
+#define SD_PROTO_VER_TRIM_ZERO_SECTORS 0x02
 
 #define SD_LISTEN_PORT 7000
 
@@ -97,7 +100,6 @@
 #define SD_NR_VDIS   (1U << 24)
 #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
 #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
-#define SECTOR_SIZE (1U << 9)
 
 #define SD_INODE_SIZE (sizeof(struct sheepdog_inode))
 #define SD_INODE_HEADER_SIZE (sizeof(struct sheepdog_inode) - \
@@ -144,6 +146,7 @@ struct sd_rsp {
 		struct {
 			uint32_t	__pad;
 			uint32_t	copies;
+			uint64_t	offset;
 		} obj;
 		struct {
 			uint32_t	__pad;
diff --git a/include/util.h b/include/util.h
index 90ed414..6f0e993 100644
--- a/include/util.h
+++ b/include/util.h
@@ -12,6 +12,8 @@
 #include "list.h"
 #include "logger.h"
 
+#define SECTOR_SIZE (1U << 9)
+
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
 
@@ -77,6 +79,10 @@ extern ssize_t xpread(int fd, void *buf, size_t count, off_t offset);
 extern ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset);
 extern int rmdir_r(char *dir_path);
 
+void trim_zero_sectors(void *buf, uint64_t *offset, uint32_t *len);
+void set_trimmed_sectors(void *buf, uint64_t offset,uint32_t len,
+			 uint32_t requested_len);
+
 #ifdef assert
 #undef assert
 #endif
diff --git a/lib/util.c b/lib/util.c
index 78bba8c..3c28165 100644
--- a/lib/util.c
+++ b/lib/util.c
@@ -17,6 +17,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include <stdio.h>
+#include <assert.h>
 
 #include "util.h"
 #include "logger.h"
@@ -262,3 +263,45 @@ out:
 	closedir(dir);
 	return ret;
 }
+
+/* Trim zero sectors from the beginning and end of buffer */
+void trim_zero_sectors(void *buf, uint64_t *offset, uint32_t *len)
+{
+	const uint8_t zero[SECTOR_SIZE] = {0};
+	uint8_t *p = buf;
+
+	assert(*offset == 0);
+
+	/* trim zero sectors from the beginning of buffer */
+	while (*len >= SECTOR_SIZE) {
+		if (memcmp(p + *offset, zero, SECTOR_SIZE) != 0)
+			break;
+
+		*offset += SECTOR_SIZE;
+		*len -= SECTOR_SIZE;
+	}
+	memmove(buf, p + *offset, *len);
+
+	/* trim zero sectors from the end of buffer */
+	while (*len >= SECTOR_SIZE) {
+		if (memcmp(p + *len - SECTOR_SIZE, zero, SECTOR_SIZE) != 0)
+			break;
+
+		*len -= SECTOR_SIZE;
+	}
+}
+
+/* Set trimmed zero sectors to the beginning and end of buffer */
+void set_trimmed_sectors(void *buf, uint64_t offset, uint32_t len,
+			 uint32_t requested_len)
+{
+	uint8_t *p = buf;
+
+	if (offset > 0) {
+		memmove(p + offset, buf, len);
+		memset(p, 0, offset);
+	}
+
+	if (offset + len < requested_len)
+		memset(p + offset + len, 0, requested_len- offset - len);
+}
diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index 6fd505c..fb606c0 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -724,7 +724,8 @@ out:
 }
 
 static int create_cache_object(struct object_cache *oc, uint32_t idx,
-			       void *buffer, size_t buf_size)
+			       void *buffer, size_t buf_size, off_t offset,
+			       size_t obj_size)
 {
 	int flags = def_open_flags | O_CREAT | O_EXCL, fd;
 	int ret = SD_RES_OID_EXIST;
@@ -745,7 +746,16 @@ static int create_cache_object(struct object_cache *oc, uint32_t idx,
 		goto out;
 	}
 
-	ret = xpwrite(fd, buffer, buf_size, 0);
+	if (offset != 0 || buf_size != obj_size) {
+		ret = prealloc(fd, obj_size);
+		if (ret < 0) {
+			ret = SD_RES_EIO;
+			eprintf("%m\n");
+			goto out_close;
+		}
+	}
+
+	ret = xpwrite(fd, buffer, buf_size, offset);
 	if (ret != buf_size) {
 		ret = SD_RES_EIO;
 		eprintf("failed, vid %"PRIx32", idx %"PRIx32"\n", oc->vid, idx);
@@ -764,6 +774,7 @@ out:
 static int object_cache_pull(struct object_cache *oc, uint32_t idx)
 {
 	struct sd_req hdr;
+	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
 	int ret = SD_RES_NO_MEM;
 	uint64_t oid;
 	uint32_t data_length;
@@ -791,7 +802,9 @@ static int object_cache_pull(struct object_cache *oc, uint32_t idx)
 
 	if (ret == SD_RES_SUCCESS) {
 		dprintf("oid %"PRIx64" pulled successfully\n", oid);
-		ret = create_cache_object(oc, idx, buf, data_length);
+
+		ret = create_cache_object(oc, idx, buf, rsp->data_length,
+					  rsp->obj.offset, data_length);
 		if (ret == SD_RES_SUCCESS)
 			add_to_object_cache(oc, idx, 0);
 		else if (ret == SD_RES_OID_EXIST)
diff --git a/sheep/ops.c b/sheep/ops.c
index ac02683..3c8aa4d 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -718,6 +718,8 @@ static int read_copy_from_replica(struct request *req, uint32_t epoch,
 {
 	struct request read_req = { };
 	struct sd_req *hdr = &read_req.rq;
+	struct sd_rsp *rsp = &read_req.rp;
+	int ret;
 
 	/* Create a fake gateway read request */
 	sd_init_req(hdr, SD_OP_READ_OBJ);
@@ -732,7 +734,13 @@ static int read_copy_from_replica(struct request *req, uint32_t epoch,
 	read_req.op = get_sd_op(hdr->opcode);
 	read_req.vinfo = req->vinfo;
 
-	return gateway_read_obj(&read_req);
+	ret = gateway_read_obj(&read_req);
+
+	if (ret == SD_RES_SUCCESS)
+		set_trimmed_sectors(buf, rsp->obj.offset, rsp->data_length,
+				    SD_DATA_OBJ_SIZE);
+
+	return ret;
 }
 
 int peer_remove_obj(struct request *req)
@@ -767,6 +775,13 @@ int peer_read_obj(struct request *req)
 		goto out;
 
 	rsp->data_length = hdr->data_length;
+
+	if (hdr->proto_ver >= SD_PROTO_VER_TRIM_ZERO_SECTORS) {
+		rsp->obj.offset = 0;
+		trim_zero_sectors(req->data, &rsp->obj.offset,
+				  &rsp->data_length);
+	}
+
 	if (hdr->obj.copies)
 		rsp->obj.copies = hdr->obj.copies;
 	else
@@ -863,6 +878,8 @@ int peer_create_and_write_obj(struct request *req)
 		memcpy(&cow_hdr, hdr, sizeof(cow_hdr));
 		cow_hdr.data_length = SD_DATA_OBJ_SIZE;
 		cow_hdr.obj.offset = 0;
+		trim_zero_sectors(buf, &cow_hdr.obj.offset,
+				  &cow_hdr.data_length);
 
 		ret = do_create_and_write_obj(&iocb, &cow_hdr, epoch, buf);
 	} else
diff --git a/sheep/recovery.c b/sheep/recovery.c
index dec9688..17dbbbb 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -64,6 +64,7 @@ static int recover_object_from_replica(uint64_t oid, struct sd_vnode *vnode,
 				       uint32_t epoch, uint32_t tgt_epoch)
 {
 	struct sd_req hdr;
+	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
 	unsigned wlen = 0, rlen;
 	int ret = SD_RES_NO_MEM;
 	void *buf = NULL;
@@ -95,6 +96,7 @@ static int recover_object_from_replica(uint64_t oid, struct sd_vnode *vnode,
 		goto out;
 	iocb.epoch = epoch;
 	iocb.length = rlen;
+	iocb.offset = rsp->obj.offset;
 	iocb.buf = buf;
 	ret = sd_store->create_and_write(oid, &iocb);
 out:
diff --git a/sheep/request.c b/sheep/request.c
index c53a487..2a76547 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -338,7 +338,7 @@ static void queue_request(struct request *req)
 			goto done;
 		}
 	} else if (hdr->proto_ver) {
-		if (hdr->proto_ver != SD_PROTO_VER) {
+		if (hdr->proto_ver > SD_PROTO_VER) {
 			rsp->result = SD_RES_VER_MISMATCH;
 			goto done;
 		}
@@ -476,6 +476,9 @@ again:
 		goto again;
 	}
 
+	/* fill rq with response header as exec_req does */
+	memcpy(rq, &req->rp, sizeof(req->rp));
+
 	close(req->wait_efd);
 	ret = req->rp.result;
 	free_local_request(req);
diff --git a/sheep/store.c b/sheep/store.c
index d3d50e1..fa2d5dc 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -493,6 +493,7 @@ int read_backend_object(uint64_t oid, char *data, unsigned int datalen,
 		       uint64_t offset, int nr_copies)
 {
 	struct sd_req hdr;
+	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
 	int ret;
 
 	sd_init_req(&hdr, SD_OP_READ_OBJ);
@@ -505,6 +506,8 @@ int read_backend_object(uint64_t oid, char *data, unsigned int datalen,
 	if (ret != SD_RES_SUCCESS)
 		eprintf("failed to read object %" PRIx64 ", %x\n", oid, ret);
 
+	set_trimmed_sectors(data, rsp->obj.offset, rsp->data_length, datalen);
+
 	return ret;
 }
 
-- 
1.7.2.5




More information about the sheepdog mailing list