[sheepdog] [PATCH v2 02/11] use 4KB for trim_zero_sectors instead of 512B

MORITA Kazutaka morita.kazutaka at gmail.com
Tue Jun 18 19:14:22 CEST 2013


From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

Most filesystems use 4KB for their block sizes, and in that case
trimming 512B may be meaningless.  It's better to use the block size
than the sector size for faster comparison.

This also allows for the caller to specify the offset of buffer so
that it can get the block-aligned offset and length.  It is in
preparation for allowing sparse objects.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 collie/common.c         |    2 +-
 collie/farm/sha1_file.c |    2 +-
 collie/vdi.c            |    8 +++----
 include/util.h          |    7 +++---
 lib/util.c              |   61 ++++++++++++++++++++++++++++++++++-------------
 sheep/gateway.c         |    4 ++--
 sheep/ops.c             |   10 ++++----
 sheep/plain_store.c     |    2 +-
 sheep/store.c           |    2 +-
 9 files changed, 64 insertions(+), 34 deletions(-)

diff --git a/collie/common.c b/collie/common.c
index 51da1f4..8fae140 100644
--- a/collie/common.c
+++ b/collie/common.c
@@ -92,7 +92,7 @@ int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
 		return rsp->result;
 	}
 
-	untrim_zero_sectors(data, rsp->obj.offset, rsp->data_length, datalen);
+	untrim_zero_blocks(data, rsp->obj.offset, rsp->data_length, datalen);
 
 	return SD_RES_SUCCESS;
 }
diff --git a/collie/farm/sha1_file.c b/collie/farm/sha1_file.c
index 452ee96..a2f3561 100644
--- a/collie/farm/sha1_file.c
+++ b/collie/farm/sha1_file.c
@@ -36,7 +36,7 @@ static void get_sha1(unsigned char *buf, unsigned len, unsigned char *sha1)
 	void *tmp = valloc(length);
 
 	memcpy(tmp, buf, len);
-	trim_zero_sectors(tmp, &offset, &length);
+	trim_zero_blocks(tmp, &offset, &length);
 
 	sha1_init(&c);
 	sha1_update(&c, (uint8_t *)&offset, sizeof(offset));
diff --git a/collie/vdi.c b/collie/vdi.c
index bc4ffb5..987be29 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -382,8 +382,8 @@ static void parse_objs(uint64_t oid, obj_parser_func_t func, void *data, unsigne
 		snprintf(name + strlen(name), sizeof(name) - strlen(name),
 			 ":%d", sd_nodes[i].nid.port);
 
-		untrim_zero_sectors(buf, rsp->obj.offset,
-				    rsp->data_length, size);
+		untrim_zero_blocks(buf, rsp->obj.offset, rsp->data_length,
+				   size);
 		cb_ret = func(name, oid, rsp, buf, data);
 		if (cb_ret)
 			break;
@@ -1389,8 +1389,8 @@ static void *read_object_from(const struct sd_vnode *vnode, uint64_t oid)
 
 	switch (rsp->result) {
 	case SD_RES_SUCCESS:
-		untrim_zero_sectors(buf, rsp->obj.offset, rsp->data_length,
-				    size);
+		untrim_zero_blocks(buf, rsp->obj.offset, rsp->data_length,
+				   size);
 		break;
 	case SD_RES_NO_OBJ:
 		free(buf);
diff --git a/include/util.h b/include/util.h
index 2fb9683..e70bb9f 100644
--- a/include/util.h
+++ b/include/util.h
@@ -15,6 +15,7 @@
 #include "list.h"
 
 #define SECTOR_SIZE (1U << 9)
+#define BLOCK_SIZE (1U << 12)
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #define round_up(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
@@ -97,9 +98,9 @@ void reraise_crash_signal(int signo, int status);
 pid_t gettid(void);
 bool is_xattr_enabled(const char *path);
 
-void trim_zero_sectors(void *buf, uint64_t *offset, uint32_t *len);
-void untrim_zero_sectors(void *buf, uint64_t offset, uint32_t len,
-			 uint32_t requested_len);
+void trim_zero_blocks(void *buf, uint64_t *offset, uint32_t *len);
+void untrim_zero_blocks(void *buf, uint64_t offset, uint32_t len,
+			uint32_t requested_len);
 int atomic_create_and_write(const char *path, char *buf, size_t len);
 
 /* a type safe version of qsort() */
diff --git a/lib/util.c b/lib/util.c
index fac468a..2fbd2f7 100644
--- a/lib/util.c
+++ b/lib/util.c
@@ -356,36 +356,65 @@ int rmdir_r(char *dir_path)
 	return ret;
 }
 
-/* Trim zero sectors from the beginning and end of buffer */
-void trim_zero_sectors(void *buf, uint64_t *offset, uint32_t *len)
+/*
+ * Trim zero blocks from the beginning and end of buffer
+ *
+ * The caller passes the offset of 'buf' with 'poffset' so that this funciton
+ * can align the return values to BLOCK_SIZE.  'plen' points the length of the
+ * buffer.  If there are zero blocks at the beginning of the buffer, this
+ * function increases the offset and decreases the length on condition that
+ * '*poffset' is block-aligned.  If there are zero blocks at the end of the
+ * buffer, this function also decreases the length on condition that '*plen' is
+ * block-aligned.
+ */
+void trim_zero_blocks(void *buf, uint64_t *poffset, uint32_t *plen)
 {
-	const uint8_t zero[SECTOR_SIZE] = {0};
+	const uint8_t zero[BLOCK_SIZE] = {0};
 	uint8_t *p = buf;
+	uint64_t start = *poffset;
+	uint64_t offset = 0;
+	uint32_t len = *plen;
 
-	assert(*offset == 0);
+	/* trim zero blocks from the beginning of buffer */
+	while (len >= BLOCK_SIZE) {
+		size_t size = (start + offset) % BLOCK_SIZE;
+		if (size == 0)
+			size = BLOCK_SIZE;
 
-	/* trim zero sectors from the beginning of buffer */
-	while (*len >= SECTOR_SIZE) {
-		if (memcmp(p + *offset, zero, SECTOR_SIZE) != 0)
+		if (memcmp(p + offset, zero, size) != 0)
 			break;
 
-		*offset += SECTOR_SIZE;
-		*len -= SECTOR_SIZE;
+		offset += size;
+		len -= size;
 	}
-	memmove(buf, p + *offset, *len);
+	if (offset > 0)
+		memmove(buf, p + offset, len);
 
 	/* trim zero sectors from the end of buffer */
-	while (*len >= SECTOR_SIZE) {
-		if (memcmp(p + *len - SECTOR_SIZE, zero, SECTOR_SIZE) != 0)
+	while (len >= BLOCK_SIZE) {
+		size_t size = (start + len) % BLOCK_SIZE;
+		if (size == 0)
+			size = BLOCK_SIZE;
+
+		if (memcmp(p + len - size, zero, size) != 0)
 			break;
 
-		*len -= SECTOR_SIZE;
+		len -= size;
 	}
+
+	*plen = len;
+	*poffset = start + offset;
 }
 
-/* Untrim zero sectors to the beginning and end of buffer */
-void untrim_zero_sectors(void *buf, uint64_t offset, uint32_t len,
-			 uint32_t requested_len)
+/*
+ * Untrim zero blocks to the beginning and end of buffer
+ *
+ * 'offset' is the offset of 'buf' in the original buffer, 'len' is the length
+ * of 'buf', and 'requested_len' is the length of the original buffer.  'buf'
+ * must have enough spaces to contain 'requested_len' bytes.
+ */
+void untrim_zero_blocks(void *buf, uint64_t offset, uint32_t len,
+			uint32_t requested_len)
 {
 	uint8_t *p = buf;
 
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 8495380..2496709 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -89,8 +89,8 @@ out:
 	if (ret == SD_RES_SUCCESS &&
 	    req->rq.proto_ver < SD_PROTO_VER_TRIM_ZERO_SECTORS) {
 		/* the client doesn't support trimming zero bytes */
-		untrim_zero_sectors(req->data, req->rp.obj.offset,
-				    req->rp.data_length, req->rq.data_length);
+		untrim_zero_blocks(req->data, req->rp.obj.offset,
+				   req->rp.data_length, req->rq.data_length);
 		req->rp.data_length = req->rq.data_length;
 		req->rp.obj.offset = 0;
 	}
diff --git a/sheep/ops.c b/sheep/ops.c
index 2b5c50d..ff06d81 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -908,8 +908,8 @@ static int read_copy_from_replica(struct request *req, uint32_t epoch,
 	ret = gateway_read_obj(&read_req);
 
 	if (ret == SD_RES_SUCCESS)
-		untrim_zero_sectors(buf, rsp->obj.offset, rsp->data_length,
-				    SD_DATA_OBJ_SIZE);
+		untrim_zero_blocks(buf, rsp->obj.offset, rsp->data_length,
+				   SD_DATA_OBJ_SIZE);
 
 	return ret;
 }
@@ -945,7 +945,7 @@ int peer_read_obj(struct request *req)
 
 	rsp->data_length = hdr->data_length;
 	rsp->obj.offset = 0;
-	trim_zero_sectors(req->data, &rsp->obj.offset, &rsp->data_length);
+	trim_zero_blocks(req->data, &rsp->obj.offset, &rsp->data_length);
 
 	if (hdr->obj.copies)
 		rsp->obj.copies = hdr->obj.copies;
@@ -1010,8 +1010,8 @@ int peer_create_and_write_obj(struct request *req)
 		memcpy(&cow_hdr, hdr, sizeof(cow_hdr));
 		cow_hdr.data_length = SD_DATA_OBJ_SIZE;
 		cow_hdr.obj.offset = 0;
-		trim_zero_sectors(buf, &cow_hdr.obj.offset,
-				  &cow_hdr.data_length);
+		trim_zero_blocks(buf, &cow_hdr.obj.offset,
+				 &cow_hdr.data_length);
 
 		ret = do_create_and_write_obj(&iocb, &cow_hdr, epoch, buf);
 	} else
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 722504e..ae713ee 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -559,7 +559,7 @@ int default_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1)
 		return ret;
 	}
 
-	trim_zero_sectors(buf, &offset, &length);
+	trim_zero_blocks(buf, &offset, &length);
 
 	sha1_init(&c);
 	sha1_update(&c, (uint8_t *)&offset, sizeof(offset));
diff --git a/sheep/store.c b/sheep/store.c
index c8253ca..a804d0d 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -423,7 +423,7 @@ int read_backend_object(uint64_t oid, char *data, unsigned int datalen,
 		sd_eprintf("failed to read object %" PRIx64 ", %s", oid,
 			   sd_strerror(ret));
 
-	untrim_zero_sectors(data, rsp->obj.offset, rsp->data_length, datalen);
+	untrim_zero_blocks(data, rsp->obj.offset, rsp->data_length, datalen);
 
 	return ret;
 }
-- 
1.7.9.5




More information about the sheepdog mailing list