[sheepdog] [PATCH v3 6/9] sheep: introduce sparse objects

Hitoshi Mitake mitake.hitoshi at gmail.com
Sun Feb 23 06:28:25 CET 2014


We don't need to care about a fragmentation when
 - the object is unlikely to be accessed sequentially, and
 - the object is read-only.

In that sense, we can make the objects sparse if they are not data
objects and writable ones.

This fixes the problem that sheepdog consumes many disk spaces for
deleted vdi objects if your filesystem supports FALLOC_FL_PUNCH_HOLE.

Cc: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
Cc: Valerio Pachera <sirio81 at gmail.com>
Cc: Alessandro Bolgia <alessandro at extensys.it>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
v3: revive find_zero_blocks() and trim_zero_blocks()

 include/util.h      |  3 +++
 lib/util.c          | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 sheep/plain_store.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 sheep/sheep_priv.h  | 11 +++++++++
 4 files changed, 143 insertions(+), 3 deletions(-)

diff --git a/include/util.h b/include/util.h
index 7f439bc..b3ff2b0 100644
--- a/include/util.h
+++ b/include/util.h
@@ -122,6 +122,9 @@ void make_path(char *path, size_t size, size_t nr_segs, const char **segs);
 int atomic_create_and_write(const char *path, const char *buf, size_t len,
 			    bool force_create);
 
+void find_zero_blocks(const void *buf, uint64_t *poffset, uint32_t *plen);
+void trim_zero_blocks(void *buf, uint64_t *poffset, uint32_t *plen);
+
 /* a type safe version of qsort() */
 #define xqsort(base, nmemb, compar)					\
 ({									\
diff --git a/lib/util.c b/lib/util.c
index 7035512..408fc19 100644
--- a/lib/util.c
+++ b/lib/util.c
@@ -744,3 +744,65 @@ void list_sort(void *priv, struct list_head *head,
 
 	merge_and_restore_back_links(priv, cmp, head, part[max_lev], list);
 }
+
+/*
+ * Find zero blocks from the beginning and end of buffer
+ *
+ * The caller passes the offset of 'buf' with 'poffset' so that this funciton
+ * can align the return values to BLOCK_SIZE.  'plen' points the length of the
+ * buffer.  If there are zero blocks at the beginning of the buffer, this
+ * function increases the offset and decreases the length on condition that
+ * '*poffset' is block-aligned.  If there are zero blocks at the end of the
+ * buffer, this function also decreases the length on condition that '*plen' is
+ * block-aligned.
+ */
+void find_zero_blocks(const void *buf, uint64_t *poffset, uint32_t *plen)
+{
+	const uint8_t zero[BLOCK_SIZE] = {0};
+	const uint8_t *p = buf;
+	uint64_t start = *poffset;
+	uint64_t offset = 0;
+	uint32_t len = *plen;
+
+	/* trim zero blocks from the beginning of buffer */
+	while (len >= BLOCK_SIZE) {
+		size_t size = BLOCK_SIZE - (start + offset) % BLOCK_SIZE;
+
+		if (memcmp(p + offset, zero, size) != 0)
+			break;
+
+		offset += size;
+		len -= size;
+	}
+
+	/* trim zero sectors from the end of buffer */
+	while (len >= BLOCK_SIZE) {
+		size_t size = (start + offset + len) % BLOCK_SIZE;
+		if (size == 0)
+			size = BLOCK_SIZE;
+
+		if (memcmp(p + offset + len - size, zero, size) != 0)
+			break;
+
+		len -= size;
+	}
+
+	*plen = len;
+	*poffset = start + offset;
+}
+
+/*
+ * Trim zero blocks from the beginning and end of buffer
+ *
+ * This function is similar to find_zero_blocks(), but this updates 'buf' so
+ * that the zero block are removed from the beginning of buffer.
+ */
+void trim_zero_blocks(void *buf, uint64_t *poffset, uint32_t *plen)
+{
+	uint8_t *p = buf;
+	uint64_t orig_offset = *poffset;
+
+	find_zero_blocks(buf, poffset, plen);
+	if (orig_offset < *poffset)
+		memmove(p, p + *poffset - orig_offset, *plen);
+}
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 754a25a..4fca8ec 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -10,9 +10,14 @@
  */
 
 #include <libgen.h>
+#include <linux/falloc.h>
 
 #include "sheep_priv.h"
 
+#ifndef FALLOC_FL_PUNCH_HOLE
+#define FALLOC_FL_PUNCH_HOLE 0x02
+#endif
+
 #define sector_algined(x) ({ ((x) & (SECTOR_SIZE - 1)) == 0; })
 
 #define ECNAME "user.ec.index"
@@ -122,12 +127,59 @@ static int err_to_sderr(const char *path, uint64_t oid, int err)
 	}
 }
 
+static int discard(int fd, uint64_t start, uint32_t end)
+{
+	int ret = xfallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+			     start, end - start);
+	if (ret < 0) {
+		if (errno == ENOSYS || errno == EOPNOTSUPP)
+			sd_info("FALLOC_FL_PUNCH_HOLE is not supported "
+				"on this filesystem");
+		else
+			sd_err("failed to discard object, %m");
+	}
+
+	return ret;
+}
+
+/* Trim zero blocks of the beginning and end of the object. */
+static int default_trim(int fd, uint64_t oid, const struct siocb *iocb,
+			uint64_t *poffset, uint32_t *plen)
+{
+	trim_zero_blocks(iocb->buf, poffset, plen);
+
+	if (iocb->offset < *poffset) {
+		sd_debug("discard between %d, %ld, %" PRIx64, iocb->offset,
+			 *poffset, oid);
+
+		if (discard(fd, iocb->offset, *poffset) < 0)
+			return -1;
+	}
+
+	if (*poffset + *plen < iocb->offset + iocb->length) {
+		uint64_t end = iocb->offset + iocb->length;
+		if (end == get_objsize(oid))
+			/* This is necessary to punch the last block */
+			end = round_up(end, BLOCK_SIZE);
+		sd_debug("discard between %ld, %ld, %" PRIx64, *poffset + *plen,
+			 end, oid);
+
+		if (discard(fd, *poffset + *plen, end) < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
 int default_write(uint64_t oid, const struct siocb *iocb)
 {
 	int flags = prepare_iocb(oid, iocb, false), fd,
 	    ret = SD_RES_SUCCESS;
 	char path[PATH_MAX];
 	ssize_t size;
+	uint32_t len = iocb->length;
+	uint64_t offset = iocb->offset;
+	static bool trim_is_supported = true;
 
 	if (iocb->epoch < sys_epoch()) {
 		sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch());
@@ -158,7 +210,15 @@ int default_write(uint64_t oid, const struct siocb *iocb)
 	if (unlikely(fd < 0))
 		return err_to_sderr(path, oid, errno);
 
-	size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
+	if (trim_is_supported && is_sparse_object(oid)) {
+		if (default_trim(fd, oid, iocb, &offset, &len) < 0) {
+			trim_is_supported = false;
+			offset = iocb->offset;
+			len = iocb->length;
+		}
+	}
+
+	size = xpwrite(fd, iocb->buf, len, offset);
 	if (unlikely(size != iocb->length)) {
 		sd_err("failed to write object %"PRIx64", path=%s, offset=%"
 		       PRId32", size=%"PRId32", result=%zd, %m", oid, path,
@@ -374,6 +434,7 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 	int flags = prepare_iocb(oid, iocb, true);
 	int ret, fd;
 	uint32_t len = iocb->length;
+	uint32_t offset = iocb->offset;
 	bool ec = is_erasure_obj(oid, iocb->copy_policy);
 	size_t obj_size;
 
@@ -418,13 +479,16 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 	} else
 		obj_size = get_objsize(oid);
 
-	ret = prealloc(fd, obj_size);
+	if (is_sparse_object(oid))
+		ret = xftruncate(fd, obj_size);
+	else
+		ret = prealloc(fd, obj_size);
 	if (ret < 0) {
 		ret = err_to_sderr(path, oid, errno);
 		goto out;
 	}
 
-	ret = xpwrite(fd, iocb->buf, len, iocb->offset);
+	ret = xpwrite(fd, iocb->buf, len, offset);
 	if (ret != len) {
 		sd_err("failed to write object. %m");
 		ret = err_to_sderr(path, oid, errno);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 8b45d5a..f967e75 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -431,6 +431,17 @@ static inline bool node_is_local(const struct sd_node *n)
 	return node_eq(n, &sys->this_node);
 }
 
+/*
+ * If the object is read-only, the fragmentation doesn't happen.  In addition,
+ * if the object is unlikely to be accessed sequentially, the fragmentation is
+ * not a problem.  We can make such objects sparse so that we can use spaces
+ * more efficently.
+ */
+static inline bool is_sparse_object(uint64_t oid)
+{
+	return !is_data_obj(oid) || oid_is_readonly(oid);
+}
+
 /* gateway operations */
 int gateway_read_obj(struct request *req);
 int gateway_write_obj(struct request *req);
-- 
1.8.3.2




More information about the sheepdog mailing list