[sheepdog] [PATCH v8 06/19] sheep: introduce sparse objects

Hitoshi Mitake mitake.hitoshi at gmail.com
Thu May 15 17:22:33 CEST 2014


We don't need to care about a fragmentation when
 - the object is unlikely to be accessed sequentially, and
 - the object is read-only.

In that sense, we can make the objects sparse if they are not data
objects and writable ones.

This fixes the problem that sheepdog consumes many disk spaces for
deleted vdi objects if your filesystem supports FALLOC_FL_PUNCH_HOLE.

Cc: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
Cc: Valerio Pachera <sirio81 at gmail.com>
Cc: Alessandro Bolgia <alessandro at extensys.it>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 include/util.h      |  3 ++
 lib/util.c          | 62 +++++++++++++++++++++++++++++++++++++++
 sheep/plain_store.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 sheep/sheep_priv.h  | 11 +++++++
 4 files changed, 152 insertions(+), 7 deletions(-)

diff --git a/include/util.h b/include/util.h
index d188ccf..c7a1921 100644
--- a/include/util.h
+++ b/include/util.h
@@ -122,6 +122,9 @@ void make_path(char *path, size_t size, size_t nr_segs, const char **segs);
 int atomic_create_and_write(const char *path, const char *buf, size_t len,
 			    bool force_create);
 
+void find_zero_blocks(const void *buf, uint64_t *poffset, uint32_t *plen);
+void trim_zero_blocks(void *buf, uint64_t *poffset, uint32_t *plen);
+
 /* a type safe version of qsort() */
 #define xqsort(base, nmemb, compar)					\
 ({									\
diff --git a/lib/util.c b/lib/util.c
index 7035512..408fc19 100644
--- a/lib/util.c
+++ b/lib/util.c
@@ -744,3 +744,65 @@ void list_sort(void *priv, struct list_head *head,
 
 	merge_and_restore_back_links(priv, cmp, head, part[max_lev], list);
 }
+
+/*
+ * Find zero blocks from the beginning and end of buffer
+ *
+ * The caller passes the offset of 'buf' with 'poffset' so that this funciton
+ * can align the return values to BLOCK_SIZE.  'plen' points the length of the
+ * buffer.  If there are zero blocks at the beginning of the buffer, this
+ * function increases the offset and decreases the length on condition that
+ * '*poffset' is block-aligned.  If there are zero blocks at the end of the
+ * buffer, this function also decreases the length on condition that '*plen' is
+ * block-aligned.
+ */
+void find_zero_blocks(const void *buf, uint64_t *poffset, uint32_t *plen)
+{
+	const uint8_t zero[BLOCK_SIZE] = {0};
+	const uint8_t *p = buf;
+	uint64_t start = *poffset;
+	uint64_t offset = 0;
+	uint32_t len = *plen;
+
+	/* trim zero blocks from the beginning of buffer */
+	while (len >= BLOCK_SIZE) {
+		size_t size = BLOCK_SIZE - (start + offset) % BLOCK_SIZE;
+
+		if (memcmp(p + offset, zero, size) != 0)
+			break;
+
+		offset += size;
+		len -= size;
+	}
+
+	/* trim zero sectors from the end of buffer */
+	while (len >= BLOCK_SIZE) {
+		size_t size = (start + offset + len) % BLOCK_SIZE;
+		if (size == 0)
+			size = BLOCK_SIZE;
+
+		if (memcmp(p + offset + len - size, zero, size) != 0)
+			break;
+
+		len -= size;
+	}
+
+	*plen = len;
+	*poffset = start + offset;
+}
+
+/*
+ * Trim zero blocks from the beginning and end of buffer
+ *
+ * This function is similar to find_zero_blocks(), but this updates 'buf' so
+ * that the zero block are removed from the beginning of buffer.
+ */
+void trim_zero_blocks(void *buf, uint64_t *poffset, uint32_t *plen)
+{
+	uint8_t *p = buf;
+	uint64_t orig_offset = *poffset;
+
+	find_zero_blocks(buf, poffset, plen);
+	if (orig_offset < *poffset)
+		memmove(p, p + *poffset - orig_offset, *plen);
+}
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 4388133..07bd107 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -10,9 +10,14 @@
  */
 
 #include <libgen.h>
+#include <linux/falloc.h>
 
 #include "sheep_priv.h"
 
+#ifndef FALLOC_FL_PUNCH_HOLE
+#define FALLOC_FL_PUNCH_HOLE 0x02
+#endif
+
 #define sector_algined(x) ({ ((x) & (SECTOR_SIZE - 1)) == 0; })
 
 static inline bool iocb_is_aligned(const struct siocb *iocb)
@@ -116,12 +121,59 @@ static int err_to_sderr(const char *path, uint64_t oid, int err)
 	}
 }
 
+static int discard(int fd, uint64_t start, uint32_t end)
+{
+	int ret = xfallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+			     start, end - start);
+	if (ret < 0) {
+		if (errno == ENOSYS || errno == EOPNOTSUPP)
+			sd_info("FALLOC_FL_PUNCH_HOLE is not supported "
+				"on this filesystem");
+		else
+			sd_err("failed to discard object, %m");
+	}
+
+	return ret;
+}
+
+/* Trim zero blocks of the beginning and end of the object. */
+static int default_trim(int fd, uint64_t oid, const struct siocb *iocb,
+			uint64_t *poffset, uint32_t *plen)
+{
+	trim_zero_blocks(iocb->buf, poffset, plen);
+
+	if (iocb->offset < *poffset) {
+		sd_debug("discard between %d, %ld, %" PRIx64, iocb->offset,
+			 *poffset, oid);
+
+		if (discard(fd, iocb->offset, *poffset) < 0)
+			return -1;
+	}
+
+	if (*poffset + *plen < iocb->offset + iocb->length) {
+		uint64_t end = iocb->offset + iocb->length;
+		if (end == get_objsize(oid))
+			/* This is necessary to punch the last block */
+			end = round_up(end, BLOCK_SIZE);
+		sd_debug("discard between %ld, %ld, %" PRIx64, *poffset + *plen,
+			 end, oid);
+
+		if (discard(fd, *poffset + *plen, end) < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
 int default_write(uint64_t oid, const struct siocb *iocb)
 {
 	int flags = prepare_iocb(oid, iocb, false), fd,
 	    ret = SD_RES_SUCCESS;
 	char path[PATH_MAX];
 	ssize_t size;
+	uint32_t len = iocb->length;
+	uint64_t offset = iocb->offset;
+	static bool trim_is_supported = true;
 
 	if (iocb->epoch < sys_epoch()) {
 		sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch());
@@ -152,8 +204,16 @@ int default_write(uint64_t oid, const struct siocb *iocb)
 	if (unlikely(fd < 0))
 		return err_to_sderr(path, oid, errno);
 
-	size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
-	if (unlikely(size != iocb->length)) {
+	if (trim_is_supported && is_sparse_object(oid)) {
+		if (default_trim(fd, oid, iocb, &offset, &len) < 0) {
+			trim_is_supported = false;
+			offset = iocb->offset;
+			len = iocb->length;
+		}
+	}
+
+	size = xpwrite(fd, iocb->buf, len, offset);
+	if (unlikely(size != len)) {
 		sd_err("failed to write object %"PRIx64", path=%s, offset=%"
 		       PRId32", size=%"PRId32", result=%zd, %m", oid, path,
 		       iocb->offset, iocb->length, size);
@@ -350,6 +410,7 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 	int ret, fd;
 	uint32_t len = iocb->length;
 	size_t obj_size;
+	uint64_t offset = iocb->offset;
 
 	sd_debug("%"PRIx64, oid);
 	get_store_path(oid, iocb->ec_index, path);
@@ -384,13 +445,21 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 	}
 
 	obj_size = get_store_objsize(oid);
-	ret = prealloc(fd, obj_size);
-	if (ret < 0) {
-		ret = err_to_sderr(path, oid, errno);
-		goto out;
+
+	trim_zero_blocks(iocb->buf, &offset, &len);
+
+	if (offset != 0 || len != get_objsize(oid)) {
+		if (is_sparse_object(oid))
+			ret = xftruncate(fd, obj_size);
+		else
+			ret = prealloc(fd, obj_size);
+		if (ret < 0) {
+			ret = err_to_sderr(path, oid, errno);
+			goto out;
+		}
 	}
 
-	ret = xpwrite(fd, iocb->buf, len, iocb->offset);
+	ret = xpwrite(fd, iocb->buf, len, offset);
 	if (ret != len) {
 		sd_err("failed to write object. %m");
 		ret = err_to_sderr(path, oid, errno);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index d1662da..1b95901 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -434,6 +434,17 @@ static inline bool node_is_local(const struct sd_node *n)
 	return node_eq(n, &sys->this_node);
 }
 
+/*
+ * If the object is read-only, the fragmentation doesn't happen.  In addition,
+ * if the object is unlikely to be accessed sequentially, the fragmentation is
+ * not a problem.  We can make such objects sparse so that we can use spaces
+ * more efficently.
+ */
+static inline bool is_sparse_object(uint64_t oid)
+{
+	return !is_data_obj(oid) || oid_is_readonly(oid);
+}
+
 /* gateway operations */
 int gateway_read_obj(struct request *req);
 int gateway_write_obj(struct request *req);
-- 
1.9.1




More information about the sheepdog mailing list