[sheepdog] [PATCH v2 08/11] sheep: introduce sparse objects

MORITA Kazutaka morita.kazutaka at gmail.com
Tue Jun 18 19:14:28 CEST 2013


From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

We don't need to care about a fragmentation when
 - the object is unlikely to be accessed sequentially, and
 - the object is read-only.

In that sense, we can make the objects sparse if they are not data
objects and writable ones.

This fixes the problem that sheepdog consumes many disk spaces for
deleted vdi objects if your filesystem supports FALLOC_FL_PUNCH_HOLE.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 sheep/plain_store.c |   77 +++++++++++++++++++++++++++++++++++++++++++++++----
 sheep/sheep_priv.h  |   11 ++++++++
 2 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index ae713ee..1abc6b6 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -15,11 +15,16 @@
 #include <sys/xattr.h>
 #include <unistd.h>
 #include <libgen.h>
+#include <linux/falloc.h>
 
 #include "sheep_priv.h"
 #include "config.h"
 #include "sha1.h"
 
+#ifndef FALLOC_FL_PUNCH_HOLE
+#define FALLOC_FL_PUNCH_HOLE 0x02
+#endif
+
 #define sector_algined(x) ({ ((x) & (SECTOR_SIZE - 1)) == 0; })
 
 static inline bool iocb_is_aligned(const struct siocb *iocb)
@@ -98,12 +103,59 @@ static int err_to_sderr(char *path, uint64_t oid, int err)
 	}
 }
 
+static int discard(int fd, uint64_t start, uint32_t end)
+{
+	int ret = xfallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+			     start, end - start);
+	if (ret < 0) {
+		if (errno == ENOSYS || errno == EOPNOTSUPP)
+			sd_iprintf("FALLOC_FL_PUNCH_HOLE is not supported "
+				   "on this filesystem");
+		else
+			sd_eprintf("failed to discard object, %m");
+	}
+
+	return ret;
+}
+
+/* Trim zero blocks of the beginning and end of the object. */
+static int default_trim(int fd, uint64_t oid, const struct siocb *iocb,
+			uint64_t *poffset, uint32_t *plen)
+{
+	trim_zero_blocks(iocb->buf, poffset, plen);
+
+	if (iocb->offset < *poffset) {
+		sd_dprintf("discard between %ld, %ld, %" PRIx64, iocb->offset,
+			   *poffset, oid);
+
+		if (discard(fd, iocb->offset, *poffset) < 0)
+			return -1;
+	}
+
+	if (*poffset + *plen < iocb->offset + iocb->length) {
+		uint64_t end = iocb->offset + iocb->length;
+		if (end == get_objsize(oid))
+			/* This is necessary to punch the last block */
+			end = round_up(end, BLOCK_SIZE);
+		sd_dprintf("discard between %ld, %ld, %" PRIx64,
+			   *poffset + *plen, end, oid);
+
+		if (discard(fd, *poffset + *plen, end) < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
 int default_write(uint64_t oid, const struct siocb *iocb)
 {
 	int flags = prepare_iocb(oid, iocb, false), fd,
 	    ret = SD_RES_SUCCESS;
 	char path[PATH_MAX];
 	ssize_t size;
+	uint32_t len = iocb->length;
+	uint64_t offset = iocb->offset;
+	static bool trim_is_supported = true;
 
 	if (iocb->epoch < sys_epoch()) {
 		sd_dprintf("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch());
@@ -126,8 +178,16 @@ int default_write(uint64_t oid, const struct siocb *iocb)
 	if (fd < 0)
 		return err_to_sderr(path, oid, errno);
 
-	size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
-	if (size != iocb->length) {
+	if (trim_is_supported && is_sparse_object(oid)) {
+		if (default_trim(fd, oid, iocb, &offset, &len) < 0) {
+			trim_is_supported = false;
+			offset = iocb->offset;
+			len = iocb->length;
+		}
+	}
+
+	size = xpwrite(fd, iocb->buf, len, offset);
+	if (size != len) {
 		sd_eprintf("failed to write object %"PRIx64", path=%s, offset=%"
 			   PRId64", size=%"PRId32", result=%zd, %m", oid, path,
 			   iocb->offset, iocb->length, size);
@@ -300,6 +360,7 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 	int flags = prepare_iocb(oid, iocb, true);
 	int ret, fd;
 	uint32_t len = iocb->length;
+	uint64_t offset = iocb->offset;
 
 	get_obj_path(oid, path);
 	get_tmp_obj_path(oid, tmp_path);
@@ -332,15 +393,21 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 		return err_to_sderr(path, oid, errno);
 	}
 
-	if (iocb->offset != 0 || iocb->length != get_objsize(oid)) {
-		ret = prealloc(fd, get_objsize(oid));
+	trim_zero_blocks(iocb->buf, &offset, &len);
+
+	if (offset != 0 || len != get_objsize(oid)) {
+		if (is_sparse_object(oid))
+			ret = xftruncate(fd, get_objsize(oid));
+		else
+			ret = prealloc(fd, get_objsize(oid));
+
 		if (ret < 0) {
 			ret = err_to_sderr(path, oid, errno);
 			goto out;
 		}
 	}
 
-	ret = xpwrite(fd, iocb->buf, len, iocb->offset);
+	ret = xpwrite(fd, iocb->buf, len, offset);
 	if (ret != len) {
 		sd_eprintf("failed to write object. %m");
 		ret = err_to_sderr(path, oid, errno);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 07d878c..20b2554 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -349,6 +349,17 @@ static inline bool node_is_local(const struct sd_node *n)
 	return node_eq(n, &sys->this_node);
 }
 
+/*
+ * If the object is read-only, the fragmentation doesn't happen.  In addition,
+ * if the object is unlikely to be accessed sequentially, the fragmentation is
+ * not a problem.  We can make such objects sparse so that we can use spaces
+ * more efficently.
+ */
+static inline bool is_sparse_object(uint64_t oid)
+{
+	return !is_data_obj(oid) || oid_is_readonly(oid);
+}
+
 /* gateway operations */
 int gateway_read_obj(struct request *req);
 int gateway_write_obj(struct request *req);
-- 
1.7.9.5




More information about the sheepdog mailing list