From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp> We don't need to care about a fragmentation when - the object is unlikely to be accessed sequentially, and - the object is read-only. In that sense, we can make the objects sparse if they are not data objects and writable ones. This fixes the problem that sheepdog consumes many disk spaces for deleted vdi objects if your filesystem supports FALLOC_FL_PUNCH_HOLE. Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp> --- sheep/plain_store.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++---- sheep/sheep_priv.h | 11 ++++++++ 2 files changed, 83 insertions(+), 5 deletions(-) diff --git a/sheep/plain_store.c b/sheep/plain_store.c index 02d0bd7..19557b4 100644 --- a/sheep/plain_store.c +++ b/sheep/plain_store.c @@ -15,11 +15,16 @@ #include <sys/xattr.h> #include <unistd.h> #include <libgen.h> +#include <linux/falloc.h> #include "sheep_priv.h" #include "config.h" #include "sha1.h" +#ifndef FALLOC_FL_PUNCH_HOLE +#define FALLOC_FL_PUNCH_HOLE 0x02 +#endif + static int get_open_flags(uint64_t oid, bool create) { int flags = O_DSYNC | O_RDWR; @@ -90,12 +95,59 @@ static int err_to_sderr(char *path, uint64_t oid, int err) } } +static int discard(int fd, uint64_t start, uint32_t end) +{ + int ret = xfallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + start, end - start); + if (ret < 0) { + if (errno == ENOSYS || errno == EOPNOTSUPP) + sd_iprintf("FALLOC_FL_PUNCH_HOLE is not supported " + "on this filesystem"); + else + sd_eprintf("failed to discard object, %m"); + } + + return ret; +} + +/* Trim zero blocks of the beginning and end of the object. */ +static int default_trim(int fd, uint64_t oid, const struct siocb *iocb, + uint64_t *poffset, uint32_t *plen) +{ + trim_zero_blocks(iocb->buf, poffset, plen); + + if (iocb->offset < *poffset) { + sd_dprintf("discard between %ld, %ld, %" PRIx64, iocb->offset, + *poffset, oid); + + if (discard(fd, iocb->offset, *poffset) < 0) + return -1; + } + + if (*poffset + *plen < iocb->offset + iocb->length) { + uint64_t end = iocb->offset + iocb->length; + if (end == get_objsize(oid)) + /* This is necessary to punch the last block */ + end = roundup(end, 4096); + sd_dprintf("discard between %ld, %ld, %" PRIx64, + *poffset + *plen, end, oid); + + if (discard(fd, *poffset + *plen, end) < 0) + return -1; + } + + return 0; +} + int default_write(uint64_t oid, const struct siocb *iocb) { int flags = get_open_flags(oid, false), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; + uint32_t len = iocb->length; + uint64_t offset = iocb->offset; + static bool trim_is_supported = true; if (iocb->epoch < sys_epoch()) { sd_dprintf("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch()); @@ -118,8 +170,16 @@ int default_write(uint64_t oid, const struct siocb *iocb) if (fd < 0) return err_to_sderr(path, oid, errno); - size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); - if (size != iocb->length) { + if (trim_is_supported && is_sparse_obj(oid)) { + if (default_trim(fd, oid, iocb, &offset, &len) < 0) { + trim_is_supported = false; + offset = iocb->offset; + len = iocb->length; + } + } + + size = xpwrite(fd, iocb->buf, len, offset); + if (size != len) { sd_eprintf("failed to write object %"PRIx64", path=%s, offset=%" PRId64", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); @@ -297,6 +357,7 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb) int flags = get_open_flags(oid, true); int ret, fd; uint32_t len = iocb->length; + uint64_t offset = iocb->offset; get_obj_path(oid, path); get_tmp_obj_path(oid, tmp_path); @@ -329,15 +390,21 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb) return err_to_sderr(path, oid, errno); } - if (iocb->offset != 0 || iocb->length != get_objsize(oid)) { - ret = prealloc(fd, get_objsize(oid)); + trim_zero_blocks(iocb->buf, &offset, &len); + + if (offset != 0 || len != get_objsize(oid)) { + if (is_sparse_obj(oid)) + ret = xftruncate(fd, get_objsize(oid)); + else + ret = prealloc(fd, get_objsize(oid)); + if (ret < 0) { ret = err_to_sderr(path, oid, errno); goto out; } } - ret = xpwrite(fd, iocb->buf, len, iocb->offset); + ret = xpwrite(fd, iocb->buf, len, offset); if (ret != len) { sd_eprintf("failed to write object. %m"); ret = err_to_sderr(path, oid, errno); diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index e1f534d..e8b0b21 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -345,6 +345,17 @@ static inline bool node_is_local(const struct sd_node *n) return node_eq(n, &sys->this_node); } +/* + * If the object is read-only, the fragmentation doesn't happen. In addition, + * if the object is unlikely to be accessed sequentially, the fragmentation is + * not a problem. We can make such objects sparse so that we can use spaces + * more efficently. + */ +static inline bool is_sparse_obj(uint64_t oid) +{ + return !is_data_obj(oid) || oid_is_readonly(oid); +} + /* gateway operations */ int gateway_read_obj(struct request *req); int gateway_write_obj(struct request *req); -- 1.7.9.5 |