[sheepdog] [PATCH v2 1/2] sheep: introduce journal file to boost IO performance
MORITA Kazutaka
morita.kazutaka at gmail.com
Sun Nov 11 20:08:29 CET 2012
At Thu, 8 Nov 2012 21:26:10 +0800,
Liu Yuan wrote:
>
> From: Liu Yuan <tailai.ly at taobao.com>
>
> The basic the idea is very simple: use a dedicated device to log all the IO
> operations in a sequential manner and then we are safe to change the backend IO
> operations from O_DSYNC & O_DIRECT into O_RDWR (buffered IO), which will
> benefit us both read & write performance a lot.
>
> Usage:
> $ sheep -j dir=/path/to/dir,size=256, # enable external journaling with the size 256M
> $ sheep -j dir=/path/to/dir,size=256,skip #like above, but skip recovery at startup
> $ sheep -j # enable internal journaling with the default size 512M
getopt_long cannot parse this syntax. If you want to make the
argument optional, spaces are not allowed after '-j'.
How about making the "size=" parameter mandatory instead of allowing
no argument?
> +
> +/* JOURNAL_DESC + JOURNAL_MARKER must be 512 algined for DIO */
> +#define JOURNAL_DESC_MAGIC 0xfee1900d
> +#define JOURNAL_DESC_SIZE 508
> +#define JOURNAL_MARKER_SIZE 4 /* Use marker to detect partial write */
> +#define JOURNAL_META_SIZE SECTOR_SIZE
Should be (JOURNAL_DESC_SIZE + JOURNAL_MARKER_SIZE) to clarify what it
is?
> +
> +#define JOURNAL_DEFAULT_SIZE (1024*1024*256) /* 256M */
> +#define JOURNAL_END_MARKER 0xdeadbeef
> +
> +static const char *jfile_name[2] = { "journal_file0", "journal_file1", };
> +static int jfile_fds[2];
> +static size_t jfile_size = JOURNAL_DEFAULT_SIZE;
> +
> +static struct journal_file jfile;
> +static pthread_spinlock_t jfile_lock;
> +
> +static int zero_out_jfile(int fd)
> +{
> + char *buf;
> + ssize_t wlen;
> +
> + buf = valloc(jfile_size);
This requires too much memory if we specify a big journal size
(e.g. several giga bytes or more). We should use fixed size buffer
(e.g. 4 MB) and iterate xpwrite to fill the file with zero bytes.
However, I wonder again if this zero-fill is really beneficial. I've
tested this patch with 4 GB journal size, and I've encountered the
following problems.
1. It takes 50 seconds for sheep to start up.
2. There is a big performance penalty to VM I/Os while sheep fills
zero bytes to journal files. (This happens when switching journal
files)
Does this zero-filling bring us more benefits than those problems?
> + if (!buf)
> + panic("%m\n");
> + memset(buf, 0, jfile_size);
> + wlen = xpwrite(fd, buf, jfile_size, 0);
> + if (wlen != jfile_size) {
> + eprintf("WARN: failed, %m\n");
> + return -1;
> + }
> +
> + free(buf);
> + return 0;
> +}
(snip)
> +int journal_file_write(uint64_t oid, const char *buf, size_t size,
> + off_t offset, bool create)
> +{
> + struct journal_descriptor jd;
jd should be initialized with zero bytes to avoid writing
uninitialized data to the journal file.
> + uint32_t marker = JOURNAL_END_MARKER;
> + int ret = SD_RES_SUCCESS;
> + ssize_t written, rusize = roundup(size, SECTOR_SIZE),
> + wsize = JOURNAL_META_SIZE + rusize;
> + off_t woff;
> + char *wbuffer, *p;
> +
> + jd.magic = JOURNAL_DESC_MAGIC;
> + jd.offset = offset;
> + jd.size = size;
> + jd.oid = oid;
> + jd.create = create;
> +
> + pthread_spin_lock(&jfile_lock);
> + if (!jfile_enough_space(wsize))
> + switch_journal_file();
> + woff = jfile.pos;
> + jfile.pos += wsize;
> + pthread_spin_unlock(&jfile_lock);
> +
> + p = wbuffer = valloc(wsize);
> + if (!wbuffer)
> + panic("%m\n");
> + memcpy(p, &jd, JOURNAL_DESC_SIZE);
> + p += JOURNAL_DESC_SIZE;
> + memcpy(p, buf, rusize);
> + p += rusize;
The size of buf can be smaller than rusize when size is not
SECTOR_SIZE aligned. This memcpy should be something like as follows.
memcpy(p, buf, size);
p += size;
if (size < rusize) {
memset(p, 0, rusize - size);
p += rusize - size;
}
> + memcpy(p, &marker, JOURNAL_MARKER_SIZE);
> +
> + dprintf("oid %lx, pos %zu, wsize %zu\n", oid, jfile.pos, wsize);
> + /*
> + * Concurrent writes with the same FD is okay because we don't have any
> + * critical sections that need lock inside kernel write path, since we
> + * a) bypass page cache, b) don't modify i_size of this inode.
> + *
> + * Feel free to correct me If I am wrong.
> + */
> + written = xpwrite(jfile.fd, wbuffer, wsize, woff);
> + if (written != wsize) {
> + eprintf("failed, written %zd, len %zu\n", written, wsize);
> + ret = err_to_sderr(oid, errno);
> + goto out;
> + }
> +out:
> + free(wbuffer);
> + return ret;
> +}
(snip)
> @@ -143,6 +141,16 @@ int default_write(uint64_t oid, const struct siocb *iocb)
> }
>
> get_obj_path(oid, path);
> +
> + if (uatomic_is_true(&sys->use_journal) &&
> + journal_file_write(oid, iocb->buf, iocb->length, iocb->offset, 0)
0 should be false.
> + != SD_RES_SUCCESS) {
> + eprintf("turn off journaling\n");
> + uatomic_set_false(&sys->use_journal);
> + flags |= O_DSYNC | O_DIRECT;
> + sync();
> + }
> +
> fd = open(path, flags, def_fmode);
> if (fd < 0)
> return err_to_sderr(oid, errno);
> @@ -305,6 +313,15 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
> get_obj_path(oid, path);
> get_tmp_obj_path(oid, tmp_path);
>
> + if (uatomic_is_true(&sys->use_journal) &&
> + journal_file_write(oid, iocb->buf, iocb->length, iocb->offset, 1)
1 should be true.
> + != SD_RES_SUCCESS) {
> + eprintf("turn off journaling\n");
> + uatomic_set_false(&sys->use_journal);
> + flags |= O_DSYNC | O_DIRECT;
> + sync();
> + }
> +
> fd = open(tmp_path, flags, def_fmode);
> if (fd < 0) {
> if (errno == EEXIST) {
> diff --git a/sheep/sheep.c b/sheep/sheep.c
> index af8da4f..91325b4 100644
> --- a/sheep/sheep.c
> +++ b/sheep/sheep.c
> @@ -59,7 +59,7 @@ static struct option const long_options[] = {
> {NULL, 0, NULL, 0},
> };
>
> -static const char *short_options = "b:c:dDfghjl:op:P:s:uw:y:z:";
> +static const char *short_options = "b:c:dDfghj:l:op:P:s:uw:y:z:";
long_options also need to be modified (no_arguemnt -> required_argument).
>
> static void usage(int status)
> {
> @@ -77,7 +77,7 @@ Options:\n\
> -f, --foreground make the program run in the foreground\n\
> -g, --gateway make the progam run as a gateway mode\n\
> -h, --help display this help and exit\n\
> - -j, --journal use jouranl to update vdi objects\n\
> + -j, --journal use jouranl file to log all the write operations\n\
> -l, --loglevel specify the level of logging detail\n\
> -o, --stdout log to stdout instead of shared logger\n\
> -p, --port specify the TCP port on which to listen\n\
> @@ -311,6 +311,34 @@ static void init_cache_type(char *arg)
> }
> }
>
> +static char jpath[PATH_MAX];
> +static bool jskip;
> +static ssize_t jsize;
> +#define MIN_JOURNAL_SIZE (64) /* 64M */
> +
> +static void init_journal_arg(char *arg)
> +{
> + const char *d = "dir=", *sz = "size=", *sp = "skip";
> + int dl = strlen(d), szl = strlen(sz), spl = strlen(sp);
> +
> + if (!strncmp(d, arg, dl)) {
> + arg += dl;
> + sprintf(jpath, "%s", arg);
> + } else if (!strncmp(sz, arg, szl)) {
> + arg += szl;
> + jsize = strtoll(arg, NULL, 10);
> + if (jsize < MIN_JOURNAL_SIZE || jsize == LLONG_MAX) {
> + fprintf(stderr, "invalid size %s", arg);
"\n" is missing, and we should show what is a valid size. For
example, "journal size must be more than 64 MB".
Thanks,
Kazutaka
More information about the sheepdog
mailing list