[sheepdog] [PATCH v2 1/2] sheep: introduce journal file to boost IO performance

MORITA Kazutaka morita.kazutaka at gmail.com
Sun Nov 11 20:08:29 CET 2012


At Thu,  8 Nov 2012 21:26:10 +0800,
Liu Yuan wrote:
> 
> From: Liu Yuan <tailai.ly at taobao.com>
> 
> The basic the idea is very simple: use a dedicated device to log all the IO
> operations in a sequential manner and then we are safe to change the backend IO
> operations from O_DSYNC & O_DIRECT into O_RDWR (buffered IO), which will
> benefit us both read & write performance a lot.
> 
> Usage:
>  $ sheep -j dir=/path/to/dir,size=256, # enable external journaling with the size 256M
>  $ sheep -j dir=/path/to/dir,size=256,skip #like above, but skip recovery at startup
>  $ sheep -j # enable internal journaling with the default size 512M

getopt_long cannot parse this syntax.  If you want to make the
argument optional, spaces are not allowed after '-j'.

How about making the "size=" parameter mandatory instead of allowing
no argument?


> +
> +/* JOURNAL_DESC + JOURNAL_MARKER must be 512 algined for DIO */
> +#define JOURNAL_DESC_MAGIC 0xfee1900d
> +#define JOURNAL_DESC_SIZE 508
> +#define JOURNAL_MARKER_SIZE 4 /* Use marker to detect partial write */
> +#define JOURNAL_META_SIZE SECTOR_SIZE

Should be (JOURNAL_DESC_SIZE + JOURNAL_MARKER_SIZE) to clarify what it
is?


> +
> +#define JOURNAL_DEFAULT_SIZE (1024*1024*256) /* 256M */
> +#define JOURNAL_END_MARKER 0xdeadbeef
> +
> +static const char *jfile_name[2] = { "journal_file0", "journal_file1", };
> +static int jfile_fds[2];
> +static size_t jfile_size = JOURNAL_DEFAULT_SIZE;
> +
> +static struct journal_file jfile;
> +static pthread_spinlock_t jfile_lock;
> +
> +static int zero_out_jfile(int fd)
> +{
> +	char *buf;
> +	ssize_t wlen;
> +
> +	buf = valloc(jfile_size);

This requires too much memory if we specify a big journal size
(e.g. several giga bytes or more).  We should use fixed size buffer
(e.g. 4 MB) and iterate xpwrite to fill the file with zero bytes.

However, I wonder again if this zero-fill is really beneficial.  I've
tested this patch with 4 GB journal size, and I've encountered the
following problems.
 1. It takes 50 seconds for sheep to start up.
 2. There is a big performance penalty to VM I/Os while sheep fills
    zero bytes to journal files. (This happens when switching journal
    files)

Does this zero-filling bring us more benefits than those problems?


> +	if (!buf)
> +		panic("%m\n");
> +	memset(buf, 0, jfile_size);
> +	wlen = xpwrite(fd, buf, jfile_size, 0);
> +	if (wlen != jfile_size) {
> +		eprintf("WARN: failed, %m\n");
> +		return -1;
> +	}
> +
> +	free(buf);
> +	return 0;
> +}

(snip)

> +int journal_file_write(uint64_t oid, const char *buf, size_t size,
> +		       off_t offset, bool create)
> +{
> +	struct journal_descriptor jd;

jd should be initialized with zero bytes to avoid writing
uninitialized data to the journal file.


> +	uint32_t marker = JOURNAL_END_MARKER;
> +	int ret = SD_RES_SUCCESS;
> +	ssize_t written, rusize = roundup(size, SECTOR_SIZE),
> +		wsize = JOURNAL_META_SIZE + rusize;
> +	off_t woff;
> +	char *wbuffer, *p;
> +
> +	jd.magic = JOURNAL_DESC_MAGIC;
> +	jd.offset = offset;
> +	jd.size = size;
> +	jd.oid = oid;
> +	jd.create = create;
> +
> +	pthread_spin_lock(&jfile_lock);
> +	if (!jfile_enough_space(wsize))
> +		switch_journal_file();
> +	woff = jfile.pos;
> +	jfile.pos += wsize;
> +	pthread_spin_unlock(&jfile_lock);
> +
> +	p = wbuffer = valloc(wsize);
> +	if (!wbuffer)
> +		panic("%m\n");
> +	memcpy(p, &jd, JOURNAL_DESC_SIZE);
> +	p += JOURNAL_DESC_SIZE;
> +	memcpy(p, buf, rusize);
> +	p += rusize;

The size of buf can be smaller than rusize when size is not
SECTOR_SIZE aligned.  This memcpy should be something like as follows.

    memcpy(p, buf, size);
    p += size;
    if (size < rusize) {
        memset(p, 0, rusize - size);
        p += rusize - size;
    }

> +	memcpy(p, &marker, JOURNAL_MARKER_SIZE);
> +
> +	dprintf("oid %lx, pos %zu, wsize %zu\n", oid, jfile.pos, wsize);
> +	/*
> +	 * Concurrent writes with the same FD is okay because we don't have any
> +	 * critical sections that need lock inside kernel write path, since we
> +	 * a) bypass page cache, b) don't modify i_size of this inode.
> +	 *
> +	 * Feel free to correct me If I am wrong.
> +	 */
> +	written = xpwrite(jfile.fd, wbuffer, wsize, woff);
> +	if (written != wsize) {
> +		eprintf("failed, written %zd, len %zu\n", written, wsize);
> +		ret = err_to_sderr(oid, errno);
> +		goto out;
> +	}
> +out:
> +	free(wbuffer);
> +	return ret;
> +}

(snip)

> @@ -143,6 +141,16 @@ int default_write(uint64_t oid, const struct siocb *iocb)
>  	}
>  
>  	get_obj_path(oid, path);
> +
> +	if (uatomic_is_true(&sys->use_journal) &&
> +	    journal_file_write(oid, iocb->buf, iocb->length, iocb->offset, 0)

0 should be false.


> +	    != SD_RES_SUCCESS) {
> +		eprintf("turn off journaling\n");
> +		uatomic_set_false(&sys->use_journal);
> +		flags |= O_DSYNC | O_DIRECT;
> +		sync();
> +	}
> +
>  	fd = open(path, flags, def_fmode);
>  	if (fd < 0)
>  		return err_to_sderr(oid, errno);
> @@ -305,6 +313,15 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
>  	get_obj_path(oid, path);
>  	get_tmp_obj_path(oid, tmp_path);
>  
> +	if (uatomic_is_true(&sys->use_journal) &&
> +	    journal_file_write(oid, iocb->buf, iocb->length, iocb->offset, 1)

1 should be true.


> +	    != SD_RES_SUCCESS) {
> +		eprintf("turn off journaling\n");
> +		uatomic_set_false(&sys->use_journal);
> +		flags |= O_DSYNC | O_DIRECT;
> +		sync();
> +	}
> +
>  	fd = open(tmp_path, flags, def_fmode);
>  	if (fd < 0) {
>  		if (errno == EEXIST) {
> diff --git a/sheep/sheep.c b/sheep/sheep.c
> index af8da4f..91325b4 100644
> --- a/sheep/sheep.c
> +++ b/sheep/sheep.c
> @@ -59,7 +59,7 @@ static struct option const long_options[] = {
>  	{NULL, 0, NULL, 0},
>  };
>  
> -static const char *short_options = "b:c:dDfghjl:op:P:s:uw:y:z:";
> +static const char *short_options = "b:c:dDfghj:l:op:P:s:uw:y:z:";

long_options also need to be modified (no_arguemnt -> required_argument).


>  
>  static void usage(int status)
>  {
> @@ -77,7 +77,7 @@ Options:\n\
>    -f, --foreground        make the program run in the foreground\n\
>    -g, --gateway           make the progam run as a gateway mode\n\
>    -h, --help              display this help and exit\n\
> -  -j, --journal           use jouranl to update vdi objects\n\
> +  -j, --journal           use jouranl file to log all the write operations\n\
>    -l, --loglevel          specify the level of logging detail\n\
>    -o, --stdout            log to stdout instead of shared logger\n\
>    -p, --port              specify the TCP port on which to listen\n\
> @@ -311,6 +311,34 @@ static void init_cache_type(char *arg)
>  	}
>  }
>  
> +static char jpath[PATH_MAX];
> +static bool jskip;
> +static ssize_t jsize;
> +#define MIN_JOURNAL_SIZE (64) /* 64M */
> +
> +static void init_journal_arg(char *arg)
> +{
> +	const char *d = "dir=", *sz = "size=", *sp = "skip";
> +	int dl = strlen(d), szl = strlen(sz), spl = strlen(sp);
> +
> +	if (!strncmp(d, arg, dl)) {
> +		arg += dl;
> +		sprintf(jpath, "%s", arg);
> +	} else if (!strncmp(sz, arg, szl)) {
> +		arg += szl;
> +		jsize = strtoll(arg, NULL, 10);
> +		if (jsize < MIN_JOURNAL_SIZE || jsize == LLONG_MAX) {
> +			fprintf(stderr, "invalid size %s", arg);

"\n" is missing, and we should show what is a valid size.  For
example, "journal size must be more than 64 MB".

Thanks,

Kazutaka



More information about the sheepdog mailing list