[Sheepdog] [PATCH 2/2] object cache: introduce async flush
Liu Yuan
namei.unix at gmail.com
Tue Apr 3 07:47:21 CEST 2012
On 04/02/2012 04:21 PM, Liu Yuan wrote:
> From: Liu Yuan <tailai.ly at taobao.com>
>
> We async flush dirty object as default to achieve the best performance.
> If users prefer strong consistency over performance, users can launch
> sheep with -S or --sync option.
>
> We need async flush because:
> 1) some APP are responsive time sensitive, the writeback of dirty bits in
> the guest will mostly hurt RT because guest need to await its completion.
> This is a considerably long operation in the sheep cluster.
> 2) some APP are just memory and CPU intensive, has little of concern of disk
> data. (For e.g, just use disk to store logs of APP)
> 3) People simply prefer performance over consistency.
>
> Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
> ---
> sheep/object_cache.c | 23 +++++++++++++++++++++++
> sheep/ops.c | 34 ++++++++++++++++++++++++++++++++--
> sheep/sheep.c | 9 ++++++++-
> sheep/sheep_priv.h | 2 ++
> 4 files changed, 65 insertions(+), 3 deletions(-)
>
> diff --git a/sheep/object_cache.c b/sheep/object_cache.c
> index 389dc6d..ed10b86 100644
> --- a/sheep/object_cache.c
> +++ b/sheep/object_cache.c
> @@ -203,9 +203,20 @@ static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t coun
> flags |= O_DIRECT;
>
> fd = open(p.buf, flags, def_fmode);
> + if (flock(fd, LOCK_EX) < 0) {
> + ret = SD_RES_EIO;
> + eprintf("%m\n");
> + goto out;
> + }
> size = xpwrite(fd, buf, count, offset);
> + if (flock(fd, LOCK_UN) < 0) {
> + ret = SD_RES_EIO;
> + eprintf("%m\n");
> + goto out;
> + }
> if (size != count)
> ret = SD_RES_EIO;
> +out:
> close(fd);
> strbuf_release(&p);
> return ret;
> @@ -225,9 +236,21 @@ static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count
> flags |= O_DIRECT;
>
> fd = open(p.buf, flags, def_fmode);
> + if (flock(fd, LOCK_SH) < 0) {
> + ret = SD_RES_EIO;
> + eprintf("%m\n");
> + goto out;
> + }
> +
> size = xpread(fd, buf, count, offset);
> + if (flock(fd, LOCK_UN) < 0) {
> + ret = SD_RES_EIO;
> + eprintf("%m\n");
> + goto out;
> + }
> if (size != count)
> ret = SD_RES_EIO;
> +out:
> close(fd);
> strbuf_release(&p);
> return ret;
> diff --git a/sheep/ops.c b/sheep/ops.c
> index bedafbe..7fd5a18 100644
> --- a/sheep/ops.c
> +++ b/sheep/ops.c
> @@ -51,6 +51,11 @@ struct sd_op_template {
> int (*process_main)(const struct sd_req *req, struct sd_rsp *rsp, void *data);
> };
>
> +struct flush_work {
> + struct object_cache *cache;
> + struct work work;
> +};
> +
> static int cluster_new_vdi(const struct sd_req *req, struct sd_rsp *rsp,
> void *data)
> {
> @@ -476,6 +481,22 @@ static int local_get_snap_file(const struct sd_req *req, struct sd_rsp *rsp,
> return ret;
> }
>
> +static void flush_vdi_fn(struct work *work)
> +{
> + struct flush_work *fw = container_of(work, struct flush_work, work);
> +
> + dprintf("flush vdi %"PRIx32"\n", fw->cache->vid);
> + if (object_cache_push(fw->cache) != SD_RES_SUCCESS)
> + eprintf("failed to flush vdi %"PRIx32"\n", fw->cache->vid);
> +}
> +
> +static void flush_vdi_done(struct work *work)
> +{
> + struct flush_work *fw = container_of(work, struct flush_work, work);
> + dprintf("flush vdi %"PRIx32" done\n", fw->cache->vid);
> + free(fw);
> +}
> +
> static int local_flush_vdi(const struct sd_req *req, struct sd_rsp *rsp, void *data)
> {
> struct sd_obj_req *hdr = (struct sd_obj_req *)req;
> @@ -483,8 +504,17 @@ static int local_flush_vdi(const struct sd_req *req, struct sd_rsp *rsp, void *d
> uint32_t vid = oid_to_vid(oid);
> struct object_cache *cache = find_object_cache(vid, 0);
>
> - if (cache)
> - return object_cache_push(cache);
> + if (cache) {
> + if (sys->sync_flush)
> + return object_cache_push(cache);
> + else {
> + struct flush_work *fw = xmalloc(sizeof(*fw));
> + fw->work.fn = flush_vdi_fn;
> + fw->work.done = flush_vdi_done;
> + fw->cache = cache;
> + queue_work(sys->flush_wqueue, &fw->work);
> + }
> + }
>
> return SD_RES_SUCCESS;
> }
> diff --git a/sheep/sheep.c b/sheep/sheep.c
> index 6d64a40..ef28f87 100644
> --- a/sheep/sheep.c
> +++ b/sheep/sheep.c
> @@ -59,6 +59,7 @@ Options:\n\
> -l, --loglevel specify the level of logging detail\n\
> -d, --debug include debug messages in the log\n\
> -D, --directio use direct IO when accessing the object from cache or backend store\n\
> + -S, --sync flush the object cache synchronously\n\
> -z, --zone specify the zone id\n\
> -v, --vnodes specify the number of virtual nodes\n\
> -c, --cluster specify the cluster driver\n\
> @@ -132,6 +133,10 @@ int main(int argc, char **argv)
> dprintf("direct IO mode\n");
> sys->use_directio = 1;
> break;
> + case 'S':
> + dprintf("sync flush\n");
> + sys->sync_flush = 1;
> + break;
> case 'z':
> zone = strtol(optarg, &p, 10);
> if (optarg == p || zone < 0 || UINT32_MAX < zone) {
> @@ -215,8 +220,10 @@ int main(int argc, char **argv)
> sys->io_wqueue = init_work_queue(NR_IO_WORKER_THREAD);
> sys->recovery_wqueue = init_work_queue(1);
> sys->deletion_wqueue = init_work_queue(1);
> + sys->flush_wqueue = init_work_queue(1);
> if (!sys->cpg_wqueue || !sys->gateway_wqueue || !sys->io_wqueue ||
> - !sys->recovery_wqueue || !sys->deletion_wqueue)
> + !sys->recovery_wqueue || !sys->deletion_wqueue ||
> + !sys->flush_wqueue)
> exit(1);
>
> vprintf(SDOG_NOTICE, "sheepdog daemon (version %s) started\n", PACKAGE_VERSION);
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index a316e2c..b932e83 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -148,12 +148,14 @@ struct cluster_info {
> uint32_t recovered_epoch;
>
> int use_directio;
> + uint8_t sync_flush;
>
> struct work_queue *cpg_wqueue;
> struct work_queue *gateway_wqueue;
> struct work_queue *io_wqueue;
> struct work_queue *deletion_wqueue;
> struct work_queue *recovery_wqueue;
> + struct work_queue *flush_wqueue;
> };
>
> struct siocb {
Applied.
Thanks,
Yuan
More information about the sheepdog
mailing list