On 04/02/2012 04:21 PM, Liu Yuan wrote: > From: Liu Yuan <tailai.ly at taobao.com> > > We async flush dirty object as default to achieve the best performance. > If users prefer strong consistency over performance, users can launch > sheep with -S or --sync option. > > We need async flush because: > 1) some APP are responsive time sensitive, the writeback of dirty bits in > the guest will mostly hurt RT because guest need to await its completion. > This is a considerably long operation in the sheep cluster. > 2) some APP are just memory and CPU intensive, has little of concern of disk > data. (For e.g, just use disk to store logs of APP) > 3) People simply prefer performance over consistency. > > Signed-off-by: Liu Yuan <tailai.ly at taobao.com> > --- > sheep/object_cache.c | 23 +++++++++++++++++++++++ > sheep/ops.c | 34 ++++++++++++++++++++++++++++++++-- > sheep/sheep.c | 9 ++++++++- > sheep/sheep_priv.h | 2 ++ > 4 files changed, 65 insertions(+), 3 deletions(-) > > diff --git a/sheep/object_cache.c b/sheep/object_cache.c > index 389dc6d..ed10b86 100644 > --- a/sheep/object_cache.c > +++ b/sheep/object_cache.c > @@ -203,9 +203,20 @@ static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t coun > flags |= O_DIRECT; > > fd = open(p.buf, flags, def_fmode); > + if (flock(fd, LOCK_EX) < 0) { > + ret = SD_RES_EIO; > + eprintf("%m\n"); > + goto out; > + } > size = xpwrite(fd, buf, count, offset); > + if (flock(fd, LOCK_UN) < 0) { > + ret = SD_RES_EIO; > + eprintf("%m\n"); > + goto out; > + } > if (size != count) > ret = SD_RES_EIO; > +out: > close(fd); > strbuf_release(&p); > return ret; > @@ -225,9 +236,21 @@ static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count > flags |= O_DIRECT; > > fd = open(p.buf, flags, def_fmode); > + if (flock(fd, LOCK_SH) < 0) { > + ret = SD_RES_EIO; > + eprintf("%m\n"); > + goto out; > + } > + > size = xpread(fd, buf, count, offset); > + if (flock(fd, LOCK_UN) < 0) { > + ret = SD_RES_EIO; > + eprintf("%m\n"); > + goto out; > + } > if (size != count) > ret = SD_RES_EIO; > +out: > close(fd); > strbuf_release(&p); > return ret; > diff --git a/sheep/ops.c b/sheep/ops.c > index bedafbe..7fd5a18 100644 > --- a/sheep/ops.c > +++ b/sheep/ops.c > @@ -51,6 +51,11 @@ struct sd_op_template { > int (*process_main)(const struct sd_req *req, struct sd_rsp *rsp, void *data); > }; > > +struct flush_work { > + struct object_cache *cache; > + struct work work; > +}; > + > static int cluster_new_vdi(const struct sd_req *req, struct sd_rsp *rsp, > void *data) > { > @@ -476,6 +481,22 @@ static int local_get_snap_file(const struct sd_req *req, struct sd_rsp *rsp, > return ret; > } > > +static void flush_vdi_fn(struct work *work) > +{ > + struct flush_work *fw = container_of(work, struct flush_work, work); > + > + dprintf("flush vdi %"PRIx32"\n", fw->cache->vid); > + if (object_cache_push(fw->cache) != SD_RES_SUCCESS) > + eprintf("failed to flush vdi %"PRIx32"\n", fw->cache->vid); > +} > + > +static void flush_vdi_done(struct work *work) > +{ > + struct flush_work *fw = container_of(work, struct flush_work, work); > + dprintf("flush vdi %"PRIx32" done\n", fw->cache->vid); > + free(fw); > +} > + > static int local_flush_vdi(const struct sd_req *req, struct sd_rsp *rsp, void *data) > { > struct sd_obj_req *hdr = (struct sd_obj_req *)req; > @@ -483,8 +504,17 @@ static int local_flush_vdi(const struct sd_req *req, struct sd_rsp *rsp, void *d > uint32_t vid = oid_to_vid(oid); > struct object_cache *cache = find_object_cache(vid, 0); > > - if (cache) > - return object_cache_push(cache); > + if (cache) { > + if (sys->sync_flush) > + return object_cache_push(cache); > + else { > + struct flush_work *fw = xmalloc(sizeof(*fw)); > + fw->work.fn = flush_vdi_fn; > + fw->work.done = flush_vdi_done; > + fw->cache = cache; > + queue_work(sys->flush_wqueue, &fw->work); > + } > + } > > return SD_RES_SUCCESS; > } > diff --git a/sheep/sheep.c b/sheep/sheep.c > index 6d64a40..ef28f87 100644 > --- a/sheep/sheep.c > +++ b/sheep/sheep.c > @@ -59,6 +59,7 @@ Options:\n\ > -l, --loglevel specify the level of logging detail\n\ > -d, --debug include debug messages in the log\n\ > -D, --directio use direct IO when accessing the object from cache or backend store\n\ > + -S, --sync flush the object cache synchronously\n\ > -z, --zone specify the zone id\n\ > -v, --vnodes specify the number of virtual nodes\n\ > -c, --cluster specify the cluster driver\n\ > @@ -132,6 +133,10 @@ int main(int argc, char **argv) > dprintf("direct IO mode\n"); > sys->use_directio = 1; > break; > + case 'S': > + dprintf("sync flush\n"); > + sys->sync_flush = 1; > + break; > case 'z': > zone = strtol(optarg, &p, 10); > if (optarg == p || zone < 0 || UINT32_MAX < zone) { > @@ -215,8 +220,10 @@ int main(int argc, char **argv) > sys->io_wqueue = init_work_queue(NR_IO_WORKER_THREAD); > sys->recovery_wqueue = init_work_queue(1); > sys->deletion_wqueue = init_work_queue(1); > + sys->flush_wqueue = init_work_queue(1); > if (!sys->cpg_wqueue || !sys->gateway_wqueue || !sys->io_wqueue || > - !sys->recovery_wqueue || !sys->deletion_wqueue) > + !sys->recovery_wqueue || !sys->deletion_wqueue || > + !sys->flush_wqueue) > exit(1); > > vprintf(SDOG_NOTICE, "sheepdog daemon (version %s) started\n", PACKAGE_VERSION); > diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h > index a316e2c..b932e83 100644 > --- a/sheep/sheep_priv.h > +++ b/sheep/sheep_priv.h > @@ -148,12 +148,14 @@ struct cluster_info { > uint32_t recovered_epoch; > > int use_directio; > + uint8_t sync_flush; > > struct work_queue *cpg_wqueue; > struct work_queue *gateway_wqueue; > struct work_queue *io_wqueue; > struct work_queue *deletion_wqueue; > struct work_queue *recovery_wqueue; > + struct work_queue *flush_wqueue; > }; > > struct siocb { Applied. Thanks, Yuan |