[sheepdog] [PATCH v1, RFC] sheep: writeback cache semantics in backend store

Tue Aug 28 19:00:33 CEST 2012

Really sorry, I mistakenly sent this email also to LKML...
If you reply, please don't reply to previous one. Because the reply
will be sent to LKML.

On Wed, Aug 29, 2012 at 1:56 AM, Hitoshi Mitake <h.mitake at gmail.com> wrote:
> v1: differences from v0 are,
> * check syncfs() in configure script
> * send SD_OP_FLUSH_PEER to all sheeps
>
> This patch implements writeback cache semantics in backend store of
> sheep. Current backend store farm and plain calls open() with
> O_DSYNC, so every object write causes slow disk access. This incurs
> overhead and this overhead is not necessary. Because current qemu
> block driver invokes SD_OP_FLUSH_VDI explicitly for object
> cache. Flushing disk cache with the invocation of SD_OP_FLUSH_VDI
> instead of every object write is enough for current sheep.
>
> For improving performance by reducing needless disk access, this patch
> adds new inter-sheep operation SD_OP_FLUSH_PEER. Typical situation is
> like this:
>  qemu sends SD_OP_FLUSH_VDI -> gateway sheep sends SD_OP_FLUSH_PEER ->
>  all other sheeps
>
> And sheeps which received SD_OP_FLUSH_PEER flush disk cache with
> syncfs() system call. If syncfs() is not available, sync() will be
> used instead of it. Checking whether syncfs() is available or not is
> done in build process.
>
> This patch adds new command line option -W to sheep. With -W, sheep
> uses writeback cache semantics in backend store.
>
> Below is the evaluation result:
> * simple dd benchmark, 1 gateway sheep and 10 sheeps for store, on xfs
>
> without -W (writethrough)
>  bs=1M count=100 oflag=direct,sync: 23.7 MB/s
>  bs=1M count=100 oflag=direct: 29.8 MB/s
>  bs=4K count=25600 oflag=direct,sync: 321 kB/s
>  bs=4K count=25600 oflag=direct: 320 kB/s
>
> with -W (writeback)
>  bs=1M count=100 oflag=direct,sync: 36.8 MB/s
>  bs=1M count=100 oflag=direct: 38.8 MB/s
>  bs=4K count=25600 oflag=direct,sync: 458 kB/s
>  bs=4K count=25600 oflag=direct: 5.8 MB/s
>
> * benchmark for disk access from several VMs at once
>
> I wrote a slack benchmark program for measuring performance of
> environments which contain several VMs. Behaviour of the program is
> like dd: iterate write() zero cleared buffer of specified size on a
> single file which is open()ed with O_DIRECT and O_SYNC. This iteration
> is repeated during specified time.
>
> Below is the result. n VMs n sheeps are distributed on n physical
> hosts equally.
>
> 4k buffer size
> 1 VM and 1 physical host: 4.389 MB/s
> 4 VM and 4 physical host: 2.378 MB/s
> 8 VM and 8 physical host: 2.434 MB/s
>
> 1M buffer size
> 1 VM and 1 physical host: 39.12 MB/s
> 4 VM and 4 physical host: 22.575 MB/s
> 8 VM and 8 physical host: 18.6 MB/s
>
> The performance degradation can be observed. But the benchmark
> produced extreme artificial workload. So I think this patch might be
> suitable for certain environments with less active VMs.
>
> Cc: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
> ---
>  configure.ac             |    2 +-
>  include/internal_proto.h |    1 +
>  sheep/ops.c              |  151 +++++++++++++++++++++++++++++++++++++++++++++-
>  sheep/plain_store.c      |    5 +-
>  sheep/sheep.c            |    7 ++-
>  sheep/sheep_priv.h       |   10 +++
>  6 files changed, 170 insertions(+), 6 deletions(-)
>
> diff --git a/configure.ac b/configure.ac
> index 91126e2..ede61ad 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -124,7 +124,7 @@ AC_FUNC_VPRINTF
>  AC_CHECK_FUNCS([alarm alphasort atexit bzero dup2 endgrent endpwent fcntl \
>                 getcwd getpeerucred getpeereid gettimeofday inet_ntoa memmove \
>                 memset mkdir scandir select socket strcasecmp strchr strdup \
> -               strerror strrchr strspn strstr])
> +               strerror strrchr strspn strstr syncfs])
>
>  AC_CONFIG_FILES([Makefile
>                 collie/Makefile
> diff --git a/include/internal_proto.h b/include/internal_proto.h
> index c1d116a..e63080a 100644
> --- a/include/internal_proto.h
> +++ b/include/internal_proto.h
> @@ -65,6 +65,7 @@
>  #define SD_OP_INFO_RECOVER 0xAA
>  #define SD_OP_GET_VDI_COPIES 0xAB
>  #define SD_OP_COMPLETE_RECOVERY 0xAC
> +#define SD_OP_FLUSH_PEER 0xAD
>
>  /* internal flags for hdr.flags, must be above 0x80 */
>  #define SD_FLAG_CMD_RECOVERY 0x0080
> diff --git a/sheep/ops.c b/sheep/ops.c
> index ccb1c5e..ca4c5f9 100644
> --- a/sheep/ops.c
> +++ b/sheep/ops.c
> @@ -21,6 +21,15 @@
>  #include <sys/types.h>
>  #include <sys/stat.h>
>  #include <pthread.h>
> +#include <sys/epoll.h>
> +
> +#ifndef HAVE_SYNCFS
> +static int syncfs(int fd)
> +{
> +       sync();
> +       return 0;
> +}
> +#endif
>
>  #include "sheep_priv.h"
>  #include "strbuf.h"
> @@ -645,11 +654,115 @@ static int local_get_snap_file(struct request *req)
>         return ret;
>  }
>
> +static int flush_all_node(struct request *req)
> +{
> +       int i, ret, err_ret, epfd, waiting, cnt;
> +       struct sd_node *s;
> +       struct node_id *node_sent[SD_MAX_NODES];
> +       struct sockfd *sfd, *sfd_sent[SD_MAX_NODES];
> +       struct sd_req hdr;
> +       struct vnode_info *vinfo = req->vinfo;
> +       struct epoll_event ev;
> +
> +       err_ret = SD_RES_SUCCESS;
> +
> +       epfd = epoll_create(SD_MAX_NODES);
> +       if (epfd == -1) {
> +               eprintf("failed to create epoll file descriptor");
> +               return SD_RES_EIO;
> +       }
> +
> +       sd_init_req(&hdr, SD_OP_FLUSH_PEER);
> +
> +       bzero(&ev, sizeof(struct epoll_event));
> +       ev.events = EPOLLIN;
> +
> +       for (waiting = 0, i = 0; i < vinfo->nr_nodes; i++) {
> +               unsigned int wlen = 0;
> +
> +               s = &vinfo->nodes[i];
> +
> +               if (node_is_local(s)) {
> +                       _peer_flush();
> +                       continue;
> +               }
> +
> +               sfd = sheep_get_sockfd(&s->nid);
> +               if (!sfd) {
> +                       err_ret = SD_RES_NETWORK_ERROR;
> +                       goto put_sockfd;
> +               }
> +
> +               node_sent[waiting] = &s->nid;
> +               sfd_sent[waiting] = sfd;
> +
> +               ret = send_req(sfd->fd, &hdr, NULL, &wlen);
> +               if (ret) {
> +                       eprintf("failed at send_req()");
> +                       sheep_del_sockfd(&s->nid, sfd);
> +                       err_ret = SD_RES_NETWORK_ERROR;
> +                       goto put_sockfd;
> +               }
> +
> +               ev.data.fd = sfd->fd;
> +               if (epoll_ctl(epfd, EPOLL_CTL_ADD, sfd->fd, &ev) == -1) {
> +                       eprintf("failed at epoll_ctl(), errno: %s", strerror(errno));
> +                       err_ret = SD_RES_EIO;
> +                       goto put_sockfd;
> +               }
> +
> +               waiting++;
> +       }
> +
> +       cnt = waiting;
> +       while (cnt) {
> +               struct epoll_event ev_nodes[SD_MAX_NODES];
> +
> +               bzero(ev_nodes, sizeof(struct epoll_event) * cnt);
> +
> +               ret = epoll_wait(epfd, ev_nodes, cnt, -1);
> +               if (ret == -1) {
> +                       eprintf("failed at epoll_wait(), errno: %s", strerror(errno));
> +                       err_ret = SD_RES_EIO;
> +                       break;
> +               }
> +
> +               cnt -= ret;
> +
> +               for (i = 0; i < ret; i++) {
> +                       struct sd_rsp rsp;
> +
> +                       if (do_read(ev_nodes[i].data.fd, &rsp, sizeof(struct sd_rsp))) {
> +                               eprintf("failed to receive response from node");
> +                               err_ret = SD_RES_NETWORK_ERROR;
> +                               goto put_sockfd;
> +                       }
> +               }
> +       }
> +
> +put_sockfd:
> +       for (i = 0; i < waiting; i++)
> +               sheep_put_sockfd(node_sent[i], sfd_sent[i]);
> +
> +       close(epfd);
> +
> +       return err_ret;
> +}
> +
>  static int local_flush_vdi(struct request *req)
>  {
> -       if (!sys->enable_write_cache)
> -               return SD_RES_SUCCESS;
> -       return object_cache_flush_vdi(req);
> +       int ret = SD_RES_SUCCESS;
> +
> +       if (sys->enable_write_cache) {
> +               ret = object_cache_flush_vdi(req);
> +               if (ret != SD_RES_SUCCESS)
> +                       return ret;
> +       }
> +
> +       if (sys->store_writeback)
> +               return flush_all_node(req);
> +
> +       return ret;
>  }
>
>  static int local_flush_and_del(struct request *req)
> @@ -904,6 +1017,31 @@ out:
>         return ret;
>  }
>
> +int _peer_flush(void)
> +{
> +       int fd;
> +
> +       fd = open(obj_path, O_RDONLY);
> +       if (fd < 0) {
> +               eprintf("error at open() %s, %s\n", obj_path, strerror(errno));
> +               return SD_RES_NO_OBJ;
> +       }
> +
> +       if (syncfs(fd)) {
> +               eprintf("error at syncfs(), %s\n", strerror(errno));
> +               return SD_RES_EIO;
> +       }
> +
> +       close(fd);
> +
> +       return SD_RES_SUCCESS;
> +}
> +
> +int peer_flush(struct request *req)
> +{
> +       return _peer_flush();
> +}
> +
>  static struct sd_op_template sd_ops[] = {
>
>         /* cluster operations */
> @@ -1170,6 +1308,12 @@ static struct sd_op_template sd_ops[] = {
>                 .type = SD_OP_TYPE_LOCAL,
>                 .process_main = local_info_recover,
>         },
> +
> +       [SD_OP_FLUSH_PEER] = {
> +               .name = "FLUSH_PEER",
> +               .type = SD_OP_TYPE_PEER,
> +               .process_work = peer_flush,
> +       },
>  };
>
>  struct sd_op_template *get_sd_op(uint8_t opcode)
> @@ -1255,6 +1399,7 @@ static int map_table[] = {
>         [SD_OP_READ_OBJ] = SD_OP_READ_PEER,
>         [SD_OP_WRITE_OBJ] = SD_OP_WRITE_PEER,
>         [SD_OP_REMOVE_OBJ] = SD_OP_REMOVE_PEER,
> +       [SD_OP_FLUSH_VDI] = SD_OP_FLUSH_PEER,
>  };
>
>  int gateway_to_peer_opcode(int opcode)
> diff --git a/sheep/plain_store.c b/sheep/plain_store.c
> index 26aa6dc..b72a55b 100644
> --- a/sheep/plain_store.c
> +++ b/sheep/plain_store.c
> @@ -21,7 +21,7 @@ static char stale_dir[PATH_MAX];
>
>  static int get_open_flags(uint64_t oid, bool create)
>  {
> -       int flags = O_DSYNC | O_RDWR;
> +       int flags = O_RDWR;
>
>         if (is_data_obj(oid))
>                 flags |= O_DIRECT;
> @@ -29,6 +29,9 @@ static int get_open_flags(uint64_t oid, bool create)
>         if (create)
>                 flags |= O_CREAT | O_TRUNC;
>
> +       if (!sys->store_writeback)
> +               flags |= O_DSYNC;
> +
>         return flags;
>  }
>
> diff --git a/sheep/sheep.c b/sheep/sheep.c
> index 10c0501..77e8d7c 100644
> --- a/sheep/sheep.c
> +++ b/sheep/sheep.c
> @@ -52,10 +52,11 @@ static struct option const long_options[] = {
>         {"enable-cache", required_argument, NULL, 'w'},
>         {"zone", required_argument, NULL, 'z'},
>         {"pidfile", required_argument, NULL, 'P'},
> +       {"writeback", no_argument, NULL, 'W'},
>         {NULL, 0, NULL, 0},
>  };
>
> -static const char *short_options = "c:dDfghl:op:P:s:w:y:z:";
> +static const char *short_options = "c:dDfghl:op:P:s:w:y:z:W";
>
>  static void usage(int status)
>  {
> @@ -81,6 +82,7 @@ Options:\n\
>    -w, --enable-cache      enable object cache and specify the max size (M) and mode\n\
>    -y, --myaddr            specify the address advertised to other sheep\n\
>    -z, --zone              specify the zone id\n\
> +  -W, --writeback         use writeback semantics in backend store\n\
>  ", PACKAGE_VERSION, program_name);
>         exit(status);
>  }
> @@ -312,6 +314,9 @@ int main(int argc, char **argv)
>
>                         sys->cdrv_option = get_cdrv_option(sys->cdrv, optarg);
>                         break;
> +               case 'W':
> +                       sys->store_writeback = 1;
> +                       break;
>                 case 'h':
>                         usage(0);
>                         break;
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index 90006f6..9575504 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -119,6 +119,8 @@ struct cluster_info {
>         struct work_queue *block_wqueue;
>         struct work_queue *sockfd_wqueue;
>         struct work_queue *reclaim_wqueue;
> +
> +       int store_writeback;
>  };
>
>  struct siocb {
> @@ -343,6 +345,11 @@ static inline int vnode_is_local(struct sd_vnode *v)
>         return is_myself(v->nid.addr, v->nid.port);
>  }
>
> +static inline int node_is_local(struct sd_node *n)
> +{
> +       return is_myself(n->nid.addr, n->nid.port);
> +}
> +
>  /* gateway operations */
>  int gateway_read_obj(struct request *req);
>  int gateway_write_obj(struct request *req);
> @@ -354,6 +361,7 @@ int peer_read_obj(struct request *req);
>  int peer_write_obj(struct request *req);
>  int peer_create_and_write_obj(struct request *req);
>  int peer_remove_obj(struct request *req);
> +int peer_flush(struct request *req);
>
>  /* object_cache */
>
> @@ -386,4 +394,6 @@ struct sockfd *sheep_get_sockfd(struct node_id *);
>  void sheep_put_sockfd(struct node_id *, struct sockfd *);
>  void sheep_del_sockfd(struct node_id *, struct sockfd *);
>
> +int _peer_flush(void);
> +
>  #endif
> --
> 1.7.5.1
>

-- 
Hitoshi Mitake
h.mitake at gmail.com