[sheepdog] [Qemu-devel] [PATCH] sheepdog: serialize requests to overwrapping area

Jeff Cody jcody at redhat.com
Mon Jul 27 17:23:02 CEST 2015


On Sat, Jul 18, 2015 at 01:44:24AM +0900, Hitoshi Mitake wrote:
> Current sheepdog driver only serializes create requests in oid
> unit. This mechanism isn't enough for handling requests to
> overwrapping area spanning multiple oids, so it can result bugs like
> below:
> https://bugs.launchpad.net/sheepdog-project/+bug/1456421
> 
> This patch adds a new serialization mechanism for the problem. The
> difference from the old one is:
> 1. serialize entire aiocb if their targetting areas overwrap
> 2. serialize all requests (read, write, and discard), not only creates
> 
> This patch also removes the old mechanism because the new one can be
> an alternative.
> 
> Cc: Kevin Wolf <kwolf at redhat.com>
> Cc: Stefan Hajnoczi <stefanha at redhat.com>
> Cc: Teruaki Ishizaki <ishizaki.teruaki at lab.ntt.co.jp>
> Cc: Vasiliy Tolstov <v.tolstov at selfip.ru>
> Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
> ---
>  block/sheepdog.c | 152 ++++++++++++++++++++++++++-----------------------------
>  1 file changed, 71 insertions(+), 81 deletions(-)
> 
> diff --git a/block/sheepdog.c b/block/sheepdog.c
> index bd7cbed..9585beb 100644
> --- a/block/sheepdog.c
> +++ b/block/sheepdog.c
> @@ -318,6 +318,10 @@ enum AIOCBState {
>      AIOCB_DISCARD_OBJ,
>  };
>  
> +#define AIOCBOverwrapping(x, y)                                 \
> +    (!(x->max_affect_data_idx < y->min_affect_data_idx          \
> +       || y->max_affect_data_idx < x->min_affect_data_idx))
> +
>  struct SheepdogAIOCB {
>      BlockAIOCB common;
>  
> @@ -334,6 +338,11 @@ struct SheepdogAIOCB {
>  
>      bool cancelable;
>      int nr_pending;
> +
> +    uint32_t min_affect_data_idx;
> +    uint32_t max_affect_data_idx;
> +
> +    QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
>  };
>  
>  typedef struct BDRVSheepdogState {
> @@ -362,8 +371,10 @@ typedef struct BDRVSheepdogState {
>  
>      /* Every aio request must be linked to either of these queues. */
>      QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
> -    QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head;
>      QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
> +
> +    CoQueue overwrapping_queue;
> +    QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
>  } BDRVSheepdogState;
>  
>  static const char * sd_strerror(int err)
> @@ -498,13 +509,7 @@ static void sd_aio_cancel(BlockAIOCB *blockacb)
>      AIOReq *aioreq, *next;
>  
>      if (sd_acb_cancelable(acb)) {
> -        /* Remove outstanding requests from pending and failed queues.  */
> -        QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings,
> -                           next) {
> -            if (aioreq->aiocb == acb) {
> -                free_aio_req(s, aioreq);
> -            }
> -        }
> +        /* Remove outstanding requests from failed queue.  */
>          QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
>                             next) {
>              if (aioreq->aiocb == acb) {
> @@ -529,6 +534,10 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
>                                     int64_t sector_num, int nb_sectors)
>  {
>      SheepdogAIOCB *acb;
> +    uint32_t object_size;
> +    BDRVSheepdogState *s = bs->opaque;
> +
> +    object_size = (UINT32_C(1) << s->inode.block_size_shift);
>  
>      acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
>  
> @@ -542,6 +551,11 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
>      acb->coroutine = qemu_coroutine_self();
>      acb->ret = 0;
>      acb->nr_pending = 0;
> +
> +    acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
> +    acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
> +                              acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;
> +
>      return acb;
>  }
>  
> @@ -703,38 +717,6 @@ static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
>  static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
>  static void co_write_request(void *opaque);
>  
> -static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
> -{
> -    AIOReq *aio_req;
> -
> -    QLIST_FOREACH(aio_req, &s->pending_aio_head, aio_siblings) {
> -        if (aio_req->oid == oid) {
> -            return aio_req;
> -        }
> -    }
> -
> -    return NULL;
> -}
> -
> -/*
> - * This function searchs pending requests to the object `oid', and
> - * sends them.
> - */
> -static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
> -{
> -    AIOReq *aio_req;
> -    SheepdogAIOCB *acb;
> -
> -    while ((aio_req = find_pending_req(s, oid)) != NULL) {
> -        acb = aio_req->aiocb;
> -        /* move aio_req from pending list to inflight one */
> -        QLIST_REMOVE(aio_req, aio_siblings);
> -        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
> -        add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
> -                        acb->aiocb_type);
> -    }
> -}
> -
>  static coroutine_fn void reconnect_to_sdog(void *opaque)
>  {
>      BDRVSheepdogState *s = opaque;
> @@ -840,12 +822,6 @@ static void coroutine_fn aio_read_response(void *opaque)
>                  s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
>                  s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
>              }
> -            /*
> -             * Some requests may be blocked because simultaneous
> -             * create requests are not allowed, so we search the
> -             * pending requests here.
> -             */
> -            send_pending_req(s, aio_req->oid);
>          }
>          break;
>      case AIOCB_READ_UDATA:
> @@ -1341,30 +1317,6 @@ out:
>      return ret;
>  }
>  
> -/* Return true if the specified request is linked to the pending list. */
> -static bool check_simultaneous_create(BDRVSheepdogState *s, AIOReq *aio_req)
> -{
> -    AIOReq *areq;
> -    QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
> -        if (areq != aio_req && areq->oid == aio_req->oid) {
> -            /*
> -             * Sheepdog cannot handle simultaneous create requests to the same
> -             * object, so we cannot send the request until the previous request
> -             * finishes.
> -             */
> -            DPRINTF("simultaneous create to %" PRIx64 "\n", aio_req->oid);
> -            aio_req->flags = 0;
> -            aio_req->base_oid = 0;
> -            aio_req->create = false;
> -            QLIST_REMOVE(aio_req, aio_siblings);
> -            QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings);
> -            return true;
> -        }
> -    }
> -
> -    return false;
> -}
> -
>  static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
>  {
>      SheepdogAIOCB *acb = aio_req->aiocb;
> @@ -1379,10 +1331,6 @@ static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
>              goto out;
>          }
>  
> -        if (check_simultaneous_create(s, aio_req)) {
> -            return;
> -        }
> -
>          if (s->inode.data_vdi_id[idx]) {
>              aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
>              aio_req->flags |= SD_FLAG_CMD_COW;
> @@ -1458,8 +1406,8 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
>      filename = qemu_opt_get(opts, "filename");
>  
>      QLIST_INIT(&s->inflight_aio_head);
> -    QLIST_INIT(&s->pending_aio_head);
>      QLIST_INIT(&s->failed_aio_head);
> +    QLIST_INIT(&s->inflight_aiocb_head);
>      s->fd = -1;
>  
>      memset(vdi, 0, sizeof(vdi));
> @@ -1524,6 +1472,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
>      bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
>      pstrcpy(s->name, sizeof(s->name), vdi);
>      qemu_co_mutex_init(&s->lock);
> +    qemu_co_queue_init(&s->overwrapping_queue);
>      qemu_opts_del(opts);
>      g_free(buf);
>      return 0;
> @@ -2195,12 +2144,6 @@ static int coroutine_fn sd_co_rw_vector(void *p)
>                                  old_oid, done);
>          QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
>  
> -        if (create) {
> -            if (check_simultaneous_create(s, aio_req)) {
> -                goto done;
> -            }
> -        }
> -
>          add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
>                          acb->aiocb_type);
>      done:
> @@ -2215,6 +2158,20 @@ out:
>      return 1;
>  }
>  
> +static bool check_overwrapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
> +{
> +    SheepdogAIOCB *cb;
> +
> +    QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
> +        if (AIOCBOverwrapping(aiocb, cb)) {
> +            return true;
> +        }
> +    }
> +
> +    QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings);
> +    return false;
> +}
> +
>  static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
>                          int nb_sectors, QEMUIOVector *qiov)
>  {
> @@ -2234,14 +2191,25 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
>      acb->aio_done_func = sd_write_done;
>      acb->aiocb_type = AIOCB_WRITE_UDATA;
>  
> +retry:
> +    if (check_overwrapping_aiocb(s, acb)) {
> +        qemu_co_queue_wait(&s->overwrapping_queue);
> +        goto retry;
> +    }
> +
>      ret = sd_co_rw_vector(acb);
>      if (ret <= 0) {
> +        QLIST_REMOVE(acb, aiocb_siblings);
> +        qemu_co_queue_restart_all(&s->overwrapping_queue);
>          qemu_aio_unref(acb);
>          return ret;
>      }
>  
>      qemu_coroutine_yield();
>  
> +    QLIST_REMOVE(acb, aiocb_siblings);
> +    qemu_co_queue_restart_all(&s->overwrapping_queue);
> +
>      return acb->ret;
>  }
>  
> @@ -2250,19 +2218,30 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
>  {
>      SheepdogAIOCB *acb;
>      int ret;
> +    BDRVSheepdogState *s = bs->opaque;
>  
>      acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
>      acb->aiocb_type = AIOCB_READ_UDATA;
>      acb->aio_done_func = sd_finish_aiocb;
>  
> +retry:
> +    if (check_overwrapping_aiocb(s, acb)) {
> +        qemu_co_queue_wait(&s->overwrapping_queue);
> +        goto retry;
> +    }
> +
>      ret = sd_co_rw_vector(acb);
>      if (ret <= 0) {
> +        QLIST_REMOVE(acb, aiocb_siblings);
> +        qemu_co_queue_restart_all(&s->overwrapping_queue);
>          qemu_aio_unref(acb);
>          return ret;
>      }
>  
>      qemu_coroutine_yield();
>  
> +    QLIST_REMOVE(acb, aiocb_siblings);
> +    qemu_co_queue_restart_all(&s->overwrapping_queue);
>      return acb->ret;
>  }
>  
> @@ -2610,14 +2589,25 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
>      acb->aiocb_type = AIOCB_DISCARD_OBJ;
>      acb->aio_done_func = sd_finish_aiocb;
>  
> +retry:
> +    if (check_overwrapping_aiocb(s, acb)) {
> +        qemu_co_queue_wait(&s->overwrapping_queue);
> +        goto retry;
> +    }
> +
>      ret = sd_co_rw_vector(acb);
>      if (ret <= 0) {
> +        QLIST_REMOVE(acb, aiocb_siblings);
> +        qemu_co_queue_restart_all(&s->overwrapping_queue);
>          qemu_aio_unref(acb);
>          return ret;
>      }
>  
>      qemu_coroutine_yield();
>  
> +    QLIST_REMOVE(acb, aiocb_siblings);
> +    qemu_co_queue_restart_all(&s->overwrapping_queue);
> +
>      return acb->ret;
>  }
>  
> -- 
> 1.9.1
> 
>

Thanks, applied to my block branch:
https://github.com/codyprime/qemu-kvm-jtc/tree/block


Jeff



More information about the sheepdog mailing list