[stgt] [PATCH 3/3] bs_aio rewrite (standard aio and eventfd; cmd batches; aio resource limit)
Alexander Nezhinsky
alexandern at mellanox.com
Mon Nov 7 16:12:24 CET 2011
Reimplementation of "aio" backing store.
Factors that reshaped the code:
- current implementation dates back to the days when both native aio api and
eventfd were not a standard feature of linux yet. Thus it was quite broken.
This implementation uses libaio and eventfd, assuming they are available,
otherwise bs_aio isn't compiled in.
- introducing multiple commands "batching", accounting for NOT_LAST bit
- keeping track of the currently submitted and queued cmds; as the aio_init()
allocates limited cmd resources, exceeding the limit may put tgtd to sleep,
so when the number of submitted commands reaches the limit, the next commands
are retained on queue.
Signed-off-by: Alexander Nezhinsky <alexandern at mellanox.com>
---
usr/Makefile | 8 +-
usr/bs_aio.c | 434 +++++++++++++++++++++++++++++++++++++++------------------
usr/bs_aio.h | 44 ------
3 files changed, 304 insertions(+), 182 deletions(-)
diff --git a/usr/Makefile b/usr/Makefile
index e15ff6a..2c9eb60 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -11,7 +11,13 @@ endif
TGTD_OBJS += $(addprefix iscsi/, conn.o param.o session.o \
iscsid.o target.o chap.o sha1.o md5.o transport.o iscsi_tcp.o \
isns.o)
-TGTD_OBJS += bs_rdwr.o bs_aio.o
+TGTD_OBJS += bs_rdwr.o
+
+ifneq ($(shell test -e /usr/include/sys/eventfd.h && echo 1),)
+CFLAGS += -DUSE_EVENTFD
+TGTD_OBJS += bs_aio.o
+LIBS += -laio
+endif
ifneq ($(ISCSI_RDMA),)
TGTD_OBJS += iscsi/iser.o iscsi/iser_text.o
diff --git a/usr/bs_aio.c b/usr/bs_aio.c
index 2828961..8d07df1 100644
--- a/usr/bs_aio.c
+++ b/usr/bs_aio.c
@@ -1,8 +1,9 @@
/*
- * AIO file backing store routine
+ * AIO backing store
*
* Copyright (C) 2006-2007 FUJITA Tomonori <tomof at acm.org>
* Copyright (C) 2006-2007 Mike Christie <michaelc at cs.wisc.edu>
+ * Copyright (C) 2011 Alexander Nezhinsky <alexandern at mellanox.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
@@ -20,147 +21,344 @@
* 02110-1301 USA
*/
#include <errno.h>
-#include <fcntl.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
-#include <pthread.h>
-
#include <linux/fs.h>
#include <sys/epoll.h>
#include <sys/types.h>
#include <sys/stat.h>
+#include <sys/eventfd.h>
+#include <libaio.h>
#include "list.h"
#include "util.h"
#include "tgtd.h"
+#include "target.h"
#include "scsi.h"
-#include "bs_aio.h"
-
#ifndef O_DIRECT
#define O_DIRECT 040000
#endif
-/* FIXME */
-#define MAX_AIO_REQS 2048
+#define AIO_MAX_IODEPTH 128
struct bs_aio_info {
- int afd;
+ struct list_head dev_list_entry;
io_context_t ctx;
- struct io_event events[MAX_AIO_REQS];
+ struct list_head cmd_wait_list;
+ unsigned int nwaiting;
+ unsigned int npending;
+ unsigned int iodepth;
+
+ int resubmit;
+
+ struct scsi_lu *lu;
+ int evt_fd;
+
+ struct iocb iocb_arr[AIO_MAX_IODEPTH];
+ struct iocb *piocb_arr[AIO_MAX_IODEPTH];
+ struct io_event io_evts[AIO_MAX_IODEPTH];
};
-static void bs_aio_endio(int fd, int events, void *data)
+static struct list_head bs_aio_dev_list = LIST_HEAD_INIT(bs_aio_dev_list);
+
+static inline struct bs_aio_info *BS_AIO_I(struct scsi_lu *lu)
{
- struct bs_aio_info *info = data;
- int i, ret;
- uint64_t total, nr;
+ return (struct bs_aio_info *) ((char *)lu + sizeof(*lu));
+}
-retry:
- ret = read(info->afd, &total, sizeof(total));
- if (ret < 0) {
- eprintf("AIO pthread will be dead, %m\n");
- if (errno == EAGAIN || errno == EINTR)
- goto retry;
+static void bs_aio_iocb_prep(struct bs_aio_info *info, int idx,
+ struct scsi_cmd *cmd)
+{
+ struct iocb *iocb = &info->iocb_arr[idx];
+ unsigned int scsi_op = (unsigned int)cmd->scb[0];
+
+ iocb->data = cmd;
+ iocb->key = 0;
+ iocb->aio_reqprio = 0;
+ iocb->aio_fildes = info->lu->fd;
+
+ switch (scsi_op) {
+ case WRITE_6:
+ case WRITE_10:
+ case WRITE_12:
+ case WRITE_16:
+ iocb->aio_lio_opcode = IO_CMD_PWRITE;
+ iocb->u.c.buf = scsi_get_out_buffer(cmd);
+ iocb->u.c.nbytes = scsi_get_out_length(cmd);
+
+ dprintf("prep WR cmd:%p op:%x buf:0x%p sz:%lx\n",
+ cmd, scsi_op, iocb->u.c.buf, iocb->u.c.nbytes);
+ break;
+
+ case READ_6:
+ case READ_10:
+ case READ_12:
+ case READ_16:
+ iocb->aio_lio_opcode = IO_CMD_PREAD;
+ iocb->u.c.buf = scsi_get_in_buffer(cmd);
+ iocb->u.c.nbytes = scsi_get_in_length(cmd);
+ dprintf("prep RD cmd:%p op:%x buf:0x%p sz:%lx\n",
+ cmd, scsi_op, iocb->u.c.buf, iocb->u.c.nbytes);
+ break;
+
+ default:
return;
}
-get_events:
- nr = min_t(long, total, MAX_AIO_REQS);
- ret = io_getevents(info->ctx, 1, nr, info->events, NULL);
- if (ret <= 0)
- return;
- nr = ret;
- total -= nr;
-
- for (i = 0; i < nr; i++) {
- struct io_event *ep = &info->events[i];
- struct scsi_cmd *cmd = (void *)(unsigned long)ep->data;
- int result;
- uint32_t length;
-
- switch (cmd->scb[0]) {
- case WRITE_6:
- case WRITE_10:
- case WRITE_12:
- case WRITE_16:
- length = scsi_get_out_length(cmd);
- break;
- default:
- length = scsi_get_in_length(cmd);
+ iocb->u.c.offset = cmd->offset;
+ iocb->u.c.flags |= (1 << 0); /* IOCB_FLAG_RESFD - use eventfd file desc. */
+ iocb->u.c.resfd = info->evt_fd;
+}
+
+static int bs_aio_submit_dev_batch(struct bs_aio_info *info)
+{
+ int nsubmit, nsuccess;
+ struct scsi_cmd *cmd, *next;
+ int i = 0;
+
+ nsubmit = info->iodepth - info->npending; /* max allowed to submit */
+ if (nsubmit > info->nwaiting)
+ nsubmit = info->nwaiting;
+
+ dprintf("nsubmit:%d waiting:%d pending:%d, tgt:%d lun:%"PRId64 "\n",
+ nsubmit, info->nwaiting, info->npending,
+ info->lu->tgt->tid, info->lu->lun);
+
+ if (!nsubmit)
+ return 0;
+
+ list_for_each_entry_safe(cmd, next, &info->cmd_wait_list, bs_list) {
+ bs_aio_iocb_prep(info, i, cmd);
+ list_del(&cmd->bs_list);
+ if (++i == nsubmit)
break;
- }
+ }
- if (ep->res == length)
- result = SAM_STAT_GOOD;
+ nsuccess = io_submit(info->ctx, nsubmit, info->piocb_arr);
+ if (unlikely(nsuccess < 0)) {
+ if (nsuccess == -EAGAIN) {
+ eprintf("delayed submit %d cmds to tgt:%d lun:%"PRId64 "\n",
+ nsubmit, info->lu->tgt->tid, info->lu->lun);
+ nsuccess = 0; /* leave the dev pending with all cmds */
+ }
else {
- sense_data_build(cmd, MEDIUM_ERROR, 0);
- result = SAM_STAT_CHECK_CONDITION;
+ eprintf("failed to submit %d cmds to tgt:%d lun:%"PRId64
+ ", err: %d\n",
+ nsubmit, info->lu->tgt->tid,
+ info->lu->lun, -nsuccess);
+ return nsuccess;
+ }
+ }
+ if (unlikely(nsuccess < nsubmit)) {
+ for (i=nsubmit-1; i >= nsuccess; i--) {
+ cmd = info->iocb_arr[i].data;
+ list_add(&cmd->bs_list, &info->cmd_wait_list);
}
-
- target_cmd_io_done(cmd, result);
}
- if (total)
- goto get_events;
+ info->npending += nsuccess;
+ info->nwaiting -= nsuccess;
+ /* if no cmds remain, remove the dev from the pending list */
+ if (likely(!info->nwaiting))
+ list_del(&info->dev_list_entry);
+
+ dprintf("submitted %d of %d cmds to tgt:%d lun:%"PRId64
+ ", waiting:%d pending:%d\n",
+ nsuccess, nsubmit, info->lu->tgt->tid, info->lu->lun,
+ info->nwaiting, info->npending);
+ return 0;
}
-static int bs_aio_open(struct scsi_lu *lu, char *path, int *fd, uint64_t *size)
+static int bs_aio_submit_all_devs(void)
{
- *fd = backed_file_open(path, O_RDWR|O_LARGEFILE|O_DIRECT, size);
- /* If we get access denied, try opening the file in readonly mode */
- if (*fd == -1 && (errno == EACCES || errno == EROFS)) {
- *fd = backed_file_open(path, O_RDONLY|O_LARGEFILE|O_DIRECT,
- size);
- lu->attrs.readonly = 1;
+ struct bs_aio_info *dev_info, *next_dev;
+ int err;
+
+ /* pass over all devices having some queued cmds and submit */
+ list_for_each_entry_safe(dev_info, next_dev, &bs_aio_dev_list, dev_list_entry) {
+ err = bs_aio_submit_dev_batch(dev_info);
+ if (unlikely(err))
+ return err;
}
- if (*fd < 0)
- return *fd;
return 0;
}
-static int bs_aio_init(struct scsi_lu *lu)
+static int bs_aio_cmd_submit(struct scsi_cmd *cmd)
+{
+ struct scsi_lu *lu = cmd->dev;
+ struct bs_aio_info *info = BS_AIO_I(lu);
+ unsigned int scsi_op = (unsigned int)cmd->scb[0];
+
+ switch (scsi_op) {
+ case WRITE_6:
+ case WRITE_10:
+ case WRITE_12:
+ case WRITE_16:
+ case READ_6:
+ case READ_10:
+ case READ_12:
+ case READ_16:
+ break;
+
+ case SYNCHRONIZE_CACHE:
+ case SYNCHRONIZE_CACHE_16:
+ default:
+ dprintf("skipped cmd:%p op:%x\n", cmd, scsi_op);
+ return 0;
+ }
+
+ list_add_tail(&cmd->bs_list, &info->cmd_wait_list);
+ if (!info->nwaiting)
+ list_add_tail(&info->dev_list_entry, &bs_aio_dev_list);
+ info->nwaiting++;
+ set_cmd_async(cmd);
+
+ if (!cmd_not_last(cmd)) /* last cmd in batch */
+ return bs_aio_submit_all_devs();
+
+ if (info->nwaiting == info->iodepth - info->npending)
+ return bs_aio_submit_dev_batch(info);
+
+ return 0;
+}
+
+static void bs_aio_complete_one(struct io_event *ep)
+{
+ struct scsi_cmd *cmd = (void *)(unsigned long)ep->data;
+ uint32_t length;
+ int result;
+
+ switch (cmd->scb[0]) {
+ case WRITE_6:
+ case WRITE_10:
+ case WRITE_12:
+ case WRITE_16:
+ length = scsi_get_out_length(cmd);
+ break;
+ default:
+ length = scsi_get_in_length(cmd);
+ break;
+ }
+
+ if (likely(ep->res == length))
+ result = SAM_STAT_GOOD;
+ else {
+ sense_data_build(cmd, MEDIUM_ERROR, 0);
+ result = SAM_STAT_CHECK_CONDITION;
+ }
+ dprintf("cmd: %p\n", cmd);
+ target_cmd_io_done(cmd, result);
+}
+
+static void bs_aio_get_completions(int fd, int events, void *data)
+{
+ struct bs_aio_info *info = data;
+ int i, ret;
+ uint64_t ncomplete, nevents;
+
+retry_read:
+ ret = read(info->evt_fd, &ncomplete, sizeof(ncomplete));
+ if (unlikely(ret < 0)) {
+ eprintf("failed to read AIO completions, %m\n");
+ if (errno == EAGAIN || errno == EINTR)
+ goto retry_read;
+
+ return;
+ }
+
+ while (ncomplete) {
+ nevents = min_t(long, ncomplete, ARRAY_SIZE(info->io_evts));
+retry_getevts:
+ ret = io_getevents(info->ctx, 1, nevents, info->io_evts, NULL);
+ if (likely(ret > 0)) {
+ nevents = ret;
+ info->npending -= nevents;
+ } else {
+ if (ret == -EINTR)
+ goto retry_getevts;
+ eprintf("io_getevents failed, err:%d\n", -ret);
+ return;
+ }
+ dprintf("got %ld ioevents out of %ld, pending %d\n",
+ nevents, ncomplete, info->npending);
+
+ for (i = 0; i < nevents; i++)
+ bs_aio_complete_one(&info->io_evts[i]);
+ ncomplete -= nevents;
+ }
+
+ if (info->nwaiting) {
+ dprintf("submit waiting cmds to tgt:%d lun:%"PRId64 "\n",
+ info->lu->tgt->tid, info->lu->lun);
+ bs_aio_submit_dev_batch(info);
+ }
+}
+
+static int bs_aio_open(struct scsi_lu *lu, char *path, int *fd, uint64_t *size)
{
+ struct bs_aio_info *info = BS_AIO_I(lu);
int ret, afd;
- struct bs_aio_info *info =
- (struct bs_aio_info *) ((char *)lu + sizeof(*lu));
- ret = io_setup(MAX_AIO_REQS, &info->ctx);
+ info->iodepth = AIO_MAX_IODEPTH;
+ eprintf("create aio context for tgt:%d lun:%"PRId64 ", max iodepth:%d\n",
+ info->lu->tgt->tid, info->lu->lun, info->iodepth);
+ ret = io_setup(info->iodepth, &info->ctx);
if (ret) {
- eprintf("fail to create aio_queue, %m\n");
+ eprintf("failed to create aio context, %m\n");
return -1;
}
- afd = eventfd(0);
+ afd = eventfd(0, O_NONBLOCK);
if (afd < 0) {
- eprintf("fail to create eventfd, %m\n");
+ eprintf("failed to create eventfd for tgt:%d lun:%"PRId64 ", %m\n",
+ info->lu->tgt->tid, info->lu->lun);
+ ret = afd;
goto close_ctx;
}
+ dprintf("eventfd:%d for tgt:%d lun:%"PRId64 "\n",
+ afd, info->lu->tgt->tid, info->lu->lun);
- ret = fcntl(afd, F_SETFL, fcntl(afd, F_GETFL, 0) | O_NONBLOCK);
- if (ret) {
- eprintf("fail to configure eventfd, %m\n");
- goto close_eventfd;
- }
-
- ret = tgt_event_add(afd, EPOLLIN, bs_aio_endio, info);
+ ret = tgt_event_add(afd, EPOLLIN, bs_aio_get_completions, info);
if (ret)
goto close_eventfd;
+ info->evt_fd = afd;
- info->afd = afd;
-
- return 0;
+ eprintf("open %s, RW, O_DIRECT for tgt:%d lun:%"PRId64 "\n",
+ path, info->lu->tgt->tid, info->lu->lun);
+ *fd = backed_file_open(path, O_RDWR|O_LARGEFILE|O_DIRECT, size);
+ /* If we get access denied, try opening the file in readonly mode */
+ if (*fd == -1 && (errno == EACCES || errno == EROFS)) {
+ eprintf("open %s, READONLY, O_DIRECT for tgt:%d lun:%"PRId64 "\n",
+ path, info->lu->tgt->tid, info->lu->lun);
+ *fd = backed_file_open(path, O_RDONLY|O_LARGEFILE|O_DIRECT,
+ size);
+ lu->attrs.readonly = 1;
+ }
+ if (*fd < 0) {
+ eprintf("failed to open %s, for tgt:%d lun:%"PRId64 ", %m\n",
+ path, info->lu->tgt->tid, info->lu->lun);
+ ret = *fd;
+ goto remove_tgt_evt;
+ }
+ else {
+ eprintf("%s opened successfully for tgt:%d lun:%"PRId64 "\n",
+ path, info->lu->tgt->tid, info->lu->lun);
+ return 0;
+ }
+remove_tgt_evt:
+ tgt_event_del(afd);
close_eventfd:
close(afd);
close_ctx:
io_destroy(info->ctx);
- return -1;
+ return ret;
}
static void bs_aio_close(struct scsi_lu *lu)
@@ -168,80 +366,42 @@ static void bs_aio_close(struct scsi_lu *lu)
close(lu->fd);
}
-static void bs_aio_exit(struct scsi_lu *lu)
+static int bs_aio_init(struct scsi_lu *lu)
{
- struct bs_aio_info *info =
- (struct bs_aio_info *) ((char *)lu + sizeof(*lu));
+ struct bs_aio_info *info = BS_AIO_I(lu);
+ int i;
- close(info->afd);
- io_destroy(info->ctx);
-}
+ memset(info, 0, sizeof(*info));
+ INIT_LIST_HEAD(&info->dev_list_entry);
+ INIT_LIST_HEAD(&info->cmd_wait_list);
+ info->lu = lu;
-static int bs_aio_cmd_submit(struct scsi_cmd *cmd)
-{
- struct scsi_lu *lu = cmd->dev;
- struct bs_aio_info *info =
- (struct bs_aio_info *)((char *)lu + sizeof(*lu));
- struct iocb iocb, *io;
- uint32_t length;
- int ret = 0, do_io = 0;
-
- io = &iocb;
- memset(io, 0, sizeof(*io));
-
- switch (cmd->scb[0]) {
- case SYNCHRONIZE_CACHE:
- case SYNCHRONIZE_CACHE_16:
- break;
- case WRITE_6:
- case WRITE_10:
- case WRITE_12:
- case WRITE_16:
- do_io = 1;
- length = scsi_get_out_length(cmd);
- io_prep_pwrite(io, lu->fd, scsi_get_out_buffer(cmd),
- length, cmd->offset, info->afd);
+ for (i=0; i < ARRAY_SIZE(info->iocb_arr); i++)
+ info->piocb_arr[i] = &info->iocb_arr[i];
- break;
- case READ_6:
- case READ_10:
- case READ_12:
- case READ_16:
- do_io = 1;
- length = scsi_get_in_length(cmd);
- io_prep_pread(io, lu->fd, scsi_get_in_buffer(cmd),
- length, cmd->offset, info->afd);
- break;
- default:
- break;
- }
+ return 0;
+}
- if (do_io) {
- io->aio_data = (uint64_t)(unsigned long)cmd;
- ret = io_submit(info->ctx, 1, &io);
- if (ret == 1) {
- set_cmd_async(cmd);
- ret = 0;
- } else {
- sense_data_build(cmd, MEDIUM_ERROR, 0);
- ret = SAM_STAT_CHECK_CONDITION;
- }
- }
+static void bs_aio_exit(struct scsi_lu *lu)
+{
+ struct bs_aio_info *info = BS_AIO_I(lu);
- return ret;
+ close(info->evt_fd);
+ io_destroy(info->ctx);
}
static struct backingstore_template aio_bst = {
.bs_name = "aio",
- .bs_datasize = sizeof(struct bs_aio_info),
+ .bs_datasize = sizeof(struct bs_aio_info),
.bs_init = bs_aio_init,
.bs_exit = bs_aio_exit,
.bs_open = bs_aio_open,
- .bs_close = bs_aio_close,
- .bs_cmd_submit = bs_aio_cmd_submit,
+ .bs_close = bs_aio_close,
+ .bs_cmd_submit = bs_aio_cmd_submit,
};
__attribute__((constructor)) static void bs_rdwr_constructor(void)
{
register_backingstore_template(&aio_bst);
}
+
diff --git a/usr/bs_aio.h b/usr/bs_aio.h
index f62e99c..82c5655 100644
--- a/usr/bs_aio.h
+++ b/usr/bs_aio.h
@@ -39,24 +39,6 @@ enum {
IOCB_CMD_PWRITEV = 8,
};
-#ifndef __NR_eventfd
-#if defined(__x86_64__)
-#define __NR_eventfd 284
-#elif defined(__i386__)
-#define __NR_eventfd 323
-#elif defined(__powerpc__)
-#define __NR_eventfd 307
-#elif defined(__powerpc64__)
-#define __NR_eventfd 307
-#elif defined(__ia64__)
-#define __NR_eventfd 1309
-#elif defined(__sparc__) || defined(__sparc64__)
-#define __NR_eventfd 313
-#else
-#error Cannot detect your architecture
-#endif
-#endif
-
#define IOCB_FLAG_RESFD (1 << 0)
#if defined(__LITTLE_ENDIAN)
@@ -104,32 +86,6 @@ struct iocb {
int32_t aio_resfd;
}; /* 64 bytes */
-static inline void io_prep_pread(struct iocb *iocb, int fd, void *buf,
- int nr_segs, int64_t offset, int afd)
-{
- iocb->aio_fildes = fd;
- iocb->aio_lio_opcode = IOCB_CMD_PREAD;
- iocb->aio_reqprio = 0;
- iocb->aio_buf = (uint64_t)(unsigned long)buf;
- iocb->aio_nbytes = nr_segs;
- iocb->aio_offset = offset;
- iocb->aio_flags = IOCB_FLAG_RESFD;
- iocb->aio_resfd = afd;
-}
-
-static inline void io_prep_pwrite(struct iocb *iocb, int fd, void const *buf,
- int nr_segs, int64_t offset, int afd)
-{
- iocb->aio_fildes = fd;
- iocb->aio_lio_opcode = IOCB_CMD_PWRITE;
- iocb->aio_reqprio = 0;
- iocb->aio_buf = (uint64_t)(unsigned long)buf;
- iocb->aio_nbytes = nr_segs;
- iocb->aio_offset = offset;
- iocb->aio_flags = IOCB_FLAG_RESFD;
- iocb->aio_resfd = afd;
-}
-
static inline int io_setup(unsigned nr_reqs, io_context_t *ctx)
{
return syscall(__NR_io_setup, nr_reqs, ctx);
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe stgt" in
the body of a message to majordomo at vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
More information about the stgt
mailing list