[stgt] [PATCH 1/1] nonblocking epoll_wait loop, sched events, ISER/IB polling
Alexander Nezhinsky
nezhinsky at gmail.com
Thu Sep 18 20:38:28 CEST 2008
This patch introduces custom events scheduler, non-blocking
epoll_wait when events are pending, delaying IB completions
notifications, that leads to significant reduction in interrupts rate
for iser/ib, while adding flexibility to tgtd event processing scheme.
Signed-off-by: Alexander Nezhinsky <nezhinsky at gmail.com>
---
usr/iscsi/iscsi_rdma.c | 240 +++++++++++++++++++++++++++---------------------
usr/tgtd.c | 136 ++++++++++++---------------
usr/tgtd.h | 30 +++++-
3 files changed, 219 insertions(+), 187 deletions(-)
diff --git a/usr/iscsi/iscsi_rdma.c b/usr/iscsi/iscsi_rdma.c
index 46e6ea8..35b0f13 100644
--- a/usr/iscsi/iscsi_rdma.c
+++ b/usr/iscsi/iscsi_rdma.c
@@ -144,6 +144,8 @@ struct conn_info {
/* but count so we can drain CQ on close */
int recvl_posted;
+ struct tgt_event tx_sched;
+
/* login phase resources, freed at full-feature */
void *srbuf_login;
void *listbuf_login;
@@ -194,6 +196,8 @@ struct iser_device {
void *mempool_listbuf;
struct ibv_mr *mempool_mr;
+ struct tgt_event poll_sched;
+
/* free and allocated mempool entries */
struct list_head mempool_free, mempool_alloc;
};
@@ -217,10 +221,6 @@ static struct list_head iser_conn_list;
/* if any task needs an rdma read or write slot to proceed */
static int waiting_rdma_slot;
-/* progress available, used with tgt_counter_event */
-static int num_tx_ready;
-static int num_rx_ready;
-
#define uint64_from_ptr(p) (uint64_t)(uintptr_t)(p)
#define ptr_from_int64(p) (void *)(unsigned long)(p)
@@ -270,13 +270,17 @@ static inline struct conn_info *RDMA_CONN(struct iscsi_connection *conn)
return container_of(conn, struct conn_info, iscsi_conn);
}
-static void iser_cqe_handler(int fd, int events, void *data);
-static void iser_rx_progress(int *counter, void *data);
+static void iser_cqe_handler(int fd __attribute__((unused)),
+ int events __attribute__((unused)),
+ void *data);
static void iser_rdma_read_completion(struct rdmalist *rdma);
static void iscsi_rdma_release(struct iscsi_connection *conn);
static int iscsi_rdma_show(struct iscsi_connection *conn, char *buf,
int rest);
static void iscsi_rdma_event_modify(struct iscsi_connection *conn, int events);
+static void iser_sched_poll_cq(struct tgt_event *tev);
+static void iser_sched_drain_cq(struct tgt_event *tev);
+static void iser_sched_tx(struct tgt_event *evt);
/*
* Called when ready for full feature, builds resources.
@@ -612,6 +616,8 @@ static int iser_device_init(struct iser_device *dev)
goto out;
}
+ tgt_init_sched_event(&dev->poll_sched,iser_sched_poll_cq,dev);
+
ret = ibv_req_notify_cq(dev->cq, 0);
if (ret) {
eprintf("ibv_req_notify failed: %s\n", strerror(ret));
@@ -691,7 +697,10 @@ static void iser_accept_connection(struct rdma_cm_event *event)
ci->login_phase = LOGIN_PHASE_START;
INIT_LIST_HEAD(&ci->conn_tx_ready);
list_add(&ci->iser_conn_list, &temp_conn);
- /* initiator sits at dst, we are src */
+
+ tgt_init_sched_event(&ci->tx_sched,iser_sched_tx,ci);
+
+ /* initiator sits at dst, we are src */
memcpy(&ci->peer_addr, &event->id->route.addr.dst_addr,
sizeof(ci->peer_addr));
memcpy(&ci->self_addr, &event->id->route.addr.src_addr,
@@ -940,7 +949,7 @@ static void handle_wc(struct ibv_wc *wc)
list_add(&rdmal->list, &ci->rdmal);
if (waiting_rdma_slot) {
waiting_rdma_slot = 0;
- num_tx_ready = 1;
+ tgt_add_sched_event(&ci->tx_sched);
}
break;
@@ -957,7 +966,7 @@ static void handle_wc(struct ibv_wc *wc)
list_add(&rdmal->list, &ci->rdmal);
if (waiting_rdma_slot) {
waiting_rdma_slot = 0;
- num_tx_ready = 1;
+ tgt_add_sched_event(&ci->tx_sched);
}
break;
@@ -974,85 +983,14 @@ close_err:
}
/*
- * Called directly from main event loop when a CQ notification is
- * available.
- */
-static void iser_cqe_handler(int fd __attribute__((unused)),
- int events __attribute__((unused)),
- void *data)
-{
- int ret;
- void *cq_context;
- struct iser_device *dev = data;
-
- ret = ibv_get_cq_event(dev->cq_channel, &dev->cq, &cq_context);
- if (ret != 0) {
- eprintf("notification, but no CQ event\n");
- exit(1);
- }
-
- ibv_ack_cq_events(dev->cq, 1);
-
- ret = ibv_req_notify_cq(dev->cq, 0);
- if (ret) {
- eprintf("ibv_req_notify_cq: %s\n", strerror(ret));
- exit(1);
- }
-
- iser_rx_progress(NULL, dev);
-}
-
-/*
- * Called from tgtd when num_tx_ready (counter) non-zero. Walks the
- * list of active connections and tries to push tx on each, until nothing
- * is ready anymore. No progress limit here.
- */
-static void iser_tx_progress(int *counter __attribute__((unused)),
- void *data __attribute__((unused)))
-{
- int reloop, ret;
- struct conn_info *ci, *cin;
- struct iscsi_connection *conn;
-
- dprintf("entry\n");
- num_tx_ready = 0;
-
- do {
- reloop = 0;
- list_for_each_entry_safe(ci, cin, &conn_tx_ready,
- conn_tx_ready) {
- conn = &ci->iscsi_conn;
- if (conn->state == STATE_CLOSE) {
- dprintf("ignoring tx for closed conn\n");
- } else {
- dprintf("trying tx\n");
- ret = iscsi_tx_handler(conn);
- if (conn->state == STATE_CLOSE) {
- conn_close(conn);
- dprintf("connection %p closed\n", ci);
- } else {
- if (ret == 0) {
- reloop = 1;
- } else {
- /* but leave on tx ready list */
- waiting_rdma_slot = 1;
- }
- }
- }
- }
- } while (reloop);
-}
-
-/*
* Could read as many entries as possible without blocking, but
* that just fills up a list of tasks. Instead pop out of here
* so that tx progress, like issuing rdma reads and writes, can
* happen periodically.
*/
-#define MAX_RX_PROGRESS 8
-static void iser_rx_progress_one(struct iser_device *dev)
+static int iser_poll_cq(struct iser_device *dev, int max_wc)
{
- int ret, numwc = 0;
+ int ret = 0, numwc = 0;
struct ibv_wc wc;
struct conn_info *ci;
struct recvlist *recvl;
@@ -1069,8 +1007,8 @@ static void iser_rx_progress_one(struct iser_device *dev)
VALGRIND_MAKE_MEM_DEFINED(&wc, sizeof(wc));
if (wc.status == IBV_WC_SUCCESS) {
handle_wc(&wc);
- if (++numwc == MAX_RX_PROGRESS) {
- num_rx_ready = 1;
+ if (++numwc == max_wc) {
+ ret = 1;
break;
}
} else if (wc.status == IBV_WC_WR_FLUSH_ERR) {
@@ -1089,24 +1027,121 @@ static void iser_rx_progress_one(struct iser_device *dev)
wc.status, (unsigned long long) wc.wr_id);
}
}
+ return ret;
+}
+
+static void iser_poll_cq_normal(struct iser_device *dev)
+{
+ int ret;
+
+ ret = iser_poll_cq(dev,8);
+ if (ret < 0)
+ exit(1);
+
+ if (ret == 0) {
+ ret = ibv_req_notify_cq(dev->cq, 0);
+ if (ret) {
+ eprintf("ibv_req_notify_cq: %s\n", strerror(ret));
+ exit(1);
+ }
+ dev->poll_sched.sched_handler = iser_sched_drain_cq;
+ }
+ else
+ dev->poll_sched.sched_handler = iser_sched_poll_cq;
+
+ tgt_add_sched_event(&dev->poll_sched);
+}
+
+static void iser_poll_cq_drain(struct iser_device *dev)
+{
+ int ret;
+
+ ret = iser_poll_cq(dev,4);
+ if (ret < 0)
+ exit(1);
+
+ dev->poll_sched.sched_handler = iser_sched_poll_cq;
+ if (ret == 0) {
+ ret = ibv_req_notify_cq(dev->cq, 0);
+ if (ret) {
+ eprintf("ibv_req_notify_cq: %s\n", strerror(ret));
+ exit(1);
+ }
+ }
+}
+
+static void iser_sched_poll_cq(struct tgt_event *tev)
+{
+ struct iser_device *dev = tev->data;
+ iser_poll_cq_normal(dev);
+}
+
+static void iser_sched_drain_cq(struct tgt_event *tev)
+{
+ struct iser_device *dev = tev->data;
+ iser_poll_cq_drain(dev);
+}
+
+/*
+ * Called directly from main event loop when a CQ notification is
+ * available.
+ */
+static void iser_cqe_handler(int fd __attribute__((unused)),
+ int events __attribute__((unused)),
+ void *data)
+{
+ struct iser_device *dev = data;
+ void *cq_context;
+ int ret;
+
+ ret = ibv_get_cq_event(dev->cq_channel, &dev->cq, &cq_context);
+ if (ret != 0) {
+ eprintf("notification, but no CQ event\n");
+ exit(1);
+ }
+
+ ibv_ack_cq_events(dev->cq, 1);
+
+ /* if a poll was previosuly scheduled, remove it as it
+ will be scheduled if necessary */
+ if (dev->poll_sched.scheduled)
+ tgt_remove_sched_event(&dev->poll_sched);
+
+ iser_poll_cq_normal(dev);
}
/*
- * Only one progress counter, must look across all devs.
+ * Called from tgtd as a scheduled event
+ * tries to push tx on a connection, until nothing
+ * is ready anymore. No progress limit here.
*/
-static void iser_rx_progress(int *counter __attribute__((unused)), void *data)
+static void iser_sched_tx(struct tgt_event *evt)
{
- struct iser_device *dev;
+ struct conn_info *ci = evt->data;
+ struct iscsi_connection *conn = &ci->iscsi_conn;
+ int ret;
dprintf("entry\n");
- num_rx_ready = 0;
- if (data == NULL) {
- list_for_each_entry(dev, &iser_dev_list, list)
- iser_rx_progress_one(dev);
- } else {
- dev = data;
- iser_rx_progress_one(dev);
- }
+
+ if (conn->state == STATE_CLOSE) {
+ dprintf("ignoring tx for closed conn\n");
+ return;
+ }
+
+ for(;;) {
+ dprintf("trying tx\n");
+ ret = iscsi_tx_handler(conn);
+ if (conn->state == STATE_CLOSE) {
+ conn_close(conn);
+ dprintf("connection %p closed\n", ci);
+ break;
+ }
+ if (ret != 0) {
+ /* but leave on tx ready list */
+ waiting_rdma_slot = 1;
+ break;
+ }
+ }
}
/*
@@ -1165,11 +1200,8 @@ static int iscsi_rdma_init(void)
INIT_LIST_HEAD(&iser_dev_list);
INIT_LIST_HEAD(&iser_conn_list);
INIT_LIST_HEAD(&temp_conn);
- num_tx_ready = 0;
- num_rx_ready = 0;
- ret = tgt_counter_event_add(&num_tx_ready, iser_tx_progress, NULL);
- ret = tgt_counter_event_add(&num_rx_ready, iser_rx_progress, NULL);
- return ret;
+
+ return ret;
}
/*
@@ -1397,10 +1429,6 @@ static void iscsi_iser_write_end(struct iscsi_connection *conn)
ci->writeb = 0; /* reset count */
ci->send_comm_event = NULL;
-
- /* wake up the progress engine to do the done */
- dprintf("inc progress to finish cmd\n");
- num_tx_ready = 1;
}
/*
@@ -1505,7 +1533,7 @@ static int iscsi_rdma_rdma_write(struct iscsi_connection *conn)
iscsi_rdma_event_modify(conn, EPOLLIN);
} else {
/* poke ourselves to do the next rdma */
- num_tx_ready = 1;
+ tgt_add_sched_event(&ci->tx_sched);
}
return ret;
@@ -1628,7 +1656,7 @@ static void iscsi_rdma_event_modify(struct iscsi_connection *conn, int events)
dprintf("tx ready adding %p\n", ci);
list_add(&ci->conn_tx_ready, &conn_tx_ready);
}
- num_tx_ready = 1;
+ tgt_add_sched_event(&ci->tx_sched);
} else {
dprintf("tx ready removing %p\n", ci);
list_del_init(&ci->conn_tx_ready);
diff --git a/usr/tgtd.c b/usr/tgtd.c
index 0b1cb4c..287f051 100644
--- a/usr/tgtd.c
+++ b/usr/tgtd.c
@@ -38,26 +38,13 @@
#include "work.h"
#include "util.h"
-struct tgt_event {
- union {
- event_handler_t *handler;
- counter_event_handler_t *counter_handler;
- };
- union {
- int fd;
- int *counter;
- };
- void *data;
- struct list_head e_list;
-};
-
unsigned long pagesize, pageshift, pagemask;
int system_active = 1;
static int ep_fd;
static char program_name[] = "tgtd";
static LIST_HEAD(tgt_events_list);
-static LIST_HEAD(tgt_counter_events_list);
+static LIST_HEAD(tgt_sched_events_list);
static struct option const long_options[] =
{
@@ -136,22 +123,6 @@ int tgt_event_add(int fd, int events, event_handler_t handler, void *data)
return err;
}
-int tgt_counter_event_add(int *counter, counter_event_handler_t handler,
- void *data)
-{
- struct tgt_event *tev;
-
- tev = zalloc(sizeof(*tev));
- if (!tev)
- return -ENOMEM;
-
- tev->data = data;
- tev->counter_handler = handler;
- tev->counter = counter;
- list_add(&tev->e_list, &tgt_counter_events_list);
- return 0;
-}
-
static struct tgt_event *tgt_event_lookup(int fd)
{
struct tgt_event *tev;
@@ -163,17 +134,6 @@ static struct tgt_event *tgt_event_lookup(int fd)
return NULL;
}
-static struct tgt_event *tgt_counter_event_lookup(int *counter)
-{
- struct tgt_event *tev;
-
- list_for_each_entry(tev, &tgt_counter_events_list, e_list) {
- if (tev->counter == counter)
- return tev;
- }
- return NULL;
-}
-
void tgt_event_del(int fd)
{
struct tgt_event *tev;
@@ -189,20 +149,6 @@ void tgt_event_del(int fd)
free(tev);
}
-void tgt_counter_event_del(int *counter)
-{
- struct tgt_event *tev;
-
- tev = tgt_counter_event_lookup(counter);
- if (!tev) {
- eprintf("Cannot find counter event %p\n", counter);
- return;
- }
-
- list_del(&tev->e_list);
- free(tev);
-}
-
int tgt_event_modify(int fd, int events)
{
struct epoll_event ev;
@@ -221,26 +167,54 @@ int tgt_event_modify(int fd, int events)
return epoll_ctl(ep_fd, EPOLL_CTL_MOD, fd, &ev);
}
-static void event_loop(void)
+void tgt_init_sched_event(struct tgt_event *evt,
+ sched_event_handler_t sched_handler, void *data)
+{
+ evt->sched_handler = sched_handler;
+ evt->scheduled = 0;
+ evt->data = data;
+ INIT_LIST_HEAD(&evt->e_list);
+}
+
+void tgt_add_sched_event(struct tgt_event *evt)
{
- int nevent, i, done, timeout = TGTD_TICK_PERIOD * 1000;
- struct epoll_event events[1024];
- struct tgt_event *tev, *tevn;
-
-retry:
- /*
- * Check the counter events to see if they have any work to run.
- */
- do {
- done = 1;
- list_for_each_entry_safe(tev, tevn, &tgt_counter_events_list,
- e_list) {
- if (*tev->counter) {
- done = 0;
- tev->counter_handler(tev->counter, tev->data);
- }
- }
- } while (!done);
+ if (!evt->scheduled) {
+ evt->scheduled = 1;
+ list_add_tail(&evt->e_list,&tgt_sched_events_list);
+ }
+}
+
+void tgt_remove_sched_event(struct tgt_event *evt)
+{
+ if (evt->scheduled) {
+ evt->scheduled = 0;
+ list_del_init(&evt->e_list);
+ }
+}
+
+static void tgt_exec_scheduled(void)
+{
+ struct list_head *last_sched;
+ struct tgt_event *tev, *tevn;
+
+ if (list_empty(&tgt_sched_events_list))
+ return;
+
+ /* execute only work scheduled till now */
+ last_sched = tgt_sched_events_list.prev;
+ list_for_each_entry_safe(tev,tevn,&tgt_sched_events_list,e_list) {
+ tgt_remove_sched_event(tev);
+ tev->sched_handler(tev);
+ if (&tev->e_list == last_sched)
+ break;
+ }
+}
+
+static void tgt_poll_events(int timeout)
+{
+ int nevent, i;
+ struct tgt_event *tev;
+ struct epoll_event events[1024];
nevent = epoll_wait(ep_fd, events, ARRAY_SIZE(events), timeout);
if (nevent < 0) {
@@ -255,9 +229,19 @@ retry:
}
} else
schedule();
+}
- if (system_active)
- goto retry;
+static void event_loop(void)
+{
+ int timeout, wait_timeout = TGTD_TICK_PERIOD * 1000;
+
+ while (system_active) {
+ tgt_exec_scheduled();
+ /* wait if no scheduled work, poll if there is */
+ timeout = list_empty(&tgt_sched_events_list) ?
+ wait_timeout : 0;
+ tgt_poll_events(timeout);
+ }
}
static int lld_init(int *use_kernel, char *args)
diff --git a/usr/tgtd.h b/usr/tgtd.h
index 4febcd3..0e226f7 100644
--- a/usr/tgtd.h
+++ b/usr/tgtd.h
@@ -206,13 +206,20 @@ extern int tgt_bind_host_to_target(int tid, int host_no);
extern int tgt_unbind_host_to_target(int tid, int host_no);
extern int tgt_bound_target_lookup(int host_no);
-typedef void (event_handler_t)(int fd, int events, void *data);
-typedef void (counter_event_handler_t)(int *counter, void *data);
+struct tgt_event;
+typedef void (* sched_event_handler_t)(struct tgt_event *tev);
+
+extern void tgt_init_sched_event(struct tgt_event *evt,
+ sched_event_handler_t sched_handler, void *data);
+
+typedef void (* event_handler_t)(int fd, int events, void *data);
+
extern int tgt_event_add(int fd, int events, event_handler_t handler, void *data);
-extern int tgt_counter_event_add(int *counter, counter_event_handler_t handler,
- void *data);
extern void tgt_event_del(int fd);
-extern void tgt_counter_event_del(int *counter);
+
+extern void tgt_add_sched_event(struct tgt_event *evt);
+extern void tgt_remove_sched_event(struct tgt_event *evt);
+
extern int tgt_event_modify(int fd, int events);
extern int target_cmd_queue(int tid, struct scsi_cmd *cmd);
extern void target_cmd_done(struct scsi_cmd *cmd);
@@ -262,4 +269,17 @@ extern int dtd_load_unload(int tid, uint64_t lun, int load, char *file);
extern int register_backingstore_template(struct backingstore_template *bst);
extern struct backingstore_template *get_backingstore_template(const char *name);
+struct tgt_event {
+ union {
+ event_handler_t handler;
+ sched_event_handler_t sched_handler;
+ };
+ union {
+ int fd;
+ int scheduled;
+ };
+ void *data;
+ struct list_head e_list;
+};
+
#endif
--
1.5.6.5
--
To unsubscribe from this list: send the line "unsubscribe stgt" in
the body of a message to majordomo at vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
More information about the stgt
mailing list