[stgt] [PATCH RFC 2/3] - new iser code (after pthreads revert)
Alexander Nezhinsky
alexandern at Voltaire.COM
Thu Jul 22 13:14:57 CEST 2010
Signed-off-by: Alexander Nezhinsky <alexandern at voltaire.com>
---
usr/iscsi/iser_ib.c | 3103 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 3103 insertions(+), 0 deletions(-)
create mode 100644 usr/iscsi/iser_ib.c
diff --git a/usr/iscsi/iser_ib.c b/usr/iscsi/iser_ib.c
new file mode 100644
index 0000000..e1a1f15
--- /dev/null
+++ b/usr/iscsi/iser_ib.c
@@ -0,0 +1,3103 @@
+/*
+ * iSCSI extensions for RDMA (iSER) data path
+ *
+ * Copyright (C) 2007 Dennis Dalessandro (dennis at osc.edu)
+ * Copyright (C) 2007 Ananth Devulapalli (ananth at osc.edu)
+ * Copyright (C) 2007 Pete Wyckoff (pw at osc.edu)
+ * Copyright (C) 2010 Voltaire, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <assert.h>
+#include <netdb.h>
+#include <sys/epoll.h>
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+
+#include "util.h"
+#include "iscsid.h"
+#include "target.h"
+#include "iser.h"
+#include "driver.h"
+#include "scsi.h"
+
+#if defined(HAVE_VALGRIND) && !defined(NDEBUG)
+#include <valgrind/memcheck.h>
+#else
+#define VALGRIND_MAKE_MEM_DEFINED(addr, len)
+#endif
+
+/* global, across all devices */
+static struct rdma_event_channel *rdma_evt_channel;
+static struct rdma_cm_id *cma_listen_id;
+
+/* accepted at RDMA layer, but not yet established */
+static struct list_head temp_conn;
+
+/* all devices */
+static struct list_head iser_dev_list;
+
+/* all iser connections */
+static struct list_head iser_conn_list;
+
+#define uint64_from_ptr(p) (uint64_t)(uintptr_t)(p)
+#define ptr_from_int64(p) (void *)(unsigned long)(p)
+
+#define ISER_LISTEN_PORT 3260
+
+short int iser_listen_port = ISER_LISTEN_PORT;
+char *iser_portal_addr;
+
+/*
+ * Crazy hard-coded linux iser settings need 128 * 8 slots + slop, plus
+ * room for our rdmas and send requests.
+ */
+#define MAX_WQE 1800
+
+#define RDMA_TRANSFER_SIZE (512 * 1024)
+
+#define MAX_POLL_WC 32
+
+#define ISER_MAX_QUEUE_CMD 128 // iSCSI cmd window size
+
+struct iscsi_sense_data {
+ uint16_t length;
+ uint8_t data[0];
+} __packed;
+
+/*
+ * Number of allocatable data buffers, each of this size. Do at least 128
+ * for linux iser. The membuf size is rounded up at initialization time
+ * to the hardware page size so that allocations for direct IO devices are
+ * aligned.
+ */
+static int membuf_num = 16 * ISER_MAX_QUEUE_CMD;
+static size_t membuf_size = RDMA_TRANSFER_SIZE;
+
+static int iser_conn_get(struct iser_conn *conn);
+static void iser_conn_put(struct iser_conn *conn);
+
+static void iser_free_login_resources(struct iser_conn *conn);
+static void iser_free_ff_resources(struct iser_conn *conn);
+
+static void iser_cqe_handler(int fd __attribute__ ((unused)),
+ int events __attribute__ ((unused)),
+ void *data);
+
+static void iser_sched_poll_cq(struct event_data *tev);
+static void iser_sched_consume_cq(struct event_data *tev);
+static void iser_sched_tx(struct event_data *evt);
+static void iser_sched_post_recv(struct event_data *evt);
+static void iser_sched_buf_alloc(struct event_data *evt);
+static void iser_sched_iosubmit(struct event_data *evt);
+static void iser_sched_rdma_rd(struct event_data *evt);
+
+
+static inline struct iser_conn *ISER_CONN(struct iscsi_connection *iscsi_conn)
+{
+ return container_of(iscsi_conn, struct iser_conn, h);
+}
+
+static void iser_pdu_init(struct iser_pdu *pdu, void *buf)
+{
+ pdu->iser_hdr = (struct iser_hdr *) buf;
+ buf += sizeof(struct iser_hdr);
+ pdu->bhs = (struct iscsi_hdr *) buf;
+ buf += sizeof(struct iscsi_hdr);
+ pdu->ahs = pdu->data.buf = buf;
+ pdu->ahssize = pdu->data.size = 0;
+}
+
+static void iser_send_pdu_init(struct iser_pdu *pdu, void *buf)
+{
+ iser_pdu_init(pdu, buf);
+ memset(pdu->iser_hdr, 0, sizeof(struct iser_hdr));
+ pdu->iser_hdr->flags = ISCSI_CTRL;
+}
+
+static void iser_rxd_init(struct iser_work_req *rxd,
+ struct iser_task *task,
+ void *buf, unsigned size,
+ struct ibv_mr *srmr)
+{
+ rxd->recv = 1;
+ rxd->task = task;
+
+ rxd->sge.addr = uint64_from_ptr(buf);
+ rxd->sge.length = size;
+ rxd->sge.lkey = srmr->lkey;
+
+ rxd->recv_wr.wr_id = uint64_from_ptr(rxd);
+ rxd->recv_wr.sg_list = &rxd->sge;
+ rxd->recv_wr.num_sge = 1;
+ rxd->recv_wr.next = NULL;
+}
+
+static void iser_txd_init(struct iser_work_req *txd,
+ struct iser_task *task,
+ void *buf, unsigned size,
+ struct ibv_mr *srmr)
+{
+ txd->recv = 0;
+ txd->task = task;
+
+ txd->sge.addr = uint64_from_ptr(buf);
+ txd->sge.length = size;
+ txd->sge.lkey = srmr->lkey;
+
+ txd->send_wr.wr_id = uint64_from_ptr(txd);
+ txd->send_wr.next = NULL;
+ txd->send_wr.sg_list = &txd->sge;
+ txd->send_wr.num_sge = 1;
+ txd->send_wr.opcode = IBV_WR_SEND;
+ txd->send_wr.send_flags = IBV_SEND_SIGNALED;
+
+ INIT_LIST_HEAD(&txd->wr_list);
+}
+
+static void iser_rdmad_init(struct iser_work_req *rdmad,
+ struct iser_task *task,
+ struct ibv_mr *srmr)
+{
+ rdmad->recv = 0;
+ rdmad->task = task;
+
+ // txd->sge.addr, txd->sge.length to be set before posting
+ rdmad->sge.lkey = srmr->lkey;
+
+ rdmad->send_wr.wr_id = uint64_from_ptr(rdmad);
+ rdmad->send_wr.sg_list = &rdmad->sge;
+ rdmad->send_wr.num_sge = 1;
+ rdmad->send_wr.next = NULL;
+ // txd->send_wr.opcode to be set before posting
+ rdmad->send_wr.send_flags = IBV_SEND_SIGNALED;
+ // rdmad->send_wr.wr.rdma.(remote_addr,rkey) to be set before post
+
+ INIT_LIST_HEAD(&rdmad->wr_list);
+}
+
+static void iser_task_init(struct iser_task *task,
+ struct iser_conn *conn,
+ void *pdu_data_buf,
+ struct ibv_mr *srmr)
+{
+ task->conn = conn;
+ task->unsolicited = 0;
+
+ iser_rxd_init(&task->rxd, task, pdu_data_buf, conn->rsize, srmr);
+ iser_pdu_init(&task->req, pdu_data_buf);
+ pdu_data_buf += conn->rsize;
+
+ iser_txd_init(&task->txd, task, pdu_data_buf, conn->ssize, srmr);
+ iser_send_pdu_init(&task->rsp, pdu_data_buf);
+
+ INIT_LIST_HEAD(&task->in_buf_list);
+ INIT_LIST_HEAD(&task->out_buf_list);
+ iser_rdmad_init(&task->rdmad, task, conn->dev->membuf_mr);
+
+ INIT_LIST_HEAD(&task->exec_list);
+ INIT_LIST_HEAD(&task->rdma_list);
+ INIT_LIST_HEAD(&task->tx_list);
+ INIT_LIST_HEAD(&task->recv_list);
+
+ INIT_LIST_HEAD(&task->session_list);
+ INIT_LIST_HEAD(&task->dout_task_list);
+}
+
+static void iser_unsolicited_task_init(struct iser_task *task,
+ struct iser_conn *conn,
+ void *pdu_data_buf,
+ struct ibv_mr *srmr)
+{
+ task->conn = conn;
+ task->unsolicited = 1;
+
+ iser_txd_init(&task->txd, task, pdu_data_buf, conn->ssize, srmr);
+ iser_send_pdu_init(&task->rsp, pdu_data_buf);
+}
+
+static void iser_task_add_out_pdu_buf(struct iser_task *task,
+ struct membuf *buf,
+ unsigned int offset)
+{
+ buf->offset = offset;
+ task->out_buf_num ++;
+ if (!list_empty(&task->out_buf_list)) {
+ struct membuf *cur_buf;
+ list_for_each_entry(cur_buf, &task->out_buf_list, task_list) {
+ if (offset < cur_buf->offset) {
+ dprintf("task:%p offset:%d size:%d buf:%p add before:%p\n",
+ task, offset, buf->size, buf->buf, cur_buf->buf);
+ list_add_tail(&buf->task_list, &cur_buf->task_list);
+ return;
+ }
+ }
+ }
+ dprintf("task:%p offset:%d size:%d buf:%p add last\n",
+ task, offset, buf->size, buf->buf);
+ list_add_tail(&buf->task_list, &task->out_buf_list);
+}
+
+static void iser_task_add_out_rdma_buf(struct iser_task *task,
+ struct membuf *buf,
+ unsigned int offset)
+{
+ buf->offset = offset;
+ task->out_buf_num ++;
+ dprintf("task:%p offset:%d size:%d buf:%p add last\n",
+ task, offset, buf->size, buf->buf);
+ list_add_tail(&buf->task_list, &task->out_buf_list);
+}
+
+static void iser_task_del_out_buf(struct iser_task *task,
+ struct membuf *buf)
+{
+ dprintf("task:%p offset:%d size:%d buf:%p\n",
+ task, buf->offset, buf->size, buf->buf);
+ list_del(&buf->task_list);
+ task->out_buf_num --;
+}
+
+static void iser_task_add_in_rdma_buf(struct iser_task *task,
+ struct membuf *buf,
+ unsigned int offset)
+{
+ dprintf("task:%p offset:0x%d size:%d buf:%p add last\n",
+ task, offset, buf->size, buf->buf);
+ buf->offset = offset;
+ task->in_buf_num ++;
+ list_add_tail(&buf->task_list, &task->in_buf_list);
+}
+
+static void iser_task_del_in_buf(struct iser_task *task,
+ struct membuf *buf)
+{
+ dprintf("task:%p offset:%d size:%d buf:%p\n",
+ task, buf->offset, buf->size, buf->buf);
+ list_del(&buf->task_list);
+ task->in_buf_num --;
+}
+
+static int iser_post_recv(struct iser_conn *conn, struct iser_task *task,
+ int num_recv_bufs)
+{
+ struct ibv_recv_wr *bad_wr;
+ int err, nr_posted;
+
+ err = ibv_post_recv(conn->qp_hndl, &task->rxd.recv_wr, &bad_wr);
+ if (!err)
+ nr_posted = num_recv_bufs;
+ else {
+ nr_posted = 0;
+ eprintf("failed to post rx buf %p", &task->rxd.recv_wr);
+ }
+ //conn->recvl_posted += nr_posted;
+
+ dprintf("chained:%d last task:%p wr_id:0x%lx sge_sz:%u\n",
+ nr_posted, task, (unsigned long)task->rxd.recv_wr.wr_id,
+ task->rxd.sge.length);
+
+ while (nr_posted--)
+ iser_conn_get(conn);
+
+ return err;
+}
+
+static void iser_prep_resp_send_req(struct iser_task *task,
+ struct iser_work_req *next_wr,
+ int signaled)
+{
+ struct iser_pdu *rsp = &task->rsp;
+ struct iscsi_hdr *bhs = rsp->bhs;
+ struct iser_work_req *txd = &task->txd;
+
+ bhs->hlength = rsp->ahssize / 4;
+ hton24(bhs->dlength, rsp->data.size);
+
+ txd->sge.length = sizeof(struct iscsi_hdr) + sizeof(struct iser_hdr);
+ txd->sge.length += rsp->ahssize;
+ txd->sge.length += rsp->data.size;
+
+ txd->send_wr.next = (next_wr ? &next_wr->send_wr : NULL);
+ txd->send_wr.send_flags = (signaled ? IBV_SEND_SIGNALED : 0);
+
+ dprintf("task:%p wr_id:0x%lx tag:0x%04lx "
+ "dtbuf:%p dtsz:%u ahs_sz:%u stat:0x%x statsn:0x%x expcmdsn:0x%x\n",
+ task, (unsigned long)txd->send_wr.wr_id,
+ (unsigned long)task->tag,
+ rsp->data.buf, rsp->data.size, rsp->ahssize,
+ bhs->rsvd2[1], ntohl(bhs->statsn), ntohl(bhs->exp_statsn));
+}
+
+static void iser_prep_rdma_wr_send_req(struct iser_task *task,
+ struct iser_work_req *next_wr,
+ int signaled)
+{
+ struct iser_work_req *rdmad = &task->rdmad;
+ struct membuf *rdma_buf;
+ uint64_t offset = 0; // ToDo: multiple RDMA-Write buffers
+
+ /* RDMA-Write buffer is the only one on the list */
+ // ToDo: multiple RDMA-Write buffers, use rdma_buf->offset
+ rdma_buf = list_first_entry(&task->in_buf_list, struct membuf, task_list);
+ rdmad->sge.addr = uint64_from_ptr(rdma_buf->buf);
+
+ if (task->rdma_wr_remains > rdma_buf->size)
+ rdmad->sge.length = rdma_buf->size;
+ else
+ rdmad->sge.length = task->rdma_wr_remains;
+ task->rdma_wr_remains -= rdmad->sge.length;
+
+ rdmad->send_wr.next = (next_wr ? &next_wr->send_wr : NULL);
+ rdmad->send_wr.opcode = IBV_WR_RDMA_WRITE;
+ rdmad->send_wr.send_flags = (signaled ? IBV_SEND_SIGNALED : 0);
+
+ rdmad->send_wr.wr.rdma.remote_addr = task->rem_read_va + offset;
+ // ToDo: multiple RDMA-Write buffers
+ // offset += rdmad->sge.length
+ rdmad->send_wr.wr.rdma.rkey = task->rem_read_stag;
+
+ dprintf(" task:%p wr_id:0x%lx tag:0x%04lx daddr:0x%lx dsz:%u "
+ "bufsz:%u rdma:%d lkey:%x raddr:%lx rkey:%x rems:%u\n",
+ task, (unsigned long)rdmad->send_wr.wr_id,
+ (unsigned long)task->tag, (unsigned long)rdmad->sge.addr,
+ rdmad->sge.length, rdma_buf->size, rdma_buf->rdma,
+ rdmad->sge.lkey, (unsigned long)rdmad->send_wr.wr.rdma.remote_addr,
+ rdmad->send_wr.wr.rdma.rkey, task->rdma_wr_remains);
+}
+
+static void iser_prep_rdma_rd_send_req(struct iser_task *task,
+ struct iser_work_req *next_wr,
+ int signaled)
+{
+ struct iser_work_req *rdmad = &task->rdmad;
+ struct membuf *rdma_buf;
+ uint64_t offset;
+ int cur_req_sz;
+
+ /* RDMA-Read buffer is always put at the list's tail */
+ // ToDo: multiple RDMA-Write buffers
+ rdma_buf = container_of(task->out_buf_list.prev, struct membuf, task_list);
+ offset = rdma_buf->offset;
+ rdmad->sge.addr = uint64_from_ptr(rdma_buf->buf) + offset;
+
+ if (task->rdma_rd_sz <= rdma_buf->size)
+ cur_req_sz = task->rdma_rd_sz;
+ else
+ cur_req_sz = rdma_buf->size;
+
+ // ToDo: multiple RDMA-Write buffers
+ // task->rdma_rd_offset += cur_req_sz;
+
+ rdmad->sge.length = cur_req_sz;
+
+ rdmad->send_wr.next = (next_wr ? &next_wr->send_wr : NULL);
+ rdmad->send_wr.opcode = IBV_WR_RDMA_READ;
+ rdmad->send_wr.send_flags = (signaled ? IBV_SEND_SIGNALED : 0);
+ rdmad->send_wr.wr.rdma.remote_addr =
+ (uint64_t) task->rem_write_va;
+ // ToDo: multiple RDMA-Write buffers: + (offset - task->unsol_sz);
+ rdmad->send_wr.wr.rdma.rkey =
+ (uint32_t) task->rem_write_stag;
+
+ dprintf("task:%p wr_id:0x%lx tag:0x%04lx daddr:0x%lx dsz:%u "
+ "bufsz:%u rdma:%d lkey:%x raddr:%lx rkey:%x rems:%u\n",
+ task, (unsigned long)rdmad->send_wr.wr_id,
+ (unsigned long)task->tag, (unsigned long)rdmad->sge.addr,
+ rdmad->sge.length, rdma_buf->size, rdma_buf->rdma,
+ rdmad->sge.lkey, (unsigned long)rdmad->send_wr.wr.rdma.remote_addr,
+ rdmad->send_wr.wr.rdma.rkey, task->rdma_rd_sz);
+}
+
+static int iser_post_send(struct iser_conn *conn, struct iser_work_req *iser_send)
+{
+ struct ibv_send_wr *bad_wr;
+ int err;
+
+ err = ibv_post_send(conn->qp_hndl, &iser_send->send_wr, &bad_wr);
+ if (err)
+ eprintf("ibv_post_send err %d\n", err);
+ return err;
+}
+
+static inline void iser_set_rsp_cmd_sn(struct iscsi_session *session,
+ struct iscsi_hdr *rsp)
+{
+ if (session) {
+ rsp->exp_statsn = cpu_to_be32(session->exp_cmd_sn);
+ rsp->max_statsn = cpu_to_be32(session->exp_cmd_sn + ISER_MAX_QUEUE_CMD);
+ }
+}
+
+/*
+ * One pool of registered memory per device (per PD that is).
+ */
+static int iser_init_rdma_buf_pool(struct iser_device *dev)
+{
+ uint8_t *regbuf, *task_buf;
+ size_t pool_size, list_size;
+ struct membuf *mp;
+ int i;
+
+ membuf_size = roundup(membuf_size, pagesize);
+ pool_size = membuf_num * membuf_size;
+ regbuf = valloc(pool_size);
+ if (!regbuf) {
+ eprintf("malloc regbuf sz:%zu\n", pool_size);
+ return -ENOMEM;
+ }
+
+ list_size = membuf_num * sizeof(*mp);
+ task_buf = malloc(list_size);
+ if (!task_buf) {
+ eprintf("malloc task_buf sz:%zu\n", list_size);
+ free(regbuf);
+ return -ENOMEM;
+ }
+
+ dev->membuf_mr = ibv_reg_mr(dev->pd, regbuf, pool_size,
+ IBV_ACCESS_LOCAL_WRITE);
+ if (!dev->membuf_mr) {
+ eprintf("register regbuf\n");
+ free(regbuf);
+ free(task_buf);
+ return -1;
+ }
+ dprintf("pool buf:%p list:%p mr:%p lkey:0x%x\n",
+ regbuf, task_buf, dev->membuf_mr, dev->membuf_mr->lkey);
+
+ dev->membuf_regbuf = regbuf;
+ dev->membuf_listbuf = task_buf;
+ INIT_LIST_HEAD(&dev->membuf_free);
+ INIT_LIST_HEAD(&dev->membuf_alloc);
+
+ for (i = 0; i < membuf_num; i++) {
+ mp = (void *) task_buf;
+ task_buf += sizeof(*mp);
+
+ list_add_tail(&mp->pool_list, &dev->membuf_free);
+ INIT_LIST_HEAD(&mp->task_list);
+ mp->buf = regbuf;
+ mp->size = membuf_size;
+ mp->rdma = 1;
+
+ regbuf += membuf_size;
+ }
+
+ return 0;
+}
+
+static struct membuf *iser_alloc_rdma_buf(struct iser_conn *conn,
+ size_t sz)
+{
+ struct iser_device *dev = conn->dev;
+ struct list_head *buf_list;
+ struct membuf *mem;
+
+ if (list_empty(&dev->membuf_free)) {
+ // eprintf("free list empty\n");
+ dev->waiting_for_mem = 1;
+ return NULL;
+ }
+ if (sz > membuf_size) {
+ eprintf("size %zu too big\n", sz);
+ exit(1);
+ }
+
+ buf_list = dev->membuf_free.next;
+ list_del(buf_list);
+ list_add_tail(buf_list, &dev->membuf_alloc);
+ mem = list_entry(buf_list, struct membuf, pool_list);
+
+ dprintf("alloc:%p sz:%zu\n", mem, sz);
+ return mem;
+}
+
+static void iser_free_rdma_buf(struct iser_conn *conn, struct membuf *mem)
+{
+ struct iser_device *dev;
+
+ if (!mem || !mem->rdma)
+ return;
+
+ dprintf("free %p\n", mem);
+ list_del(&mem->pool_list);
+ // add to the free list head to reuse recently used buffers first
+ dev = conn->dev;
+ list_add(&mem->pool_list, &dev->membuf_free);
+ if (dev->waiting_for_mem) {
+ dev->waiting_for_mem = 0;
+ tgt_add_sched_event(&conn->sched_buf_alloc);
+ }
+}
+
+static void iser_task_free_out_bufs(struct iser_task *task)
+{
+ struct iser_conn *conn = task->conn;
+ struct membuf *membuf, *mbnext;
+
+ list_for_each_entry_safe(membuf, mbnext, &task->out_buf_list, task_list) {
+ iser_task_del_out_buf(task, membuf);
+ if (membuf->rdma)
+ iser_free_rdma_buf(conn, membuf);
+ }
+ assert(task->out_buf_num == 0);
+}
+
+static void iser_task_free_in_bufs(struct iser_task *task)
+{
+ struct iser_conn *conn = task->conn;
+ struct membuf *membuf, *mbnext;
+
+ list_for_each_entry_safe(membuf, mbnext, &task->in_buf_list, task_list) {
+ iser_task_del_in_buf(task, membuf);
+ iser_free_rdma_buf(conn, membuf);
+ }
+ assert(task->in_buf_num == 0);
+}
+
+static void iser_complete_task(struct iser_task *task)
+{
+ if (task->opcode == ISCSI_OP_SCSI_CMD) {
+ target_cmd_done(&task->scmd);
+ list_del(&task->session_list);
+ if (task->is_read)
+ iser_task_free_in_bufs(task);
+ if (task->is_write) {
+ iser_task_free_out_bufs(task);
+ // ToDo: multiple out buffers, no copy
+ // iser_task_free_dout_tasks(task);
+ }
+ }
+ dprintf("task:%p\n", task);
+ //iser_conn_put(task->conn);
+}
+
+/*
+ * Called at accept time, builds resources just for login phase.
+ */
+
+#define NUM_LOGIN_TASKS 2 /* one posted for req rx, one for reply tx */
+
+static int iser_alloc_login_resources(struct iser_conn *conn)
+{
+ unsigned long size;
+ struct iser_task *login_task[NUM_LOGIN_TASKS];
+ uint8_t *pdu_data_buf, *task_buf;
+ unsigned int i;
+ int err = 0;
+
+ dprintf("login tasks num:%u, rx/tx sz %u/%u\n",
+ NUM_LOGIN_TASKS, conn->ssize, conn->rsize);
+
+ size = NUM_LOGIN_TASKS * (conn->rsize + conn->ssize);
+ conn->login_data_pool = malloc(size);
+ if (!conn->login_data_pool) {
+ eprintf("malloc pdu_data_buf %lu\n", size);
+ err = -1;
+ goto out;
+ }
+
+ conn->login_data_mr =
+ ibv_reg_mr(conn->dev->pd, conn->login_data_pool, size,
+ IBV_ACCESS_LOCAL_WRITE);
+ if (!conn->login_data_mr) {
+ eprintf("ibv_reg_mr pdu_data_buf failed\n");
+ free(conn->login_data_pool);
+ conn->login_data_pool = NULL;
+ err = -1;
+ goto out;
+ }
+
+ size = NUM_LOGIN_TASKS * sizeof(struct iser_task);
+ conn->login_task_pool = malloc(size);
+ if (!conn->login_task_pool) {
+ eprintf("malloc login_task_buf %lu\n", size);
+ ibv_dereg_mr(conn->login_data_mr);
+ conn->login_data_mr = NULL;
+ free(conn->login_data_pool);
+ conn->login_data_pool = NULL;
+ goto out;
+ }
+ memset(conn->login_task_pool, 0, size);
+
+ conn->login_res_alloc = 1;
+ pdu_data_buf = conn->login_data_pool;
+ task_buf = conn->login_task_pool;
+ for (i = 0; i < NUM_LOGIN_TASKS; i++) {
+ login_task[i] = (struct iser_task *) task_buf;
+ task_buf += sizeof(struct iser_task);
+
+ iser_task_init(login_task[i], conn, pdu_data_buf,
+ conn->login_data_mr);
+ pdu_data_buf += conn->rsize + conn->ssize;
+ }
+
+ dprintf("post_recv login rx task:%p\n", login_task[0]);
+ err = iser_post_recv(conn, login_task[0], 1);
+ if (err) {
+ eprintf("post_recv login rx task failed, %m\n");
+ iser_free_login_resources(conn);
+ goto out;
+ }
+ dprintf("saved login tx task:%p\n", login_task[1]);
+ conn->login_tx_task = login_task[1];
+out:
+ return err;
+}
+
+/*
+ * When ready for full-feature mode, free login-phase resources.
+ */
+static void iser_free_login_resources(struct iser_conn *conn)
+{
+ int err;
+
+ if (!conn->login_res_alloc)
+ return;
+
+ dprintf("freeing, login phase %d\n", conn->login_phase);
+
+ /* release mr and free the lists */
+ if (conn->login_data_mr) {
+ err = ibv_dereg_mr(conn->login_data_mr);
+ if (err)
+ eprintf("ibv_dereg_mr\n");
+ }
+ if (conn->login_data_pool)
+ free(conn->login_data_pool);
+ if (conn->login_task_pool)
+ free(conn->login_task_pool);
+ conn->login_res_alloc = 0;
+}
+
+/*
+ * Called when ready for full feature, builds resources.
+ */
+static int iser_alloc_ff_resources(struct iser_conn *conn)
+{
+ unsigned long size;
+ uint8_t *pdu_data_buf, *task_buf;
+ struct iser_task *task;
+ unsigned int i;
+ int err = 0;
+
+ dprintf("conn ssize:%u rsize:%u max_outst:%u\n",
+ conn->ssize, conn->rsize, conn->max_outst_pdu);
+
+ size = (conn->rsize + conn->ssize) * conn->max_outst_pdu;
+ conn->pdu_data_pool = malloc(size);
+ if (!conn->pdu_data_pool) {
+ eprintf("malloc pdu_data_buf %lu\n", size);
+ err = -1;
+ goto out;
+ }
+
+ conn->pdu_data_mr = ibv_reg_mr(conn->dev->pd, conn->pdu_data_pool, size,
+ IBV_ACCESS_LOCAL_WRITE);
+ if (!conn->pdu_data_mr) {
+ eprintf("register pdu_data_buf\n");
+ free(conn->pdu_data_pool);
+ conn->pdu_data_pool = NULL;
+ err = -1;
+ goto out;
+ }
+
+ size = conn->max_outst_pdu * sizeof(struct iser_task);
+ conn->task_pool = malloc(size);
+ if (!conn->task_pool) {
+ eprintf("malloc task_buf %lu\n", size);
+ ibv_dereg_mr(conn->pdu_data_mr);
+ conn->pdu_data_mr = NULL;
+ free(conn->pdu_data_pool);
+ conn->pdu_data_pool = NULL;
+ goto out;
+ }
+ memset(conn->task_pool, 0, size);
+
+ conn->ff_res_alloc = 1;
+ pdu_data_buf = conn->pdu_data_pool;
+ task_buf = conn->task_pool;
+ for (i = 1; i < conn->max_outst_pdu; i++) {
+ task = (void *) task_buf;
+ task_buf += sizeof(*task);
+
+ iser_task_init(task, conn, pdu_data_buf, conn->pdu_data_mr);
+ pdu_data_buf += conn->rsize + conn->ssize;
+
+ err = iser_post_recv(conn, task, 1);
+ if (err) {
+ eprintf("post_recv (%d/%d): %m\n",
+ i, conn->max_outst_pdu);
+ iser_free_ff_resources(conn);
+ break;
+ }
+ }
+ task = (void *) task_buf;
+ iser_unsolicited_task_init(task, conn, pdu_data_buf, conn->pdu_data_mr);
+ conn->nop_in_task = task;
+
+out:
+ return err;
+}
+
+/*
+ * On connection shutdown.
+ */
+static void iser_free_ff_resources(struct iser_conn *conn)
+{
+ int err;
+
+ if (!conn->ff_res_alloc)
+ return;
+
+ dprintf("freeing conn %p\n", conn);
+
+ /* release mr and free the lists */
+ dprintf("dereg mr %p\n", conn->pdu_data_mr);
+ if (conn->pdu_data_mr) {
+ err = ibv_dereg_mr(conn->pdu_data_mr);
+ if (err)
+ eprintf("ibv_dereg_mr\n");
+ }
+ if (conn->pdu_data_pool)
+ free(conn->pdu_data_pool);
+ if (conn->task_pool)
+ free(conn->task_pool);
+ conn->ff_res_alloc = 0;
+ dprintf("exit\n");
+}
+
+/*
+ * Allocate resources for this new connection. Called after login, when
+ * final negotiated transfer parameters are known.
+ */
+int iser_login_complete(struct iscsi_connection *iscsi_conn)
+{
+ struct iser_conn *conn = ISER_CONN(iscsi_conn);
+ unsigned int irdsl, trdsl, outst_pdu, hdrsz;
+ int err = -1;
+
+ dprintf("entry\n");
+
+ /*
+ * Build full feature connection structures, but leave around the
+ * login ones until the final send finishes.
+ */
+ conn->login_phase = LOGIN_PHASE_LAST_SEND; /* one more send, then done */
+ irdsl = iscsi_conn->session_param[ISCSI_PARAM_INITIATOR_RDSL].val;
+ trdsl = iscsi_conn->session_param[ISCSI_PARAM_TARGET_RDSL].val;
+ outst_pdu =
+ iscsi_conn->session_param[ISCSI_PARAM_MAX_OUTST_PDU].val; // ToDo: outstanding pdus num
+
+ /* hack, ib/ulp/iser does not have this param, but reading the code
+ * shows their formula for max tx dtos outstanding
+ * = cmds_max * (1 + dataouts) + rx_misc + tx_misc
+ */
+#define ISER_INFLIGHT_DATAOUTS 0
+#define ISER_MAX_RX_MISC_PDUS 4
+#define ISER_MAX_TX_MISC_PDUS 6
+
+ // ToDo: outstanding pdus num
+// if (outst_pdu == 0)
+ outst_pdu =
+ 3 * ISER_MAX_QUEUE_CMD * (1 + ISER_INFLIGHT_DATAOUTS) +
+ ISER_MAX_RX_MISC_PDUS + ISER_MAX_TX_MISC_PDUS;
+
+ /* RDSLs do not include headers. */
+ hdrsz = sizeof(struct iser_hdr) +
+ sizeof(struct iscsi_hdr) +
+ sizeof(struct iscsi_ecdb_ahdr) +
+ sizeof(struct iscsi_rlength_ahdr);
+
+ conn->ssize = hdrsz + irdsl;
+ conn->rsize = hdrsz + trdsl;
+ conn->max_outst_pdu = outst_pdu;
+ err = iser_alloc_ff_resources(conn);
+ if (err) {
+ eprintf("iser_alloc_resources failed\n");
+ goto out;
+ }
+ // How much data to grab in an RDMA operation, read or write.
+ //iscsi_conn->data_inout_max_length = RDMA_TRANSFER_SIZE;
+ iscsi_conn->session_param[ISCSI_PARAM_MAX_XMIT_DLENGTH].val =
+ RDMA_TRANSFER_SIZE;
+ out:
+ return err;
+}
+
+/*
+ * First time a new connection is received on an RDMA device, record
+ * it and build a PD and static memory.
+ */
+static int iser_device_init(struct iser_device *dev)
+{
+ int cqe_num;
+ int err = -1;
+
+ dprintf("dev %p\n", dev);
+ dev->pd = ibv_alloc_pd(dev->ibv_hndl);
+ if (dev->pd == NULL) {
+ eprintf("ibv_alloc_pd failed\n");
+ goto out;
+ }
+
+ err = iser_init_rdma_buf_pool(dev);
+ if (err) {
+ eprintf("iser_init_rdma_buf_pool failed\n");
+ goto out;
+ }
+
+ err = ibv_query_device(dev->ibv_hndl, &dev->device_attr);
+ if (err < 0) {
+ eprintf("ibv_query_device: %m\n");
+ goto out;
+ }
+ cqe_num = dev->device_attr.max_cqe;
+ dprintf("max %d CQEs\n", cqe_num);
+
+ err = -1;
+ dev->cq_channel = ibv_create_comp_channel(dev->ibv_hndl);
+ if (dev->cq_channel == NULL) {
+ eprintf("ibv_create_comp_channel failed: %m\n");
+ goto out;
+ }
+
+ dev->cq = ibv_create_cq(dev->ibv_hndl, cqe_num, NULL,
+ dev->cq_channel, 0);
+ if (dev->cq == NULL) {
+ eprintf("ibv_create_cq failed: %m\n");
+ goto out;
+ }
+
+ tgt_init_sched_event(&dev->poll_sched, iser_sched_poll_cq, dev);
+
+ err = ibv_req_notify_cq(dev->cq, 0);
+ if (err) {
+ eprintf("ibv_req_notify failed: %s\n", strerror(err));
+ goto out;
+ }
+
+ err = tgt_event_add(dev->cq_channel->fd, EPOLLIN, iser_cqe_handler,
+ dev);
+ if (err) {
+ eprintf("tgt_event_add failed: %m\n");
+ goto out;
+
+ }
+
+ list_add_tail(&dev->list, &iser_dev_list);
+
+out:
+ return err;
+}
+
+static int iser_ib_clear_iosubmit_list(struct iser_conn *conn)
+{
+ struct iser_task *task;
+
+ dprintf("start\n");
+ while (!list_empty(&conn->iosubmit_list)) {
+ task = list_first_entry(&conn->iosubmit_list, struct iser_task, exec_list);
+ list_del(&task->exec_list);
+ iser_complete_task(task);
+ }
+ return 0;
+}
+
+static int iser_ib_clear_rdma_rd_list(struct iser_conn *conn)
+{
+ struct iser_task *task;
+
+ dprintf("start\n");
+ while (!list_empty(&conn->rdma_rd_list)) {
+ task = list_first_entry(&conn->rdma_rd_list, struct iser_task, rdma_list);
+ list_del(&task->rdma_list);
+ }
+ return 0;
+}
+
+static int iser_ib_clear_tx_list(struct iser_conn *conn)
+{
+ struct iser_task *task;
+
+ dprintf("start\n");
+ while (!list_empty(&conn->resp_tx_list)) {
+ task = list_first_entry(&conn->resp_tx_list, struct iser_task, tx_list);
+ list_del(&task->tx_list);
+ }
+ return 0;
+}
+
+static int iser_ib_clear_sent_list(struct iser_conn *conn)
+{
+ struct iser_task *task;
+
+ dprintf("start\n");
+ while (!list_empty(&conn->sent_list)) {
+ task = list_first_entry(&conn->sent_list, struct iser_task, tx_list);
+ list_del(&task->tx_list);
+ }
+ return 0;
+}
+
+static int iser_ib_clear_post_recv_list(struct iser_conn *conn)
+{
+ struct iser_task *task;
+
+ dprintf("start\n");
+ while (!list_empty(&conn->post_recv_list)) {
+ task = list_first_entry(&conn->post_recv_list, struct iser_task, recv_list);
+ list_del(&task->recv_list);
+ }
+ return 0;
+}
+
+int iser_conn_init(struct iser_conn *conn)
+{
+ conn->h.refcount = 0;
+ conn->h.state = STATE_INIT;
+ param_set_defaults(conn->h.session_param, session_keys);
+
+ INIT_LIST_HEAD(&conn->h.clist);
+
+ INIT_LIST_HEAD(&conn->buf_alloc_list);
+ INIT_LIST_HEAD(&conn->rdma_rd_list);
+ /* INIT_LIST_HEAD(&conn->rdma_wr_list); */
+ INIT_LIST_HEAD(&conn->iosubmit_list);
+ INIT_LIST_HEAD(&conn->resp_tx_list);
+ INIT_LIST_HEAD(&conn->sent_list);
+ INIT_LIST_HEAD(&conn->post_recv_list);
+
+ return 0;
+}
+
+/*
+ * Start closing connection. Transfer IB QP to error state.
+ * This will be followed by WC error and buffers flush events.
+ * We also should expect DISCONNECTED and TIMEWAIT_EXIT events.
+ * Only after the draining is over we are sure to have reclaimed
+ * all buffers (and tasks). After the RDMA CM events are collected,
+ * the connection QP may be destroyed, and its number may be recycled.
+ */
+void iser_conn_close(struct iser_conn *conn)
+{
+ int err;
+
+ if (conn->h.state == STATE_CLOSE)
+ return;
+ conn->h.state = STATE_CLOSE;
+
+ dprintf("rdma_disconnect conn:%p\n", &conn->h);
+ err = rdma_disconnect(conn->cm_id);
+ if (err)
+ eprintf("rdma_disconnect: %s\n", strerror(-err));
+
+ list_del(&conn->conn_list);
+
+ tgt_remove_sched_event(&conn->sched_buf_alloc);
+ tgt_remove_sched_event(&conn->sched_rdma_rd);
+ tgt_remove_sched_event(&conn->sched_iosubmit);
+ tgt_remove_sched_event(&conn->sched_tx);
+ tgt_remove_sched_event(&conn->sched_post_recv);
+
+ if (conn->login_phase != LOGIN_PHASE_FF) {
+ iser_ib_clear_iosubmit_list(conn);
+ iser_ib_clear_rdma_rd_list(conn);
+ }
+ iser_ib_clear_tx_list(conn);
+ iser_ib_clear_sent_list(conn);
+ iser_ib_clear_post_recv_list(conn);
+
+ eprintf("connection:%p closed, refcnt:%d\n", conn, conn->h.refcount);
+}
+
+/*
+ * Called when the connection is freed, from iscsi, but won't do anything until
+ * all posted WRs have gone away. So also called again from RX progress when
+ * it notices this happens.
+ */
+void iser_conn_free(struct iser_conn *conn)
+{
+ int err;
+
+ dprintf("conn:%p rx.posted:%d refcnt:%d\n", conn, conn->recvl_posted,
+ conn->h.refcount);
+ assert(conn->h.refcount == 0);
+
+ /* try to free unconditionally, resources freed only if necessary */
+ iser_free_login_resources(conn);
+ iser_free_ff_resources(conn);
+
+ if (conn->qp_hndl) {
+ err = ibv_destroy_qp(conn->qp_hndl);
+ if (err)
+ eprintf("ibv_destroy_qp: %s\n", strerror(-err));
+ else
+ dprintf("destroyed qp:%p\n", conn->qp_hndl);
+ }
+ if (conn->cm_id) {
+ err = rdma_destroy_id(conn->cm_id);
+ if (err)
+ eprintf("rdma_destroy_id conn:%p cm_id:%p, %s\n",
+ &conn->h, conn->cm_id, strerror(-err));
+ else
+ dprintf("destroyed cm_id:%p\n", conn->cm_id);
+ }
+
+ /* delete from session; was put there by conn_add_to_session() */
+ list_del(&conn->h.clist);
+
+ if (conn->h.initiator)
+ free(conn->h.initiator);
+
+ if (conn->h.session)
+ session_put(conn->h.session);
+
+ conn->h.state = STATE_INIT;
+ free(conn);
+ dprintf("conn:%p freed\n", conn);
+}
+
+static void iser_sched_conn_free(struct event_data *evt)
+{
+ struct iser_conn *conn = (struct iser_conn *) evt->data;
+
+ iser_conn_free(conn);
+}
+
+int iser_conn_get(struct iser_conn *conn)
+{
+ /* TODO: check state */
+ conn->h.refcount++;
+ dprintf("refcnt:%d\n", conn->h.refcount);
+ return 0;
+}
+
+void iser_conn_put(struct iser_conn *conn)
+{
+ conn->h.refcount--;
+ dprintf("refcnt:%d\n", conn->h.refcount);
+ if (!conn->h.refcount) {
+ assert(conn->h.state == STATE_CLOSE);
+ tgt_add_sched_event(&conn->sched_conn_free);
+ }
+}
+
+static int iser_show(struct iscsi_connection *iscsi_conn, char *buf,
+ int rest)
+{
+ struct iser_conn *conn = ISER_CONN(iscsi_conn);
+ int err;
+ char host[NI_MAXHOST];
+
+ err = getnameinfo((struct sockaddr *) &conn->peer_addr,
+ sizeof(conn->peer_addr), host, sizeof(host),
+ NULL, 0, NI_NUMERICHOST);
+ if (err) {
+ eprintf("getnameinfo: %m\n");
+ return 0;
+ }
+ return snprintf(buf, rest, "RDMA IP Address: %s", host);
+}
+
+static int iser_getsockname(struct iscsi_connection *iscsi_conn,
+ struct sockaddr *sa, socklen_t * len)
+{
+ struct iser_conn *conn = ISER_CONN(iscsi_conn);
+
+ if (*len > sizeof(conn->self_addr))
+ *len = sizeof(conn->self_addr);
+ memcpy(sa, &conn->self_addr, *len);
+ return 0;
+}
+
+static int iser_getpeername(struct iscsi_connection *iscsi_conn,
+ struct sockaddr *sa, socklen_t * len)
+{
+ struct iser_conn *conn = ISER_CONN(iscsi_conn);
+
+ if (*len > sizeof(conn->peer_addr))
+ *len = sizeof(conn->peer_addr);
+ memcpy(sa, &conn->peer_addr, *len);
+ return 0;
+}
+
+static void iser_cm_connect_request(struct rdma_cm_event *ev)
+{
+ struct rdma_cm_id *cm_id = ev->id;
+ struct ibv_qp_init_attr qp_init_attr;
+ struct iser_conn *conn;
+ struct iser_device *dev;
+ unsigned int hdrsz;
+ int err, dev_found;
+
+ struct rdma_conn_param conn_param = {
+ .responder_resources = 1,
+ .initiator_depth = 1,
+ .retry_count = 5,
+ };
+
+ /* find device */
+ dev_found = 0;
+ list_for_each_entry(dev, &iser_dev_list, list) {
+ if (dev->ibv_hndl == cm_id->verbs) {
+ dev_found = 1;
+ break;
+ }
+ }
+ if (!dev_found) {
+ dev = malloc(sizeof(*dev));
+ if (dev == NULL) {
+ eprintf("unable to allocate dev\n");
+ goto reject;
+ }
+ dev->ibv_hndl = cm_id->verbs;
+ err = iser_device_init(dev);
+ if (err) {
+ free(dev);
+ goto reject;
+ }
+ }
+
+ /* build a new connection structure */
+ conn = zalloc(sizeof(*conn));
+ if (!conn) {
+ eprintf("unable to allocate conn\n");
+ goto reject;
+ }
+
+ err = iser_conn_init(conn);
+ if (err) {
+ free(conn);
+ goto reject;
+ }
+ dprintf("Created conn:%p cm_id:%p\n", &conn->h, cm_id);
+ list_add(&conn->conn_list, &iser_conn_list);
+
+ /* ToDo: finalize these callbacks after tcp/iser code merge */
+ conn->h.op.conn_show = iser_show;
+ conn->h.op.conn_getsockname = iser_getsockname;
+ conn->h.op.conn_getpeername = iser_getpeername;
+
+ /* relate iser and rdma connections */
+ conn->cm_id = cm_id;
+ cm_id->context = conn;
+
+ conn->dev = dev;
+ conn->login_phase = LOGIN_PHASE_START;
+
+ tgt_init_sched_event(&conn->sched_buf_alloc, iser_sched_buf_alloc,
+ conn);
+ tgt_init_sched_event(&conn->sched_iosubmit, iser_sched_iosubmit,
+ conn);
+ tgt_init_sched_event(&conn->sched_rdma_rd, iser_sched_rdma_rd,
+ conn);
+ tgt_init_sched_event(&conn->sched_tx, iser_sched_tx,
+ conn);
+ tgt_init_sched_event(&conn->sched_post_recv, iser_sched_post_recv,
+ conn);
+ tgt_init_sched_event(&conn->sched_conn_free, iser_sched_conn_free,
+ conn);
+
+ /* initiator is dst, target is src */
+ memcpy(&conn->peer_addr, &cm_id->route.addr.dst_addr,
+ sizeof(conn->peer_addr));
+ memcpy(&conn->self_addr, &cm_id->route.addr.src_addr,
+ sizeof(conn->self_addr));
+#ifndef NDEBUG
+ {
+ char str[256];
+
+ iser_show(&conn->h, str, sizeof(str));
+ str[sizeof(str) - 1] = 0;
+ dprintf("new conn %p from %s\n", conn, str);
+ }
+#endif
+
+ /* create qp next */
+ memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+ /* wire both send and recv to the same CQ */
+ qp_init_attr.send_cq = dev->cq;
+ qp_init_attr.recv_cq = dev->cq;
+ qp_init_attr.cap.max_send_wr = MAX_WQE;
+ qp_init_attr.cap.max_recv_wr = MAX_WQE;
+ qp_init_attr.cap.max_send_sge = 1; /* scatter/gather entries */
+ qp_init_attr.cap.max_recv_sge = 1;
+ qp_init_attr.qp_type = IBV_QPT_RC;
+ /* only generate completion queue entries if requested */
+ qp_init_attr.sq_sig_all = 0;
+
+ err = rdma_create_qp(cm_id, dev->pd, &qp_init_attr);
+ if (err) {
+ eprintf("create qp failed\n");
+ goto free_conn;
+ }
+ conn->qp_hndl = cm_id->qp;
+ VALGRIND_MAKE_MEM_DEFINED(conn->qp_hndl, sizeof(*conn->qp_hndl));
+
+ /*
+ * Post buffers for the login phase, only.
+ */
+ hdrsz = sizeof(struct iser_hdr) +
+ sizeof(struct iscsi_hdr) +
+ sizeof(struct iscsi_ecdb_ahdr) +
+ sizeof(struct iscsi_rlength_ahdr);
+ conn->ssize = hdrsz + 8192;
+ conn->rsize = hdrsz + 8192;
+ err = iser_alloc_login_resources(conn);
+ if (err)
+ goto free_conn;
+
+ conn_param.initiator_depth = dev->device_attr.max_qp_init_rd_atom;
+ if (conn_param.initiator_depth > ev->param.conn.initiator_depth)
+ conn_param.initiator_depth = ev->param.conn.initiator_depth;
+
+ /* Increment reference count to be able to wait for TIMEWAIT_EXIT
+ when finalizing the disconnect process */
+ iser_conn_get(conn);
+
+ /* now we can actually accept the connection */
+ err = rdma_accept(conn->cm_id, &conn_param);
+ if (err) {
+ eprintf("rdma_accept failed conn:%p cm_id:%p\n",
+ &conn->h, cm_id);
+ goto free_conn;
+ }
+
+ conn->h.state = STATE_START;
+ dprintf("accepted conn:%p cm_id:%p\n", &conn->h, cm_id);
+
+ return;
+
+free_conn:
+ iser_conn_free(conn);
+
+reject:
+ err = rdma_reject(cm_id, NULL, 0);
+ if (err)
+ eprintf("rdma_reject failed: %s\n", strerror(-err));
+}
+
+/*
+ * Finish putting the connection together, now that the other side
+ * has ACKed our acceptance. Moves it from the temp_conn to the
+ * iser_conn_list.
+ *
+ * Release the temporary conn_info and glue it into iser_conn_list.
+ */
+static void iser_cm_conn_established(struct rdma_cm_event *ev)
+{
+ struct rdma_cm_id *cm_id = ev->id;
+ struct iser_conn *conn = cm_id->context;
+
+ eprintf("conn:%p cm_id:%p\n", &conn->h, cm_id);
+ assert(conn->h.state == STATE_START);
+ conn->h.state = STATE_READY;
+}
+
+/*
+ * Handle RDMA_CM_EVENT_DISCONNECTED or an equivalent event.
+ * Start closing the target's side connection.
+*/
+static void iser_cm_disconnected(struct rdma_cm_event *ev)
+{
+ struct rdma_cm_id *cm_id = ev->id;
+ struct iser_conn *conn = cm_id->context;
+ enum rdma_cm_event_type ev_type = ev->event;
+
+ eprintf("conn:%p cm_id:%p event:%d, %s\n", &conn->h, cm_id,
+ ev_type, rdma_event_str(ev_type));
+ iser_conn_close(conn);
+}
+
+/*
+ * Handle RDMA_CM_EVENT_TIMEWAIT_EXIT which is expected to be the last
+ * event during the lifecycle of a connection, when it had been shut down
+ * and the network has cleared from the remaining in-flight messages.
+*/
+static void iser_cm_timewait_exit(struct rdma_cm_event *ev)
+{
+ struct rdma_cm_id *cm_id = ev->id;
+ struct iser_conn *conn = cm_id->context;
+
+ eprintf("conn:%p cm_id:%p\n", &conn->h, cm_id);
+
+ /* Refcount was incremented just before accepting the connection,
+ typically this is the last decrement and the connection will be
+ released instantly */
+ iser_conn_put(conn);
+}
+
+/*
+ * Handle RDMA CM events.
+ */
+static void iser_handle_rdmacm(int fd __attribute__ ((unused)),
+ int events __attribute__ ((unused)),
+ void *data __attribute__ ((unused)))
+{
+ struct rdma_cm_event *ev;
+ enum rdma_cm_event_type ev_type;
+ int err;
+
+ err = rdma_get_cm_event(rdma_evt_channel, &ev);
+ if (err) {
+ eprintf("rdma_get_cm_event failed\n");
+ return;
+ }
+
+ VALGRIND_MAKE_MEM_DEFINED(ev, sizeof(*ev));
+
+ ev_type = ev->event;
+ switch (ev_type) {
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ iser_cm_connect_request(ev);
+ break;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ iser_cm_conn_established(ev);
+ break;
+
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ case RDMA_CM_EVENT_REJECTED:
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ case RDMA_CM_EVENT_DISCONNECTED:
+ iser_cm_disconnected(ev);
+ break;
+
+ case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+ iser_cm_timewait_exit(ev);
+ break;
+
+ case RDMA_CM_EVENT_MULTICAST_JOIN:
+ case RDMA_CM_EVENT_MULTICAST_ERROR:
+ eprintf("UD-related event:%d, %s - ignored\n", ev_type,
+ rdma_event_str(ev_type));
+ break;
+
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ eprintf("Unsupported event:%d, %s - ignored\n", ev_type,
+ rdma_event_str(ev_type));
+ break;
+
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ case RDMA_CM_EVENT_CONNECT_RESPONSE:
+ case RDMA_CM_EVENT_UNREACHABLE:
+ eprintf("Active side event:%d, %s - ignored\n", ev_type,
+ rdma_event_str(ev_type));
+ break;
+
+ default:
+ eprintf("Illegal event:%d - ignored\n", ev_type);
+ break;
+ }
+
+ err = rdma_ack_cm_event(ev);
+ if (err) {
+ eprintf("ack cm event failed, %s\n",
+ rdma_event_str(ev_type));
+ }
+}
+
+static inline void schedule_task_iosubmit(struct iser_task *task,
+ struct iser_conn *conn)
+{
+ struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) task->req.bhs;
+
+ list_add_tail(&task->exec_list, &conn->iosubmit_list);
+ tgt_add_sched_event(&conn->sched_iosubmit);
+ dprintf("task:%p tag:0x%04lx cmdsn:0x%x\n",
+ task, (unsigned long)task->tag,
+ be32_to_cpu(req_bhs->cmdsn));
+}
+
+static inline void schedule_rdma_read(struct iser_task *task,
+ struct iser_conn *conn)
+{
+ list_add_tail(&task->rdma_list, &conn->rdma_rd_list);
+ tgt_add_sched_event(&conn->sched_rdma_rd);
+ dprintf("task:%p tag:0x%04lx\n", task, (unsigned long)task->tag);
+}
+
+static inline void schedule_resp_tx(struct iser_task *task,
+ struct iser_conn *conn)
+{
+ dprintf("task:%p tag:0x%04lx\n", task, (unsigned long)task->tag);
+
+ list_add_tail(&task->tx_list, &conn->resp_tx_list);
+ tgt_add_sched_event(&conn->sched_tx);
+}
+
+static inline void schedule_post_recv(struct iser_task *task,
+ struct iser_conn *conn)
+{
+ list_add_tail(&task->recv_list, &conn->post_recv_list);
+ tgt_add_sched_event(&conn->sched_post_recv);
+ dprintf("task:%p tag:0x%04lx\n", task, (unsigned long)task->tag);
+}
+
+static int iser_logout_exec(struct iser_task *task)
+{
+ struct iser_conn *conn = task->conn;
+ struct iscsi_session *session = conn->h.session;
+ struct iscsi_logout *req = (struct iscsi_logout *) task->req.bhs;
+ uint32_t cmd_sn = be32_to_cpu(req->cmdsn);
+ int is_immediate = req->opcode & ISCSI_OP_IMMEDIATE;
+ struct iscsi_logout_rsp *rsp =
+ (struct iscsi_logout_rsp *) task->rsp.bhs;
+
+ memset(rsp, 0, BHS_SIZE);
+ rsp->opcode = ISCSI_OP_LOGOUT_RSP;
+ rsp->flags = ISCSI_FLAG_CMD_FINAL;
+ rsp->itt = req->itt;
+ rsp->statsn = cpu_to_be32(conn->h.stat_sn++);
+ if (session->exp_cmd_sn == cmd_sn && !is_immediate)
+ session->exp_cmd_sn++;
+ iser_set_rsp_cmd_sn(session, task->rsp.bhs);
+
+ task->rsp.ahssize = 0;
+ task->rsp.data.size = 0;
+
+ schedule_resp_tx(task, conn);
+ return 0;
+}
+
+static int iser_nop_out_exec(struct iser_task *task)
+{
+ struct iscsi_nopin *rsp_bhs =
+ (struct iscsi_nopin *) task->rsp.bhs;
+ struct iser_conn *conn = task->conn;
+
+ rsp_bhs->opcode = ISCSI_OP_NOOP_IN;
+ rsp_bhs->flags = ISCSI_FLAG_CMD_FINAL;
+ rsp_bhs->rsvd2 = 0;
+ rsp_bhs->rsvd3 = 0;
+ memset(rsp_bhs->lun, 0, sizeof(rsp_bhs->lun));
+ rsp_bhs->itt = task->req.bhs->itt;
+ rsp_bhs->ttt = cpu_to_be32(ISCSI_RESERVED_TAG);
+ rsp_bhs->statsn = cpu_to_be32(conn->h.stat_sn);
+ if (task->req.bhs->itt != cpu_to_be32(ISCSI_RESERVED_TAG))
+ conn->h.stat_sn++;
+
+ iser_set_rsp_cmd_sn(conn->h.session, task->rsp.bhs);
+
+ memset(rsp_bhs->rsvd4, 0, sizeof(rsp_bhs->rsvd4));
+ task->rsp.ahssize = 0;
+
+ if (task->out_len > 0)
+ task->rsp.data.buf = task->req.data.buf;
+ task->rsp.data.size = task->out_len;
+
+ schedule_resp_tx(task, conn);
+ return 0;
+}
+
+static int iser_send_ping_nop_in(struct iser_task *task)
+{
+ struct iscsi_nopin *rsp_bhs =
+ (struct iscsi_nopin *) task->rsp.bhs;
+ struct iser_conn *conn = task->conn;
+
+ task->opcode = ISCSI_OP_NOOP_IN;
+ task->tag = 0xffffffff;
+
+ rsp_bhs->opcode = ISCSI_OP_NOOP_IN;
+ rsp_bhs->flags = ISCSI_FLAG_CMD_FINAL;
+ rsp_bhs->rsvd2 = 0;
+ rsp_bhs->rsvd3 = 0;
+ memset(rsp_bhs->lun, 0, sizeof(rsp_bhs->lun));
+ rsp_bhs->itt = 0xffffffff;
+ rsp_bhs->ttt = 0xffffffff;
+ rsp_bhs->statsn = cpu_to_be32(conn->h.stat_sn);
+ iser_set_rsp_cmd_sn(conn->h.session, task->rsp.bhs);
+
+ memset(rsp_bhs->rsvd4, 0, sizeof(rsp_bhs->rsvd4));
+ task->rsp.ahssize = 0;
+ task->rsp.data.size = 0;
+
+ dprintf("task:%p conn:%p\n", task, &conn->h);
+
+ schedule_resp_tx(task, conn);
+ return 0;
+}
+
+/*
+static int iser_send_reject(struct iser_task *task, uint8 reason)
+{
+ struct iser_conn *conn = task->conn;
+ struct iscsi_session *session = conn->h.session;
+ struct iscsi_hdr *req = (struct iscsi_hdr *) task->req.bhs;
+ struct iscsi_reject *rsp = (struct iscsi_reject *) task->rsp.bhs;
+
+ memset(rsp, 0, BHS_SIZE);
+ rsp->opcode = ISCSI_OP_REJECT;
+ rsp->flags = ISCSI_FLAG_CMD_FINAL;
+ rsp->reason = reason;
+ hton24(rsp->dlength, sizeof(struct iser_hdr));
+ rsp->ffffffff = 0xffffffff;
+ rsp->itt = req->itt;
+ rsp->statsn = cpu_to_be32(conn->h.stat_sn++);
+ iser_set_rsp_cmd_sn(session, task->rsp.bhs);
+
+ task->rsp.ahssize = 0;
+ task->rsp.data.buf = task->req.bhs;
+ task->rsp.data.size = sizeof(struct iser_hdr);
+
+ schedule_resp_tx(task, conn);
+ return 0;
+}
+*/
+
+static int iser_tm_exec(struct iser_task *task)
+{
+ struct iser_conn *conn = task->conn;
+ struct iscsi_session *session = conn->h.session;
+ struct iscsi_tm *req_bhs = (struct iscsi_tm *) task->req.bhs;
+ int fn = 0;
+ int err = 0;
+
+ eprintf("TM itt:0x%x cmdsn:0x%x "
+ "ref.itt:0x%x ref.cmdsn:0x%x "
+ "exp.cmdsn:0x%x "
+ "lun:0x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+ be32_to_cpu(req_bhs->itt), be32_to_cpu(req_bhs->cmdsn),
+ be32_to_cpu(req_bhs->rtt), be32_to_cpu(req_bhs->refcmdsn),
+ be32_to_cpu(req_bhs->exp_statsn),
+ req_bhs->lun[0], req_bhs->lun[1], req_bhs->lun[2], req_bhs->lun[3],
+ req_bhs->lun[4], req_bhs->lun[5], req_bhs->lun[6], req_bhs->lun[7]);
+
+ switch (req_bhs->flags & ISCSI_FLAG_TM_FUNC_MASK) {
+ case ISCSI_TM_FUNC_ABORT_TASK:
+ fn = ABORT_TASK;
+ break;
+ case ISCSI_TM_FUNC_ABORT_TASK_SET:
+ fn = ABORT_TASK_SET;
+ break;
+ case ISCSI_TM_FUNC_CLEAR_ACA:
+ fn = CLEAR_TASK_SET;
+ break;
+ case ISCSI_TM_FUNC_CLEAR_TASK_SET:
+ fn = CLEAR_ACA;
+ break;
+ case ISCSI_TM_FUNC_LOGICAL_UNIT_RESET:
+ fn = LOGICAL_UNIT_RESET;
+ break;
+ case ISCSI_TM_FUNC_TARGET_WARM_RESET:
+ case ISCSI_TM_FUNC_TARGET_COLD_RESET:
+ case ISCSI_TM_FUNC_TASK_REASSIGN:
+ err = ISCSI_TMF_RSP_NOT_SUPPORTED;
+ eprintf("unsupported TMF %d\n",
+ req_bhs->flags & ISCSI_FLAG_TM_FUNC_MASK);
+ break;
+ default:
+ err = ISCSI_TMF_RSP_REJECTED;
+ eprintf("unknown TMF %d\n",
+ req_bhs->flags & ISCSI_FLAG_TM_FUNC_MASK);
+ }
+
+ if (err)
+ task->result = err;
+ else
+ target_mgmt_request(session->target->tid, session->tsih,
+ (unsigned long) task, fn, req_bhs->lun,
+ req_bhs->rtt, 0);
+ return err;
+}
+
+static int iser_tm_done(struct mgmt_req *mreq)
+{
+ struct iser_task *task =
+ (struct iser_task *) (unsigned long) mreq->mid;
+ struct iser_conn *conn = task->conn;
+ struct iscsi_session *session = conn->h.session;
+ struct iscsi_tm_rsp *rsp_bhs =
+ (struct iscsi_tm_rsp *) task->rsp.bhs;
+
+ memset(rsp_bhs, 0, sizeof(*rsp_bhs));
+ rsp_bhs->opcode = ISCSI_OP_SCSI_TMFUNC_RSP;
+ rsp_bhs->flags = ISCSI_FLAG_CMD_FINAL;
+ rsp_bhs->itt = task->req.bhs->itt;
+ switch (mreq->result) {
+ case 0:
+ rsp_bhs->response = ISCSI_TMF_RSP_COMPLETE;
+ break;
+ case -EINVAL:
+ rsp_bhs->response = ISCSI_TMF_RSP_NOT_SUPPORTED;
+ break;
+ case -EEXIST:
+ /*
+ * the command completed or we could not find it so
+ * we retrun no task here
+ */
+ rsp_bhs->response = ISCSI_TMF_RSP_NO_TASK;
+ break;
+ default:
+ rsp_bhs->response = ISCSI_TMF_RSP_REJECTED;
+ break;
+ }
+
+ rsp_bhs->statsn = cpu_to_be32(conn->h.stat_sn++);
+ iser_set_rsp_cmd_sn(session, task->rsp.bhs);
+
+ // Must be zero according to 10.6.3
+ task->rsp.ahssize = 0;
+ task->rsp.data.size = 0;
+
+ schedule_resp_tx(task, conn);
+
+ return 0;
+}
+
+static int iser_scsi_cmd_buf_alloc(struct iser_task *task)
+{
+ struct iser_conn *conn = task->conn;
+ struct membuf *mem;
+
+ // when a buffer needs to be allocated and it fails, leave task on
+ // the list waiting until a new buffer becomes available
+ if (task->is_write) {
+ if (task->out_len > 0) {
+ if (task->rdma_rd_sz > 0) {
+ // ToDo: multiple RDMA-Read buffers
+ mem = iser_alloc_rdma_buf(conn, task->rdma_rd_sz);
+ if (mem == NULL) {
+ eprintf("free list empty, task:%p tag:0x%04lx\n",
+ task, (unsigned long)task->tag);
+ return -ENOMEM;
+ }
+ dprintf("out-buf:%p sz:%u rdma:%d\n",
+ mem->buf, mem->size, mem->rdma);
+ // ToDo: multiple RDMA-Read buffers
+ iser_task_add_out_rdma_buf(task, mem, task->unsol_sz);
+ schedule_rdma_read(task, conn);
+ return 0;
+ }
+ if (task->unsol_remains > 0)
+ return 0;
+ }
+ } // ToDo: bi-dir
+ else if (task->is_read) {
+ if (task->in_len > 0) {
+ mem = iser_alloc_rdma_buf(conn, task->rdma_wr_sz);
+ if (mem == NULL) {
+ eprintf("free list empty, task:%p tag:0x%04lx\n",
+ task, (unsigned long)task->tag);
+ return -ENOMEM;
+ }
+ dprintf("in-buf:%p sz:%u rdma:%d\n",
+ mem->buf, mem->size, mem->rdma);
+ // ToDo: multiple rdma write bufs
+ iser_task_add_in_rdma_buf(task, mem, 0);
+ }
+ }
+ schedule_task_iosubmit(task, conn);
+ return 0;
+}
+
+static int scsi_cmd_attr(unsigned int flags)
+{
+ int attr;
+
+ switch (flags & ISCSI_FLAG_CMD_ATTR_MASK) {
+ case ISCSI_ATTR_UNTAGGED:
+ case ISCSI_ATTR_SIMPLE:
+ attr = MSG_SIMPLE_TAG;
+ break;
+ case ISCSI_ATTR_HEAD_OF_QUEUE:
+ attr = MSG_HEAD_TAG;
+ break;
+ case ISCSI_ATTR_ORDERED:
+ default:
+ attr = MSG_ORDERED_TAG;
+ }
+ return attr;
+}
+
+static void iser_task_free_dout_tasks(struct iser_task *task)
+{
+ struct iser_conn *conn = task->conn;
+ struct iser_task *dout_task, *tnext;
+
+ list_for_each_entry_safe(dout_task, tnext, &task->dout_task_list, dout_task_list) {
+ list_del(&dout_task->dout_task_list);
+ iser_task_del_out_buf(task, &dout_task->req.data);
+ schedule_post_recv(dout_task, conn);
+ }
+}
+
+static void iser_scsi_cmd_iosubmit(struct iser_task *task)
+{
+ struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) task->req.bhs;
+ struct scsi_cmd *scmd = &task->scmd;
+ struct iser_conn *conn = task->conn;
+ struct iscsi_session *session = conn->h.session;
+ struct membuf *data;
+
+ // ToDo: implement iser_task_handle_ahs(itask);
+
+ scmd->state = 0;
+ scsi_set_in_length(scmd, 0);
+ scsi_set_out_length(scmd, 0);
+
+ if (task->is_write) {
+ scsi_set_out_resid(scmd, 0);
+ /* It's either the last buffer, which is RDMA, or the only buffer */
+ data = list_entry(task->out_buf_list.prev, struct membuf, task_list);
+ // ToDo: multiple RDMA-Write buffers
+ if (task->out_buf_num > 1) {
+ unsigned char *ptr = data->buf;
+ struct membuf *cur_buf;
+
+ list_for_each_entry(cur_buf, &task->out_buf_list, task_list) {
+ if (cur_buf->rdma)
+ break;
+ memcpy(ptr, cur_buf->buf, cur_buf->size);
+ ptr += cur_buf->size;
+ }
+ iser_task_free_dout_tasks(task);
+ }
+
+ scsi_set_out_buffer(scmd, data->buf);
+ scsi_set_out_length(scmd, task->out_len);
+ }
+ if (task->is_read) {
+ scsi_set_in_resid(scmd, 0);
+ // ToDo: multiple RDMA-Read buffers
+ data = list_entry(task->in_buf_list.next, struct membuf, task_list);
+ scsi_set_in_buffer(scmd, data->buf);
+ scsi_set_in_length(scmd, task->in_len);
+ }
+
+ scmd->cmd_itn_id = session->tsih;
+ scmd->scb = req_bhs->cdb;
+ scmd->scb_len = sizeof(req_bhs->cdb);
+ memcpy(scmd->lun, req_bhs->lun, sizeof(scmd->lun));
+ scmd->attribute = scsi_cmd_attr(req_bhs->flags);
+ scmd->tag = task->tag; //req_bhs->itt;
+ scmd->result = 0;
+ scmd->mreq = NULL;
+ scmd->sense_len = 0;
+
+ dprintf("task:%p tag:0x%04lx\n", task, (unsigned long)task->tag);
+
+ // ToDo: handle bidi ahs and allocations
+ // the problem is that there is only one buffer
+ // in scsi command
+
+ target_cmd_queue(session->target->tid, scmd);
+}
+
+static void iser_rsp_set_read_resid(struct iscsi_cmd_rsp *rsp_bhs,
+ int in_resid)
+{
+ if (in_resid > 0) {
+ rsp_bhs->flags |= ISCSI_FLAG_CMD_UNDERFLOW;
+ rsp_bhs->residual_count = cpu_to_be32((uint32_t)in_resid);
+ }
+ else {
+ rsp_bhs->flags |= ISCSI_FLAG_CMD_OVERFLOW;
+ rsp_bhs->residual_count = cpu_to_be32(((uint32_t)-in_resid));
+ }
+ rsp_bhs->bi_residual_count = 0;
+}
+
+static void iser_rsp_set_bidir_resid(struct iscsi_cmd_rsp *rsp_bhs,
+ int in_resid)
+{
+ if (in_resid > 0) {
+ rsp_bhs->flags |= ISCSI_FLAG_CMD_BIDI_UNDERFLOW;
+ rsp_bhs->bi_residual_count = cpu_to_be32((uint32_t)in_resid);
+ }
+ else {
+ rsp_bhs->flags |= ISCSI_FLAG_CMD_BIDI_OVERFLOW;
+ rsp_bhs->bi_residual_count = cpu_to_be32(((uint32_t)-in_resid));
+ }
+ rsp_bhs->residual_count = 0;
+}
+
+static int iser_scsi_cmd_done(uint64_t nid, int result,
+ struct scsi_cmd *scmd)
+{
+ struct iser_task *task = container_of(scmd, struct iser_task, scmd);
+ struct iscsi_cmd_rsp *rsp_bhs = (struct iscsi_cmd_rsp *) task->rsp.bhs;
+ struct iser_conn *conn = task->conn;
+ struct iscsi_session *session = conn->h.session;
+ unsigned char sense_len = scmd->sense_len;
+ int in_resid;
+
+ if (nid != scmd->cmd_itn_id) {
+ eprintf("unexpected itn_id in scsi command\n");
+ exit(1);
+ }
+
+ if (conn->h.state == STATE_CLOSE) {
+ /* Connection is closed, but its resources are not released
+ to allow receiving completion of such late tasks.
+ When all tasks are released, and connection refcnt
+ drops to zero, then all the resources can be freed. */
+ iser_complete_task(task);
+ return 0;
+ }
+
+ rsp_bhs->opcode = ISCSI_OP_SCSI_CMD_RSP;
+ rsp_bhs->flags = ISCSI_FLAG_CMD_FINAL;
+ rsp_bhs->response = ISCSI_STATUS_CMD_COMPLETED;
+ rsp_bhs->cmd_status = scsi_get_result(scmd);
+ *((uint64_t *) rsp_bhs->rsvd) = (uint64_t) 0;
+ rsp_bhs->itt = task->tag;
+ rsp_bhs->rsvd1 = 0;
+ rsp_bhs->statsn = cpu_to_be32(conn->h.stat_sn++);
+ iser_set_rsp_cmd_sn(session, task->rsp.bhs);
+ rsp_bhs->exp_datasn = 0;
+
+ if (task->is_read)
+ {
+ in_resid = scsi_get_in_resid(scmd);
+ if (in_resid != 0) {
+ if (!task->is_write)
+ iser_rsp_set_read_resid(rsp_bhs, in_resid);
+ else
+ iser_rsp_set_bidir_resid(rsp_bhs, in_resid);
+ if (in_resid > 0)
+ task->rdma_wr_remains = task->in_len - in_resid;
+ }
+ //schedule_rdma_write(task, conn);
+ }
+ else {
+ rsp_bhs->bi_residual_count = 0;
+ rsp_bhs->residual_count = 0;
+ }
+ task->rsp.ahssize = 0;
+ task->rsp.data.size = 0;
+
+ if (sense_len) {
+ struct iscsi_sense_data *sense =
+ (struct iscsi_sense_data *) task->rsp.data.buf;
+ sense->length = cpu_to_be16(sense_len);
+ memcpy(sense->data, scmd->sense_buffer, sense_len);
+ task->rsp.data.size = sense_len + sizeof(*sense);
+ }
+ dprintf("task:%p tag:0x%04lx status:%x statsn:%d flags:0x%x "
+ "in_len:%d out_len:%d rsd:%d birsd:%d\n",
+ task, (unsigned long)task->tag, rsp_bhs->cmd_status,
+ conn->h.stat_sn-1, rsp_bhs->flags,
+ scsi_get_in_length(scmd), scsi_get_out_length(scmd),
+ ntohl(rsp_bhs->residual_count),
+ ntohl(rsp_bhs->bi_residual_count));
+
+ schedule_resp_tx(task, conn);
+
+ return 0;
+}
+
+/*
+static int iser_task_handle_ahs(struct iser_task *task)
+{
+ struct scsi_cmd *scmd = &task->scmd;
+ struct iscsi_connection *conn = task->conn;
+ struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) &task->req.;
+ uint32_t data_len;
+ uint8_t *ahs;
+ int ahslen;
+ enum data_direction dir = scsi_get_data_dir(scmd);
+
+ ahs = task->ahs;
+ ahslen = req_bhs->hlength * 4;
+ if (ahslen >= 4) {
+ struct iscsi_ecdb_ahdr *ahs_extcdb = (void *) ahs;
+
+ if (ahs_extcdb->ahstype == ISCSI_AHSTYPE_CDB) {
+ int extcdb_len = ntohs(ahs_extcdb->ahslength) - 1;
+ unsigned char *p = (void *)task->extdata;
+
+ if (4 + extcdb_len > ahslen) {
+ eprintf("AHS len:%d too short for extcdb %d\n",
+ ahslen, extcdb_len);
+ return -EINVAL;
+ }
+ if (extcdb_len + sizeof(req_bhs->cdb) > 260) {
+ eprintf("invalid extcdb len:%d\n", extcdb_len);
+
+ return -EINVAL;
+ }
+
+ memcpy(p, req_bhs->cdb, sizeof(req_bhs->cdb));
+ memmove(p + sizeof(req_bhs->cdb), ahs_extcdb->ecdb,
+ extcdb_len);
+
+ scmd->scb = p;
+ scmd->scb_len = sizeof(req_bhs->cdb) + extcdb_len;
+
+ ahs += 4 + extcdb_len;
+ ahslen -= 4 + extcdb_len;
+ }
+ }
+
+ if (dir == DATA_BIDIRECTIONAL && ahslen >= 8) {
+ struct iscsi_rlength_ahdr *ahs_bidi = (void *) ahs;
+ if (ahs_bidi->ahstype == ISCSI_AHSTYPE_RLENGTH) {
+ uint32_t in_length = ntohl(ahs_bidi->read_length);
+
+ dprintf("bidi read len %u\n", in_length);
+
+ if (in_length) {
+ uint32_t len;
+ void *buf;
+
+ len = roundup(in_length,
+ conn->tp->data_padding);
+ buf = conn->tp->alloc_data_buf(conn, len);
+ if (!buf)
+ return -ENOMEM;
+
+ scsi_set_in_buffer(scmd, buf);
+ scsi_set_in_length(scmd, in_length);
+ }
+ }
+ }
+}
+*/
+
+static void iser_sched_buf_alloc(struct event_data *evt)
+{
+ struct iser_conn *conn = (struct iser_conn *) evt->data;
+ struct iser_task *task;
+ int err;
+
+ while (!list_empty(&conn->buf_alloc_list)) {
+ task = list_first_entry(&conn->buf_alloc_list,
+ struct iser_task,
+ exec_list);
+ err = iser_scsi_cmd_buf_alloc(task);
+ if (err == -ENOMEM)
+ break;
+ list_del(&task->exec_list);
+ }
+}
+
+static inline int task_batch_type(struct iser_task *task)
+{
+ struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) task->req.bhs;
+
+ switch (req_bhs->cdb[0]) {
+ case SYNCHRONIZE_CACHE:
+ case SYNCHRONIZE_CACHE_16:
+ case WRITE_6:
+ case WRITE_10:
+ case WRITE_12:
+ case WRITE_16:
+ case READ_6:
+ case READ_10:
+ case READ_12:
+ case READ_16:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static void iser_sched_iosubmit(struct event_data *evt)
+{
+ struct iser_conn *conn = (struct iser_conn *) evt->data;
+ struct iser_task *task = NULL;
+
+ while (!list_empty(&conn->iosubmit_list)) {
+ task = list_first_entry(&conn->iosubmit_list,
+ struct iser_task, exec_list);
+ list_del(&task->exec_list);
+ iser_scsi_cmd_iosubmit(task);
+ }
+}
+
+static void iser_sched_rdma_rd(struct event_data *evt)
+{
+ struct iser_conn *conn = (struct iser_conn *) evt->data;
+ struct iser_work_req *first_wr = NULL;
+ struct iser_task *prev_task = NULL;
+ struct iser_task *task = NULL;
+ int err;
+
+ if (conn->h.state == STATE_CLOSE) {
+ dprintf("ignoring rdma_rd for closed conn\n");
+ // ToDo: free all tasks and buffers
+ return;
+ }
+
+ while (!list_empty(&conn->rdma_rd_list)) {
+ task = list_first_entry(&conn->rdma_rd_list,
+ struct iser_task, rdma_list);
+ list_del(&task->rdma_list);
+
+ iser_prep_rdma_rd_send_req(task, NULL, 1);
+ if (first_wr == NULL)
+ first_wr = &task->rdmad;
+ else
+ prev_task->rdmad.send_wr.next = &task->rdmad.send_wr;
+ prev_task = task;
+ }
+ if (prev_task) {
+ prev_task->rdmad.send_wr.next = NULL;
+ // submit the chain of rdma-rd requests, start from the first
+ err = iser_post_send(conn, first_wr);
+ }
+ // ToDo: error handling
+}
+
+static void iser_sched_tx(struct event_data *evt)
+{
+ struct iser_conn *conn = (struct iser_conn *) evt->data;
+ struct iser_work_req *first_wr = NULL;
+ struct iser_task *prev_task = NULL;
+ struct iser_task *task;
+ struct iser_work_req *cur_send_wr;
+ int err;
+
+ if (conn->h.state == STATE_CLOSE) {
+ dprintf("ignoring tx for closed conn\n");
+ return;
+ }
+
+ while (!list_empty(&conn->resp_tx_list)) {
+ task = list_first_entry(&conn->resp_tx_list,
+ struct iser_task,
+ tx_list);
+ list_del(&task->tx_list);
+ list_add_tail(&task->tx_list, &conn->sent_list);
+
+ if (task->is_read) {
+ iser_prep_rdma_wr_send_req(task, &task->txd, 0);
+ cur_send_wr = &task->rdmad;
+ }
+ else
+ cur_send_wr = &task->txd;
+
+ if (prev_task == NULL)
+ first_wr = cur_send_wr;
+ else
+ prev_task->txd.send_wr.next = &cur_send_wr->send_wr;
+
+ iser_prep_resp_send_req(task, NULL, 1);
+ prev_task = task;
+ }
+ if (prev_task) {
+ prev_task->txd.send_wr.next = NULL;
+ // submit the chain of rdma-wr & tx reqs, start from the first
+ err = iser_post_send(conn, first_wr);
+ // ToDo: error handling
+ }
+}
+
+static void iser_sched_post_recv(struct event_data *evt)
+{
+ struct iser_conn *conn = (struct iser_conn *) evt->data;
+ struct iser_task *first_task = NULL;
+ struct iser_task *prev_task = NULL;
+ struct iser_task *task;
+ int num_recv_bufs = 0;
+ int err;
+
+ if (conn->h.state == STATE_CLOSE) {
+ dprintf("ignoring post recv for closed conn\n");
+ return;
+ }
+
+ while (!list_empty(&conn->post_recv_list)) {
+ task = list_first_entry(&conn->post_recv_list,
+ struct iser_task,
+ recv_list);
+ list_del(&task->recv_list);
+
+ if (prev_task == NULL)
+ first_task = task;
+ else
+ prev_task->rxd.recv_wr.next = &task->rxd.recv_wr;
+
+ prev_task = task;
+ num_recv_bufs ++;
+ }
+ if (prev_task) {
+ prev_task->rxd.recv_wr.next = NULL;
+ // post the chain of recv buffers, start from the first
+ err = iser_post_recv(conn, first_task, num_recv_bufs);
+ // ToDo: error handling
+ }
+}
+
+static int iser_scsi_cmd_rx(struct iser_task *task)
+{
+ struct iser_conn *conn = task->conn;
+ struct iscsi_session *session = conn->h.session;
+ struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) task->req.bhs;
+ unsigned int flags = req_bhs->flags;
+ uint32_t imm_data_sz = ntoh24(req_bhs->dlength);
+ uint32_t xfer_sz = ntohl(req_bhs->data_length);
+
+ task->is_read = flags & ISCSI_FLAG_CMD_READ;
+ task->is_write = flags & ISCSI_FLAG_CMD_WRITE;
+
+ if (task->is_write) {
+ task->out_len = xfer_sz;
+
+ /* add immediate data to the task */
+
+
+ if (!conn->h.session_param[ISCSI_PARAM_INITIAL_R2T_EN].val) {
+ task->unsol_sz = conn->h.session_param[ISCSI_PARAM_FIRST_BURST].val;
+ if (task->out_len < task->unsol_sz)
+ task->unsol_sz = task->out_len;
+ }
+ else
+ task->unsol_sz = 0;
+
+ if (conn->h.session_param[ISCSI_PARAM_IMM_DATA_EN].val) {
+ if (imm_data_sz > 0) {
+ if (task->unsol_sz == 0)
+ task->unsol_sz = imm_data_sz;
+ iser_task_add_out_pdu_buf(task, &task->req.data, 0);
+ }
+ }
+ else if (imm_data_sz > 0) {
+ eprintf("ImmediateData disabled but received\n");
+ return -EINVAL;
+ }
+
+ /* immediate data is the first chunk of the unsolicited data */
+ task->unsol_remains = task->unsol_sz - imm_data_sz;
+ /* rdma-reads cover the entire solicited data */
+ task->rdma_rd_sz = task->rdma_rd_remains =
+ task->out_len - task->unsol_sz;
+ // ToDo: multiple RDMA-Write buffers
+ // task->rdma_rd_offset = task->unsol_sz;
+
+ task->in_len = task->rdma_wr_sz = task->rdma_wr_remains = 0;
+
+ if (!task->is_read)
+ scsi_set_data_dir(&task->scmd, DATA_WRITE);
+ else
+ scsi_set_data_dir(&task->scmd, DATA_BIDIRECTIONAL);
+ } else {
+ task->in_len = task->rdma_wr_sz =
+ task->rdma_wr_remains = xfer_sz;
+ task->out_len = task->unsol_sz = task->unsol_remains =
+ task->rdma_rd_sz = task->rdma_rd_remains = 0;
+
+ if (task->is_read)
+ scsi_set_data_dir(&task->scmd, DATA_READ);
+ else
+ scsi_set_data_dir(&task->scmd, DATA_NONE);
+ }
+
+ list_add_tail(&task->session_list, &session->cmd_list);
+
+ dprintf("task:%p tag:0x%04lx scsi_op:0x%x %s%s in_len:%d out_len:%d "
+ "imm_sz:%d unsol_sz:%d cmdsn:0x%x expcmdsn:0x%x\n",
+ task, (long unsigned)task->tag, req_bhs->cdb[0],
+ task->is_read ? "rd" : "", task->is_write ? "wr" : "",
+ task->in_len, task->out_len, imm_data_sz, task->unsol_sz,
+ ntohl(req_bhs->cmdsn), session->exp_cmd_sn);
+
+ return 0;
+}
+
+static int iser_data_out_rx(struct iser_task *dout_task)
+{
+ struct iser_conn *conn = dout_task->conn;
+ struct iscsi_session *session = conn->h.session;
+ struct iscsi_data *req_bhs =
+ (struct iscsi_data *) dout_task->req.bhs;
+ struct iser_task *task;
+ int err = 0;
+
+ list_for_each_entry(task, &session->cmd_list, session_list) {
+ if (task->tag == req_bhs->itt)
+ goto found;
+ }
+ return -EINVAL;
+
+found:
+ iser_task_add_out_pdu_buf(task, &dout_task->req.data,
+ be32_to_cpu(req_bhs->offset));
+ list_add_tail(&dout_task->dout_task_list, &task->dout_task_list);
+
+ // ToDo: BUG!!! add an indication that it's data-out task so that
+ // it can be released when the buffer is released
+
+ task->unsol_remains -= ntoh24(req_bhs->dlength);
+
+ dprintf("task:%p tag:0x%04lx, dout taskptr:%p out_len:%d "
+ "uns_rem:%d rdma_rd_rem:%d expcmdsn:0x%x\n",
+ task, (long unsigned)task->tag, dout_task, task->out_len,
+ task->unsol_remains, task->rdma_rd_remains,
+ session->exp_cmd_sn);
+
+ // ToDo: look at the task counters !!!
+ if (req_bhs->ttt == cpu_to_be32(ISCSI_RESERVED_TAG)) {
+ if (req_bhs->flags & ISCSI_FLAG_CMD_FINAL) {
+ if (!task_pending(task)) {
+ // ToDo: this condition is weird ...
+ if (task->rdma_rd_remains == 0 && task->unsol_remains == 0)
+ schedule_task_iosubmit(task,conn);
+ }
+ }
+ } else {
+ if (!(req_bhs->flags & ISCSI_FLAG_CMD_FINAL))
+ return err;
+
+ if (task->rdma_rd_remains == 0 && task->unsol_remains == 0)
+ schedule_task_iosubmit(task,conn);
+ }
+ return err;
+}
+
+static void iser_pdu_copy(struct iser_pdu *dst_pdu, struct iser_pdu *src_pdu)
+{
+ memcpy(dst_pdu->iser_hdr, src_pdu->iser_hdr, sizeof(*src_pdu->iser_hdr));
+ memcpy(dst_pdu->bhs, src_pdu->bhs, sizeof(*src_pdu->bhs));
+ dst_pdu->ahssize = src_pdu->ahssize;
+ // ToDo ahs alloc and copy
+
+ dst_pdu->data.size = src_pdu->data.size;
+ memcpy(dst_pdu->data.buf, src_pdu->data.buf, src_pdu->data.size);
+}
+
+static void iser_login_rx(struct iser_task *task)
+{
+ struct iser_conn *conn = task->conn;
+ struct iser_task *tx_task = conn->login_tx_task;
+
+ iser_login_exec(task);
+ iser_pdu_copy(&tx_task->rsp, &task->rsp);
+
+ if (conn->login_phase != LOGIN_PHASE_LAST_SEND)
+ iser_post_recv(conn, task, 1);
+ else {
+ dprintf("transitioning to full-feature, no repost\n");
+ }
+ iser_prep_resp_send_req(tx_task, NULL, 1);
+ iser_post_send(conn, &tx_task->txd);
+}
+
+static int iser_nop_out_rx(struct iser_task *task)
+{
+ struct iser_conn *conn = task->conn;
+// struct iscsi_session *session = conn->h.session;
+ struct iscsi_nopout *req_bhs = (struct iscsi_nopout *) task->req.bhs;
+ int err = 0;
+
+ if (req_bhs->ttt != cpu_to_be32(ISCSI_RESERVED_TAG)) {
+ /*
+ * We don't request a NOP-Out by sending a NOP-In.
+ * See 10.18.2 in the draft 20.
+ */
+ eprintf("initiator bug\n");
+ err = -ISCSI_REASON_PROTOCOL_ERROR;
+ goto reject;
+ }
+ if (req_bhs->itt == cpu_to_be32(ISCSI_RESERVED_TAG)) {
+ if (req_bhs->opcode & ISCSI_OP_IMMEDIATE) {
+ dprintf("No response to Nop-Out\n");
+ iser_post_recv(conn, task, 1);
+ return -EAGAIN; // ToDo: fix the ret codes
+ } else {
+ eprintf("initiator bug\n");
+ err = -ISCSI_REASON_PROTOCOL_ERROR;
+ goto reject;
+ }
+ }
+
+ task->out_len = ntoh24(req_bhs->dlength);
+ dprintf("nop-out task:%p cmdsn:0x%x data_sz:%d\n",
+ task, ntohl(req_bhs->cmdsn), task->out_len);
+
+ return 0;
+
+reject: // ToDo: prepare and send reject pdu
+ /* iser_send_reject(task, ISCSI_REASON_INVALID_PDU_FIELD); */
+ return err;
+}
+
+static int iser_task_delivery(struct iser_task *task)
+{
+ int err;
+
+ switch (task->opcode) {
+ case ISCSI_OP_SCSI_CMD:
+ err = iser_scsi_cmd_buf_alloc(task);
+ if (err == -ENOMEM)
+ list_add_tail(&task->exec_list,
+ &task->conn->buf_alloc_list);
+ break;
+ case ISCSI_OP_NOOP_OUT:
+ err = iser_nop_out_exec(task);
+ break;
+ case ISCSI_OP_LOGOUT:
+ err = iser_logout_exec(task);
+ break;
+ case ISCSI_OP_SCSI_TMFUNC:
+ err = iser_tm_exec(task);
+ break;
+ case ISCSI_OP_TEXT:
+ err = iser_text_exec(task);
+ break;
+ default:
+ eprintf("Internal error: Unexpected op:0x%x\n",task->opcode);
+ err = -EINVAL;
+ break;
+ }
+ return err;
+}
+
+static inline int cmdsn_cmp(uint32_t sn1, uint32_t sn2)
+{
+ if (sn1 == sn2)
+ return 0;
+
+ return ((int32_t)sn1 - (int32_t)sn2) > 0 ? 1 : -1;
+}
+
+// queues the task according to cmd-sn, no exec here
+static int iser_queue_task(struct iscsi_session *session,
+ struct iser_task *task)
+{
+ struct iscsi_hdr *req_bhs = (struct iscsi_hdr *) task->req.bhs;
+ int is_immediate = req_bhs->opcode & ISCSI_OP_IMMEDIATE;
+ uint32_t cmd_sn = be32_to_cpu(req_bhs->statsn);
+ struct list_head *cmp_entry;
+ int err;
+
+ if (is_immediate) {
+ dprintf("exec imm task task:%p cmd_sn:0x%x\n", task, cmd_sn);
+ err = iser_task_delivery(task);
+ if (!err || err == -ENOMEM)
+ return 0;
+ else
+ return err;
+ }
+
+ /* if the current command is the expected one, exec it
+ and all others possibly acumulated on the queue */
+ while (session->exp_cmd_sn == cmd_sn) {
+ session->exp_cmd_sn++;
+ dprintf("exec task:%p cmd_sn:0x%x\n", task, cmd_sn);
+ err = iser_task_delivery(task);
+ if (err && err != -ENOMEM) {
+ // when no free buffers remain, the task will wait
+ // on queue, so it is not a real error, but we should
+ // not attempt to start other tasks until more
+ // memory becomes available
+ return err; // ToDo: what if there are more tasks in case of error
+ }
+
+ if (list_empty(&session->pending_cmd_list))
+ return 0;
+
+ task = list_first_entry(&session->pending_cmd_list,
+ struct iser_task, exec_list);
+ list_del(&task->exec_list);
+ clear_task_pending(task);
+ cmd_sn = be32_to_cpu(task->req.bhs->statsn);
+ }
+
+ /* cmd_sn > (exp_cmd_sn+ISER_MAX_QUEUE_CMD), i.e. beyond allowed window */
+ if (cmdsn_cmp(cmd_sn, session->exp_cmd_sn+ISER_MAX_QUEUE_CMD) == 1) {
+ eprintf("unexpected cmd_sn:0x%x, max:0x%x\n",
+ cmd_sn, session->exp_cmd_sn+ISER_MAX_QUEUE_CMD);
+ return -EINVAL;
+ }
+
+ /* insert the current task, ordered by cmd_sn */
+ list_for_each_prev(cmp_entry, &session->pending_cmd_list) {
+ struct iser_task *cmp_task;
+ uint32_t cmp_cmd_sn;
+ int cmp_res;
+
+ cmp_task = list_entry(cmp_entry, struct iser_task, exec_list);
+ cmp_cmd_sn = be32_to_cpu(cmp_task->req.bhs->statsn);
+ cmp_res = cmdsn_cmp(cmd_sn, cmp_cmd_sn);
+ if (cmp_res == 1) { /* cmd_sn > cmp_cmd_sn */
+ dprintf("inserted cmdsn:0x%x after cmdsn:0x%x\n",
+ cmd_sn, cmp_cmd_sn);
+ break;
+ }
+ else if (cmp_res == -1) { /* cmd_sn < cmp_cmd_sn */
+ dprintf("inserting cmdsn:0x%x skip cmdsn:0x%x\n",
+ cmd_sn, cmp_cmd_sn);
+ continue;
+ }
+ else { /* cmd_sn == cmp_cmd_sn */
+ eprintf("duplicate cmd_sn:0x%x, exp:%u\n",
+ cmd_sn, session->exp_cmd_sn);
+ return -EINVAL;
+ }
+ }
+ list_add(&task->exec_list, cmp_entry);
+ set_task_pending(task);
+ return 0;
+}
+
+static int iser_parse_req_headers(struct iser_task *task)
+{
+ struct iser_hdr *iser_hdr = task->req.iser_hdr;
+ struct iscsi_hdr *iscsi_hdr = task->req.bhs;
+ int err = -1;
+
+ switch (iser_hdr->flags & 0xF0) {
+ case ISCSI_CTRL:
+ if (iser_hdr->flags & ISER_RSV) {
+ task->rem_read_stag =
+ be32_to_cpu(iser_hdr->read_stag);
+ task->rem_read_va = be64_to_cpu(iser_hdr->read_va);
+ dprintf("task:%p rstag:0x%x va:0x%lx\n", task,
+ task->rem_read_stag, task->rem_read_va);
+ }
+ if (iser_hdr->flags & ISER_WSV) {
+ task->rem_write_stag =
+ be32_to_cpu(iser_hdr->write_stag);
+ task->rem_write_va =
+ be64_to_cpu(iser_hdr->write_va);
+ dprintf("task:%p wstag:0x%x va:0x%lx\n", task,
+ task->rem_write_stag, task->rem_write_va);
+ }
+ err = 0;
+ break;
+ case ISER_HELLO:
+ dprintf("iSER Hello message??\n");
+ break;
+ default:
+ eprintf("malformed iser iser_hdr, flags 0x%02x\n",
+ iser_hdr->flags);
+ break;
+ }
+
+ task->opcode = iscsi_hdr->opcode & ISCSI_OPCODE_MASK;
+ task->req.ahssize = iscsi_hdr->hlength * 4;
+ task->req.data.buf += task->req.ahssize;
+ task->req.data.size = ntoh24(iscsi_hdr->dlength);
+ task->req.data.rdma = 0;
+
+ return err;
+}
+
+static void iser_rx_handler(struct iser_work_req *rxd)
+{
+ struct iser_task *task = rxd->task;
+ struct iser_conn *conn = task->conn;
+ int queue_task = 1;
+ int err = 0;
+
+ //--conn->recvl_posted;
+ iser_conn_put(conn);
+
+ if (conn->h.state == STATE_CLOSE)
+ return;
+
+ conn->h.exp_stat_sn = be32_to_cpu(task->req.bhs->exp_statsn);
+ task->tag = task->req.bhs->itt;
+ task->is_read = task->is_write = 0;
+
+ err = iser_parse_req_headers(task);
+ if (err)
+ goto out;
+
+ INIT_LIST_HEAD(&task->in_buf_list);
+ task->in_buf_num = 0;
+
+ INIT_LIST_HEAD(&task->out_buf_list);
+ task->out_buf_num = 0;
+
+ switch (task->opcode) {
+ case ISCSI_OP_LOGIN:
+ dprintf("login rx\n");
+ iser_login_rx(task);
+ queue_task = 0;
+ break;
+ case ISCSI_OP_SCSI_CMD:
+ err = iser_scsi_cmd_rx(task);
+ break;
+ case ISCSI_OP_SCSI_DATA_OUT:
+ err = iser_data_out_rx(task);
+ queue_task = 0;
+ break;
+ case ISCSI_OP_NOOP_OUT:
+ err = iser_nop_out_rx(task);
+ break;
+ case ISCSI_OP_LOGOUT:
+ dprintf("logout rx\n");
+ break;
+ case ISCSI_OP_SCSI_TMFUNC:
+ dprintf("tmfunc rx\n");
+ break;
+ case ISCSI_OP_TEXT:
+ dprintf("text rx\n");
+ break;
+ case ISCSI_OP_SNACK:
+ eprintf("Cannot handle SNACK yet\n");
+ err = -EINVAL;
+ break;
+ default:
+ eprintf("Unknown op 0x%x\n",task->opcode);
+ err = -EINVAL;
+ break;
+ }
+ if (!err && queue_task)
+ err = iser_queue_task(conn->h.session, task);
+out:
+ if (err) {
+ eprintf("task:%p conn:%p err:%d\n", task, &conn->h, err);
+ iser_conn_close(conn);
+ }
+}
+
+/*
+static void iser_send_chain_complete(struct iser_work_req *txd)
+{
+ struct iser_task *task = txd->task;
+
+ for (;;) {
+ if (txd->send_wr.opcode == IBV_WR_SEND) {
+ iser_complete_task(task);
+ iser_post_recv(task->conn, task);
+ }
+ else if (txd->send_wr.opcode != IBV_WR_RDMA_WRITE) {
+ eprintf("Unexpected send_wr.opcode:0x%x wr:%p\n",
+ txd->send_wr.opcode, &txd->send_wr);
+ exit(1);
+ }
+ if (!txd->send_wr.next)
+ break;
+
+ txd = container_of(txd->send_wr.next, struct iser_work_req,
+ send_wr);
+ task = txd->task;
+ }
+}
+*/
+
+static void iser_tx_complete_handler(struct iser_work_req *txd)
+{
+ struct iser_task *task = txd->task;
+ struct iser_conn *conn = task->conn;
+
+ dprintf("task:%p tag:0x%04lx\n", task, task->tag);
+
+ list_del(&task->tx_list); /* remove from conn->sent_list */
+ if (!task->unsolicited)
+ iser_complete_task(task);
+ else {
+ conn->nop_in_task = task;
+ return;
+ }
+
+ if (conn->h.state == STATE_CLOSE)
+ return;
+
+ if (conn->login_phase == LOGIN_PHASE_FF)
+ schedule_post_recv(task, conn);
+ else if (conn->login_phase == LOGIN_PHASE_LAST_SEND) {
+ dprintf("last login send completed, release, to ff\n");
+ iser_free_login_resources(conn);
+ conn->login_phase = LOGIN_PHASE_FF;
+ conn->h.state = STATE_FULL;
+ }
+}
+
+static void iser_rdma_wr_complete_handler(struct iser_work_req *rdmad)
+{
+ struct iser_task *task = rdmad->task;
+ struct iser_conn *conn = task->conn;
+
+ eprintf("task:%p tag:0x%04lx\n", task, task->tag);
+ exit(1);
+
+ if (conn->h.state == STATE_CLOSE)
+ return;
+
+ //iser_send_chain_complete(rdmad);
+}
+
+/*
+ * Called from CQ processing. Hands completed write data to iscsi.
+ */
+static void iser_rdma_rd_complete_handler(struct iser_work_req *rdmad)
+{
+ struct iser_task *task = rdmad->task;
+ struct iser_conn *conn = task->conn;
+
+ // ToDo: need iser_complete_task?
+
+ if (conn->h.state == STATE_CLOSE)
+ return;
+
+ task->rdma_rd_remains -= rdmad->sge.length;
+ dprintf("task:%p tag:0x%04lx, rems rdma:%d unsol:%d\n",
+ task, task->tag, task->rdma_rd_remains,
+ task->unsol_remains);
+ if (task->rdma_rd_remains == 0 && task->unsol_remains == 0)
+ schedule_task_iosubmit(task,conn);
+}
+
+/*
+ * Deal with just one work completion.
+ */
+static void handle_wc(struct ibv_wc *wc)
+{
+ void *wr_id = ptr_from_int64(wc->wr_id);
+
+ switch (wc->opcode) {
+ case IBV_WC_SEND:
+ dprintf("tx complete, wr_id:%p len:%d\n", wr_id, wc->byte_len);
+ iser_tx_complete_handler(wr_id);
+ break;
+ case IBV_WC_RECV:
+ dprintf("rx cmd, wr_id:%p len:%d\n", wr_id, wc->byte_len);
+ iser_rx_handler(wr_id);
+ break;
+ case IBV_WC_RDMA_WRITE:
+ dprintf("rdma write done, wr_id:%p len:%d\n", wr_id, wc->byte_len);
+ iser_rdma_wr_complete_handler(wr_id);
+ break;
+ case IBV_WC_RDMA_READ:
+ dprintf("rdma read done, wr_id:%p len:%d\n", wr_id, wc->byte_len);
+ iser_rdma_rd_complete_handler(wr_id);
+ break;
+ default:
+ eprintf("unexpected opcode %d\n", wc->opcode);
+ exit(1);
+ }
+}
+
+static void handle_wc_flush(struct ibv_wc *wc)
+{
+ struct iser_work_req *wr = ptr_from_int64(wc->wr_id);
+ struct iser_task *task = wr->task;
+ struct iser_conn *conn = task->conn;
+
+ dprintf("conn:%p wr_id:0x%llx task:%p\n",
+ &conn->h, (unsigned long long) wc->wr_id, task);
+
+ if (conn->h.state == STATE_CLOSE) {
+ //--conn->recvl_posted;
+ iser_conn_put(conn);
+ } else {
+ eprintf("conn %p wr flush err\n", conn);
+ /* call disconnect now? */
+ }
+}
+
+static void handle_wc_error(struct ibv_wc *wc)
+{
+ struct iser_work_req *wr = ptr_from_int64(wc->wr_id);
+ struct iser_task *task = wr->task;
+ struct iser_conn *conn = task->conn;
+ int complete = 1;
+ char *op_str;
+
+ if (wr->recv) {
+ op_str = "recv";
+ complete = 0;
+ } else {
+ switch (wr->send_wr.opcode) {
+ case IBV_WC_SEND:
+ op_str = "send";
+ break;
+ case IBV_WC_RDMA_WRITE:
+ op_str = "rdma_wr";
+ break;
+ case IBV_WC_RDMA_READ:
+ op_str = "rdma_rd";
+ break;
+ default:
+ op_str = "Unknown";
+ complete = 0;
+ break;
+ }
+ }
+ eprintf("conn:%p task:%p tag:0x%04lx op:%s WC status:%d wr_id:0x%llx\n",
+ &conn->h, task, task->tag, op_str, wc->status,
+ (unsigned long long) wc->wr_id);
+
+ if (complete)
+ iser_complete_task(task);
+ iser_conn_close(conn);
+}
+
+/*
+ * Could read as many entries as possible without blocking, but
+ * that just fills up a list of tasks. Instead pop out of here
+ * so that tx progress, like issuing rdma reads and writes, can
+ * happen periodically.
+ */
+static int iser_poll_cq(struct iser_device *dev, int max_wc)
+{
+ int err = 0, numwc = 0;
+ struct ibv_wc wc;
+
+ for (;;) {
+ err = ibv_poll_cq(dev->cq, 1, &wc);
+ if (err < 0) {
+ eprintf("ibv_poll_cq %d\n", err);
+ break;
+ } else if (err == 0) {
+ break;
+ }
+
+ VALGRIND_MAKE_MEM_DEFINED(&wc, sizeof(wc));
+ if (wc.status == IBV_WC_SUCCESS) {
+ handle_wc(&wc);
+ if (++numwc == max_wc) {
+ err = 1;
+ break;
+ }
+ } else if (wc.status == IBV_WC_WR_FLUSH_ERR)
+ handle_wc_flush(&wc);
+ else
+ handle_wc_error(&wc);
+ }
+ return err;
+}
+
+static int num_delayed_arm = 0;
+#define MAX_NUM_DELAYED_ARM 8
+
+static void iser_poll_cq_armable(struct iser_device *dev)
+{
+ int err;
+
+ err = iser_poll_cq(dev, MAX_POLL_WC);
+ if (err < 0)
+ exit(1);
+
+ if (err == 0) {
+ if (++num_delayed_arm == MAX_NUM_DELAYED_ARM) {
+ num_delayed_arm = 0;
+ /* no more completions on cq, arm the completion interrupts */
+ err = ibv_req_notify_cq(dev->cq, 0);
+ if (err) {
+ eprintf("ibv_req_notify_cq: %s\n", strerror(err));
+ exit(1);
+ }
+ dev->poll_sched.sched_handler = iser_sched_consume_cq;
+ }
+ else
+ dev->poll_sched.sched_handler = iser_sched_poll_cq;
+ } else
+ dev->poll_sched.sched_handler = iser_sched_poll_cq;
+
+ tgt_add_sched_event(&dev->poll_sched);
+}
+
+/* Scheduled to poll cq after a completion event has been
+ received and acknowledged, if no more completions are found
+ the interrupts are re-armed */
+static void iser_sched_poll_cq(struct event_data *tev)
+{
+ struct iser_device *dev = tev->data;
+ iser_poll_cq_armable(dev);
+}
+
+/* Scheduled to consume completion events that could arrive
+ after the cq had been seen empty but just before
+ the notification interrupts were re-armed.
+ Intended to consume those remaining completions only,
+ this function does not re-arm interrupts. */
+static void iser_sched_consume_cq(struct event_data *tev)
+{
+ struct iser_device *dev = tev->data;
+ int err;
+
+ err = iser_poll_cq(dev, MAX_POLL_WC);
+ if (err < 0)
+ exit(1);
+}
+
+/*
+ * Called from main event loop when a CQ notification is available.
+ */
+static void iser_cqe_handler(int fd __attribute__ ((unused)),
+ int events __attribute__ ((unused)),
+ void *data)
+{
+ struct iser_device *dev = data;
+ void *cq_context;
+ int err;
+
+ err = ibv_get_cq_event(dev->cq_channel, &dev->cq, &cq_context);
+ if (err != 0) {
+ eprintf("notification, but no CQ event\n");
+ exit(1);
+ }
+
+ ibv_ack_cq_events(dev->cq, 1);
+
+ /* if a poll was previosuly scheduled, remove it,
+ as it will be scheduled when necessary */
+ if (dev->poll_sched.scheduled)
+ tgt_remove_sched_event(&dev->poll_sched);
+
+ iser_poll_cq_armable(dev);
+}
+
+/*
+ * Init entire iscsi transport. Begin listening for connections.
+ */
+static int iser_ib_init(void)
+{
+ int err;
+ struct sockaddr_in sock_addr;
+ short int port = iser_listen_port;
+
+ rdma_evt_channel = rdma_create_event_channel();
+ if (!rdma_evt_channel) {
+ eprintf("Failed to initialize RDMA; load kernel modules?\n");
+ return -1;
+ }
+
+ err = rdma_create_id(rdma_evt_channel, &cma_listen_id, NULL,
+ RDMA_PS_TCP);
+ if (err) {
+ eprintf("rdma_create_id: %s\n", strerror(err));
+ return -1;
+ }
+
+ memset(&sock_addr, 0, sizeof(sock_addr));
+ sock_addr.sin_family = AF_INET;
+ sock_addr.sin_port = htons(port);
+ sock_addr.sin_addr.s_addr = INADDR_ANY;
+ err =
+ rdma_bind_addr(cma_listen_id, (struct sockaddr *) &sock_addr);
+ if (err) {
+ if (err == -1)
+ eprintf("rdma_bind_addr -1: %m\n");
+ else
+ eprintf("rdma_bind_addr: %s\n", strerror(-err));
+ return -1;
+ }
+
+ /* 0 == maximum backlog */
+ err = rdma_listen(cma_listen_id, 0);
+ if (err) {
+ if (err == -1)
+ eprintf("rdma_listen -1: %m\n");
+ else
+ eprintf("rdma_listen: %s\n", strerror(-err));
+ return -1;
+ }
+
+ dprintf("listening for iser connections on port %d\n", port);
+ err = tgt_event_add(cma_listen_id->channel->fd, EPOLLIN,
+ iser_handle_rdmacm, NULL);
+ if (err)
+ return err;
+
+ INIT_LIST_HEAD(&iser_dev_list);
+ INIT_LIST_HEAD(&iser_conn_list);
+ INIT_LIST_HEAD(&temp_conn);
+
+ return err;
+}
+
+#include <signal.h>
+#include <sys/time.h>
+
+#define ISER_TIMER_INT_SEC 5
+static struct itimerval iser_timer =
+ {{ISER_TIMER_INT_SEC,0},
+ {ISER_TIMER_INT_SEC,0}};
+static int timer_fd[2] = {0,0};
+static unsigned int ntimer = 0;
+
+static void iser_timer_sig_handler(int data)
+{
+ int err;
+
+ // ntimer++;
+ err = write(timer_fd[1], &ntimer, sizeof(ntimer));
+ if (err < 0) {
+ eprintf("Failed to write to pipe, %m\n");
+ return;
+ }
+}
+
+static void iser_timer_evt_handler(int fd, int events, void *data)
+{
+ struct iser_conn *conn;
+ struct iser_task *task;
+ unsigned int n;
+ int err;
+
+ err = read(timer_fd[0], &n, sizeof(n));
+ if (err < 0) {
+ eprintf("Failed to write to pipe, %m\n");
+ return;
+ }
+
+ list_for_each_entry(conn, &iser_conn_list, conn_list) {
+ if (conn->h.state != STATE_FULL)
+ continue;
+ task = conn->nop_in_task;
+ if (!task)
+ continue;
+ conn->nop_in_task = NULL;
+ iser_send_ping_nop_in(task);
+ }
+}
+
+static int iser_start_timer(void)
+{
+ struct sigaction s;
+ int err;
+
+ sigemptyset(&s.sa_mask);
+ sigaddset(&s.sa_mask, SIGALRM);
+ s.sa_flags = 0;
+ s.sa_handler = iser_timer_sig_handler;
+ err = sigaction(SIGALRM, &s, NULL);
+ if (err) {
+ eprintf("Failed to setup timer handler\n");
+ return err;
+ }
+
+ err = setitimer(ITIMER_REAL, &iser_timer, 0);
+ if (err) {
+ eprintf("Failed to set timer\n");
+ return err;
+ }
+
+ err = pipe(timer_fd);
+ if (err) {
+ eprintf("Failed to open timer pipe\n");
+ return err;
+ }
+
+ err = tgt_event_add(timer_fd[0], EPOLLIN, iser_timer_evt_handler, NULL);
+ if (err) {
+ eprintf("failed to add iser timer epoll event\n");
+ return err;
+ }
+
+ return 0;
+}
+
+static int iser_stop_timer(void)
+{
+ int err;
+
+ tgt_event_del(timer_fd[0]);
+
+ if (timer_fd[0] > 0)
+ close(timer_fd[0]);
+ if (timer_fd[1] > 0)
+ close(timer_fd[1]);
+
+ err = setitimer(ITIMER_REAL, 0, 0);
+ if (err)
+ eprintf("Failed to stop timer\n");
+
+ return err;
+}
+
+static int iser_init(int index, char *args)
+{
+ int err;
+
+ err = iser_ib_init();
+ if (err)
+ return err;
+
+ /* err = iser_start_timer();
+ if (err)
+ return err;
+ */
+
+ return 0;
+}
+
+static void iser_exit(void)
+{
+ iser_stop_timer();
+}
+
+static int iser_target_create(struct target *t)
+{
+ struct iscsi_target *target;
+ int err;
+
+ err = iscsi_target_create(t);
+ if (err)
+ return err;
+
+ target = target_find_by_id(t->tid);
+ assert(target != NULL);
+
+ target->rdma = 1;
+ target->session_param[ISCSI_PARAM_INITIAL_R2T_EN].val = 1;
+ target->session_param[ISCSI_PARAM_IMM_DATA_EN].val = 0;
+
+ return 0;
+}
+
+static const char *lld_param_port = "port";
+
+static int iser_param_parser(char *p)
+{
+ char *q;
+
+ while (*p) {
+ if (!strncmp(p, lld_param_port, strlen(lld_param_port))) {
+ q = p + strlen(lld_param_port) + 1;
+ iser_listen_port = atoi(q);
+ eprintf("iser listen port:%d\n", iser_listen_port);
+ }
+
+ p += strcspn(p, ",");
+ if (*p == ',')
+ ++p;
+ }
+ return 0;
+}
+
+static struct tgt_driver iser = {
+ .name = "iser",
+ .use_kernel = 0,
+ .init = iser_init,
+ .exit = iser_exit,
+ .target_create = iser_target_create,
+ .target_destroy = iscsi_target_destroy,
+
+ .update = iscsi_target_update,
+ .show = iscsi_target_show,
+ .cmd_end_notify = iser_scsi_cmd_done,
+ .mgmt_end_notify = iser_tm_done,
+ .transportid = iscsi_transportid,
+ .default_bst = "rdwr",
+};
+
+__attribute__ ((constructor))
+static void iser_driver_constructor(void)
+{
+ register_driver(&iser);
+
+ setup_param("iser", iser_param_parser);
+}
+
--
1.6.5
--
To unsubscribe from this list: send the line "unsubscribe stgt" in
the body of a message to majordomo at vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
More information about the stgt
mailing list