These patches reflect a preliminary version of iser-related changes and are presented as a new implementation concept. More changes in iscsi infrastructure design are required. This RFC contains a re-write of the iser code (file iscsi_rdma.c is replaced by iser_ib.c to simplify diff). New header iser.h is created. A separate transport lld (named "iser") is defined. Some code, mostly iscsi text related, is replicated and put into a new file iser_text.c, mainly because the functions there are not general enough, and rely on specifics of iscsi/tcp structs. Large portions of code are moved from iscsid.c and shuffled around. This code seems to fix an occasional data corruption that happens with the current version. There are many unhandled error cases and rare conditions, left until there is a solid common iscsi framework to rely upon. Most of them are marked with ToDo comments. The code is fairly RDMA-transport agnostic (ib/iwarp), but it was never verified over iwarp (this fact is reflected in the file name iser_ib.c). The code implies RDMA-only mode of work. This means the first burst incl. immediate data should be disabled, so that the entire data transfer is performed using RDMA. It introduces some preparations for handling other (general) scenarios, but as tgt has no framework for multi-buffer commands, these extra code segments are either commented or conditioned upon events that should never take place. All such places have a ToDo comment over them. This specific patch compiles when applied to the current git head, but was not verified to work with it (as it'd been branched off from a rather old commit). Files in patch 1/2: iser.h iser_ib.c Files in patch 2/2: iscsi/iser_text.c iscsi/iscsid.c iscsi/iscsid.h iscsi/iscsi_tcp.c iscsi/conn.c iscsi/session.c iscsi/target.c iscsi/transport.h iscsi/chap.c list.h Makefile Signed-off-by: Alexander Nezhinsky <alexandern at voltaire.com> --- usr/iscsi/iser.h | 238 ++++ usr/iscsi/iser_ib.c | 3103 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 3341 insertions(+), 0 deletions(-) create mode 100644 usr/iscsi/iser.h create mode 100644 usr/iscsi/iser_ib.c diff --git a/usr/iscsi/iser.h b/usr/iscsi/iser.h new file mode 100644 index 0000000..b4f0d17 --- /dev/null +++ b/usr/iscsi/iser.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2010 Voltaire, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ +#ifndef ISER_H +#define ISER_H + +#include "iscsid.h" + +/* + * The IB-extended version from the kernel. Stags and VAs are in + * big-endian format. + */ +struct iser_hdr { + uint8_t flags; + uint8_t rsvd[3]; + uint32_t write_stag; /* write rkey */ + uint64_t write_va; + uint32_t read_stag; /* read rkey */ + uint64_t read_va; +} __attribute__((packed)); + +#define ISER_WSV (0x08) +#define ISER_RSV (0x04) +#define ISCSI_CTRL (0x10) +#define ISER_HELLO (0x20) +#define ISER_HELLORPLY (0x30) + +struct iser_conn; + +/* + * Work requests are either posted Receives for control messages, + * or Send and RDMA ops (also considered "send" by IB) + * They have different work request descriptors. + * During a flush, we need to know the type of op and the + * task to which it is related. + */ +struct iser_work_req { + struct ibv_sge sge; + int recv; /* type of op: recv or send */ + struct iser_task *task; + union { + struct ibv_recv_wr recv_wr; + struct ibv_send_wr send_wr; + }; + struct list_head wr_list; +}; + +/* + * Pre-registered memory. Buffers are allocated by iscsi from us, handed + * to device to fill, then iser can send them directly without registration. + * Also for write path. + */ +struct membuf { + void *buf; + unsigned size; + unsigned offset; /* offset within task data */ + struct list_head task_list; + int rdma; + struct list_head pool_list; +}; + +struct iser_pdu { + struct iser_hdr *iser_hdr; + struct iscsi_hdr *bhs; + unsigned int ahssize; + void *ahs; + struct membuf data; // describes pdu data section only, + // the original buffer is reflected in ibv_sge +}; + +/* + * Each SCSI command may have its own RDMA parameters. These appear on + * the connection then later are assigned to the particular task to be + * used when the target responds. + */ +struct iser_task { + struct iser_conn *conn; + + struct iser_work_req rxd; + struct iser_pdu req; + + /* read and write from the initiator's point of view */ + uint32_t rem_read_stag, rem_write_stag; + uint64_t rem_read_va, rem_write_va; + + int opcode; + uint64_t tag; + int is_read; + int is_write; + + unsigned long flags; + + int in_len; + int out_len; + + int unsol_sz; + int unsol_remains; + int rdma_rd_sz; + int rdma_rd_remains; + // int rdma_rd_offset; // ToDo: multiple RDMA-Write buffers + int rdma_wr_sz; + int rdma_wr_remains; + + int unsolicited; + + struct iser_work_req txd; + struct iser_pdu rsp; + + struct list_head in_buf_list; + int in_buf_num; + + struct list_head out_buf_list; + int out_buf_num; + + struct iser_work_req rdmad; + + struct list_head exec_list; + struct list_head rdma_list; + struct list_head tx_list; + struct list_head recv_list; + + /* linked to session->cmd_list */ + struct list_head session_list; + + struct list_head dout_task_list; + + int result; + struct scsi_cmd scmd; + + unsigned long extdata[0]; +}; + +struct iser_device; + +/* + * Parallels iscsi_connection. Adds more fields for iser. + */ +struct iser_conn { + struct iscsi_conn_hdr h; + + struct ibv_qp *qp_hndl; + struct rdma_cm_id *cm_id; + struct iser_device *dev; + struct sockaddr_storage peer_addr; /* initiator address */ + struct sockaddr_storage self_addr; /* target address */ + unsigned int ssize, rsize, max_outst_pdu; + + enum { + LOGIN_PHASE_START, /* keep 1 send spot and 1 recv posted */ + LOGIN_PHASE_LAST_SEND, /* need 1 more send before ff */ + LOGIN_PHASE_FF, /* full feature */ + } login_phase; + + int recvl_posted; + + struct event_data sched_buf_alloc; + struct list_head buf_alloc_list; + + struct event_data sched_rdma_rd; + struct list_head rdma_rd_list; + + struct event_data sched_iosubmit; + struct list_head iosubmit_list; + + struct event_data sched_tx; + struct list_head resp_tx_list; + + struct list_head sent_list; + + struct event_data sched_post_recv; + struct list_head post_recv_list; + + struct event_data sched_conn_free; + + /* FF resources */ + int ff_res_alloc; + void *task_pool; /* space for the send, recv, rdma list elements */ + void *pdu_data_pool; /* registered space for non-rdma send and recv */ + struct ibv_mr *pdu_data_mr; /* mr for registered pdu_data_buf */ + struct iser_task *nop_in_task; + + /* login phase resources, freed at full-feature */ + int login_res_alloc; + void *login_task_pool; + void *login_data_pool; + struct ibv_mr *login_data_mr; + struct iser_task *login_tx_task; + + /* list of all iser conns */ + struct list_head conn_list; +}; + +/* + * Shared variables for a particular device. The conn[] array will + * have to be broken out when multiple device support is added, maybe with + * a pointer into this "device" struct. + */ +struct iser_device { + struct list_head list; + struct ibv_context *ibv_hndl; + struct ibv_pd *pd; + struct ibv_cq *cq; + struct ibv_comp_channel *cq_channel; + struct ibv_device_attr device_attr; + + /* membuf registered buffer, list area, handle */ + void *membuf_regbuf; + void *membuf_listbuf; + struct ibv_mr *membuf_mr; + int waiting_for_mem; + + struct event_data poll_sched; + + /* free and allocated membuf entries */ + struct list_head membuf_free, membuf_alloc; +}; + +void iser_login_exec(struct iser_task *task); +int iser_login_complete(struct iscsi_conn_hdr *conn_h); +int iser_text_exec(struct iser_task *task); +void iser_conn_close(struct iser_conn *conn); + +#endif /* ISER_H */ diff --git a/usr/iscsi/iser_ib.c b/usr/iscsi/iser_ib.c new file mode 100644 index 0000000..d7ce047 --- /dev/null +++ b/usr/iscsi/iser_ib.c @@ -0,0 +1,3103 @@ +/* + * iSCSI extensions for RDMA (iSER) data path + * + * Copyright (C) 2007 Dennis Dalessandro (dennis at osc.edu) + * Copyright (C) 2007 Ananth Devulapalli (ananth at osc.edu) + * Copyright (C) 2007 Pete Wyckoff (pw at osc.edu) + * Copyright (C) 2010 Voltaire, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <assert.h> +#include <netdb.h> +#include <sys/epoll.h> +#include <infiniband/verbs.h> +#include <rdma/rdma_cma.h> + +#include "util.h" +#include "iscsid.h" +#include "target.h" +#include "iser.h" +#include "driver.h" +#include "scsi.h" + +#if defined(HAVE_VALGRIND) && !defined(NDEBUG) +#include <valgrind/memcheck.h> +#else +#define VALGRIND_MAKE_MEM_DEFINED(addr, len) +#endif + +/* global, across all devices */ +static struct rdma_event_channel *rdma_evt_channel; +static struct rdma_cm_id *cma_listen_id; + +/* accepted at RDMA layer, but not yet established */ +static struct list_head temp_conn; + +/* all devices */ +static struct list_head iser_dev_list; + +/* all iser connections */ +static struct list_head iser_conn_list; + +#define uint64_from_ptr(p) (uint64_t)(uintptr_t)(p) +#define ptr_from_int64(p) (void *)(unsigned long)(p) + +#define ISER_LISTEN_PORT 3260 + +short int iser_listen_port = ISER_LISTEN_PORT; +char *iser_portal_addr; + +/* + * Crazy hard-coded linux iser settings need 128 * 8 slots + slop, plus + * room for our rdmas and send requests. + */ +#define MAX_WQE 1800 + +#define RDMA_TRANSFER_SIZE (512 * 1024) + +#define MAX_POLL_WC 32 + +#define ISER_MAX_QUEUE_CMD 128 // iSCSI cmd window size + +struct iscsi_sense_data { + uint16_t length; + uint8_t data[0]; +} __packed; + +/* + * Number of allocatable data buffers, each of this size. Do at least 128 + * for linux iser. The membuf size is rounded up at initialization time + * to the hardware page size so that allocations for direct IO devices are + * aligned. + */ +static int membuf_num = 16 * ISER_MAX_QUEUE_CMD; +static size_t membuf_size = RDMA_TRANSFER_SIZE; + +static int iser_conn_get(struct iser_conn *conn); +static void iser_conn_put(struct iser_conn *conn); + +static void iser_free_login_resources(struct iser_conn *conn); +static void iser_free_ff_resources(struct iser_conn *conn); + +static void iser_cqe_handler(int fd __attribute__ ((unused)), + int events __attribute__ ((unused)), + void *data); + +static void iser_sched_poll_cq(struct event_data *tev); +static void iser_sched_consume_cq(struct event_data *tev); +static void iser_sched_tx(struct event_data *evt); +static void iser_sched_post_recv(struct event_data *evt); +static void iser_sched_buf_alloc(struct event_data *evt); +static void iser_sched_iosubmit(struct event_data *evt); +static void iser_sched_rdma_rd(struct event_data *evt); + + +static inline struct iser_conn *ISER_CONN(struct iscsi_conn_hdr *conn_h) +{ + return container_of(conn_h, struct iser_conn, h); +} + +static void iser_pdu_init(struct iser_pdu *pdu, void *buf) +{ + pdu->iser_hdr = (struct iser_hdr *) buf; + buf += sizeof(struct iser_hdr); + pdu->bhs = (struct iscsi_hdr *) buf; + buf += sizeof(struct iscsi_hdr); + pdu->ahs = pdu->data.buf = buf; + pdu->ahssize = pdu->data.size = 0; +} + +static void iser_send_pdu_init(struct iser_pdu *pdu, void *buf) +{ + iser_pdu_init(pdu, buf); + memset(pdu->iser_hdr, 0, sizeof(struct iser_hdr)); + pdu->iser_hdr->flags = ISCSI_CTRL; +} + +static void iser_rxd_init(struct iser_work_req *rxd, + struct iser_task *task, + void *buf, unsigned size, + struct ibv_mr *srmr) +{ + rxd->recv = 1; + rxd->task = task; + + rxd->sge.addr = uint64_from_ptr(buf); + rxd->sge.length = size; + rxd->sge.lkey = srmr->lkey; + + rxd->recv_wr.wr_id = uint64_from_ptr(rxd); + rxd->recv_wr.sg_list = &rxd->sge; + rxd->recv_wr.num_sge = 1; + rxd->recv_wr.next = NULL; +} + +static void iser_txd_init(struct iser_work_req *txd, + struct iser_task *task, + void *buf, unsigned size, + struct ibv_mr *srmr) +{ + txd->recv = 0; + txd->task = task; + + txd->sge.addr = uint64_from_ptr(buf); + txd->sge.length = size; + txd->sge.lkey = srmr->lkey; + + txd->send_wr.wr_id = uint64_from_ptr(txd); + txd->send_wr.next = NULL; + txd->send_wr.sg_list = &txd->sge; + txd->send_wr.num_sge = 1; + txd->send_wr.opcode = IBV_WR_SEND; + txd->send_wr.send_flags = IBV_SEND_SIGNALED; + + INIT_LIST_HEAD(&txd->wr_list); +} + +static void iser_rdmad_init(struct iser_work_req *rdmad, + struct iser_task *task, + struct ibv_mr *srmr) +{ + rdmad->recv = 0; + rdmad->task = task; + + // txd->sge.addr, txd->sge.length to be set before posting + rdmad->sge.lkey = srmr->lkey; + + rdmad->send_wr.wr_id = uint64_from_ptr(rdmad); + rdmad->send_wr.sg_list = &rdmad->sge; + rdmad->send_wr.num_sge = 1; + rdmad->send_wr.next = NULL; + // txd->send_wr.opcode to be set before posting + rdmad->send_wr.send_flags = IBV_SEND_SIGNALED; + // rdmad->send_wr.wr.rdma.(remote_addr,rkey) to be set before post + + INIT_LIST_HEAD(&rdmad->wr_list); +} + +static void iser_task_init(struct iser_task *task, + struct iser_conn *conn, + void *pdu_data_buf, + struct ibv_mr *srmr) +{ + task->conn = conn; + task->unsolicited = 0; + + iser_rxd_init(&task->rxd, task, pdu_data_buf, conn->rsize, srmr); + iser_pdu_init(&task->req, pdu_data_buf); + pdu_data_buf += conn->rsize; + + iser_txd_init(&task->txd, task, pdu_data_buf, conn->ssize, srmr); + iser_send_pdu_init(&task->rsp, pdu_data_buf); + + INIT_LIST_HEAD(&task->in_buf_list); + INIT_LIST_HEAD(&task->out_buf_list); + iser_rdmad_init(&task->rdmad, task, conn->dev->membuf_mr); + + INIT_LIST_HEAD(&task->exec_list); + INIT_LIST_HEAD(&task->rdma_list); + INIT_LIST_HEAD(&task->tx_list); + INIT_LIST_HEAD(&task->recv_list); + + INIT_LIST_HEAD(&task->session_list); + INIT_LIST_HEAD(&task->dout_task_list); +} + +static void iser_unsolicited_task_init(struct iser_task *task, + struct iser_conn *conn, + void *pdu_data_buf, + struct ibv_mr *srmr) +{ + task->conn = conn; + task->unsolicited = 1; + + iser_txd_init(&task->txd, task, pdu_data_buf, conn->ssize, srmr); + iser_send_pdu_init(&task->rsp, pdu_data_buf); +} + +static void iser_task_add_out_pdu_buf(struct iser_task *task, + struct membuf *buf, + unsigned int offset) +{ + buf->offset = offset; + task->out_buf_num ++; + if (!list_empty(&task->out_buf_list)) { + struct membuf *cur_buf; + list_for_each_entry(cur_buf, &task->out_buf_list, task_list) { + if (offset < cur_buf->offset) { + dprintf("task:%p offset:%d size:%d buf:%p add before:%p\n", + task, offset, buf->size, buf->buf, cur_buf->buf); + list_add_tail(&buf->task_list, &cur_buf->task_list); + return; + } + } + } + dprintf("task:%p offset:%d size:%d buf:%p add last\n", + task, offset, buf->size, buf->buf); + list_add_tail(&buf->task_list, &task->out_buf_list); +} + +static void iser_task_add_out_rdma_buf(struct iser_task *task, + struct membuf *buf, + unsigned int offset) +{ + buf->offset = offset; + task->out_buf_num ++; + dprintf("task:%p offset:%d size:%d buf:%p add last\n", + task, offset, buf->size, buf->buf); + list_add_tail(&buf->task_list, &task->out_buf_list); +} + +static void iser_task_del_out_buf(struct iser_task *task, + struct membuf *buf) +{ + dprintf("task:%p offset:%d size:%d buf:%p\n", + task, buf->offset, buf->size, buf->buf); + list_del(&buf->task_list); + task->out_buf_num --; +} + +static void iser_task_add_in_rdma_buf(struct iser_task *task, + struct membuf *buf, + unsigned int offset) +{ + dprintf("task:%p offset:0x%d size:%d buf:%p add last\n", + task, offset, buf->size, buf->buf); + buf->offset = offset; + task->in_buf_num ++; + list_add_tail(&buf->task_list, &task->in_buf_list); +} + +static void iser_task_del_in_buf(struct iser_task *task, + struct membuf *buf) +{ + dprintf("task:%p offset:%d size:%d buf:%p\n", + task, buf->offset, buf->size, buf->buf); + list_del(&buf->task_list); + task->in_buf_num --; +} + +static int iser_post_recv(struct iser_conn *conn, struct iser_task *task, + int num_recv_bufs) +{ + struct ibv_recv_wr *bad_wr; + int err, nr_posted; + + err = ibv_post_recv(conn->qp_hndl, &task->rxd.recv_wr, &bad_wr); + if (!err) + nr_posted = num_recv_bufs; + else { + nr_posted = 0; + eprintf("failed to post rx buf %p", &task->rxd.recv_wr); + } + //conn->recvl_posted += nr_posted; + + dprintf("chained:%d last task:%p wr_id:0x%lx sge_sz:%u\n", + nr_posted, task, (unsigned long)task->rxd.recv_wr.wr_id, + task->rxd.sge.length); + + while (nr_posted--) + iser_conn_get(conn); + + return err; +} + +static void iser_prep_resp_send_req(struct iser_task *task, + struct iser_work_req *next_wr, + int signaled) +{ + struct iser_pdu *rsp = &task->rsp; + struct iscsi_hdr *bhs = rsp->bhs; + struct iser_work_req *txd = &task->txd; + + bhs->hlength = rsp->ahssize / 4; + hton24(bhs->dlength, rsp->data.size); + + txd->sge.length = sizeof(struct iscsi_hdr) + sizeof(struct iser_hdr); + txd->sge.length += rsp->ahssize; + txd->sge.length += rsp->data.size; + + txd->send_wr.next = (next_wr ? &next_wr->send_wr : NULL); + txd->send_wr.send_flags = (signaled ? IBV_SEND_SIGNALED : 0); + + dprintf("task:%p wr_id:0x%lx tag:0x%04lx " + "dtbuf:%p dtsz:%u ahs_sz:%u stat:0x%x statsn:0x%x expcmdsn:0x%x\n", + task, (unsigned long)txd->send_wr.wr_id, + (unsigned long)task->tag, + rsp->data.buf, rsp->data.size, rsp->ahssize, + bhs->rsvd2[1], ntohl(bhs->statsn), ntohl(bhs->exp_statsn)); +} + +static void iser_prep_rdma_wr_send_req(struct iser_task *task, + struct iser_work_req *next_wr, + int signaled) +{ + struct iser_work_req *rdmad = &task->rdmad; + struct membuf *rdma_buf; + uint64_t offset = 0; // ToDo: multiple RDMA-Write buffers + + /* RDMA-Write buffer is the only one on the list */ + // ToDo: multiple RDMA-Write buffers, use rdma_buf->offset + rdma_buf = list_first_entry(&task->in_buf_list, struct membuf, task_list); + rdmad->sge.addr = uint64_from_ptr(rdma_buf->buf); + + if (task->rdma_wr_remains > rdma_buf->size) + rdmad->sge.length = rdma_buf->size; + else + rdmad->sge.length = task->rdma_wr_remains; + task->rdma_wr_remains -= rdmad->sge.length; + + rdmad->send_wr.next = (next_wr ? &next_wr->send_wr : NULL); + rdmad->send_wr.opcode = IBV_WR_RDMA_WRITE; + rdmad->send_wr.send_flags = (signaled ? IBV_SEND_SIGNALED : 0); + + rdmad->send_wr.wr.rdma.remote_addr = task->rem_read_va + offset; + // ToDo: multiple RDMA-Write buffers + // offset += rdmad->sge.length + rdmad->send_wr.wr.rdma.rkey = task->rem_read_stag; + + dprintf(" task:%p wr_id:0x%lx tag:0x%04lx daddr:0x%lx dsz:%u " + "bufsz:%u rdma:%d lkey:%x raddr:%lx rkey:%x rems:%u\n", + task, (unsigned long)rdmad->send_wr.wr_id, + (unsigned long)task->tag, (unsigned long)rdmad->sge.addr, + rdmad->sge.length, rdma_buf->size, rdma_buf->rdma, + rdmad->sge.lkey, (unsigned long)rdmad->send_wr.wr.rdma.remote_addr, + rdmad->send_wr.wr.rdma.rkey, task->rdma_wr_remains); +} + +static void iser_prep_rdma_rd_send_req(struct iser_task *task, + struct iser_work_req *next_wr, + int signaled) +{ + struct iser_work_req *rdmad = &task->rdmad; + struct membuf *rdma_buf; + uint64_t offset; + int cur_req_sz; + + /* RDMA-Read buffer is always put at the list's tail */ + // ToDo: multiple RDMA-Write buffers + rdma_buf = container_of(task->out_buf_list.prev, struct membuf, task_list); + offset = rdma_buf->offset; + rdmad->sge.addr = uint64_from_ptr(rdma_buf->buf) + offset; + + if (task->rdma_rd_sz <= rdma_buf->size) + cur_req_sz = task->rdma_rd_sz; + else + cur_req_sz = rdma_buf->size; + + // ToDo: multiple RDMA-Write buffers + // task->rdma_rd_offset += cur_req_sz; + + rdmad->sge.length = cur_req_sz; + + rdmad->send_wr.next = (next_wr ? &next_wr->send_wr : NULL); + rdmad->send_wr.opcode = IBV_WR_RDMA_READ; + rdmad->send_wr.send_flags = (signaled ? IBV_SEND_SIGNALED : 0); + rdmad->send_wr.wr.rdma.remote_addr = + (uint64_t) task->rem_write_va; + // ToDo: multiple RDMA-Write buffers: + (offset - task->unsol_sz); + rdmad->send_wr.wr.rdma.rkey = + (uint32_t) task->rem_write_stag; + + dprintf("task:%p wr_id:0x%lx tag:0x%04lx daddr:0x%lx dsz:%u " + "bufsz:%u rdma:%d lkey:%x raddr:%lx rkey:%x rems:%u\n", + task, (unsigned long)rdmad->send_wr.wr_id, + (unsigned long)task->tag, (unsigned long)rdmad->sge.addr, + rdmad->sge.length, rdma_buf->size, rdma_buf->rdma, + rdmad->sge.lkey, (unsigned long)rdmad->send_wr.wr.rdma.remote_addr, + rdmad->send_wr.wr.rdma.rkey, task->rdma_rd_sz); +} + +static int iser_post_send(struct iser_conn *conn, struct iser_work_req *iser_send) +{ + struct ibv_send_wr *bad_wr; + int err; + + err = ibv_post_send(conn->qp_hndl, &iser_send->send_wr, &bad_wr); + if (err) + eprintf("ibv_post_send err %d\n", err); + return err; +} + +static inline void iser_set_rsp_cmd_sn(struct iscsi_session *session, + struct iscsi_hdr *rsp) +{ + if (session) { + rsp->exp_statsn = cpu_to_be32(session->exp_cmd_sn); + rsp->max_statsn = cpu_to_be32(session->exp_cmd_sn + ISER_MAX_QUEUE_CMD); + } +} + +/* + * One pool of registered memory per device (per PD that is). + */ +static int iser_init_rdma_buf_pool(struct iser_device *dev) +{ + uint8_t *regbuf, *task_buf; + size_t pool_size, list_size; + struct membuf *mp; + int i; + + membuf_size = roundup(membuf_size, pagesize); + pool_size = membuf_num * membuf_size; + regbuf = valloc(pool_size); + if (!regbuf) { + eprintf("malloc regbuf sz:%zu\n", pool_size); + return -ENOMEM; + } + + list_size = membuf_num * sizeof(*mp); + task_buf = malloc(list_size); + if (!task_buf) { + eprintf("malloc task_buf sz:%zu\n", list_size); + free(regbuf); + return -ENOMEM; + } + + dev->membuf_mr = ibv_reg_mr(dev->pd, regbuf, pool_size, + IBV_ACCESS_LOCAL_WRITE); + if (!dev->membuf_mr) { + eprintf("register regbuf\n"); + free(regbuf); + free(task_buf); + return -1; + } + dprintf("pool buf:%p list:%p mr:%p lkey:0x%x\n", + regbuf, task_buf, dev->membuf_mr, dev->membuf_mr->lkey); + + dev->membuf_regbuf = regbuf; + dev->membuf_listbuf = task_buf; + INIT_LIST_HEAD(&dev->membuf_free); + INIT_LIST_HEAD(&dev->membuf_alloc); + + for (i = 0; i < membuf_num; i++) { + mp = (void *) task_buf; + task_buf += sizeof(*mp); + + list_add_tail(&mp->pool_list, &dev->membuf_free); + INIT_LIST_HEAD(&mp->task_list); + mp->buf = regbuf; + mp->size = membuf_size; + mp->rdma = 1; + + regbuf += membuf_size; + } + + return 0; +} + +static struct membuf *iser_alloc_rdma_buf(struct iser_conn *conn, + size_t sz) +{ + struct iser_device *dev = conn->dev; + struct list_head *buf_list; + struct membuf *mem; + + if (list_empty(&dev->membuf_free)) { + // eprintf("free list empty\n"); + dev->waiting_for_mem = 1; + return NULL; + } + if (sz > membuf_size) { + eprintf("size %zu too big\n", sz); + exit(1); + } + + buf_list = dev->membuf_free.next; + list_del(buf_list); + list_add_tail(buf_list, &dev->membuf_alloc); + mem = list_entry(buf_list, struct membuf, pool_list); + + dprintf("alloc:%p sz:%zu\n", mem, sz); + return mem; +} + +static void iser_free_rdma_buf(struct iser_conn *conn, struct membuf *mem) +{ + struct iser_device *dev; + + if (!mem || !mem->rdma) + return; + + dprintf("free %p\n", mem); + list_del(&mem->pool_list); + // add to the free list head to reuse recently used buffers first + dev = conn->dev; + list_add(&mem->pool_list, &dev->membuf_free); + if (dev->waiting_for_mem) { + dev->waiting_for_mem = 0; + tgt_add_sched_event(&conn->sched_buf_alloc); + } +} + +static void iser_task_free_out_bufs(struct iser_task *task) +{ + struct iser_conn *conn = task->conn; + struct membuf *membuf, *mbnext; + + list_for_each_entry_safe(membuf, mbnext, &task->out_buf_list, task_list) { + iser_task_del_out_buf(task, membuf); + if (membuf->rdma) + iser_free_rdma_buf(conn, membuf); + } + assert(task->out_buf_num == 0); +} + +static void iser_task_free_in_bufs(struct iser_task *task) +{ + struct iser_conn *conn = task->conn; + struct membuf *membuf, *mbnext; + + list_for_each_entry_safe(membuf, mbnext, &task->in_buf_list, task_list) { + iser_task_del_in_buf(task, membuf); + iser_free_rdma_buf(conn, membuf); + } + assert(task->in_buf_num == 0); +} + +static void iser_complete_task(struct iser_task *task) +{ + if (task->opcode == ISCSI_OP_SCSI_CMD) { + target_cmd_done(&task->scmd); + list_del(&task->session_list); + if (task->is_read) + iser_task_free_in_bufs(task); + if (task->is_write) { + iser_task_free_out_bufs(task); + // ToDo: multiple out buffers, no copy + // iser_task_free_dout_tasks(task); + } + } + dprintf("task:%p\n", task); + //iser_conn_put(task->conn); +} + +/* + * Called at accept time, builds resources just for login phase. + */ + +#define NUM_LOGIN_TASKS 2 /* one posted for req rx, one for reply tx */ + +static int iser_alloc_login_resources(struct iser_conn *conn) +{ + unsigned long size; + struct iser_task *login_task[NUM_LOGIN_TASKS]; + uint8_t *pdu_data_buf, *task_buf; + unsigned int i; + int err = 0; + + dprintf("login tasks num:%u, rx/tx sz %u/%u\n", + NUM_LOGIN_TASKS, conn->ssize, conn->rsize); + + size = NUM_LOGIN_TASKS * (conn->rsize + conn->ssize); + conn->login_data_pool = malloc(size); + if (!conn->login_data_pool) { + eprintf("malloc pdu_data_buf %lu\n", size); + err = -1; + goto out; + } + + conn->login_data_mr = + ibv_reg_mr(conn->dev->pd, conn->login_data_pool, size, + IBV_ACCESS_LOCAL_WRITE); + if (!conn->login_data_mr) { + eprintf("ibv_reg_mr pdu_data_buf failed\n"); + free(conn->login_data_pool); + conn->login_data_pool = NULL; + err = -1; + goto out; + } + + size = NUM_LOGIN_TASKS * sizeof(struct iser_task); + conn->login_task_pool = malloc(size); + if (!conn->login_task_pool) { + eprintf("malloc login_task_buf %lu\n", size); + ibv_dereg_mr(conn->login_data_mr); + conn->login_data_mr = NULL; + free(conn->login_data_pool); + conn->login_data_pool = NULL; + goto out; + } + memset(conn->login_task_pool, 0, size); + + conn->login_res_alloc = 1; + pdu_data_buf = conn->login_data_pool; + task_buf = conn->login_task_pool; + for (i = 0; i < NUM_LOGIN_TASKS; i++) { + login_task[i] = (struct iser_task *) task_buf; + task_buf += sizeof(struct iser_task); + + iser_task_init(login_task[i], conn, pdu_data_buf, + conn->login_data_mr); + pdu_data_buf += conn->rsize + conn->ssize; + } + + dprintf("post_recv login rx task:%p\n", login_task[0]); + err = iser_post_recv(conn, login_task[0], 1); + if (err) { + eprintf("post_recv login rx task failed, %m\n"); + iser_free_login_resources(conn); + goto out; + } + dprintf("saved login tx task:%p\n", login_task[1]); + conn->login_tx_task = login_task[1]; +out: + return err; +} + +/* + * When ready for full-feature mode, free login-phase resources. + */ +static void iser_free_login_resources(struct iser_conn *conn) +{ + int err; + + if (!conn->login_res_alloc) + return; + + dprintf("freeing, login phase %d\n", conn->login_phase); + + /* release mr and free the lists */ + if (conn->login_data_mr) { + err = ibv_dereg_mr(conn->login_data_mr); + if (err) + eprintf("ibv_dereg_mr\n"); + } + if (conn->login_data_pool) + free(conn->login_data_pool); + if (conn->login_task_pool) + free(conn->login_task_pool); + conn->login_res_alloc = 0; +} + +/* + * Called when ready for full feature, builds resources. + */ +static int iser_alloc_ff_resources(struct iser_conn *conn) +{ + unsigned long size; + uint8_t *pdu_data_buf, *task_buf; + struct iser_task *task; + unsigned int i; + int err = 0; + + dprintf("conn ssize:%u rsize:%u max_outst:%u\n", + conn->ssize, conn->rsize, conn->max_outst_pdu); + + size = (conn->rsize + conn->ssize) * conn->max_outst_pdu; + conn->pdu_data_pool = malloc(size); + if (!conn->pdu_data_pool) { + eprintf("malloc pdu_data_buf %lu\n", size); + err = -1; + goto out; + } + + conn->pdu_data_mr = ibv_reg_mr(conn->dev->pd, conn->pdu_data_pool, size, + IBV_ACCESS_LOCAL_WRITE); + if (!conn->pdu_data_mr) { + eprintf("register pdu_data_buf\n"); + free(conn->pdu_data_pool); + conn->pdu_data_pool = NULL; + err = -1; + goto out; + } + + size = conn->max_outst_pdu * sizeof(struct iser_task); + conn->task_pool = malloc(size); + if (!conn->task_pool) { + eprintf("malloc task_buf %lu\n", size); + ibv_dereg_mr(conn->pdu_data_mr); + conn->pdu_data_mr = NULL; + free(conn->pdu_data_pool); + conn->pdu_data_pool = NULL; + goto out; + } + memset(conn->task_pool, 0, size); + + conn->ff_res_alloc = 1; + pdu_data_buf = conn->pdu_data_pool; + task_buf = conn->task_pool; + for (i = 1; i < conn->max_outst_pdu; i++) { + task = (void *) task_buf; + task_buf += sizeof(*task); + + iser_task_init(task, conn, pdu_data_buf, conn->pdu_data_mr); + pdu_data_buf += conn->rsize + conn->ssize; + + err = iser_post_recv(conn, task, 1); + if (err) { + eprintf("post_recv (%d/%d): %m\n", + i, conn->max_outst_pdu); + iser_free_ff_resources(conn); + break; + } + } + task = (void *) task_buf; + iser_unsolicited_task_init(task, conn, pdu_data_buf, conn->pdu_data_mr); + conn->nop_in_task = task; + +out: + return err; +} + +/* + * On connection shutdown. + */ +static void iser_free_ff_resources(struct iser_conn *conn) +{ + int err; + + if (!conn->ff_res_alloc) + return; + + dprintf("freeing conn %p\n", conn); + + /* release mr and free the lists */ + dprintf("dereg mr %p\n", conn->pdu_data_mr); + if (conn->pdu_data_mr) { + err = ibv_dereg_mr(conn->pdu_data_mr); + if (err) + eprintf("ibv_dereg_mr\n"); + } + if (conn->pdu_data_pool) + free(conn->pdu_data_pool); + if (conn->task_pool) + free(conn->task_pool); + conn->ff_res_alloc = 0; + dprintf("exit\n"); +} + +/* + * Allocate resources for this new connection. Called after login, when + * final negotiated transfer parameters are known. + */ +int iser_login_complete(struct iscsi_conn_hdr *conn_h) +{ + struct iser_conn *conn = ISER_CONN(conn_h); + unsigned int irdsl, trdsl, outst_pdu, hdrsz; + int err = -1; + + dprintf("entry\n"); + + /* + * Build full feature connection structures, but leave around the + * login ones until the final send finishes. + */ + conn->login_phase = LOGIN_PHASE_LAST_SEND; /* one more send, then done */ + irdsl = conn_h->session_param[ISCSI_PARAM_INITIATOR_RDSL].val; + trdsl = conn_h->session_param[ISCSI_PARAM_TARGET_RDSL].val; + outst_pdu = + conn_h->session_param[ISCSI_PARAM_MAX_OUTST_PDU].val; // ToDo: outstanding pdus num + + /* hack, ib/ulp/iser does not have this param, but reading the code + * shows their formula for max tx dtos outstanding + * = cmds_max * (1 + dataouts) + rx_misc + tx_misc + */ +#define ISER_INFLIGHT_DATAOUTS 0 +#define ISER_MAX_RX_MISC_PDUS 4 +#define ISER_MAX_TX_MISC_PDUS 6 + + // ToDo: outstanding pdus num +// if (outst_pdu == 0) + outst_pdu = + 3 * ISER_MAX_QUEUE_CMD * (1 + ISER_INFLIGHT_DATAOUTS) + + ISER_MAX_RX_MISC_PDUS + ISER_MAX_TX_MISC_PDUS; + + /* RDSLs do not include headers. */ + hdrsz = sizeof(struct iser_hdr) + + sizeof(struct iscsi_hdr) + + sizeof(struct iscsi_ecdb_ahdr) + + sizeof(struct iscsi_rlength_ahdr); + + conn->ssize = hdrsz + irdsl; + conn->rsize = hdrsz + trdsl; + conn->max_outst_pdu = outst_pdu; + err = iser_alloc_ff_resources(conn); + if (err) { + eprintf("iser_alloc_resources failed\n"); + goto out; + } + // How much data to grab in an RDMA operation, read or write. + //conn_h->data_inout_max_length = RDMA_TRANSFER_SIZE; + conn_h->session_param[ISCSI_PARAM_MAX_XMIT_DLENGTH].val = + RDMA_TRANSFER_SIZE; + out: + return err; +} + +/* + * First time a new connection is received on an RDMA device, record + * it and build a PD and static memory. + */ +static int iser_device_init(struct iser_device *dev) +{ + int cqe_num; + int err = -1; + + dprintf("dev %p\n", dev); + dev->pd = ibv_alloc_pd(dev->ibv_hndl); + if (dev->pd == NULL) { + eprintf("ibv_alloc_pd failed\n"); + goto out; + } + + err = iser_init_rdma_buf_pool(dev); + if (err) { + eprintf("iser_init_rdma_buf_pool failed\n"); + goto out; + } + + err = ibv_query_device(dev->ibv_hndl, &dev->device_attr); + if (err < 0) { + eprintf("ibv_query_device: %m\n"); + goto out; + } + cqe_num = dev->device_attr.max_cqe; + dprintf("max %d CQEs\n", cqe_num); + + err = -1; + dev->cq_channel = ibv_create_comp_channel(dev->ibv_hndl); + if (dev->cq_channel == NULL) { + eprintf("ibv_create_comp_channel failed: %m\n"); + goto out; + } + + dev->cq = ibv_create_cq(dev->ibv_hndl, cqe_num, NULL, + dev->cq_channel, 0); + if (dev->cq == NULL) { + eprintf("ibv_create_cq failed: %m\n"); + goto out; + } + + tgt_init_sched_event(&dev->poll_sched, iser_sched_poll_cq, dev); + + err = ibv_req_notify_cq(dev->cq, 0); + if (err) { + eprintf("ibv_req_notify failed: %s\n", strerror(err)); + goto out; + } + + err = tgt_event_add(dev->cq_channel->fd, EPOLLIN, iser_cqe_handler, + dev); + if (err) { + eprintf("tgt_event_add failed: %m\n"); + goto out; + + } + + list_add_tail(&dev->list, &iser_dev_list); + +out: + return err; +} + +static int iser_ib_clear_iosubmit_list(struct iser_conn *conn) +{ + struct iser_task *task; + + dprintf("start\n"); + while (!list_empty(&conn->iosubmit_list)) { + task = list_first_entry(&conn->iosubmit_list, struct iser_task, exec_list); + list_del(&task->exec_list); + iser_complete_task(task); + } + return 0; +} + +static int iser_ib_clear_rdma_rd_list(struct iser_conn *conn) +{ + struct iser_task *task; + + dprintf("start\n"); + while (!list_empty(&conn->rdma_rd_list)) { + task = list_first_entry(&conn->rdma_rd_list, struct iser_task, rdma_list); + list_del(&task->rdma_list); + } + return 0; +} + +static int iser_ib_clear_tx_list(struct iser_conn *conn) +{ + struct iser_task *task; + + dprintf("start\n"); + while (!list_empty(&conn->resp_tx_list)) { + task = list_first_entry(&conn->resp_tx_list, struct iser_task, tx_list); + list_del(&task->tx_list); + } + return 0; +} + +static int iser_ib_clear_sent_list(struct iser_conn *conn) +{ + struct iser_task *task; + + dprintf("start\n"); + while (!list_empty(&conn->sent_list)) { + task = list_first_entry(&conn->sent_list, struct iser_task, tx_list); + list_del(&task->tx_list); + } + return 0; +} + +static int iser_ib_clear_post_recv_list(struct iser_conn *conn) +{ + struct iser_task *task; + + dprintf("start\n"); + while (!list_empty(&conn->post_recv_list)) { + task = list_first_entry(&conn->post_recv_list, struct iser_task, recv_list); + list_del(&task->recv_list); + } + return 0; +} + +int iser_conn_init(struct iser_conn *conn) +{ + conn->h.refcount = 0; + conn->h.state = STATE_INIT; + param_set_defaults(conn->h.session_param, session_keys); + + INIT_LIST_HEAD(&conn->h.clist); + + INIT_LIST_HEAD(&conn->buf_alloc_list); + INIT_LIST_HEAD(&conn->rdma_rd_list); + /* INIT_LIST_HEAD(&conn->rdma_wr_list); */ + INIT_LIST_HEAD(&conn->iosubmit_list); + INIT_LIST_HEAD(&conn->resp_tx_list); + INIT_LIST_HEAD(&conn->sent_list); + INIT_LIST_HEAD(&conn->post_recv_list); + + return 0; +} + +/* + * Start closing connection. Transfer IB QP to error state. + * This will be followed by WC error and buffers flush events. + * We also should expect DISCONNECTED and TIMEWAIT_EXIT events. + * Only after the draining is over we are sure to have reclaimed + * all buffers (and tasks). After the RDMA CM events are collected, + * the connection QP may be destroyed, and its number may be recycled. + */ +void iser_conn_close(struct iser_conn *conn) +{ + int err; + + if (conn->h.state == STATE_CLOSE) + return; + conn->h.state = STATE_CLOSE; + + dprintf("rdma_disconnect conn:%p\n", &conn->h); + err = rdma_disconnect(conn->cm_id); + if (err) + eprintf("rdma_disconnect: %s\n", strerror(-err)); + + list_del(&conn->conn_list); + + tgt_remove_sched_event(&conn->sched_buf_alloc); + tgt_remove_sched_event(&conn->sched_rdma_rd); + tgt_remove_sched_event(&conn->sched_iosubmit); + tgt_remove_sched_event(&conn->sched_tx); + tgt_remove_sched_event(&conn->sched_post_recv); + + if (conn->login_phase != LOGIN_PHASE_FF) { + iser_ib_clear_iosubmit_list(conn); + iser_ib_clear_rdma_rd_list(conn); + } + iser_ib_clear_tx_list(conn); + iser_ib_clear_sent_list(conn); + iser_ib_clear_post_recv_list(conn); + + eprintf("connection:%p closed, refcnt:%d\n", conn, conn->h.refcount); +} + +/* + * Called when the connection is freed, from iscsi, but won't do anything until + * all posted WRs have gone away. So also called again from RX progress when + * it notices this happens. + */ +void iser_conn_free(struct iser_conn *conn) +{ + int err; + + dprintf("conn:%p rx.posted:%d refcnt:%d\n", conn, conn->recvl_posted, + conn->h.refcount); + assert(conn->h.refcount == 0); + + /* try to free unconditionally, resources freed only if necessary */ + iser_free_login_resources(conn); + iser_free_ff_resources(conn); + + if (conn->qp_hndl) { + err = ibv_destroy_qp(conn->qp_hndl); + if (err) + eprintf("ibv_destroy_qp: %s\n", strerror(-err)); + else + dprintf("destroyed qp:%p\n", conn->qp_hndl); + } + if (conn->cm_id) { + err = rdma_destroy_id(conn->cm_id); + if (err) + eprintf("rdma_destroy_id conn:%p cm_id:%p, %s\n", + &conn->h, conn->cm_id, strerror(-err)); + else + dprintf("destroyed cm_id:%p\n", conn->cm_id); + } + + /* delete from session; was put there by conn_add_to_session() */ + list_del(&conn->h.clist); + + if (conn->h.initiator) + free(conn->h.initiator); + + if (conn->h.session) + session_put(conn->h.session); + + conn->h.state = STATE_INIT; + free(conn); + dprintf("conn:%p freed\n", conn); +} + +static void iser_sched_conn_free(struct event_data *evt) +{ + struct iser_conn *conn = (struct iser_conn *) evt->data; + + iser_conn_free(conn); +} + +int iser_conn_get(struct iser_conn *conn) +{ + /* TODO: check state */ + conn->h.refcount++; + dprintf("refcnt:%d\n", conn->h.refcount); + return 0; +} + +void iser_conn_put(struct iser_conn *conn) +{ + conn->h.refcount--; + dprintf("refcnt:%d\n", conn->h.refcount); + if (!conn->h.refcount) { + assert(conn->h.state == STATE_CLOSE); + tgt_add_sched_event(&conn->sched_conn_free); + } +} + +static int iser_show(struct iscsi_conn_hdr *conn_h, char *buf, + int rest) +{ + struct iser_conn *conn = ISER_CONN(conn_h); + int err; + char host[NI_MAXHOST]; + + err = getnameinfo((struct sockaddr *) &conn->peer_addr, + sizeof(conn->peer_addr), host, sizeof(host), + NULL, 0, NI_NUMERICHOST); + if (err) { + eprintf("getnameinfo: %m\n"); + return 0; + } + return snprintf(buf, rest, "RDMA IP Address: %s", host); +} + +static int iser_getsockname(struct iscsi_conn_hdr *conn_h, + struct sockaddr *sa, socklen_t * len) +{ + struct iser_conn *conn = ISER_CONN(conn_h); + + if (*len > sizeof(conn->self_addr)) + *len = sizeof(conn->self_addr); + memcpy(sa, &conn->self_addr, *len); + return 0; +} + +static int iser_getpeername(struct iscsi_conn_hdr *conn_h, + struct sockaddr *sa, socklen_t * len) +{ + struct iser_conn *conn = ISER_CONN(conn_h); + + if (*len > sizeof(conn->peer_addr)) + *len = sizeof(conn->peer_addr); + memcpy(sa, &conn->peer_addr, *len); + return 0; +} + +static void iser_cm_connect_request(struct rdma_cm_event *ev) +{ + struct rdma_cm_id *cm_id = ev->id; + struct ibv_qp_init_attr qp_init_attr; + struct iser_conn *conn; + struct iser_device *dev; + unsigned int hdrsz; + int err, dev_found; + + struct rdma_conn_param conn_param = { + .responder_resources = 1, + .initiator_depth = 1, + .retry_count = 5, + }; + + /* find device */ + dev_found = 0; + list_for_each_entry(dev, &iser_dev_list, list) { + if (dev->ibv_hndl == cm_id->verbs) { + dev_found = 1; + break; + } + } + if (!dev_found) { + dev = malloc(sizeof(*dev)); + if (dev == NULL) { + eprintf("unable to allocate dev\n"); + goto reject; + } + dev->ibv_hndl = cm_id->verbs; + err = iser_device_init(dev); + if (err) { + free(dev); + goto reject; + } + } + + /* build a new connection structure */ + conn = zalloc(sizeof(*conn)); + if (!conn) { + eprintf("unable to allocate conn\n"); + goto reject; + } + + err = iser_conn_init(conn); + if (err) { + free(conn); + goto reject; + } + dprintf("Created conn:%p cm_id:%p\n", &conn->h, cm_id); + list_add(&conn->conn_list, &iser_conn_list); + + /* ToDo: finalize these callbacks after tcp/iser code merge */ + conn->h.op.conn_show = iser_show; + conn->h.op.conn_getsockname = iser_getsockname; + conn->h.op.conn_getpeername = iser_getpeername; + + /* relate iser and rdma connections */ + conn->cm_id = cm_id; + cm_id->context = conn; + + conn->dev = dev; + conn->login_phase = LOGIN_PHASE_START; + + tgt_init_sched_event(&conn->sched_buf_alloc, iser_sched_buf_alloc, + conn); + tgt_init_sched_event(&conn->sched_iosubmit, iser_sched_iosubmit, + conn); + tgt_init_sched_event(&conn->sched_rdma_rd, iser_sched_rdma_rd, + conn); + tgt_init_sched_event(&conn->sched_tx, iser_sched_tx, + conn); + tgt_init_sched_event(&conn->sched_post_recv, iser_sched_post_recv, + conn); + tgt_init_sched_event(&conn->sched_conn_free, iser_sched_conn_free, + conn); + + /* initiator is dst, target is src */ + memcpy(&conn->peer_addr, &cm_id->route.addr.dst_addr, + sizeof(conn->peer_addr)); + memcpy(&conn->self_addr, &cm_id->route.addr.src_addr, + sizeof(conn->self_addr)); +#ifndef NDEBUG + { + char str[256]; + + iser_show(&conn->h, str, sizeof(str)); + str[sizeof(str) - 1] = 0; + dprintf("new conn %p from %s\n", conn, str); + } +#endif + + /* create qp next */ + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + /* wire both send and recv to the same CQ */ + qp_init_attr.send_cq = dev->cq; + qp_init_attr.recv_cq = dev->cq; + qp_init_attr.cap.max_send_wr = MAX_WQE; + qp_init_attr.cap.max_recv_wr = MAX_WQE; + qp_init_attr.cap.max_send_sge = 1; /* scatter/gather entries */ + qp_init_attr.cap.max_recv_sge = 1; + qp_init_attr.qp_type = IBV_QPT_RC; + /* only generate completion queue entries if requested */ + qp_init_attr.sq_sig_all = 0; + + err = rdma_create_qp(cm_id, dev->pd, &qp_init_attr); + if (err) { + eprintf("create qp failed\n"); + goto free_conn; + } + conn->qp_hndl = cm_id->qp; + VALGRIND_MAKE_MEM_DEFINED(conn->qp_hndl, sizeof(*conn->qp_hndl)); + + /* + * Post buffers for the login phase, only. + */ + hdrsz = sizeof(struct iser_hdr) + + sizeof(struct iscsi_hdr) + + sizeof(struct iscsi_ecdb_ahdr) + + sizeof(struct iscsi_rlength_ahdr); + conn->ssize = hdrsz + 8192; + conn->rsize = hdrsz + 8192; + err = iser_alloc_login_resources(conn); + if (err) + goto free_conn; + + conn_param.initiator_depth = dev->device_attr.max_qp_init_rd_atom; + if (conn_param.initiator_depth > ev->param.conn.initiator_depth) + conn_param.initiator_depth = ev->param.conn.initiator_depth; + + /* Increment reference count to be able to wait for TIMEWAIT_EXIT + when finalizing the disconnect process */ + iser_conn_get(conn); + + /* now we can actually accept the connection */ + err = rdma_accept(conn->cm_id, &conn_param); + if (err) { + eprintf("rdma_accept failed conn:%p cm_id:%p\n", + &conn->h, cm_id); + goto free_conn; + } + + conn->h.state = STATE_START; + dprintf("accepted conn:%p cm_id:%p\n", &conn->h, cm_id); + + return; + +free_conn: + iser_conn_free(conn); + +reject: + err = rdma_reject(cm_id, NULL, 0); + if (err) + eprintf("rdma_reject failed: %s\n", strerror(-err)); +} + +/* + * Finish putting the connection together, now that the other side + * has ACKed our acceptance. Moves it from the temp_conn to the + * iser_conn_list. + * + * Release the temporary conn_info and glue it into iser_conn_list. + */ +static void iser_cm_conn_established(struct rdma_cm_event *ev) +{ + struct rdma_cm_id *cm_id = ev->id; + struct iser_conn *conn = cm_id->context; + + eprintf("conn:%p cm_id:%p\n", &conn->h, cm_id); + assert(conn->h.state == STATE_START); + conn->h.state = STATE_READY; +} + +/* + * Handle RDMA_CM_EVENT_DISCONNECTED or an equivalent event. + * Start closing the target's side connection. +*/ +static void iser_cm_disconnected(struct rdma_cm_event *ev) +{ + struct rdma_cm_id *cm_id = ev->id; + struct iser_conn *conn = cm_id->context; + enum rdma_cm_event_type ev_type = ev->event; + + eprintf("conn:%p cm_id:%p event:%d, %s\n", &conn->h, cm_id, + ev_type, rdma_event_str(ev_type)); + iser_conn_close(conn); +} + +/* + * Handle RDMA_CM_EVENT_TIMEWAIT_EXIT which is expected to be the last + * event during the lifecycle of a connection, when it had been shut down + * and the network has cleared from the remaining in-flight messages. +*/ +static void iser_cm_timewait_exit(struct rdma_cm_event *ev) +{ + struct rdma_cm_id *cm_id = ev->id; + struct iser_conn *conn = cm_id->context; + + eprintf("conn:%p cm_id:%p\n", &conn->h, cm_id); + + /* Refcount was incremented just before accepting the connection, + typically this is the last decrement and the connection will be + released instantly */ + iser_conn_put(conn); +} + +/* + * Handle RDMA CM events. + */ +static void iser_handle_rdmacm(int fd __attribute__ ((unused)), + int events __attribute__ ((unused)), + void *data __attribute__ ((unused))) +{ + struct rdma_cm_event *ev; + enum rdma_cm_event_type ev_type; + int err; + + err = rdma_get_cm_event(rdma_evt_channel, &ev); + if (err) { + eprintf("rdma_get_cm_event failed\n"); + return; + } + + VALGRIND_MAKE_MEM_DEFINED(ev, sizeof(*ev)); + + ev_type = ev->event; + switch (ev_type) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + iser_cm_connect_request(ev); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + iser_cm_conn_established(ev); + break; + + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_DISCONNECTED: + iser_cm_disconnected(ev); + break; + + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + iser_cm_timewait_exit(ev); + break; + + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_MULTICAST_ERROR: + eprintf("UD-related event:%d, %s - ignored\n", ev_type, + rdma_event_str(ev_type)); + break; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + eprintf("Unsupported event:%d, %s - ignored\n", ev_type, + rdma_event_str(ev_type)); + break; + + case RDMA_CM_EVENT_ADDR_RESOLVED: + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_RESOLVED: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_RESPONSE: + case RDMA_CM_EVENT_UNREACHABLE: + eprintf("Active side event:%d, %s - ignored\n", ev_type, + rdma_event_str(ev_type)); + break; + + default: + eprintf("Illegal event:%d - ignored\n", ev_type); + break; + } + + err = rdma_ack_cm_event(ev); + if (err) { + eprintf("ack cm event failed, %s\n", + rdma_event_str(ev_type)); + } +} + +static inline void schedule_task_iosubmit(struct iser_task *task, + struct iser_conn *conn) +{ + struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) task->req.bhs; + + list_add_tail(&task->exec_list, &conn->iosubmit_list); + tgt_add_sched_event(&conn->sched_iosubmit); + dprintf("task:%p tag:0x%04lx cmdsn:0x%x\n", + task, (unsigned long)task->tag, + be32_to_cpu(req_bhs->cmdsn)); +} + +static inline void schedule_rdma_read(struct iser_task *task, + struct iser_conn *conn) +{ + list_add_tail(&task->rdma_list, &conn->rdma_rd_list); + tgt_add_sched_event(&conn->sched_rdma_rd); + dprintf("task:%p tag:0x%04lx\n", task, (unsigned long)task->tag); +} + +static inline void schedule_resp_tx(struct iser_task *task, + struct iser_conn *conn) +{ + dprintf("task:%p tag:0x%04lx\n", task, (unsigned long)task->tag); + + list_add_tail(&task->tx_list, &conn->resp_tx_list); + tgt_add_sched_event(&conn->sched_tx); +} + +static inline void schedule_post_recv(struct iser_task *task, + struct iser_conn *conn) +{ + list_add_tail(&task->recv_list, &conn->post_recv_list); + tgt_add_sched_event(&conn->sched_post_recv); + dprintf("task:%p tag:0x%04lx\n", task, (unsigned long)task->tag); +} + +static int iser_logout_exec(struct iser_task *task) +{ + struct iser_conn *conn = task->conn; + struct iscsi_session *session = conn->h.session; + struct iscsi_logout *req = (struct iscsi_logout *) task->req.bhs; + uint32_t cmd_sn = be32_to_cpu(req->cmdsn); + int is_immediate = req->opcode & ISCSI_OP_IMMEDIATE; + struct iscsi_logout_rsp *rsp = + (struct iscsi_logout_rsp *) task->rsp.bhs; + + memset(rsp, 0, BHS_SIZE); + rsp->opcode = ISCSI_OP_LOGOUT_RSP; + rsp->flags = ISCSI_FLAG_CMD_FINAL; + rsp->itt = req->itt; + rsp->statsn = cpu_to_be32(conn->h.stat_sn++); + if (session->exp_cmd_sn == cmd_sn && !is_immediate) + session->exp_cmd_sn++; + iser_set_rsp_cmd_sn(session, task->rsp.bhs); + + task->rsp.ahssize = 0; + task->rsp.data.size = 0; + + schedule_resp_tx(task, conn); + return 0; +} + +static int iser_nop_out_exec(struct iser_task *task) +{ + struct iscsi_nopin *rsp_bhs = + (struct iscsi_nopin *) task->rsp.bhs; + struct iser_conn *conn = task->conn; + + rsp_bhs->opcode = ISCSI_OP_NOOP_IN; + rsp_bhs->flags = ISCSI_FLAG_CMD_FINAL; + rsp_bhs->rsvd2 = 0; + rsp_bhs->rsvd3 = 0; + memset(rsp_bhs->lun, 0, sizeof(rsp_bhs->lun)); + rsp_bhs->itt = task->req.bhs->itt; + rsp_bhs->ttt = cpu_to_be32(ISCSI_RESERVED_TAG); + rsp_bhs->statsn = cpu_to_be32(conn->h.stat_sn); + if (task->req.bhs->itt != cpu_to_be32(ISCSI_RESERVED_TAG)) + conn->h.stat_sn++; + + iser_set_rsp_cmd_sn(conn->h.session, task->rsp.bhs); + + memset(rsp_bhs->rsvd4, 0, sizeof(rsp_bhs->rsvd4)); + task->rsp.ahssize = 0; + + if (task->out_len > 0) + task->rsp.data.buf = task->req.data.buf; + task->rsp.data.size = task->out_len; + + schedule_resp_tx(task, conn); + return 0; +} + +static int iser_send_ping_nop_in(struct iser_task *task) +{ + struct iscsi_nopin *rsp_bhs = + (struct iscsi_nopin *) task->rsp.bhs; + struct iser_conn *conn = task->conn; + + task->opcode = ISCSI_OP_NOOP_IN; + task->tag = 0xffffffff; + + rsp_bhs->opcode = ISCSI_OP_NOOP_IN; + rsp_bhs->flags = ISCSI_FLAG_CMD_FINAL; + rsp_bhs->rsvd2 = 0; + rsp_bhs->rsvd3 = 0; + memset(rsp_bhs->lun, 0, sizeof(rsp_bhs->lun)); + rsp_bhs->itt = 0xffffffff; + rsp_bhs->ttt = 0xffffffff; + rsp_bhs->statsn = cpu_to_be32(conn->h.stat_sn); + iser_set_rsp_cmd_sn(conn->h.session, task->rsp.bhs); + + memset(rsp_bhs->rsvd4, 0, sizeof(rsp_bhs->rsvd4)); + task->rsp.ahssize = 0; + task->rsp.data.size = 0; + + dprintf("task:%p conn:%p\n", task, &conn->h); + + schedule_resp_tx(task, conn); + return 0; +} + +/* +static int iser_send_reject(struct iser_task *task, uint8 reason) +{ + struct iser_conn *conn = task->conn; + struct iscsi_session *session = conn->h.session; + struct iscsi_hdr *req = (struct iscsi_hdr *) task->req.bhs; + struct iscsi_reject *rsp = (struct iscsi_reject *) task->rsp.bhs; + + memset(rsp, 0, BHS_SIZE); + rsp->opcode = ISCSI_OP_REJECT; + rsp->flags = ISCSI_FLAG_CMD_FINAL; + rsp->reason = reason; + hton24(rsp->dlength, sizeof(struct iser_hdr)); + rsp->ffffffff = 0xffffffff; + rsp->itt = req->itt; + rsp->statsn = cpu_to_be32(conn->h.stat_sn++); + iser_set_rsp_cmd_sn(session, task->rsp.bhs); + + task->rsp.ahssize = 0; + task->rsp.data.buf = task->req.bhs; + task->rsp.data.size = sizeof(struct iser_hdr); + + schedule_resp_tx(task, conn); + return 0; +} +*/ + +static int iser_tm_exec(struct iser_task *task) +{ + struct iser_conn *conn = task->conn; + struct iscsi_session *session = conn->h.session; + struct iscsi_tm *req_bhs = (struct iscsi_tm *) task->req.bhs; + int fn = 0; + int err = 0; + + eprintf("TM itt:0x%x cmdsn:0x%x " + "ref.itt:0x%x ref.cmdsn:0x%x " + "exp.cmdsn:0x%x " + "lun:0x%02x%02x%02x%02x%02x%02x%02x%02x\n", + be32_to_cpu(req_bhs->itt), be32_to_cpu(req_bhs->cmdsn), + be32_to_cpu(req_bhs->rtt), be32_to_cpu(req_bhs->refcmdsn), + be32_to_cpu(req_bhs->exp_statsn), + req_bhs->lun[0], req_bhs->lun[1], req_bhs->lun[2], req_bhs->lun[3], + req_bhs->lun[4], req_bhs->lun[5], req_bhs->lun[6], req_bhs->lun[7]); + + switch (req_bhs->flags & ISCSI_FLAG_TM_FUNC_MASK) { + case ISCSI_TM_FUNC_ABORT_TASK: + fn = ABORT_TASK; + break; + case ISCSI_TM_FUNC_ABORT_TASK_SET: + fn = ABORT_TASK_SET; + break; + case ISCSI_TM_FUNC_CLEAR_ACA: + fn = CLEAR_TASK_SET; + break; + case ISCSI_TM_FUNC_CLEAR_TASK_SET: + fn = CLEAR_ACA; + break; + case ISCSI_TM_FUNC_LOGICAL_UNIT_RESET: + fn = LOGICAL_UNIT_RESET; + break; + case ISCSI_TM_FUNC_TARGET_WARM_RESET: + case ISCSI_TM_FUNC_TARGET_COLD_RESET: + case ISCSI_TM_FUNC_TASK_REASSIGN: + err = ISCSI_TMF_RSP_NOT_SUPPORTED; + eprintf("unsupported TMF %d\n", + req_bhs->flags & ISCSI_FLAG_TM_FUNC_MASK); + break; + default: + err = ISCSI_TMF_RSP_REJECTED; + eprintf("unknown TMF %d\n", + req_bhs->flags & ISCSI_FLAG_TM_FUNC_MASK); + } + + if (err) + task->result = err; + else + target_mgmt_request(session->target->tid, session->tsih, + (unsigned long) task, fn, req_bhs->lun, + req_bhs->rtt, 0); + return err; +} + +static int iser_tm_done(struct mgmt_req *mreq) +{ + struct iser_task *task = + (struct iser_task *) (unsigned long) mreq->mid; + struct iser_conn *conn = task->conn; + struct iscsi_session *session = conn->h.session; + struct iscsi_tm_rsp *rsp_bhs = + (struct iscsi_tm_rsp *) task->rsp.bhs; + + memset(rsp_bhs, 0, sizeof(*rsp_bhs)); + rsp_bhs->opcode = ISCSI_OP_SCSI_TMFUNC_RSP; + rsp_bhs->flags = ISCSI_FLAG_CMD_FINAL; + rsp_bhs->itt = task->req.bhs->itt; + switch (mreq->result) { + case 0: + rsp_bhs->response = ISCSI_TMF_RSP_COMPLETE; + break; + case -EINVAL: + rsp_bhs->response = ISCSI_TMF_RSP_NOT_SUPPORTED; + break; + case -EEXIST: + /* + * the command completed or we could not find it so + * we retrun no task here + */ + rsp_bhs->response = ISCSI_TMF_RSP_NO_TASK; + break; + default: + rsp_bhs->response = ISCSI_TMF_RSP_REJECTED; + break; + } + + rsp_bhs->statsn = cpu_to_be32(conn->h.stat_sn++); + iser_set_rsp_cmd_sn(session, task->rsp.bhs); + + // Must be zero according to 10.6.3 + task->rsp.ahssize = 0; + task->rsp.data.size = 0; + + schedule_resp_tx(task, conn); + + return 0; +} + +static int iser_scsi_cmd_buf_alloc(struct iser_task *task) +{ + struct iser_conn *conn = task->conn; + struct membuf *mem; + + // when a buffer needs to be allocated and it fails, leave task on + // the list waiting until a new buffer becomes available + if (task->is_write) { + if (task->out_len > 0) { + if (task->rdma_rd_sz > 0) { + // ToDo: multiple RDMA-Read buffers + mem = iser_alloc_rdma_buf(conn, task->rdma_rd_sz); + if (mem == NULL) { + eprintf("free list empty, task:%p tag:0x%04lx\n", + task, (unsigned long)task->tag); + return -ENOMEM; + } + dprintf("out-buf:%p sz:%u rdma:%d\n", + mem->buf, mem->size, mem->rdma); + // ToDo: multiple RDMA-Read buffers + iser_task_add_out_rdma_buf(task, mem, task->unsol_sz); + schedule_rdma_read(task, conn); + return 0; + } + if (task->unsol_remains > 0) + return 0; + } + } // ToDo: bi-dir + else if (task->is_read) { + if (task->in_len > 0) { + mem = iser_alloc_rdma_buf(conn, task->rdma_wr_sz); + if (mem == NULL) { + eprintf("free list empty, task:%p tag:0x%04lx\n", + task, (unsigned long)task->tag); + return -ENOMEM; + } + dprintf("in-buf:%p sz:%u rdma:%d\n", + mem->buf, mem->size, mem->rdma); + // ToDo: multiple rdma write bufs + iser_task_add_in_rdma_buf(task, mem, 0); + } + } + schedule_task_iosubmit(task, conn); + return 0; +} + +static int scsi_cmd_attr(unsigned int flags) +{ + int attr; + + switch (flags & ISCSI_FLAG_CMD_ATTR_MASK) { + case ISCSI_ATTR_UNTAGGED: + case ISCSI_ATTR_SIMPLE: + attr = MSG_SIMPLE_TAG; + break; + case ISCSI_ATTR_HEAD_OF_QUEUE: + attr = MSG_HEAD_TAG; + break; + case ISCSI_ATTR_ORDERED: + default: + attr = MSG_ORDERED_TAG; + } + return attr; +} + +static void iser_task_free_dout_tasks(struct iser_task *task) +{ + struct iser_conn *conn = task->conn; + struct iser_task *dout_task, *tnext; + + list_for_each_entry_safe(dout_task, tnext, &task->dout_task_list, dout_task_list) { + list_del(&dout_task->dout_task_list); + iser_task_del_out_buf(task, &dout_task->req.data); + schedule_post_recv(dout_task, conn); + } +} + +static void iser_scsi_cmd_iosubmit(struct iser_task *task) +{ + struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) task->req.bhs; + struct scsi_cmd *scmd = &task->scmd; + struct iser_conn *conn = task->conn; + struct iscsi_session *session = conn->h.session; + struct membuf *data; + + // ToDo: implement iser_task_handle_ahs(itask); + + scmd->state = 0; + scsi_set_in_length(scmd, 0); + scsi_set_out_length(scmd, 0); + + if (task->is_write) { + scsi_set_out_resid(scmd, 0); + /* It's either the last buffer, which is RDMA, or the only buffer */ + data = list_entry(task->out_buf_list.prev, struct membuf, task_list); + // ToDo: multiple RDMA-Write buffers + if (task->out_buf_num > 1) { + unsigned char *ptr = data->buf; + struct membuf *cur_buf; + + list_for_each_entry(cur_buf, &task->out_buf_list, task_list) { + if (cur_buf->rdma) + break; + memcpy(ptr, cur_buf->buf, cur_buf->size); + ptr += cur_buf->size; + } + iser_task_free_dout_tasks(task); + } + + scsi_set_out_buffer(scmd, data->buf); + scsi_set_out_length(scmd, task->out_len); + } + if (task->is_read) { + scsi_set_in_resid(scmd, 0); + // ToDo: multiple RDMA-Read buffers + data = list_entry(task->in_buf_list.next, struct membuf, task_list); + scsi_set_in_buffer(scmd, data->buf); + scsi_set_in_length(scmd, task->in_len); + } + + scmd->cmd_itn_id = session->tsih; + scmd->scb = req_bhs->cdb; + scmd->scb_len = sizeof(req_bhs->cdb); + memcpy(scmd->lun, req_bhs->lun, sizeof(scmd->lun)); + scmd->attribute = scsi_cmd_attr(req_bhs->flags); + scmd->tag = task->tag; //req_bhs->itt; + scmd->result = 0; + scmd->mreq = NULL; + scmd->sense_len = 0; + + dprintf("task:%p tag:0x%04lx\n", task, (unsigned long)task->tag); + + // ToDo: handle bidi ahs and allocations + // the problem is that there is only one buffer + // in scsi command + + target_cmd_queue(session->target->tid, scmd); +} + +static void iser_rsp_set_read_resid(struct iscsi_cmd_rsp *rsp_bhs, + int in_resid) +{ + if (in_resid > 0) { + rsp_bhs->flags |= ISCSI_FLAG_CMD_UNDERFLOW; + rsp_bhs->residual_count = cpu_to_be32((uint32_t)in_resid); + } + else { + rsp_bhs->flags |= ISCSI_FLAG_CMD_OVERFLOW; + rsp_bhs->residual_count = cpu_to_be32(((uint32_t)-in_resid)); + } + rsp_bhs->bi_residual_count = 0; +} + +static void iser_rsp_set_bidir_resid(struct iscsi_cmd_rsp *rsp_bhs, + int in_resid) +{ + if (in_resid > 0) { + rsp_bhs->flags |= ISCSI_FLAG_CMD_BIDI_UNDERFLOW; + rsp_bhs->bi_residual_count = cpu_to_be32((uint32_t)in_resid); + } + else { + rsp_bhs->flags |= ISCSI_FLAG_CMD_BIDI_OVERFLOW; + rsp_bhs->bi_residual_count = cpu_to_be32(((uint32_t)-in_resid)); + } + rsp_bhs->residual_count = 0; +} + +static int iser_scsi_cmd_done(uint64_t nid, int result, + struct scsi_cmd *scmd) +{ + struct iser_task *task = container_of(scmd, struct iser_task, scmd); + struct iscsi_cmd_rsp *rsp_bhs = (struct iscsi_cmd_rsp *) task->rsp.bhs; + struct iser_conn *conn = task->conn; + struct iscsi_session *session = conn->h.session; + unsigned char sense_len = scmd->sense_len; + int in_resid; + + if (nid != scmd->cmd_itn_id) { + eprintf("unexpected itn_id in scsi command\n"); + exit(1); + } + + if (conn->h.state == STATE_CLOSE) { + /* Connection is closed, but its resources are not released + to allow receiving completion of such late tasks. + When all tasks are released, and connection refcnt + drops to zero, then all the resources can be freed. */ + iser_complete_task(task); + return 0; + } + + rsp_bhs->opcode = ISCSI_OP_SCSI_CMD_RSP; + rsp_bhs->flags = ISCSI_FLAG_CMD_FINAL; + rsp_bhs->response = ISCSI_STATUS_CMD_COMPLETED; + rsp_bhs->cmd_status = scsi_get_result(scmd); + *((uint64_t *) rsp_bhs->rsvd) = (uint64_t) 0; + rsp_bhs->itt = task->tag; + rsp_bhs->rsvd1 = 0; + rsp_bhs->statsn = cpu_to_be32(conn->h.stat_sn++); + iser_set_rsp_cmd_sn(session, task->rsp.bhs); + rsp_bhs->exp_datasn = 0; + + if (task->is_read) + { + in_resid = scsi_get_in_resid(scmd); + if (in_resid != 0) { + if (!task->is_write) + iser_rsp_set_read_resid(rsp_bhs, in_resid); + else + iser_rsp_set_bidir_resid(rsp_bhs, in_resid); + if (in_resid > 0) + task->rdma_wr_remains = task->in_len - in_resid; + } + //schedule_rdma_write(task, conn); + } + else { + rsp_bhs->bi_residual_count = 0; + rsp_bhs->residual_count = 0; + } + task->rsp.ahssize = 0; + task->rsp.data.size = 0; + + if (sense_len) { + struct iscsi_sense_data *sense = + (struct iscsi_sense_data *) task->rsp.data.buf; + sense->length = cpu_to_be16(sense_len); + memcpy(sense->data, scmd->sense_buffer, sense_len); + task->rsp.data.size = sense_len + sizeof(*sense); + } + dprintf("task:%p tag:0x%04lx status:%x statsn:%d flags:0x%x " + "in_len:%d out_len:%d rsd:%d birsd:%d\n", + task, (unsigned long)task->tag, rsp_bhs->cmd_status, + conn->h.stat_sn-1, rsp_bhs->flags, + scsi_get_in_length(scmd), scsi_get_out_length(scmd), + ntohl(rsp_bhs->residual_count), + ntohl(rsp_bhs->bi_residual_count)); + + schedule_resp_tx(task, conn); + + return 0; +} + +/* +static int iser_task_handle_ahs(struct iser_task *task) +{ + struct scsi_cmd *scmd = &task->scmd; + struct iscsi_conn_hdr *conn = task->conn; + struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) &task->req.; + uint32_t data_len; + uint8_t *ahs; + int ahslen; + enum data_direction dir = scsi_get_data_dir(scmd); + + ahs = task->ahs; + ahslen = req_bhs->hlength * 4; + if (ahslen >= 4) { + struct iscsi_ecdb_ahdr *ahs_extcdb = (void *) ahs; + + if (ahs_extcdb->ahstype == ISCSI_AHSTYPE_CDB) { + int extcdb_len = ntohs(ahs_extcdb->ahslength) - 1; + unsigned char *p = (void *)task->extdata; + + if (4 + extcdb_len > ahslen) { + eprintf("AHS len:%d too short for extcdb %d\n", + ahslen, extcdb_len); + return -EINVAL; + } + if (extcdb_len + sizeof(req_bhs->cdb) > 260) { + eprintf("invalid extcdb len:%d\n", extcdb_len); + + return -EINVAL; + } + + memcpy(p, req_bhs->cdb, sizeof(req_bhs->cdb)); + memmove(p + sizeof(req_bhs->cdb), ahs_extcdb->ecdb, + extcdb_len); + + scmd->scb = p; + scmd->scb_len = sizeof(req_bhs->cdb) + extcdb_len; + + ahs += 4 + extcdb_len; + ahslen -= 4 + extcdb_len; + } + } + + if (dir == DATA_BIDIRECTIONAL && ahslen >= 8) { + struct iscsi_rlength_ahdr *ahs_bidi = (void *) ahs; + if (ahs_bidi->ahstype == ISCSI_AHSTYPE_RLENGTH) { + uint32_t in_length = ntohl(ahs_bidi->read_length); + + dprintf("bidi read len %u\n", in_length); + + if (in_length) { + uint32_t len; + void *buf; + + len = roundup(in_length, + conn->tp->data_padding); + buf = conn->tp->alloc_data_buf(conn, len); + if (!buf) + return -ENOMEM; + + scsi_set_in_buffer(scmd, buf); + scsi_set_in_length(scmd, in_length); + } + } + } +} +*/ + +static void iser_sched_buf_alloc(struct event_data *evt) +{ + struct iser_conn *conn = (struct iser_conn *) evt->data; + struct iser_task *task; + int err; + + while (!list_empty(&conn->buf_alloc_list)) { + task = list_first_entry(&conn->buf_alloc_list, + struct iser_task, + exec_list); + err = iser_scsi_cmd_buf_alloc(task); + if (err == -ENOMEM) + break; + list_del(&task->exec_list); + } +} + +static inline int task_batch_type(struct iser_task *task) +{ + struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) task->req.bhs; + + switch (req_bhs->cdb[0]) { + case SYNCHRONIZE_CACHE: + case SYNCHRONIZE_CACHE_16: + case WRITE_6: + case WRITE_10: + case WRITE_12: + case WRITE_16: + case READ_6: + case READ_10: + case READ_12: + case READ_16: + return 1; + default: + return 0; + } +} + +static void iser_sched_iosubmit(struct event_data *evt) +{ + struct iser_conn *conn = (struct iser_conn *) evt->data; + struct iser_task *task = NULL; + + while (!list_empty(&conn->iosubmit_list)) { + task = list_first_entry(&conn->iosubmit_list, + struct iser_task, exec_list); + list_del(&task->exec_list); + iser_scsi_cmd_iosubmit(task); + } +} + +static void iser_sched_rdma_rd(struct event_data *evt) +{ + struct iser_conn *conn = (struct iser_conn *) evt->data; + struct iser_work_req *first_wr = NULL; + struct iser_task *prev_task = NULL; + struct iser_task *task = NULL; + int err; + + if (conn->h.state == STATE_CLOSE) { + dprintf("ignoring rdma_rd for closed conn\n"); + // ToDo: free all tasks and buffers + return; + } + + while (!list_empty(&conn->rdma_rd_list)) { + task = list_first_entry(&conn->rdma_rd_list, + struct iser_task, rdma_list); + list_del(&task->rdma_list); + + iser_prep_rdma_rd_send_req(task, NULL, 1); + if (first_wr == NULL) + first_wr = &task->rdmad; + else + prev_task->rdmad.send_wr.next = &task->rdmad.send_wr; + prev_task = task; + } + if (prev_task) { + prev_task->rdmad.send_wr.next = NULL; + // submit the chain of rdma-rd requests, start from the first + err = iser_post_send(conn, first_wr); + } + // ToDo: error handling +} + +static void iser_sched_tx(struct event_data *evt) +{ + struct iser_conn *conn = (struct iser_conn *) evt->data; + struct iser_work_req *first_wr = NULL; + struct iser_task *prev_task = NULL; + struct iser_task *task; + struct iser_work_req *cur_send_wr; + int err; + + if (conn->h.state == STATE_CLOSE) { + dprintf("ignoring tx for closed conn\n"); + return; + } + + while (!list_empty(&conn->resp_tx_list)) { + task = list_first_entry(&conn->resp_tx_list, + struct iser_task, + tx_list); + list_del(&task->tx_list); + list_add_tail(&task->tx_list, &conn->sent_list); + + if (task->is_read) { + iser_prep_rdma_wr_send_req(task, &task->txd, 0); + cur_send_wr = &task->rdmad; + } + else + cur_send_wr = &task->txd; + + if (prev_task == NULL) + first_wr = cur_send_wr; + else + prev_task->txd.send_wr.next = &cur_send_wr->send_wr; + + iser_prep_resp_send_req(task, NULL, 1); + prev_task = task; + } + if (prev_task) { + prev_task->txd.send_wr.next = NULL; + // submit the chain of rdma-wr & tx reqs, start from the first + err = iser_post_send(conn, first_wr); + // ToDo: error handling + } +} + +static void iser_sched_post_recv(struct event_data *evt) +{ + struct iser_conn *conn = (struct iser_conn *) evt->data; + struct iser_task *first_task = NULL; + struct iser_task *prev_task = NULL; + struct iser_task *task; + int num_recv_bufs = 0; + int err; + + if (conn->h.state == STATE_CLOSE) { + dprintf("ignoring post recv for closed conn\n"); + return; + } + + while (!list_empty(&conn->post_recv_list)) { + task = list_first_entry(&conn->post_recv_list, + struct iser_task, + recv_list); + list_del(&task->recv_list); + + if (prev_task == NULL) + first_task = task; + else + prev_task->rxd.recv_wr.next = &task->rxd.recv_wr; + + prev_task = task; + num_recv_bufs ++; + } + if (prev_task) { + prev_task->rxd.recv_wr.next = NULL; + // post the chain of recv buffers, start from the first + err = iser_post_recv(conn, first_task, num_recv_bufs); + // ToDo: error handling + } +} + +static int iser_scsi_cmd_rx(struct iser_task *task) +{ + struct iser_conn *conn = task->conn; + struct iscsi_session *session = conn->h.session; + struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) task->req.bhs; + unsigned int flags = req_bhs->flags; + uint32_t imm_data_sz = ntoh24(req_bhs->dlength); + uint32_t xfer_sz = ntohl(req_bhs->data_length); + + task->is_read = flags & ISCSI_FLAG_CMD_READ; + task->is_write = flags & ISCSI_FLAG_CMD_WRITE; + + if (task->is_write) { + task->out_len = xfer_sz; + + /* add immediate data to the task */ + + + if (!conn->h.session_param[ISCSI_PARAM_INITIAL_R2T_EN].val) { + task->unsol_sz = conn->h.session_param[ISCSI_PARAM_FIRST_BURST].val; + if (task->out_len < task->unsol_sz) + task->unsol_sz = task->out_len; + } + else + task->unsol_sz = 0; + + if (conn->h.session_param[ISCSI_PARAM_IMM_DATA_EN].val) { + if (imm_data_sz > 0) { + if (task->unsol_sz == 0) + task->unsol_sz = imm_data_sz; + iser_task_add_out_pdu_buf(task, &task->req.data, 0); + } + } + else if (imm_data_sz > 0) { + eprintf("ImmediateData disabled but received\n"); + return -EINVAL; + } + + /* immediate data is the first chunk of the unsolicited data */ + task->unsol_remains = task->unsol_sz - imm_data_sz; + /* rdma-reads cover the entire solicited data */ + task->rdma_rd_sz = task->rdma_rd_remains = + task->out_len - task->unsol_sz; + // ToDo: multiple RDMA-Write buffers + // task->rdma_rd_offset = task->unsol_sz; + + task->in_len = task->rdma_wr_sz = task->rdma_wr_remains = 0; + + if (!task->is_read) + scsi_set_data_dir(&task->scmd, DATA_WRITE); + else + scsi_set_data_dir(&task->scmd, DATA_BIDIRECTIONAL); + } else { + task->in_len = task->rdma_wr_sz = + task->rdma_wr_remains = xfer_sz; + task->out_len = task->unsol_sz = task->unsol_remains = + task->rdma_rd_sz = task->rdma_rd_remains = 0; + + if (task->is_read) + scsi_set_data_dir(&task->scmd, DATA_READ); + else + scsi_set_data_dir(&task->scmd, DATA_NONE); + } + + list_add_tail(&task->session_list, &session->cmd_list); + + dprintf("task:%p tag:0x%04lx scsi_op:0x%x %s%s in_len:%d out_len:%d " + "imm_sz:%d unsol_sz:%d cmdsn:0x%x expcmdsn:0x%x\n", + task, (long unsigned)task->tag, req_bhs->cdb[0], + task->is_read ? "rd" : "", task->is_write ? "wr" : "", + task->in_len, task->out_len, imm_data_sz, task->unsol_sz, + ntohl(req_bhs->cmdsn), session->exp_cmd_sn); + + return 0; +} + +static int iser_data_out_rx(struct iser_task *dout_task) +{ + struct iser_conn *conn = dout_task->conn; + struct iscsi_session *session = conn->h.session; + struct iscsi_data *req_bhs = + (struct iscsi_data *) dout_task->req.bhs; + struct iser_task *task; + int err = 0; + + list_for_each_entry(task, &session->cmd_list, session_list) { + if (task->tag == req_bhs->itt) + goto found; + } + return -EINVAL; + +found: + iser_task_add_out_pdu_buf(task, &dout_task->req.data, + be32_to_cpu(req_bhs->offset)); + list_add_tail(&dout_task->dout_task_list, &task->dout_task_list); + + // ToDo: BUG!!! add an indication that it's data-out task so that + // it can be released when the buffer is released + + task->unsol_remains -= ntoh24(req_bhs->dlength); + + dprintf("task:%p tag:0x%04lx, dout taskptr:%p out_len:%d " + "uns_rem:%d rdma_rd_rem:%d expcmdsn:0x%x\n", + task, (long unsigned)task->tag, dout_task, task->out_len, + task->unsol_remains, task->rdma_rd_remains, + session->exp_cmd_sn); + + // ToDo: look at the task counters !!! + if (req_bhs->ttt == cpu_to_be32(ISCSI_RESERVED_TAG)) { + if (req_bhs->flags & ISCSI_FLAG_CMD_FINAL) { + if (!task_pending(task)) { + // ToDo: this condition is weird ... + if (task->rdma_rd_remains == 0 && task->unsol_remains == 0) + schedule_task_iosubmit(task,conn); + } + } + } else { + if (!(req_bhs->flags & ISCSI_FLAG_CMD_FINAL)) + return err; + + if (task->rdma_rd_remains == 0 && task->unsol_remains == 0) + schedule_task_iosubmit(task,conn); + } + return err; +} + +static void iser_pdu_copy(struct iser_pdu *dst_pdu, struct iser_pdu *src_pdu) +{ + memcpy(dst_pdu->iser_hdr, src_pdu->iser_hdr, sizeof(*src_pdu->iser_hdr)); + memcpy(dst_pdu->bhs, src_pdu->bhs, sizeof(*src_pdu->bhs)); + dst_pdu->ahssize = src_pdu->ahssize; + // ToDo ahs alloc and copy + + dst_pdu->data.size = src_pdu->data.size; + memcpy(dst_pdu->data.buf, src_pdu->data.buf, src_pdu->data.size); +} + +static void iser_login_rx(struct iser_task *task) +{ + struct iser_conn *conn = task->conn; + struct iser_task *tx_task = conn->login_tx_task; + + iser_login_exec(task); + iser_pdu_copy(&tx_task->rsp, &task->rsp); + + if (conn->login_phase != LOGIN_PHASE_LAST_SEND) + iser_post_recv(conn, task, 1); + else { + dprintf("transitioning to full-feature, no repost\n"); + } + iser_prep_resp_send_req(tx_task, NULL, 1); + iser_post_send(conn, &tx_task->txd); +} + +static int iser_nop_out_rx(struct iser_task *task) +{ + struct iser_conn *conn = task->conn; +// struct iscsi_session *session = conn->h.session; + struct iscsi_nopout *req_bhs = (struct iscsi_nopout *) task->req.bhs; + int err = 0; + + if (req_bhs->ttt != cpu_to_be32(ISCSI_RESERVED_TAG)) { + /* + * We don't request a NOP-Out by sending a NOP-In. + * See 10.18.2 in the draft 20. + */ + eprintf("initiator bug\n"); + err = -ISCSI_REASON_PROTOCOL_ERROR; + goto reject; + } + if (req_bhs->itt == cpu_to_be32(ISCSI_RESERVED_TAG)) { + if (req_bhs->opcode & ISCSI_OP_IMMEDIATE) { + dprintf("No response to Nop-Out\n"); + iser_post_recv(conn, task, 1); + return -EAGAIN; // ToDo: fix the ret codes + } else { + eprintf("initiator bug\n"); + err = -ISCSI_REASON_PROTOCOL_ERROR; + goto reject; + } + } + + task->out_len = ntoh24(req_bhs->dlength); + dprintf("nop-out task:%p cmdsn:0x%x data_sz:%d\n", + task, ntohl(req_bhs->cmdsn), task->out_len); + + return 0; + +reject: // ToDo: prepare and send reject pdu + /* iser_send_reject(task, ISCSI_REASON_INVALID_PDU_FIELD); */ + return err; +} + +static int iser_task_delivery(struct iser_task *task) +{ + int err; + + switch (task->opcode) { + case ISCSI_OP_SCSI_CMD: + err = iser_scsi_cmd_buf_alloc(task); + if (err == -ENOMEM) + list_add_tail(&task->exec_list, + &task->conn->buf_alloc_list); + break; + case ISCSI_OP_NOOP_OUT: + err = iser_nop_out_exec(task); + break; + case ISCSI_OP_LOGOUT: + err = iser_logout_exec(task); + break; + case ISCSI_OP_SCSI_TMFUNC: + err = iser_tm_exec(task); + break; + case ISCSI_OP_TEXT: + err = iser_text_exec(task); + break; + default: + eprintf("Internal error: Unexpected op:0x%x\n",task->opcode); + err = -EINVAL; + break; + } + return err; +} + +static inline int cmdsn_cmp(uint32_t sn1, uint32_t sn2) +{ + if (sn1 == sn2) + return 0; + + return ((int32_t)sn1 - (int32_t)sn2) > 0 ? 1 : -1; +} + +// queues the task according to cmd-sn, no exec here +static int iser_queue_task(struct iscsi_session *session, + struct iser_task *task) +{ + struct iscsi_hdr *req_bhs = (struct iscsi_hdr *) task->req.bhs; + int is_immediate = req_bhs->opcode & ISCSI_OP_IMMEDIATE; + uint32_t cmd_sn = be32_to_cpu(req_bhs->statsn); + struct list_head *cmp_entry; + int err; + + if (is_immediate) { + dprintf("exec imm task task:%p cmd_sn:0x%x\n", task, cmd_sn); + err = iser_task_delivery(task); + if (!err || err == -ENOMEM) + return 0; + else + return err; + } + + /* if the current command is the expected one, exec it + and all others possibly acumulated on the queue */ + while (session->exp_cmd_sn == cmd_sn) { + session->exp_cmd_sn++; + dprintf("exec task:%p cmd_sn:0x%x\n", task, cmd_sn); + err = iser_task_delivery(task); + if (err && err != -ENOMEM) { + // when no free buffers remain, the task will wait + // on queue, so it is not a real error, but we should + // not attempt to start other tasks until more + // memory becomes available + return err; // ToDo: what if there are more tasks in case of error + } + + if (list_empty(&session->pending_cmd_list)) + return 0; + + task = list_first_entry(&session->pending_cmd_list, + struct iser_task, exec_list); + list_del(&task->exec_list); + clear_task_pending(task); + cmd_sn = be32_to_cpu(task->req.bhs->statsn); + } + + /* cmd_sn > (exp_cmd_sn+ISER_MAX_QUEUE_CMD), i.e. beyond allowed window */ + if (cmdsn_cmp(cmd_sn, session->exp_cmd_sn+ISER_MAX_QUEUE_CMD) == 1) { + eprintf("unexpected cmd_sn:0x%x, max:0x%x\n", + cmd_sn, session->exp_cmd_sn+ISER_MAX_QUEUE_CMD); + return -EINVAL; + } + + /* insert the current task, ordered by cmd_sn */ + list_for_each_prev(cmp_entry, &session->pending_cmd_list) { + struct iser_task *cmp_task; + uint32_t cmp_cmd_sn; + int cmp_res; + + cmp_task = list_entry(cmp_entry, struct iser_task, exec_list); + cmp_cmd_sn = be32_to_cpu(cmp_task->req.bhs->statsn); + cmp_res = cmdsn_cmp(cmd_sn, cmp_cmd_sn); + if (cmp_res == 1) { /* cmd_sn > cmp_cmd_sn */ + dprintf("inserted cmdsn:0x%x after cmdsn:0x%x\n", + cmd_sn, cmp_cmd_sn); + break; + } + else if (cmp_res == -1) { /* cmd_sn < cmp_cmd_sn */ + dprintf("inserting cmdsn:0x%x skip cmdsn:0x%x\n", + cmd_sn, cmp_cmd_sn); + continue; + } + else { /* cmd_sn == cmp_cmd_sn */ + eprintf("duplicate cmd_sn:0x%x, exp:%u\n", + cmd_sn, session->exp_cmd_sn); + return -EINVAL; + } + } + list_add(&task->exec_list, cmp_entry); + set_task_pending(task); + return 0; +} + +static int iser_parse_req_headers(struct iser_task *task) +{ + struct iser_hdr *iser_hdr = task->req.iser_hdr; + struct iscsi_hdr *iscsi_hdr = task->req.bhs; + int err = -1; + + switch (iser_hdr->flags & 0xF0) { + case ISCSI_CTRL: + if (iser_hdr->flags & ISER_RSV) { + task->rem_read_stag = + be32_to_cpu(iser_hdr->read_stag); + task->rem_read_va = be64_to_cpu(iser_hdr->read_va); + dprintf("task:%p rstag:0x%x va:0x%lx\n", task, + task->rem_read_stag, task->rem_read_va); + } + if (iser_hdr->flags & ISER_WSV) { + task->rem_write_stag = + be32_to_cpu(iser_hdr->write_stag); + task->rem_write_va = + be64_to_cpu(iser_hdr->write_va); + dprintf("task:%p wstag:0x%x va:0x%lx\n", task, + task->rem_write_stag, task->rem_write_va); + } + err = 0; + break; + case ISER_HELLO: + dprintf("iSER Hello message??\n"); + break; + default: + eprintf("malformed iser iser_hdr, flags 0x%02x\n", + iser_hdr->flags); + break; + } + + task->opcode = iscsi_hdr->opcode & ISCSI_OPCODE_MASK; + task->req.ahssize = iscsi_hdr->hlength * 4; + task->req.data.buf += task->req.ahssize; + task->req.data.size = ntoh24(iscsi_hdr->dlength); + task->req.data.rdma = 0; + + return err; +} + +static void iser_rx_handler(struct iser_work_req *rxd) +{ + struct iser_task *task = rxd->task; + struct iser_conn *conn = task->conn; + int queue_task = 1; + int err = 0; + + //--conn->recvl_posted; + iser_conn_put(conn); + + if (conn->h.state == STATE_CLOSE) + return; + + conn->h.exp_stat_sn = be32_to_cpu(task->req.bhs->exp_statsn); + task->tag = task->req.bhs->itt; + task->is_read = task->is_write = 0; + + err = iser_parse_req_headers(task); + if (err) + goto out; + + INIT_LIST_HEAD(&task->in_buf_list); + task->in_buf_num = 0; + + INIT_LIST_HEAD(&task->out_buf_list); + task->out_buf_num = 0; + + switch (task->opcode) { + case ISCSI_OP_LOGIN: + dprintf("login rx\n"); + iser_login_rx(task); + queue_task = 0; + break; + case ISCSI_OP_SCSI_CMD: + err = iser_scsi_cmd_rx(task); + break; + case ISCSI_OP_SCSI_DATA_OUT: + err = iser_data_out_rx(task); + queue_task = 0; + break; + case ISCSI_OP_NOOP_OUT: + err = iser_nop_out_rx(task); + break; + case ISCSI_OP_LOGOUT: + dprintf("logout rx\n"); + break; + case ISCSI_OP_SCSI_TMFUNC: + dprintf("tmfunc rx\n"); + break; + case ISCSI_OP_TEXT: + dprintf("text rx\n"); + break; + case ISCSI_OP_SNACK: + eprintf("Cannot handle SNACK yet\n"); + err = -EINVAL; + break; + default: + eprintf("Unknown op 0x%x\n",task->opcode); + err = -EINVAL; + break; + } + if (!err && queue_task) + err = iser_queue_task(conn->h.session, task); +out: + if (err) { + eprintf("task:%p conn:%p err:%d\n", task, &conn->h, err); + iser_conn_close(conn); + } +} + +/* +static void iser_send_chain_complete(struct iser_work_req *txd) +{ + struct iser_task *task = txd->task; + + for (;;) { + if (txd->send_wr.opcode == IBV_WR_SEND) { + iser_complete_task(task); + iser_post_recv(task->conn, task); + } + else if (txd->send_wr.opcode != IBV_WR_RDMA_WRITE) { + eprintf("Unexpected send_wr.opcode:0x%x wr:%p\n", + txd->send_wr.opcode, &txd->send_wr); + exit(1); + } + if (!txd->send_wr.next) + break; + + txd = container_of(txd->send_wr.next, struct iser_work_req, + send_wr); + task = txd->task; + } +} +*/ + +static void iser_tx_complete_handler(struct iser_work_req *txd) +{ + struct iser_task *task = txd->task; + struct iser_conn *conn = task->conn; + + dprintf("task:%p tag:0x%04lx\n", task, task->tag); + + list_del(&task->tx_list); /* remove from conn->sent_list */ + if (!task->unsolicited) + iser_complete_task(task); + else { + conn->nop_in_task = task; + return; + } + + if (conn->h.state == STATE_CLOSE) + return; + + if (conn->login_phase == LOGIN_PHASE_FF) + schedule_post_recv(task, conn); + else if (conn->login_phase == LOGIN_PHASE_LAST_SEND) { + dprintf("last login send completed, release, to ff\n"); + iser_free_login_resources(conn); + conn->login_phase = LOGIN_PHASE_FF; + conn->h.state = STATE_FULL; + } +} + +static void iser_rdma_wr_complete_handler(struct iser_work_req *rdmad) +{ + struct iser_task *task = rdmad->task; + struct iser_conn *conn = task->conn; + + eprintf("task:%p tag:0x%04lx\n", task, task->tag); + exit(1); + + if (conn->h.state == STATE_CLOSE) + return; + + //iser_send_chain_complete(rdmad); +} + +/* + * Called from CQ processing. Hands completed write data to iscsi. + */ +static void iser_rdma_rd_complete_handler(struct iser_work_req *rdmad) +{ + struct iser_task *task = rdmad->task; + struct iser_conn *conn = task->conn; + + // ToDo: need iser_complete_task? + + if (conn->h.state == STATE_CLOSE) + return; + + task->rdma_rd_remains -= rdmad->sge.length; + dprintf("task:%p tag:0x%04lx, rems rdma:%d unsol:%d\n", + task, task->tag, task->rdma_rd_remains, + task->unsol_remains); + if (task->rdma_rd_remains == 0 && task->unsol_remains == 0) + schedule_task_iosubmit(task,conn); +} + +/* + * Deal with just one work completion. + */ +static void handle_wc(struct ibv_wc *wc) +{ + void *wr_id = ptr_from_int64(wc->wr_id); + + switch (wc->opcode) { + case IBV_WC_SEND: + dprintf("tx complete, wr_id:%p len:%d\n", wr_id, wc->byte_len); + iser_tx_complete_handler(wr_id); + break; + case IBV_WC_RECV: + dprintf("rx cmd, wr_id:%p len:%d\n", wr_id, wc->byte_len); + iser_rx_handler(wr_id); + break; + case IBV_WC_RDMA_WRITE: + dprintf("rdma write done, wr_id:%p len:%d\n", wr_id, wc->byte_len); + iser_rdma_wr_complete_handler(wr_id); + break; + case IBV_WC_RDMA_READ: + dprintf("rdma read done, wr_id:%p len:%d\n", wr_id, wc->byte_len); + iser_rdma_rd_complete_handler(wr_id); + break; + default: + eprintf("unexpected opcode %d\n", wc->opcode); + exit(1); + } +} + +static void handle_wc_flush(struct ibv_wc *wc) +{ + struct iser_work_req *wr = ptr_from_int64(wc->wr_id); + struct iser_task *task = wr->task; + struct iser_conn *conn = task->conn; + + dprintf("conn:%p wr_id:0x%llx task:%p\n", + &conn->h, (unsigned long long) wc->wr_id, task); + + if (conn->h.state == STATE_CLOSE) { + //--conn->recvl_posted; + iser_conn_put(conn); + } else { + eprintf("conn %p wr flush err\n", conn); + /* call disconnect now? */ + } +} + +static void handle_wc_error(struct ibv_wc *wc) +{ + struct iser_work_req *wr = ptr_from_int64(wc->wr_id); + struct iser_task *task = wr->task; + struct iser_conn *conn = task->conn; + int complete = 1; + char *op_str; + + if (wr->recv) { + op_str = "recv"; + complete = 0; + } else { + switch (wr->send_wr.opcode) { + case IBV_WC_SEND: + op_str = "send"; + break; + case IBV_WC_RDMA_WRITE: + op_str = "rdma_wr"; + break; + case IBV_WC_RDMA_READ: + op_str = "rdma_rd"; + break; + default: + op_str = "Unknown"; + complete = 0; + break; + } + } + eprintf("conn:%p task:%p tag:0x%04lx op:%s WC status:%d wr_id:0x%llx\n", + &conn->h, task, task->tag, op_str, wc->status, + (unsigned long long) wc->wr_id); + + if (complete) + iser_complete_task(task); + iser_conn_close(conn); +} + +/* + * Could read as many entries as possible without blocking, but + * that just fills up a list of tasks. Instead pop out of here + * so that tx progress, like issuing rdma reads and writes, can + * happen periodically. + */ +static int iser_poll_cq(struct iser_device *dev, int max_wc) +{ + int err = 0, numwc = 0; + struct ibv_wc wc; + + for (;;) { + err = ibv_poll_cq(dev->cq, 1, &wc); + if (err < 0) { + eprintf("ibv_poll_cq %d\n", err); + break; + } else if (err == 0) { + break; + } + + VALGRIND_MAKE_MEM_DEFINED(&wc, sizeof(wc)); + if (wc.status == IBV_WC_SUCCESS) { + handle_wc(&wc); + if (++numwc == max_wc) { + err = 1; + break; + } + } else if (wc.status == IBV_WC_WR_FLUSH_ERR) + handle_wc_flush(&wc); + else + handle_wc_error(&wc); + } + return err; +} + +static int num_delayed_arm = 0; +#define MAX_NUM_DELAYED_ARM 8 + +static void iser_poll_cq_armable(struct iser_device *dev) +{ + int err; + + err = iser_poll_cq(dev, MAX_POLL_WC); + if (err < 0) + exit(1); + + if (err == 0) { + if (++num_delayed_arm == MAX_NUM_DELAYED_ARM) { + num_delayed_arm = 0; + /* no more completions on cq, arm the completion interrupts */ + err = ibv_req_notify_cq(dev->cq, 0); + if (err) { + eprintf("ibv_req_notify_cq: %s\n", strerror(err)); + exit(1); + } + dev->poll_sched.sched_handler = iser_sched_consume_cq; + } + else + dev->poll_sched.sched_handler = iser_sched_poll_cq; + } else + dev->poll_sched.sched_handler = iser_sched_poll_cq; + + tgt_add_sched_event(&dev->poll_sched); +} + +/* Scheduled to poll cq after a completion event has been + received and acknowledged, if no more completions are found + the interrupts are re-armed */ +static void iser_sched_poll_cq(struct event_data *tev) +{ + struct iser_device *dev = tev->data; + iser_poll_cq_armable(dev); +} + +/* Scheduled to consume completion events that could arrive + after the cq had been seen empty but just before + the notification interrupts were re-armed. + Intended to consume those remaining completions only, + this function does not re-arm interrupts. */ +static void iser_sched_consume_cq(struct event_data *tev) +{ + struct iser_device *dev = tev->data; + int err; + + err = iser_poll_cq(dev, MAX_POLL_WC); + if (err < 0) + exit(1); +} + +/* + * Called from main event loop when a CQ notification is available. + */ +static void iser_cqe_handler(int fd __attribute__ ((unused)), + int events __attribute__ ((unused)), + void *data) +{ + struct iser_device *dev = data; + void *cq_context; + int err; + + err = ibv_get_cq_event(dev->cq_channel, &dev->cq, &cq_context); + if (err != 0) { + eprintf("notification, but no CQ event\n"); + exit(1); + } + + ibv_ack_cq_events(dev->cq, 1); + + /* if a poll was previosuly scheduled, remove it, + as it will be scheduled when necessary */ + if (dev->poll_sched.scheduled) + tgt_remove_sched_event(&dev->poll_sched); + + iser_poll_cq_armable(dev); +} + +/* + * Init entire iscsi transport. Begin listening for connections. + */ +static int iser_ib_init(void) +{ + int err; + struct sockaddr_in sock_addr; + short int port = iser_listen_port; + + rdma_evt_channel = rdma_create_event_channel(); + if (!rdma_evt_channel) { + eprintf("Failed to initialize RDMA; load kernel modules?\n"); + return -1; + } + + err = rdma_create_id(rdma_evt_channel, &cma_listen_id, NULL, + RDMA_PS_TCP); + if (err) { + eprintf("rdma_create_id: %s\n", strerror(err)); + return -1; + } + + memset(&sock_addr, 0, sizeof(sock_addr)); + sock_addr.sin_family = AF_INET; + sock_addr.sin_port = htons(port); + sock_addr.sin_addr.s_addr = INADDR_ANY; + err = + rdma_bind_addr(cma_listen_id, (struct sockaddr *) &sock_addr); + if (err) { + if (err == -1) + eprintf("rdma_bind_addr -1: %m\n"); + else + eprintf("rdma_bind_addr: %s\n", strerror(-err)); + return -1; + } + + /* 0 == maximum backlog */ + err = rdma_listen(cma_listen_id, 0); + if (err) { + if (err == -1) + eprintf("rdma_listen -1: %m\n"); + else + eprintf("rdma_listen: %s\n", strerror(-err)); + return -1; + } + + dprintf("listening for iser connections on port %d\n", port); + err = tgt_event_add(cma_listen_id->channel->fd, EPOLLIN, + iser_handle_rdmacm, NULL); + if (err) + return err; + + INIT_LIST_HEAD(&iser_dev_list); + INIT_LIST_HEAD(&iser_conn_list); + INIT_LIST_HEAD(&temp_conn); + + return err; +} + +#include <signal.h> +#include <sys/time.h> + +#define ISER_TIMER_INT_SEC 5 +static struct itimerval iser_timer = + {{ISER_TIMER_INT_SEC,0}, + {ISER_TIMER_INT_SEC,0}}; +static int timer_fd[2] = {0,0}; +static unsigned int ntimer = 0; + +static void iser_timer_sig_handler(int data) +{ + int err; + + // ntimer++; + err = write(timer_fd[1], &ntimer, sizeof(ntimer)); + if (err < 0) { + eprintf("Failed to write to pipe, %m\n"); + return; + } +} + +static void iser_timer_evt_handler(int fd, int events, void *data) +{ + struct iser_conn *conn; + struct iser_task *task; + unsigned int n; + int err; + + err = read(timer_fd[0], &n, sizeof(n)); + if (err < 0) { + eprintf("Failed to write to pipe, %m\n"); + return; + } + + list_for_each_entry(conn, &iser_conn_list, conn_list) { + if (conn->h.state != STATE_FULL) + continue; + task = conn->nop_in_task; + if (!task) + continue; + conn->nop_in_task = NULL; + iser_send_ping_nop_in(task); + } +} + +static int iser_start_timer(void) +{ + struct sigaction s; + int err; + + sigemptyset(&s.sa_mask); + sigaddset(&s.sa_mask, SIGALRM); + s.sa_flags = 0; + s.sa_handler = iser_timer_sig_handler; + err = sigaction(SIGALRM, &s, NULL); + if (err) { + eprintf("Failed to setup timer handler\n"); + return err; + } + + err = setitimer(ITIMER_REAL, &iser_timer, 0); + if (err) { + eprintf("Failed to set timer\n"); + return err; + } + + err = pipe(timer_fd); + if (err) { + eprintf("Failed to open timer pipe\n"); + return err; + } + + err = tgt_event_add(timer_fd[0], EPOLLIN, iser_timer_evt_handler, NULL); + if (err) { + eprintf("failed to add iser timer epoll event\n"); + return err; + } + + return 0; +} + +static int iser_stop_timer(void) +{ + int err; + + tgt_event_del(timer_fd[0]); + + if (timer_fd[0] > 0) + close(timer_fd[0]); + if (timer_fd[1] > 0) + close(timer_fd[1]); + + err = setitimer(ITIMER_REAL, 0, 0); + if (err) + eprintf("Failed to stop timer\n"); + + return err; +} + +static int iser_init(int index, char *args) +{ + int err; + + err = iser_ib_init(); + if (err) + return err; + + /* err = iser_start_timer(); + if (err) + return err; + */ + + return 0; +} + +static void iser_exit(void) +{ + iser_stop_timer(); +} + +static int iser_target_create(struct target *t) +{ + struct iscsi_target *target; + int err; + + err = iscsi_target_create(t); + if (err) + return err; + + target = target_find_by_id(t->tid); + assert(target != NULL); + + target->rdma = 1; + target->session_param[ISCSI_PARAM_INITIAL_R2T_EN].val = 1; + target->session_param[ISCSI_PARAM_IMM_DATA_EN].val = 0; + + return 0; +} + +static const char *lld_param_port = "port"; + +static int iser_param_parser(char *p) +{ + char *q; + + while (*p) { + if (!strncmp(p, lld_param_port, strlen(lld_param_port))) { + q = p + strlen(lld_param_port) + 1; + iser_listen_port = atoi(q); + eprintf("iser listen port:%d\n", iser_listen_port); + } + + p += strcspn(p, ","); + if (*p == ',') + ++p; + } + return 0; +} + +static struct tgt_driver iser = { + .name = "iser", + .use_kernel = 0, + .init = iser_init, + .exit = iser_exit, + .target_create = iser_target_create, + .target_destroy = iscsi_target_destroy, + + .update = iscsi_target_update, + .show = iscsi_target_show, + .cmd_end_notify = iser_scsi_cmd_done, + .mgmt_end_notify = iser_tm_done, + .transportid = iscsi_transportid, + .default_bst = "rdwr", +}; + +__attribute__ ((constructor)) +static void iser_driver_constructor(void) +{ + register_driver(&iser); + + setup_param("iser", iser_param_parser); +} + -- 1.6.5 -- To unsubscribe from this list: send the line "unsubscribe stgt" in the body of a message to majordomo at vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html |