[stgt] [PATCH 2/4] iser, new implementation

Alexander Nezhinsky alexandern at Voltaire.COM
Wed Oct 13 15:31:37 CEST 2010


Signed-off-by: Alexander Nezhinsky <alexandern at voltaire.com>
---
 usr/iscsi/iser.c | 3526 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 3526 insertions(+), 0 deletions(-)
 create mode 100644 usr/iscsi/iser.c

diff --git a/usr/iscsi/iser.c b/usr/iscsi/iser.c
new file mode 100644
index 0000000..f742fb2
--- /dev/null
+++ b/usr/iscsi/iser.c
@@ -0,0 +1,3526 @@
+/*
+ * iSCSI extensions for RDMA (iSER) data path
+ *
+ * Copyright (C) 2007 Dennis Dalessandro (dennis at osc.edu)
+ * Copyright (C) 2007 Ananth Devulapalli (ananth at osc.edu)
+ * Copyright (C) 2007 Pete Wyckoff (pw at osc.edu)
+ * Copyright (C) 2010 Voltaire, Inc. All rights reserved.
+ * Copyright (C) 2010 Alexander Nezhinsky (alexandern at voltaire.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <assert.h>
+#include <netdb.h>
+#include <sys/epoll.h>
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+
+#include "util.h"
+#include "iscsid.h"
+#include "target.h"
+#include "iser.h"
+#include "driver.h"
+#include "scsi.h"
+
+#if defined(HAVE_VALGRIND) && !defined(NDEBUG)
+#include <valgrind/memcheck.h>
+#else
+#define VALGRIND_MAKE_MEM_DEFINED(addr, len)
+#endif
+
+static struct iscsi_transport iscsi_iser;
+
+/* global, across all devices */
+static struct rdma_event_channel *rdma_evt_channel;
+static struct rdma_cm_id *cma_listen_id;
+
+/* accepted at RDMA layer, but not yet established */
+static struct list_head temp_conn;
+
+/* all devices */
+static struct list_head iser_dev_list;
+
+/* all iser connections */
+static struct list_head iser_conn_list;
+
+#define uint64_from_ptr(p) (uint64_t)(uintptr_t)(p)
+#define ptr_from_int64(p) (void *)(unsigned long)(p)
+
+#define ISER_LISTEN_PORT 3260
+
+short int iser_listen_port = ISER_LISTEN_PORT;
+char *iser_portal_addr;
+
+/*
+ * Crazy hard-coded linux iser settings need 128 * 8 slots + slop, plus
+ * room for our rdmas and send requests.
+ */
+#define MAX_WQE 1800
+
+#define RDMA_TRANSFER_SIZE (512 * 1024)
+
+#define MAX_POLL_WC 32
+
+#define ISER_MAX_QUEUE_CMD      128     /* iSCSI cmd window size */
+
+#define MASK_BY_BIT(b)  	((1UL << b) - 1)
+#define ALIGN_TO_BIT(x, b)      ((((unsigned long)x) + MASK_BY_BIT(b)) & \
+				 ~MASK_BY_BIT(b))
+#define ALIGN_TO_32(x)  	ALIGN_TO_BIT(x, 5)
+
+struct iscsi_sense_data {
+	uint16_t length;
+	uint8_t data[0];
+} __packed;
+
+/*
+ * Number of allocatable data buffers, each of this size.  Do at least 128
+ * for linux iser.  The membuf size is rounded up at initialization time
+ * to the hardware page size so that allocations for direct IO devices are
+ * aligned.
+ */
+static int membuf_num = 16 * ISER_MAX_QUEUE_CMD;
+static size_t membuf_size = RDMA_TRANSFER_SIZE;
+
+static int iser_conn_get(struct iser_conn *conn);
+static int iser_conn_getn(struct iser_conn *conn, int n);
+static void iser_conn_put(struct iser_conn *conn);
+
+static void iser_free_login_resources(struct iser_conn *conn);
+static void iser_free_ff_resources(struct iser_conn *conn);
+
+static void iser_handle_cq_event(int fd __attribute__ ((unused)),
+				 int events __attribute__ ((unused)),
+				 void *data);
+
+static void iser_handle_async_event(int fd __attribute__ ((unused)),
+				    int events __attribute__ ((unused)),
+				    void *data);
+
+static void iser_sched_poll_cq(struct event_data *tev);
+static void iser_sched_consume_cq(struct event_data *tev);
+static void iser_sched_tx(struct event_data *evt);
+static void iser_sched_post_recv(struct event_data *evt);
+static void iser_sched_buf_alloc(struct event_data *evt);
+static void iser_sched_iosubmit(struct event_data *evt);
+static void iser_sched_rdma_rd(struct event_data *evt);
+
+static void iser_login_rx(struct iser_task *task);
+
+static int iser_device_init(struct iser_device *dev);
+
+static char *iser_ib_op_to_str(enum iser_ib_op_code iser_ib_op)
+{
+	char *op_str;
+
+	switch (iser_ib_op) {
+	case ISER_IB_RECV:
+		op_str = "recv";
+		break;
+	case ISER_IB_SEND:
+		op_str = "send";
+		break;
+	case ISER_IB_RDMA_WRITE:
+		op_str = "rdma_wr";
+		break;
+	case ISER_IB_RDMA_READ:
+		op_str = "rdma_rd";
+		break;
+	default:
+		op_str = "Unknown";
+		break;
+	}
+	return op_str;
+}
+
+static void iser_pdu_init(struct iser_pdu *pdu, void *buf)
+{
+	pdu->iser_hdr = (struct iser_hdr *) buf;
+	buf += sizeof(struct iser_hdr);
+
+	pdu->bhs = (struct iscsi_hdr *) buf;
+	buf += sizeof(struct iscsi_hdr);
+
+	pdu->ahs = buf;
+	pdu->ahssize = 0;
+
+	pdu->membuf.addr = buf;
+	pdu->membuf.size = 0;
+}
+
+static void iser_rxd_init(struct iser_work_req *rxd,
+			  struct iser_task *task,
+			  void *buf, unsigned size,
+			  struct ibv_mr *srmr)
+{
+	rxd->task = task;
+	rxd->iser_ib_op = ISER_IB_RECV;
+
+	rxd->sge.addr = uint64_from_ptr(buf);
+	rxd->sge.length = size;
+	rxd->sge.lkey = srmr->lkey;
+
+	rxd->recv_wr.wr_id = uint64_from_ptr(rxd);
+	rxd->recv_wr.sg_list = &rxd->sge;
+	rxd->recv_wr.num_sge = 1;
+	rxd->recv_wr.next = NULL;
+}
+
+static void iser_txd_init(struct iser_work_req *txd,
+			  struct iser_task *task,
+			  void *buf, unsigned size,
+			  struct ibv_mr *srmr)
+{
+	txd->task = task;
+	txd->iser_ib_op = ISER_IB_SEND;
+
+	txd->sge.addr = uint64_from_ptr(buf);
+	txd->sge.length = size;
+	txd->sge.lkey = srmr->lkey;
+
+	txd->send_wr.wr_id = uint64_from_ptr(txd);
+	txd->send_wr.next = NULL;
+	txd->send_wr.sg_list = &txd->sge;
+	txd->send_wr.num_sge = 1;
+	txd->send_wr.opcode = IBV_WR_SEND;
+	txd->send_wr.send_flags = IBV_SEND_SIGNALED;
+
+	INIT_LIST_HEAD(&txd->wr_list);
+}
+
+static void iser_rdmad_init(struct iser_work_req *rdmad,
+			    struct iser_task *task,
+			    struct ibv_mr *srmr)
+{
+	rdmad->task = task;
+
+	rdmad->sge.lkey = srmr->lkey;
+
+	rdmad->send_wr.wr_id = uint64_from_ptr(rdmad);
+	rdmad->send_wr.sg_list = &rdmad->sge;
+	rdmad->send_wr.num_sge = 1;
+	rdmad->send_wr.next = NULL;
+	rdmad->send_wr.send_flags = IBV_SEND_SIGNALED;
+
+	/* to be set before posting:
+	   rdmad->iser_ib_op, rdmad->send_wr.opcode
+	   rdmad->sge.addr, rdmad->sge.length
+	   rdmad->send_wr.wr.rdma.(remote_addr,rkey) */
+
+	INIT_LIST_HEAD(&rdmad->wr_list);
+}
+
+static void iser_task_init(struct iser_task *task,
+			   struct iser_conn *conn,
+			   void *pdu_buf,
+			   unsigned long buf_size,
+			   struct ibv_mr *srmr)
+{
+	task->conn = conn;
+	task->unsolicited = 0;
+
+	iser_pdu_init(&task->pdu, pdu_buf);
+
+	iser_rxd_init(&task->rxd, task, pdu_buf, buf_size, srmr);
+	iser_txd_init(&task->txd, task, pdu_buf, buf_size, srmr);
+	iser_rdmad_init(&task->rdmad, task, conn->dev->membuf_mr);
+
+	INIT_LIST_HEAD(&task->in_buf_list);
+	INIT_LIST_HEAD(&task->out_buf_list);
+
+	INIT_LIST_HEAD(&task->exec_list);
+	INIT_LIST_HEAD(&task->rdma_list);
+	INIT_LIST_HEAD(&task->tx_list);
+	INIT_LIST_HEAD(&task->recv_list);
+
+	INIT_LIST_HEAD(&task->session_list);
+	INIT_LIST_HEAD(&task->dout_task_list);
+}
+
+static void iser_unsolicited_task_init(struct iser_task *task,
+				       struct iser_conn *conn,
+				       void *pdu_data_buf,
+				       unsigned long buf_size,
+				       struct ibv_mr *srmr)
+{
+	task->conn = conn;
+	task->unsolicited = 1;
+
+	iser_txd_init(&task->txd, task, pdu_data_buf, buf_size, srmr);
+	iser_pdu_init(&task->pdu, pdu_data_buf);
+}
+
+static void iser_task_add_out_pdu_buf(struct iser_task *task,
+				      struct iser_membuf *data_buf,
+				      unsigned int offset)
+{
+	data_buf->offset = offset;
+	task->out_buf_num++;
+	if (!list_empty(&task->out_buf_list)) {
+		struct iser_membuf *cur_buf;
+		list_for_each_entry(cur_buf, &task->out_buf_list, task_list) {
+			if (offset < cur_buf->offset) {
+				dprintf("task:%p offset:%d size:%d data_buf:%p add before:%p\n",
+					task, offset, data_buf->size, data_buf->addr, cur_buf->addr);
+				list_add_tail(&data_buf->task_list, &cur_buf->task_list);
+				return;
+			}
+		}
+	}
+	dprintf("task:%p offset:%d size:%d data_buf:%p add last\n",
+		task, offset, data_buf->size, data_buf->addr);
+	list_add_tail(&data_buf->task_list, &task->out_buf_list);
+}
+
+static void iser_task_add_out_rdma_buf(struct iser_task *task,
+				       struct iser_membuf *data_buf,
+				       unsigned int offset)
+{
+	data_buf->offset = offset;
+	task->out_buf_num++;
+	dprintf("task:%p offset:%d size:%d data_buf:%p add last\n",
+		task, offset, data_buf->size, data_buf->addr);
+	list_add_tail(&data_buf->task_list, &task->out_buf_list);
+}
+
+static void iser_task_del_out_buf(struct iser_task *task,
+				  struct iser_membuf *data_buf)
+{
+	dprintf("task:%p offset:%d size:%d data_buf:%p\n",
+		task, data_buf->offset, data_buf->size, data_buf->addr);
+	list_del(&data_buf->task_list);
+	task->out_buf_num--;
+}
+
+static void iser_task_add_in_rdma_buf(struct iser_task *task,
+				      struct iser_membuf *data_buf,
+				      unsigned int offset)
+{
+	dprintf("task:%p offset:0x%d size:%d data_buf:%p add last\n",
+		task, offset, data_buf->size, data_buf->addr);
+	data_buf->offset = offset;
+	task->in_buf_num++;
+	list_add_tail(&data_buf->task_list, &task->in_buf_list);
+}
+
+static void iser_task_del_in_buf(struct iser_task *task,
+				 struct iser_membuf *data_buf)
+{
+	dprintf("task:%p offset:%d size:%d data_buf:%p\n",
+		task, data_buf->offset, data_buf->size, data_buf->addr);
+	list_del(&data_buf->task_list);
+	task->in_buf_num--;
+}
+
+static void iser_prep_resp_send_req(struct iser_task *task,
+				    struct iser_work_req *next_wr,
+				    int signaled)
+{
+	struct iser_pdu *pdu = &task->pdu;
+	struct iser_hdr *iser_hdr = pdu->iser_hdr;
+	struct iscsi_hdr *bhs = pdu->bhs;
+	struct iser_work_req *txd = &task->txd;
+
+	bhs->hlength = pdu->ahssize / 4;
+	hton24(bhs->dlength, pdu->membuf.size);
+
+	txd->sge.length = ISER_HDRS_SZ;
+	txd->sge.length += pdu->ahssize;
+	txd->sge.length += pdu->membuf.size;
+
+	memset(iser_hdr, 0, sizeof(*iser_hdr));
+	iser_hdr->flags = ISCSI_CTRL;
+
+	txd->send_wr.next = (next_wr ? &next_wr->send_wr : NULL);
+	txd->send_wr.send_flags = (signaled ? IBV_SEND_SIGNALED : 0);
+
+	dprintf("task:%p wr_id:0x%"PRIx64 "tag:0x%04"PRIx64 ""
+		"dtbuf:%p dtsz:%u ahs_sz:%u stat:0x%x statsn:0x%x expcmdsn:0x%x\n",
+		task, txd->send_wr.wr_id, task->tag,
+		pdu->membuf.addr, pdu->membuf.size, pdu->ahssize,
+		bhs->rsvd2[1], ntohl(bhs->statsn), ntohl(bhs->exp_statsn));
+}
+
+static void iser_prep_rdma_wr_send_req(struct iser_task *task,
+				       struct iser_work_req *next_wr,
+				       int signaled)
+{
+	struct iser_work_req *rdmad = &task->rdmad;
+	struct iser_membuf *rdma_buf;
+	uint64_t offset = 0; /* ToDo: multiple RDMA-Write buffers */
+
+	rdmad->iser_ib_op = ISER_IB_RDMA_WRITE;
+
+	/* RDMA-Write buffer is the only one on the list */
+	/* ToDo: multiple RDMA-Write buffers, use rdma_buf->offset */
+	rdma_buf = list_first_entry(&task->in_buf_list, struct iser_membuf, task_list);
+	rdmad->sge.addr = uint64_from_ptr(rdma_buf->addr);
+
+	if (likely(task->rdma_wr_remains <= rdma_buf->size)) {
+		rdmad->sge.length = task->rdma_wr_remains;
+		task->rdma_wr_remains = 0;
+	} else {
+		rdmad->sge.length = rdma_buf->size;
+		task->rdma_wr_remains -= rdmad->sge.length;
+	}
+
+	rdmad->send_wr.next = (next_wr ? &next_wr->send_wr : NULL);
+	rdmad->send_wr.opcode = IBV_WR_RDMA_WRITE;
+	rdmad->send_wr.send_flags = (signaled ? IBV_SEND_SIGNALED : 0);
+
+	rdmad->send_wr.wr.rdma.remote_addr = task->rem_read_va + offset;
+	/* offset += rdmad->sge.length // ToDo: multiple RDMA-Write buffers */
+	rdmad->send_wr.wr.rdma.rkey = task->rem_read_stag;
+
+	dprintf(" task:%p tag:0x%04"PRIx64 "wr_id:0x%"PRIx64 "daddr:0x%"PRIx64 "dsz:%u "
+		"bufsz:%u rdma:%d lkey:%x raddr:%"PRIx64 "rkey:%x rems:%u\n",
+		task, task->tag, rdmad->send_wr.wr_id, rdmad->sge.addr,
+		rdmad->sge.length, rdma_buf->size, rdma_buf->rdma,
+		rdmad->sge.lkey, rdmad->send_wr.wr.rdma.remote_addr,
+		rdmad->send_wr.wr.rdma.rkey, task->rdma_wr_remains);
+}
+
+static void iser_prep_rdma_rd_send_req(struct iser_task *task,
+				       struct iser_work_req *next_wr,
+				       int signaled)
+{
+	struct iser_work_req *rdmad = &task->rdmad;
+	struct iser_membuf *rdma_buf;
+	uint64_t offset;
+	int cur_req_sz;
+
+	rdmad->iser_ib_op = ISER_IB_RDMA_READ;
+
+	/* RDMA-Read buffer is always put at the list's tail */
+	/* ToDo: multiple RDMA-Write buffers */
+	rdma_buf = container_of(task->out_buf_list.prev,
+				struct iser_membuf, task_list);
+	offset = rdma_buf->offset;
+	rdmad->sge.addr = uint64_from_ptr(rdma_buf->addr) + offset;
+
+	if (task->rdma_rd_sz <= rdma_buf->size)
+		cur_req_sz = task->rdma_rd_sz;
+	else
+		cur_req_sz = rdma_buf->size;
+
+	/* task->rdma_rd_offset += cur_req_sz; // ToDo: multiple RDMA-Write buffers */
+
+	rdmad->sge.length = cur_req_sz;
+
+	rdmad->send_wr.next = (next_wr ? &next_wr->send_wr : NULL);
+	rdmad->send_wr.opcode = IBV_WR_RDMA_READ;
+	rdmad->send_wr.send_flags = (signaled ? IBV_SEND_SIGNALED : 0);
+	rdmad->send_wr.wr.rdma.remote_addr =
+		(uint64_t) task->rem_write_va;
+		/* + (offset - task->unsol_sz); // ToDo: multiple RDMA-Write buffers */
+	rdmad->send_wr.wr.rdma.rkey =
+		(uint32_t) task->rem_write_stag;
+
+	dprintf("task:%p wr_id:0x%"PRIx64 "tag:0x%04"PRIx64 "daddr:0x%"PRIx64 "dsz:%u "
+		"bufsz:%u rdma:%d lkey:%x raddr:%"PRIx64 "rkey:%x rems:%u\n",
+		task, rdmad->send_wr.wr_id, task->tag, rdmad->sge.addr,
+		rdmad->sge.length, rdma_buf->size, rdma_buf->rdma,
+		rdmad->sge.lkey, rdmad->send_wr.wr.rdma.remote_addr,
+		rdmad->send_wr.wr.rdma.rkey, task->rdma_rd_sz);
+}
+
+static int iser_post_send(struct iser_conn *conn,
+			  struct iser_work_req *iser_send,
+			  int num_send_reqs)
+{
+	struct ibv_send_wr *bad_wr;
+	int err, nr_posted;
+
+	err = ibv_post_send(conn->qp_hndl, &iser_send->send_wr, &bad_wr);
+	if (likely(!err)) {
+		nr_posted = num_send_reqs;
+		dprintf("conn:%p posted:%d 1st wr:%p wr_id:0x%lx sge_sz:%u\n",
+			&conn->h, nr_posted, iser_send,
+			(unsigned long)iser_send->send_wr.wr_id,
+			iser_send->sge.length);
+	} else {
+		struct ibv_send_wr *wr;
+
+		nr_posted = 0;
+		for (wr = &iser_send->send_wr; wr != bad_wr; wr = wr->next)
+			nr_posted++;
+
+		eprintf("%m, conn:%p posted:%d/%d 1st wr:%p wr_id:0x%lx sge_sz:%u\n",
+			&conn->h, nr_posted, num_send_reqs, iser_send,
+			(unsigned long)iser_send->send_wr.wr_id,
+			iser_send->sge.length);
+	}
+
+	iser_conn_getn(conn, nr_posted);
+
+	return err;
+}
+
+static int iser_post_recv(struct iser_conn *conn,
+			  struct iser_task *task,
+			  int num_recv_bufs)
+{
+	struct ibv_recv_wr *bad_wr;
+	int err, nr_posted;
+
+	err = ibv_post_recv(conn->qp_hndl, &task->rxd.recv_wr, &bad_wr);
+	if (likely(!err)) {
+		nr_posted = num_recv_bufs;
+		dprintf("conn:%p posted:%d 1st task:%p "
+			"wr_id:0x%lx sge_sz:%u\n",
+			&conn->h, nr_posted, task,
+			(unsigned long)task->rxd.recv_wr.wr_id,
+			task->rxd.sge.length);
+	} else {
+		struct ibv_recv_wr *wr;
+
+		nr_posted = 0;
+		for (wr = &task->rxd.recv_wr; wr != bad_wr; wr = wr->next)
+			nr_posted++;
+
+		eprintf("%m, conn:%p posted:%d/%d 1st task:%p "
+			"wr_id:0x%lx sge_sz:%u\n",
+			&conn->h, nr_posted, num_recv_bufs, task,
+			(unsigned long)task->rxd.recv_wr.wr_id,
+			task->rxd.sge.length);
+	}
+
+	iser_conn_getn(conn, nr_posted);
+
+	return err;
+}
+
+static inline void iser_set_rsp_stat_sn(struct iscsi_session *session,
+					struct iscsi_hdr *rsp)
+{
+	if (session) {
+		rsp->exp_statsn = cpu_to_be32(session->exp_cmd_sn);
+		rsp->max_statsn = cpu_to_be32(session->exp_cmd_sn +
+					      ISER_MAX_QUEUE_CMD);
+	}
+}
+
+static int iser_init_rdma_buf_pool(struct iser_device *dev)
+{
+	uint8_t *pool_buf, *list_buf;
+	size_t pool_size, list_size;
+	struct iser_membuf *rdma_buf;
+	int i;
+
+	membuf_size = roundup(membuf_size, pagesize);
+	pool_size = membuf_num * membuf_size;
+	pool_buf = valloc(pool_size);
+	if (!pool_buf) {
+		eprintf("malloc rdma pool sz:%zu failed\n", pool_size);
+		return -ENOMEM;
+	}
+
+	list_size = membuf_num * sizeof(*rdma_buf);
+	list_buf = malloc(list_size);
+	if (!list_buf) {
+		eprintf("malloc list_buf sz:%zu failed\n", list_size);
+		free(pool_buf);
+		return -ENOMEM;
+	}
+
+	/* One pool of registered memory per PD */
+	dev->membuf_mr = ibv_reg_mr(dev->pd, pool_buf, pool_size,
+				    IBV_ACCESS_LOCAL_WRITE);
+	if (!dev->membuf_mr) {
+		eprintf("ibv_reg_mr failed, %m\n");
+		free(pool_buf);
+		free(list_buf);
+		return -1;
+	}
+	dprintf("pool buf:%p list:%p mr:%p lkey:0x%x\n",
+		pool_buf, list_buf, dev->membuf_mr, dev->membuf_mr->lkey);
+
+	dev->membuf_regbuf = pool_buf;
+	dev->membuf_listbuf = list_buf;
+	INIT_LIST_HEAD(&dev->membuf_free);
+	INIT_LIST_HEAD(&dev->membuf_alloc);
+
+	for (i = 0; i < membuf_num; i++) {
+		rdma_buf = (void *) list_buf;
+		list_buf += sizeof(*rdma_buf);
+
+		list_add_tail(&rdma_buf->pool_list, &dev->membuf_free);
+		INIT_LIST_HEAD(&rdma_buf->task_list);
+		rdma_buf->addr = pool_buf;
+		rdma_buf->size = membuf_size;
+		rdma_buf->rdma = 1;
+
+		pool_buf += membuf_size;
+	}
+
+	return 0;
+}
+
+static struct iser_membuf *iser_alloc_rdma_buf(struct iser_conn *conn,
+					  size_t sz)
+{
+	struct iser_device *dev = conn->dev;
+	struct list_head *buf_list;
+	struct iser_membuf *rdma_buf;
+
+	if (unlikely(list_empty(&dev->membuf_free))) {
+		dev->waiting_for_mem = 1;
+		return NULL;
+	}
+
+	buf_list = dev->membuf_free.next;
+	list_del(buf_list);
+	list_add_tail(buf_list, &dev->membuf_alloc);
+	rdma_buf = list_entry(buf_list, struct iser_membuf, pool_list);
+
+	dprintf("alloc:%p sz:%zu\n", rdma_buf, sz);
+	return rdma_buf;
+}
+
+static void iser_free_rdma_buf(struct iser_conn *conn, struct iser_membuf *rdma_buf)
+{
+	struct iser_device *dev;
+
+	if (unlikely(!rdma_buf || !rdma_buf->rdma))
+		return;
+
+	dprintf("free %p\n", rdma_buf);
+	list_del(&rdma_buf->pool_list);
+	/* add to the free list head to reuse recently used buffers first */
+	dev = conn->dev;
+	list_add(&rdma_buf->pool_list, &dev->membuf_free);
+	if (unlikely(dev->waiting_for_mem)) {
+		dev->waiting_for_mem = 0;
+		tgt_add_sched_event(&conn->sched_buf_alloc);
+	}
+}
+
+static void iser_task_free_out_bufs(struct iser_task *task)
+{
+	struct iser_conn *conn = task->conn;
+	struct iser_membuf *membuf, *mbnext;
+
+	list_for_each_entry_safe(membuf, mbnext, &task->out_buf_list, task_list) {
+		iser_task_del_out_buf(task, membuf);
+		if (membuf->rdma)
+			iser_free_rdma_buf(conn, membuf);
+	}
+	assert(task->out_buf_num == 0);
+}
+
+static void iser_task_free_in_bufs(struct iser_task *task)
+{
+	struct iser_conn *conn = task->conn;
+	struct iser_membuf *membuf, *mbnext;
+
+	list_for_each_entry_safe(membuf, mbnext, &task->in_buf_list, task_list) {
+		iser_task_del_in_buf(task, membuf);
+		iser_free_rdma_buf(conn, membuf);
+	}
+	assert(task->in_buf_num == 0);
+}
+
+static void iser_complete_task(struct iser_task *task)
+{
+	if (unlikely(task->opcode != ISCSI_OP_SCSI_CMD)) {
+		dprintf("task:%p, non-cmd\n", task);
+		return;
+	}
+
+	/* we are completing scsi cmd task, returning from target */
+	if (likely(task_in_scsi(task))) {
+		target_cmd_done(&task->scmd);
+		clear_task_in_scsi(task);
+	}
+
+	list_del(&task->session_list);
+	if (task->is_read)
+		iser_task_free_in_bufs(task);
+	if (task->is_write) {
+		iser_task_free_out_bufs(task);
+		/* iser_task_free_dout_tasks(task); // ToDo: multiple out buffers */
+	}
+	dprintf("task:%p\n", task);
+}
+
+static char *iser_conn_login_phase_name(enum iser_login_phase phase)
+{
+	switch (phase) {
+	case LOGIN_PHASE_INIT:
+		return "INIT";
+	case LOGIN_PHASE_START:
+		return "START";
+	case LOGIN_PHASE_LAST_SEND:
+		return "LAST_SEND";
+	case LOGIN_PHASE_FF:
+		return "FF";
+	default:
+		return "Illegal";
+	}
+}
+
+static void iser_conn_login_phase_set(struct iser_conn *conn,
+				      enum iser_login_phase phase)
+{
+	dprintf("conn:%p from:%s to:%s\n", &conn->h,
+		iser_conn_login_phase_name(conn->login_phase),
+		iser_conn_login_phase_name(phase));
+	conn->login_phase = phase;
+}
+
+/*
+ * Called at accept time, builds resources just for login phase.
+ */
+
+#define NUM_LOGIN_TASKS 	2 /* one posted for req rx, one for reply tx */
+
+static int iser_alloc_login_resources(struct iser_conn *conn)
+{
+	unsigned long buf_size = ALIGN_TO_32(conn->rsize);
+	unsigned long pool_size = NUM_LOGIN_TASKS * buf_size;
+	struct iser_task *login_task[NUM_LOGIN_TASKS];
+	uint8_t *pdu_data_buf, *task_buf;
+	unsigned int i;
+	int err = 0;
+
+	dprintf("conn:%p login tasks num:%u, buf_sz:%lu (rx_sz:%u tx_sz:%u)\n",
+		&conn->h, NUM_LOGIN_TASKS, buf_size, conn->rsize, conn->ssize);
+
+	conn->login_data_pool = malloc(pool_size);
+	if (!conn->login_data_pool) {
+		eprintf("conn:%p malloc login_data_pool sz:%lu failed\n",
+			&conn->h, pool_size);
+		err = -1;
+		goto out;
+	}
+
+	conn->login_data_mr = ibv_reg_mr(conn->dev->pd, conn->login_data_pool,
+					 pool_size, IBV_ACCESS_LOCAL_WRITE);
+	if (!conn->login_data_mr) {
+		eprintf("conn:%p ibv_reg_mr login pool failed, %m\n", &conn->h);
+		free(conn->login_data_pool);
+		conn->login_data_pool = NULL;
+		err = -1;
+		goto out;
+	}
+
+	pool_size = NUM_LOGIN_TASKS * sizeof(struct iser_task);
+	conn->login_task_pool = malloc(pool_size);
+	if (!conn->login_task_pool) {
+		eprintf("conn:%p malloc login_task_pool sz:%lu failed\n",
+			&conn->h, pool_size);
+		ibv_dereg_mr(conn->login_data_mr);
+		conn->login_data_mr = NULL;
+		free(conn->login_data_pool);
+		conn->login_data_pool = NULL;
+		goto out;
+	}
+	memset(conn->login_task_pool, 0, pool_size);
+
+	conn->login_res_alloc = 1;
+
+	pdu_data_buf = conn->login_data_pool;
+	task_buf = conn->login_task_pool;
+	for (i = 0; i < NUM_LOGIN_TASKS; i++) {
+		login_task[i] = (struct iser_task *) task_buf;
+
+		iser_task_init(login_task[i], conn,
+			       pdu_data_buf, buf_size,
+			       conn->login_data_mr);
+
+		task_buf += sizeof(struct iser_task);
+		pdu_data_buf += buf_size;
+	}
+
+	dprintf("post_recv login rx task:%p\n", login_task[0]);
+	err = iser_post_recv(conn, login_task[0], 1);
+	if (err) {
+		eprintf("conn:%p post_recv login rx-task failed\n", &conn->h);
+		iser_free_login_resources(conn);
+		goto out;
+	}
+	dprintf("saved login tx-task:%p\n", login_task[1]);
+	conn->login_tx_task = login_task[1];
+out:
+	return err;
+}
+
+/*
+ * When ready for full-feature mode, free login-phase resources.
+ */
+static void iser_free_login_resources(struct iser_conn *conn)
+{
+	int err;
+
+	if (!conn->login_res_alloc)
+		return;
+
+	dprintf("conn:%p, login phase:%s\n", &conn->h,
+		iser_conn_login_phase_name(conn->login_phase));
+
+	/* release mr and free the lists */
+	if (conn->login_data_mr) {
+		err = ibv_dereg_mr(conn->login_data_mr);
+		if (err)
+			eprintf("conn:%p ibv_dereg_mr failed, %m\n", &conn->h);
+	}
+	if (conn->login_data_pool)
+		free(conn->login_data_pool);
+	if (conn->login_task_pool)
+		free(conn->login_task_pool);
+	conn->login_tx_task = NULL;
+
+	conn->login_res_alloc = 0;
+}
+
+/*
+ * Ready for full feature, allocate resources.
+ */
+static int iser_alloc_ff_resources(struct iser_conn *conn)
+{
+	/* ToDo: need to fix the ISCSI_PARAM_INITIATOR_RDSL bug in initiator */
+	/* buf_size = ALIGN_TO_32(MAX(conn->rsize, conn->ssize)); */
+	unsigned long buf_size = ALIGN_TO_32(conn->rsize); /* includes headers */
+	unsigned long num_tasks = conn->max_outst_pdu + 2; /* all rx, text tx, nop-in*/
+	unsigned long alloc_sz;
+	uint8_t *pdu_data_buf, *task_buf;
+	struct iser_task *task;
+	unsigned int i;
+	int err = 0;
+
+	dprintf("conn:%p max_outst:%u buf_sz:%lu (ssize:%u rsize:%u)\n",
+		&conn->h, conn->max_outst_pdu, buf_size,
+		conn->ssize, conn->rsize);
+
+	alloc_sz = num_tasks * buf_size;
+	conn->pdu_data_pool = malloc(alloc_sz);
+	if (!conn->pdu_data_pool) {
+		eprintf("conn:%p malloc pdu_data_buf sz:%lu failed\n",
+			&conn->h, alloc_sz);
+		err = -1;
+		goto out;
+	}
+
+	conn->pdu_data_mr = ibv_reg_mr(conn->dev->pd, conn->pdu_data_pool,
+				       alloc_sz, IBV_ACCESS_LOCAL_WRITE);
+	if (!conn->pdu_data_mr) {
+		eprintf("conn:%p ibv_reg_mr pdu_data_pool failed, %m\n",
+			&conn->h);
+		free(conn->pdu_data_pool);
+		conn->pdu_data_pool = NULL;
+		err = -1;
+		goto out;
+	}
+
+	alloc_sz = num_tasks * sizeof(struct iser_task);
+	conn->task_pool = malloc(alloc_sz);
+	if (!conn->task_pool) {
+		eprintf("conn:%p malloc task_pool sz:%lu failed\n",
+			&conn->h, alloc_sz);
+		ibv_dereg_mr(conn->pdu_data_mr);
+		conn->pdu_data_mr = NULL;
+		free(conn->pdu_data_pool);
+		conn->pdu_data_pool = NULL;
+		err = -1;
+		goto out;
+	}
+	memset(conn->task_pool, 0, alloc_sz);
+
+	conn->ff_res_alloc = 1;
+
+	pdu_data_buf = conn->pdu_data_pool;
+	task_buf = conn->task_pool;
+	for (i = 0; i < conn->max_outst_pdu; i++) {
+		task = (void *) task_buf;
+
+		iser_task_init(task, conn,
+			       pdu_data_buf, buf_size,
+			       conn->pdu_data_mr);
+
+		task_buf += sizeof(*task);
+		/* ToDo: need to fix the ISCSI_PARAM_INITIATOR_RDSL bug in initiator */
+		/* pdu_data_buf += conn->rsize + conn->ssize; */
+		pdu_data_buf += buf_size;
+
+		err = iser_post_recv(conn, task, 1);
+		if (err) {
+			eprintf("conn:%p post_recv (%d/%d) failed\n",
+				&conn->h, i, conn->max_outst_pdu);
+			iser_free_ff_resources(conn);
+			err = -1;
+			goto out;
+		}
+	}
+
+	/* initialize unsolicited tx task: nop-in */
+	task = (void *) task_buf;
+	iser_unsolicited_task_init(task, conn, pdu_data_buf,
+				   buf_size, conn->pdu_data_mr);
+	conn->nop_in_task = task;
+	task_buf += sizeof(*task);
+	pdu_data_buf += buf_size;
+
+	/* initialize tx task: text/login */
+	task = (void *) task_buf;
+	iser_unsolicited_task_init(task, conn, pdu_data_buf,
+				   buf_size, conn->pdu_data_mr);
+	conn->text_tx_task = task;
+
+out:
+	return err;
+}
+
+/*
+ * On connection shutdown.
+ */
+static void iser_free_ff_resources(struct iser_conn *conn)
+{
+	int err;
+
+	if (!conn->ff_res_alloc)
+		return;
+
+	dprintf("conn:%p pdu_mr:%p pdu_pool:%p task_pool:%p\n",
+		&conn->h, conn->pdu_data_mr,
+		conn->pdu_data_pool, conn->task_pool);
+
+	/* release mr and free the lists */
+	if (conn->pdu_data_mr) {
+		err = ibv_dereg_mr(conn->pdu_data_mr);
+		if (err)
+			eprintf("conn:%p ibv_dereg_mr failed, %m\n", &conn->h);
+	}
+	if (conn->pdu_data_pool)
+		free(conn->pdu_data_pool);
+	if (conn->task_pool)
+		free(conn->task_pool);
+	conn->nop_in_task = NULL;
+	conn->text_tx_task = NULL;
+
+	conn->ff_res_alloc = 0;
+}
+
+/*
+ * Allocate resources for this new connection.  Called after login, when
+ * final negotiated transfer parameters are known.
+ */
+int iser_login_complete(struct iscsi_connection *iscsi_conn)
+{
+	struct iser_conn *conn = ISER_CONN(iscsi_conn);
+	unsigned int irdsl, trdsl, outst_pdu, hdrsz;
+	int err = -1;
+
+	dprintf("entry\n");
+
+	/* one more send, then done; login resources are left until then */
+	iser_conn_login_phase_set(conn, LOGIN_PHASE_LAST_SEND);
+	irdsl = iscsi_conn->session_param[ISCSI_PARAM_INITIATOR_RDSL].val;
+	trdsl = iscsi_conn->session_param[ISCSI_PARAM_TARGET_RDSL].val;
+
+	/* ToDo: outstanding pdus num */
+	/* outst_pdu =
+		iscsi_conn->session_param[ISCSI_PARAM_MAX_OUTST_PDU].val; */
+
+	/* hack, ib/ulp/iser does not have this param, but reading the code
+	 * shows their formula for max tx dtos outstanding
+	 *    = cmds_max * (1 + dataouts) + rx_misc + tx_misc
+	 */
+#define ISER_INFLIGHT_DATAOUTS  0
+#define ISER_MAX_RX_MISC_PDUS   4
+#define ISER_MAX_TX_MISC_PDUS   6
+
+	/* if (outst_pdu == 0) // ToDo: outstanding pdus num */
+	outst_pdu =
+		3 * ISER_MAX_QUEUE_CMD * (1 + ISER_INFLIGHT_DATAOUTS) +
+		ISER_MAX_RX_MISC_PDUS + ISER_MAX_TX_MISC_PDUS;
+
+	/* RDSLs do not include headers. */
+	hdrsz = sizeof(struct iser_hdr) +
+		sizeof(struct iscsi_hdr) +
+		sizeof(struct iscsi_ecdb_ahdr) +
+		sizeof(struct iscsi_rlength_ahdr);
+
+	if (trdsl < 1024)
+		trdsl = 1024;
+	conn->rsize = hdrsz + trdsl;
+	/* ToDo: need to fix the ISCSI_PARAM_INITIATOR_RDSL bug in initiator */
+	/* conn->ssize = hdrsz + irdsl; */
+	conn->ssize = conn->rsize;
+
+	conn->max_outst_pdu = outst_pdu;
+	err = iser_alloc_ff_resources(conn);
+	if (err)
+		goto out;
+
+	/* How much data to grab in an RDMA operation, read or write */
+	/* ToDo: fix iscsi login code, broken handling of MAX_XMIT_DL */
+	iscsi_conn->session_param[ISCSI_PARAM_MAX_XMIT_DLENGTH].val =
+		RDMA_TRANSFER_SIZE;
+out:
+	return err;
+}
+
+static int iser_ib_clear_iosubmit_list(struct iser_conn *conn)
+{
+	struct iser_task *task;
+
+	dprintf("start\n");
+	while (!list_empty(&conn->iosubmit_list)) {
+		task = list_first_entry(&conn->iosubmit_list,
+					struct iser_task, exec_list);
+		list_del(&task->exec_list);
+		iser_complete_task(task); /* must free, task keeps rdma buffer */
+	}
+	return 0;
+}
+
+static int iser_ib_clear_rdma_rd_list(struct iser_conn *conn)
+{
+	struct iser_task *task;
+
+	dprintf("start\n");
+	while (!list_empty(&conn->rdma_rd_list)) {
+		task = list_first_entry(&conn->rdma_rd_list,
+					struct iser_task, rdma_list);
+		list_del(&task->rdma_list);
+		iser_complete_task(task);  /* must free, task keeps rdma buffer */
+	}
+	return 0;
+}
+
+static int iser_ib_clear_tx_list(struct iser_conn *conn)
+{
+	struct iser_task *task;
+
+	dprintf("start\n");
+	while (!list_empty(&conn->resp_tx_list)) {
+		task = list_first_entry(&conn->resp_tx_list,
+					struct iser_task, tx_list);
+		list_del(&task->tx_list);
+		iser_complete_task(task);  /* must free, task keeps rdma buffer */
+	}
+	return 0;
+}
+
+static int iser_ib_clear_sent_list(struct iser_conn *conn)
+{
+	struct iser_task *task;
+
+	dprintf("start\n");
+	while (!list_empty(&conn->sent_list)) {
+		task = list_first_entry(&conn->sent_list,
+					struct iser_task, tx_list);
+		list_del(&task->tx_list); /* don't free, future completion guaranteed */
+	}
+	return 0;
+}
+
+static int iser_ib_clear_post_recv_list(struct iser_conn *conn)
+{
+	struct iser_task *task;
+
+	dprintf("start\n");
+	while (!list_empty(&conn->post_recv_list)) {
+		task = list_first_entry(&conn->post_recv_list,
+					struct iser_task, recv_list);
+		list_del(&task->recv_list); /* don't free, future completion guaranteed */
+	}
+	return 0;
+}
+
+int iser_conn_init(struct iser_conn *conn)
+{
+	conn->h.refcount = 0;
+	conn->h.state = STATE_INIT;
+	param_set_defaults(conn->h.session_param, session_keys);
+
+	INIT_LIST_HEAD(&conn->h.clist);
+
+	INIT_LIST_HEAD(&conn->buf_alloc_list);
+	INIT_LIST_HEAD(&conn->rdma_rd_list);
+	/* INIT_LIST_HEAD(&conn->rdma_wr_list); */
+	INIT_LIST_HEAD(&conn->iosubmit_list);
+	INIT_LIST_HEAD(&conn->resp_tx_list);
+	INIT_LIST_HEAD(&conn->sent_list);
+	INIT_LIST_HEAD(&conn->post_recv_list);
+
+	return 0;
+}
+
+/*
+ * Start closing connection. Transfer IB QP to error state.
+ * This will be followed by WC error and buffers flush events.
+ * We also should expect DISCONNECTED and TIMEWAIT_EXIT events.
+ * Only after the draining is over we are sure to have reclaimed
+ * all buffers (and tasks). After the RDMA CM events are collected,
+ * the connection QP may be destroyed, and its number may be recycled.
+ */
+void iser_conn_close(struct iser_conn *conn)
+{
+	int err;
+
+	if (conn->h.state == STATE_CLOSE)
+		return;
+
+	dprintf("rdma_disconnect conn:%p\n", &conn->h);
+	err = rdma_disconnect(conn->cm_id);
+	if (err)
+		eprintf("conn:%p rdma_disconnect failed, %m\n", &conn->h);
+
+	list_del(&conn->conn_list);
+
+	tgt_remove_sched_event(&conn->sched_buf_alloc);
+	tgt_remove_sched_event(&conn->sched_rdma_rd);
+	tgt_remove_sched_event(&conn->sched_iosubmit);
+	tgt_remove_sched_event(&conn->sched_tx);
+	tgt_remove_sched_event(&conn->sched_post_recv);
+
+	conn->h.state = STATE_CLOSE;
+	eprintf("conn:%p cm_id:0x%p state: CLOSE, refcnt:%d\n",
+		&conn->h, conn->cm_id, conn->h.refcount);
+}
+
+/*
+ * Called when the connection is freed, from iscsi, but won't do anything until
+ * all posted WRs have gone away.  So also called again from RX progress when
+ * it notices this happens.
+ */
+void iser_conn_free(struct iser_conn *conn)
+{
+	int err;
+
+	dprintf("conn:%p refcnt:%d qp:%p cm_id:%p\n",
+		&conn->h, conn->h.refcount, conn->qp_hndl, conn->cm_id);
+
+	assert(conn->h.refcount == 0);
+
+	iser_ib_clear_iosubmit_list(conn);
+	iser_ib_clear_rdma_rd_list(conn);
+	iser_ib_clear_tx_list(conn);
+	iser_ib_clear_sent_list(conn);
+	iser_ib_clear_post_recv_list(conn);
+
+	/* try to free unconditionally, resources freed only if necessary */
+	iser_free_login_resources(conn);
+	iser_free_ff_resources(conn);
+
+	if (conn->qp_hndl) {
+		err = ibv_destroy_qp(conn->qp_hndl);
+		if (err)
+			eprintf("conn:%p ibv_destroy_qp failed, %m\n", &conn->h);
+	}
+	if (conn->cm_id) {
+		err = rdma_destroy_id(conn->cm_id);
+		if (err)
+			eprintf("conn:%p rdma_destroy_id failed, %m\n", &conn->h);
+	}
+
+	/* delete from session; was put there by conn_add_to_session() */
+	list_del(&conn->h.clist);
+
+	if (conn->h.initiator)
+		free(conn->h.initiator);
+
+	if (conn->h.session)
+		session_put(conn->h.session);
+
+	if (conn->peer_name)
+		free(conn->peer_name);
+	if (conn->self_name)
+		free(conn->self_name);
+
+	conn->h.state = STATE_INIT;
+	free(conn);
+	dprintf("conn:%p freed\n", &conn->h);
+}
+
+static void iser_sched_conn_free(struct event_data *evt)
+{
+	struct iser_conn *conn = (struct iser_conn *) evt->data;
+
+	iser_conn_free(conn);
+}
+
+int iser_conn_get(struct iser_conn *conn)
+{
+	/* TODO: check state */
+	conn->h.refcount++;
+	dprintf("refcnt:%d\n", conn->h.refcount);
+	return 0;
+}
+
+int iser_conn_getn(struct iser_conn *conn, int n)
+{
+	int new_count = conn->h.refcount + n;
+	dprintf("refcnt:%d + %d = %d\n", conn->h.refcount, n, new_count);
+	conn->h.refcount = new_count;
+	return 0;
+}
+
+void iser_conn_put(struct iser_conn *conn)
+{
+	conn->h.refcount--;
+	dprintf("refcnt:%d\n", conn->h.refcount);
+	if (unlikely(conn->h.refcount == 0)) {
+		assert(conn->h.state == STATE_CLOSE);
+		tgt_add_sched_event(&conn->sched_conn_free);
+	}
+}
+
+static int iser_get_host_name(struct sockaddr_storage *addr, char **name)
+{
+	int err;
+	char host[NI_MAXHOST];
+
+	if (name == NULL)
+		return -EINVAL;
+
+	err = getnameinfo((struct sockaddr *) addr, sizeof(*addr),
+			  host, sizeof(host), NULL, 0, NI_NUMERICHOST);
+	if (!err) {
+		*name = strdup(host);
+		if (*name == NULL)
+			err = -ENOMEM;
+	} else
+		eprintf("getnameinfo failed, %m\n");
+
+	return err;
+}
+
+static int iser_show(struct iscsi_connection *iscsi_conn, char *buf,
+		     int rest)
+{
+	struct iser_conn *conn = ISER_CONN(iscsi_conn);
+	int len;
+
+	len = snprintf(buf, rest, "RDMA IP Address: %s", conn->peer_name);
+	return len;
+}
+
+static int iser_getsockname(struct iscsi_connection *iscsi_conn,
+			    struct sockaddr *sa, socklen_t *len)
+{
+	struct iser_conn *conn = ISER_CONN(iscsi_conn);
+
+	if (*len > sizeof(conn->self_addr))
+		*len = sizeof(conn->self_addr);
+	memcpy(sa, &conn->self_addr, *len);
+	return 0;
+}
+
+static int iser_getpeername(struct iscsi_connection *iscsi_conn,
+			    struct sockaddr *sa, socklen_t *len)
+{
+	struct iser_conn *conn = ISER_CONN(iscsi_conn);
+
+	if (*len > sizeof(conn->peer_addr))
+		*len = sizeof(conn->peer_addr);
+	memcpy(sa, &conn->peer_addr, *len);
+	return 0;
+}
+
+static void iser_cm_connect_request(struct rdma_cm_event *ev)
+{
+	struct rdma_cm_id *cm_id = ev->id;
+	struct ibv_qp_init_attr qp_init_attr;
+	struct iser_conn *conn;
+	struct iser_device *dev;
+	int err, dev_found;
+
+	struct rdma_conn_param conn_param = {
+		.responder_resources = 1,
+		.initiator_depth = 1,
+		.retry_count = 5,
+	};
+
+	/* find device */
+	dev_found = 0;
+	list_for_each_entry(dev, &iser_dev_list, list) {
+		if (dev->ibv_ctxt == cm_id->verbs) {
+			dev_found = 1;
+			break;
+		}
+	}
+	if (!dev_found) {
+		dev = malloc(sizeof(*dev));
+		if (dev == NULL) {
+			eprintf("cm_id:%p malloc dev failed\n", cm_id);
+			goto reject;
+		}
+		dev->ibv_ctxt = cm_id->verbs;
+		err = iser_device_init(dev);
+		if (err) {
+			free(dev);
+			goto reject;
+		}
+	}
+
+	/* build a new connection structure */
+	conn = zalloc(sizeof(*conn));
+	if (!conn) {
+		eprintf("cm_id:%p malloc conn failed\n", cm_id);
+		goto reject;
+	}
+
+	err = iser_conn_init(conn);
+	if (err) {
+		free(conn);
+		goto reject;
+	}
+	dprintf("alloc conn:%p cm_id:%p\n", &conn->h, cm_id);
+	list_add(&conn->conn_list, &iser_conn_list);
+
+	/* relate iser and rdma connections */
+	conn->cm_id = cm_id;
+	cm_id->context = conn;
+
+	conn->dev = dev;
+
+	iser_conn_login_phase_set(conn, LOGIN_PHASE_START);
+
+	tgt_init_sched_event(&conn->sched_buf_alloc, iser_sched_buf_alloc,
+			     conn);
+	tgt_init_sched_event(&conn->sched_iosubmit, iser_sched_iosubmit,
+			     conn);
+	tgt_init_sched_event(&conn->sched_rdma_rd, iser_sched_rdma_rd,
+			     conn);
+	tgt_init_sched_event(&conn->sched_tx, iser_sched_tx,
+			     conn);
+	tgt_init_sched_event(&conn->sched_post_recv, iser_sched_post_recv,
+			     conn);
+	tgt_init_sched_event(&conn->sched_conn_free, iser_sched_conn_free,
+			     conn);
+
+	/* initiator is dst, target is src */
+	memcpy(&conn->peer_addr, &cm_id->route.addr.dst_addr,
+	       sizeof(conn->peer_addr));
+	err = iser_get_host_name(&conn->peer_addr, &conn->peer_name);
+	if (err)
+		conn->peer_name = strdup("Unresolved");
+
+	memcpy(&conn->self_addr, &cm_id->route.addr.src_addr,
+	       sizeof(conn->self_addr));
+	err = iser_get_host_name(&conn->self_addr, &conn->self_name);
+	if (err)
+		conn->self_name = strdup("Unresolved");
+
+	/* create qp */
+	memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+	qp_init_attr.qp_context = conn;
+	/* both send and recv to the same CQ */
+	qp_init_attr.send_cq = dev->cq;
+	qp_init_attr.recv_cq = dev->cq;
+	qp_init_attr.cap.max_send_wr = MAX_WQE;
+	qp_init_attr.cap.max_recv_wr = MAX_WQE;
+	qp_init_attr.cap.max_send_sge = 1;      /* scatter/gather entries */
+	qp_init_attr.cap.max_recv_sge = 1;
+	qp_init_attr.qp_type = IBV_QPT_RC;
+	/* only generate completion queue entries if requested */
+	qp_init_attr.sq_sig_all = 0;
+
+	err = rdma_create_qp(cm_id, dev->pd, &qp_init_attr);
+	if (err) {
+		eprintf("conn:%p cm_id:%p rdma_create_qp failed, %m\n",
+			&conn->h, cm_id);
+		goto free_conn;
+	}
+	conn->qp_hndl = cm_id->qp;
+	VALGRIND_MAKE_MEM_DEFINED(conn->qp_hndl, sizeof(*conn->qp_hndl));
+	dprintf("conn:%p cm_id:%p, created qp:%p\n", &conn->h, cm_id,
+		conn->qp_hndl);
+
+	/*
+	 * Post buffers for the login phase, only.
+	 */
+	conn->rsize =
+		sizeof(struct iser_hdr) +
+		sizeof(struct iscsi_hdr) +
+		sizeof(struct iscsi_ecdb_ahdr) +
+		sizeof(struct iscsi_rlength_ahdr) + 8192;
+	conn->ssize = conn->rsize;
+	err = iser_alloc_login_resources(conn);
+	if (err)
+		goto free_conn;
+
+	conn_param.initiator_depth = dev->device_attr.max_qp_init_rd_atom;
+	if (conn_param.initiator_depth > ev->param.conn.initiator_depth)
+		conn_param.initiator_depth = ev->param.conn.initiator_depth;
+
+	/* Increment reference count to be able to wait for TIMEWAIT_EXIT
+	   when finalizing the disconnect process */
+	iser_conn_get(conn);
+
+	/* now we can actually accept the connection */
+	err = rdma_accept(conn->cm_id, &conn_param);
+	if (err) {
+		eprintf("conn:%p cm_id:%p rdma_accept failed, %m\n",
+			&conn->h, cm_id);
+		goto free_conn;
+	}
+
+	conn->h.tp = &iscsi_iser;
+	conn->h.state = STATE_START;
+	dprintf("conn:%p cm_id:%p, %s -> %s, accepted\n",
+		&conn->h, cm_id, conn->peer_name, conn->self_name);
+	return;
+
+free_conn:
+	iser_conn_free(conn);
+
+reject:
+	err = rdma_reject(cm_id, NULL, 0);
+	if (err)
+		eprintf("cm_id:%p rdma_reject failed, %m\n", cm_id);
+}
+
+/*
+ * Finish putting the connection together, now that the other side
+ * has ACKed our acceptance.  Moves it from the temp_conn to the
+ * iser_conn_list.
+ *
+ * Release the temporary conn_info and glue it into iser_conn_list.
+ */
+static void iser_cm_conn_established(struct rdma_cm_event *ev)
+{
+	struct rdma_cm_id *cm_id = ev->id;
+	struct iser_conn *conn = cm_id->context;
+
+	if (conn->h.state == STATE_START) {
+		conn->h.state = STATE_READY;
+		eprintf("conn:%p cm_id:%p, %s -> %s, established\n",
+			&conn->h, cm_id, conn->peer_name, conn->self_name);
+	} else if (conn->h.state == STATE_READY) {
+		eprintf("conn:%p cm_id:%p, %s -> %s, "
+			"execute delayed login_rx now\n",
+			&conn->h, cm_id, conn->peer_name, conn->self_name);
+		iser_login_rx(conn->login_rx_task);
+	}
+}
+
+/*
+ * Handle RDMA_CM_EVENT_DISCONNECTED or an equivalent event.
+ * Start closing the target's side connection.
+*/
+static void iser_cm_disconnected(struct rdma_cm_event *ev)
+{
+	struct rdma_cm_id *cm_id = ev->id;
+	struct iser_conn *conn = cm_id->context;
+	enum rdma_cm_event_type ev_type = ev->event;
+
+	eprintf("conn:%p cm_id:%p event:%d, %s\n", &conn->h, cm_id,
+		ev_type, rdma_event_str(ev_type));
+	iser_conn_close(conn);
+}
+
+/*
+ * Handle RDMA_CM_EVENT_TIMEWAIT_EXIT which is expected to be the last
+ * event during the lifecycle of a connection, when it had been shut down
+ * and the network has cleared from the remaining in-flight messages.
+*/
+static void iser_cm_timewait_exit(struct rdma_cm_event *ev)
+{
+	struct rdma_cm_id *cm_id = ev->id;
+	struct iser_conn *conn = cm_id->context;
+
+	eprintf("conn:%p cm_id:%p\n", &conn->h, cm_id);
+
+	/* Refcount was incremented just before accepting the connection,
+	   typically this is the last decrement and the connection will be
+	   released instantly */
+	iser_conn_put(conn);
+}
+
+/*
+ * Handle RDMA CM events.
+ */
+static void iser_handle_rdmacm(int fd __attribute__ ((unused)),
+			       int events __attribute__ ((unused)),
+			       void *data __attribute__ ((unused)))
+{
+	struct rdma_cm_event *ev;
+	enum rdma_cm_event_type ev_type;
+	int err;
+
+	err = rdma_get_cm_event(rdma_evt_channel, &ev);
+	if (err) {
+		eprintf("rdma_get_cm_event failed, %m\n");
+		return;
+	}
+
+	VALGRIND_MAKE_MEM_DEFINED(ev, sizeof(*ev));
+
+	ev_type = ev->event;
+	switch (ev_type) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		iser_cm_connect_request(ev);
+		break;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		iser_cm_conn_established(ev);
+		break;
+
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	case RDMA_CM_EVENT_REJECTED:
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+	case RDMA_CM_EVENT_DISCONNECTED:
+		iser_cm_disconnected(ev);
+		break;
+
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		iser_cm_timewait_exit(ev);
+		break;
+
+	case RDMA_CM_EVENT_MULTICAST_JOIN:
+	case RDMA_CM_EVENT_MULTICAST_ERROR:
+		eprintf("UD-related event:%d, %s - ignored\n", ev_type,
+			rdma_event_str(ev_type));
+		break;
+
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		eprintf("Unsupported event:%d, %s - ignored\n", ev_type,
+			rdma_event_str(ev_type));
+		break;
+
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+	case RDMA_CM_EVENT_ADDR_ERROR:
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_CONNECT_RESPONSE:
+	case RDMA_CM_EVENT_UNREACHABLE:
+		eprintf("Active side event:%d, %s - ignored\n", ev_type,
+			rdma_event_str(ev_type));
+		break;
+
+	default:
+		eprintf("Illegal event:%d - ignored\n", ev_type);
+		break;
+	}
+
+	err = rdma_ack_cm_event(ev);
+	if (err)
+		eprintf("ack cm event failed, %s\n", rdma_event_str(ev_type));
+}
+
+static inline void schedule_task_iosubmit(struct iser_task *task,
+					  struct iser_conn *conn)
+{
+	list_add_tail(&task->exec_list, &conn->iosubmit_list);
+	tgt_add_sched_event(&conn->sched_iosubmit);
+
+	dprintf("task:%p tag:0x%04"PRIx64 "cmdsn:0x%x\n",
+		task, task->tag, task->cmd_sn);
+}
+
+static inline void schedule_rdma_read(struct iser_task *task,
+				      struct iser_conn *conn)
+{
+	list_add_tail(&task->rdma_list, &conn->rdma_rd_list);
+	tgt_add_sched_event(&conn->sched_rdma_rd);
+
+	dprintf("task:%p tag:0x%04"PRIx64 "cmdsn:0x%x\n",
+		task, task->tag, task->cmd_sn);
+}
+
+static inline void schedule_resp_tx(struct iser_task *task,
+				    struct iser_conn *conn)
+{
+	list_add_tail(&task->tx_list, &conn->resp_tx_list);
+	tgt_add_sched_event(&conn->sched_tx);
+
+	dprintf("task:%p tag:0x%04"PRIx64 "cmdsn:0x%x\n",
+		task, task->tag, task->cmd_sn);
+}
+
+static inline void schedule_post_recv(struct iser_task *task,
+				      struct iser_conn *conn)
+{
+	list_add_tail(&task->recv_list, &conn->post_recv_list);
+	tgt_add_sched_event(&conn->sched_post_recv);
+
+	dprintf("task:%p tag:0x%04"PRIx64 "cmdsn:0x%x\n",
+		task, task->tag, task->cmd_sn);
+}
+
+static int iser_logout_exec(struct iser_task *task)
+{
+	struct iser_conn *conn = task->conn;
+	struct iscsi_session *session = conn->h.session;
+	struct iscsi_logout_rsp *rsp_bhs =
+		(struct iscsi_logout_rsp *) task->pdu.bhs;
+
+	memset(rsp_bhs, 0, BHS_SIZE);
+	rsp_bhs->opcode = ISCSI_OP_LOGOUT_RSP;
+	rsp_bhs->flags = ISCSI_FLAG_CMD_FINAL;
+	rsp_bhs->response = ISCSI_LOGOUT_SUCCESS;
+	rsp_bhs->itt = task->tag;
+	rsp_bhs->statsn = cpu_to_be32(conn->h.stat_sn++);
+
+	if (session->exp_cmd_sn == task->cmd_sn && !task->is_immediate)
+		session->exp_cmd_sn++;
+	iser_set_rsp_stat_sn(session, task->pdu.bhs);
+
+	task->pdu.ahssize = 0;
+	task->pdu.membuf.size = 0;
+
+	dprintf("rsp task:%p op:%0x itt:%0x statsn:%0x\n",
+		task, (unsigned int)rsp_bhs->opcode,
+		(unsigned int)rsp_bhs->itt,
+		(unsigned int)rsp_bhs->statsn);
+
+	/* add the logout resp to tx list, and force all scheduled tx now */
+	list_add_tail(&task->tx_list, &conn->resp_tx_list);
+	dprintf("add to tx list, logout resp task:%p tag:0x%04"PRIx64 "cmdsn:0x%x\n",
+		task, task->tag, task->cmd_sn);
+	tgt_remove_sched_event(&conn->sched_tx);
+	iser_sched_tx(&conn->sched_tx);
+
+	return 0;
+}
+
+static int iser_nop_out_exec(struct iser_task *task)
+{
+	struct iscsi_nopin *rsp_bhs = (struct iscsi_nopin *) task->pdu.bhs;
+	struct iser_conn *conn = task->conn;
+
+	rsp_bhs->opcode = ISCSI_OP_NOOP_IN;
+	rsp_bhs->flags = ISCSI_FLAG_CMD_FINAL;
+	rsp_bhs->rsvd2 = 0;
+	rsp_bhs->rsvd3 = 0;
+	memset(rsp_bhs->lun, 0, sizeof(rsp_bhs->lun));
+	rsp_bhs->itt = task->tag;
+	rsp_bhs->ttt = ISCSI_RESERVED_TAG;
+	rsp_bhs->statsn = cpu_to_be32(conn->h.stat_sn);
+	if (task->tag != ISCSI_RESERVED_TAG)
+		conn->h.stat_sn++;
+
+	iser_set_rsp_stat_sn(conn->h.session, task->pdu.bhs);
+
+	memset(rsp_bhs->rsvd4, 0, sizeof(rsp_bhs->rsvd4));
+	task->pdu.ahssize = 0;
+	task->pdu.membuf.size = task->out_len; /* ping back nop-out data */
+
+	schedule_resp_tx(task, conn);
+	return 0;
+}
+
+static int iser_send_ping_nop_in(struct iser_task *task)
+{
+	struct iscsi_nopin *rsp_bhs = (struct iscsi_nopin *) task->pdu.bhs;
+	struct iser_conn *conn = task->conn;
+
+	task->opcode = ISCSI_OP_NOOP_IN;
+	task->tag = ISCSI_RESERVED_TAG;
+
+	rsp_bhs->opcode = ISCSI_OP_NOOP_IN;
+	rsp_bhs->flags = ISCSI_FLAG_CMD_FINAL;
+	rsp_bhs->rsvd2 = 0;
+	rsp_bhs->rsvd3 = 0;
+	memset(rsp_bhs->lun, 0, sizeof(rsp_bhs->lun));
+	rsp_bhs->itt = ISCSI_RESERVED_TAG;
+	rsp_bhs->ttt = ISCSI_RESERVED_TAG;
+	rsp_bhs->statsn = cpu_to_be32(conn->h.stat_sn);
+	iser_set_rsp_stat_sn(conn->h.session, task->pdu.bhs);
+
+	memset(rsp_bhs->rsvd4, 0, sizeof(rsp_bhs->rsvd4));
+	task->pdu.ahssize = 0;
+	task->pdu.membuf.size = 0;
+
+	dprintf("task:%p conn:%p\n", task, &conn->h);
+
+	schedule_resp_tx(task, conn);
+	return 0;
+}
+
+/*
+static int iser_send_reject(struct iser_task *task, uint8 reason)
+{
+	struct iser_conn *conn = task->conn;
+	struct iscsi_session *session = conn->h.session;
+	struct iscsi_hdr *req = (struct iscsi_hdr *) task->req.bhs;
+	struct iscsi_reject *rsp = (struct iscsi_reject *) task->rsp.bhs;
+
+	memset(rsp, 0, BHS_SIZE);
+	rsp->opcode = ISCSI_OP_REJECT;
+	rsp->flags = ISCSI_FLAG_CMD_FINAL;
+	rsp->reason = reason;
+	hton24(rsp->dlength, sizeof(struct iser_hdr));
+	rsp->ffffffff = 0xffffffff;
+	rsp->itt = req->itt;
+	rsp->statsn = cpu_to_be32(conn->h.stat_sn++);
+	iser_set_rsp_stat_sn(session, task->rsp.bhs);
+
+	task->rsp.ahssize = 0;
+	task->rsp.membuf.addr = task->req.bhs;
+	task->rsp.membuf.size = sizeof(struct iser_hdr);
+
+	schedule_resp_tx(task, conn);
+	return 0;
+}
+*/
+
+static int iser_tm_exec(struct iser_task *task)
+{
+	struct iser_conn *conn = task->conn;
+	struct iscsi_session *session = conn->h.session;
+	struct iscsi_tm *req_bhs = (struct iscsi_tm *) task->pdu.bhs;
+	int fn = 0;
+	int err = 0;
+
+	eprintf("conn:%p TM itt:0x%x cmdsn:0x%x "
+		"ref.itt:0x%x ref.cmdsn:0x%x "
+		"exp.cmdsn:0x%x "
+		"lun:0x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+		&conn->h,
+		be32_to_cpu(req_bhs->itt), be32_to_cpu(req_bhs->cmdsn),
+		be32_to_cpu(req_bhs->rtt), be32_to_cpu(req_bhs->refcmdsn),
+		be32_to_cpu(req_bhs->exp_statsn),
+		req_bhs->lun[0], req_bhs->lun[1], req_bhs->lun[2], req_bhs->lun[3],
+		req_bhs->lun[4], req_bhs->lun[5], req_bhs->lun[6], req_bhs->lun[7]);
+
+	switch (req_bhs->flags & ISCSI_FLAG_TM_FUNC_MASK) {
+	case ISCSI_TM_FUNC_ABORT_TASK:
+		fn = ABORT_TASK;
+		break;
+	case ISCSI_TM_FUNC_ABORT_TASK_SET:
+		fn = ABORT_TASK_SET;
+		break;
+	case ISCSI_TM_FUNC_CLEAR_ACA:
+		fn = CLEAR_TASK_SET;
+		break;
+	case ISCSI_TM_FUNC_CLEAR_TASK_SET:
+		fn = CLEAR_ACA;
+		break;
+	case ISCSI_TM_FUNC_LOGICAL_UNIT_RESET:
+		fn = LOGICAL_UNIT_RESET;
+		break;
+	case ISCSI_TM_FUNC_TARGET_WARM_RESET:
+	case ISCSI_TM_FUNC_TARGET_COLD_RESET:
+	case ISCSI_TM_FUNC_TASK_REASSIGN:
+		err = ISCSI_TMF_RSP_NOT_SUPPORTED;
+		eprintf("unsupported TMF %d\n",
+			req_bhs->flags & ISCSI_FLAG_TM_FUNC_MASK);
+		break;
+	default:
+		err = ISCSI_TMF_RSP_REJECTED;
+		eprintf("unknown TMF %d\n",
+			req_bhs->flags & ISCSI_FLAG_TM_FUNC_MASK);
+	}
+
+	if (err)
+		task->result = err;
+	else {
+		int ret;
+		ret = target_mgmt_request(session->target->tid, session->tsih,
+					  (unsigned long) task, fn, req_bhs->lun,
+					  req_bhs->rtt, 0);
+		set_task_in_scsi(task);
+		switch (ret) {
+		case MGMT_REQ_QUEUED:
+			break;
+		case MGMT_REQ_FAILED:
+		case MGMT_REQ_DONE:
+			clear_task_in_scsi(task);
+			break;
+		}
+	}
+	return err;
+}
+
+static int iser_tm_done(struct mgmt_req *mreq)
+{
+	struct iser_task *task = (struct iser_task *) (unsigned long) mreq->mid;
+	struct iser_conn *conn = task->conn;
+	struct iscsi_session *session = conn->h.session;
+	struct iscsi_tm_rsp *rsp_bhs =
+		(struct iscsi_tm_rsp *) task->pdu.bhs;
+
+	memset(rsp_bhs, 0, sizeof(*rsp_bhs));
+	rsp_bhs->opcode = ISCSI_OP_SCSI_TMFUNC_RSP;
+	rsp_bhs->flags = ISCSI_FLAG_CMD_FINAL;
+	rsp_bhs->itt = task->tag;
+	switch (mreq->result) {
+	case 0:
+		rsp_bhs->response = ISCSI_TMF_RSP_COMPLETE;
+		break;
+	case -EINVAL:
+		rsp_bhs->response = ISCSI_TMF_RSP_NOT_SUPPORTED;
+		break;
+	case -EEXIST:
+		/*
+		 * the command completed or we could not find it so
+		 * we retrun  no task here
+		 */
+		rsp_bhs->response = ISCSI_TMF_RSP_NO_TASK;
+		break;
+	default:
+		rsp_bhs->response = ISCSI_TMF_RSP_REJECTED;
+		break;
+	}
+
+	rsp_bhs->statsn = cpu_to_be32(conn->h.stat_sn++);
+	iser_set_rsp_stat_sn(session, task->pdu.bhs);
+
+	/* Must be zero according to 10.6.3 */
+	task->pdu.ahssize = 0;
+	task->pdu.membuf.size = 0;
+
+	schedule_resp_tx(task, conn);
+
+	return 0;
+}
+
+static int iser_scsi_cmd_buf_alloc(struct iser_task *task)
+{
+	struct iser_conn *conn = task->conn;
+	struct iser_membuf *rdma_buf;
+	size_t rdma_sz;
+
+	/* when a buffer needs to be allocated and it fails, leave task on
+	   the list waiting until a new buffer becomes available */
+	if (task->is_write) {
+		if (task->out_len > 0) {
+			rdma_sz = task->rdma_rd_sz;
+			if (rdma_sz > 0) {
+				/* ToDo: multiple RDMA-Read buffers */
+				if (unlikely(rdma_sz > membuf_size)) {
+					eprintf("conn:%p task:%p tag:0x%04"PRIx64 ", "
+						"rdma-rd size:%zu too big\n",
+						&conn->h, task, task->tag,
+						rdma_sz);
+					return -E2BIG;
+				}
+				rdma_buf = iser_alloc_rdma_buf(conn, rdma_sz);
+				if (unlikely(rdma_buf == NULL)) {
+					eprintf("conn:%p task:%p tag:0x%04"PRIx64 ", "
+						"free list empty\n",
+						&conn->h, task, task->tag);
+					return -ENOMEM;
+				}
+				dprintf("out-buf:%p sz:%u rdma:%d\n",
+					rdma_buf->addr, rdma_buf->size, rdma_buf->rdma);
+				/* ToDo: multiple RDMA-Read buffers */
+				iser_task_add_out_rdma_buf(task, rdma_buf,
+							   task->unsol_sz);
+				schedule_rdma_read(task, conn);
+				return 0;
+			}
+			if (task->unsol_remains > 0)
+				return 0;
+		}
+	} /* ToDo: bi-dir */
+	else if (task->is_read) {
+		if (task->in_len > 0) {
+			rdma_sz = task->rdma_wr_sz;
+			if (unlikely(rdma_sz > membuf_size)) {
+				eprintf("conn:%p task:%p tag:0x%04"PRIx64 ", "
+					"rdma-wr size:%zu too big\n",
+					&conn->h, task, task->tag,
+					rdma_sz);
+				return -E2BIG;
+			}
+			rdma_buf = iser_alloc_rdma_buf(conn, rdma_sz);
+			if (unlikely(rdma_buf == NULL)) {
+				eprintf("conn:%p task:%p tag:0x%04"PRIx64 ", "
+					"free list empty\n",
+					&conn->h, task, task->tag);
+				return -ENOMEM;
+			}
+			dprintf("in-buf:%p sz:%u rdma:%d\n",
+				rdma_buf->addr, rdma_buf->size, rdma_buf->rdma);
+			/* ToDo: multiple rdma write bufs */
+			iser_task_add_in_rdma_buf(task, rdma_buf, 0);
+		}
+	}
+	schedule_task_iosubmit(task, conn);
+	return 0;
+}
+
+static int scsi_cmd_attr(unsigned int flags)
+{
+	int attr;
+
+	switch (flags & ISCSI_FLAG_CMD_ATTR_MASK) {
+	case ISCSI_ATTR_UNTAGGED:
+	case ISCSI_ATTR_SIMPLE:
+		attr = MSG_SIMPLE_TAG;
+		break;
+	case ISCSI_ATTR_HEAD_OF_QUEUE:
+		attr = MSG_HEAD_TAG;
+		break;
+	case ISCSI_ATTR_ORDERED:
+	default:
+		attr = MSG_ORDERED_TAG;
+	}
+	return attr;
+}
+
+static void iser_task_free_dout_tasks(struct iser_task *task)
+{
+	struct iser_conn *conn = task->conn;
+	struct iser_task *dout_task, *tnext;
+
+	list_for_each_entry_safe(dout_task, tnext, &task->dout_task_list, dout_task_list) {
+		list_del(&dout_task->dout_task_list);
+		iser_task_del_out_buf(task, &dout_task->pdu.membuf);
+		schedule_post_recv(dout_task, conn);
+	}
+}
+
+static void iser_scsi_cmd_iosubmit(struct iser_task *task)
+{
+	struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) task->pdu.bhs;
+	struct scsi_cmd *scmd = &task->scmd;
+	struct iser_conn *conn = task->conn;
+	struct iscsi_session *session = conn->h.session;
+	struct iser_membuf *data_buf;
+
+	/* ToDo: implement iser_task_handle_ahs(itask); */
+
+	scmd->state = 0;
+	scsi_set_in_length(scmd, 0);
+	scsi_set_out_length(scmd, 0);
+
+	if (task->is_write) {
+		scsi_set_out_resid(scmd, 0);
+		/* It's either the last buffer, which is RDMA,
+		   or the only buffer */
+		data_buf = list_entry(task->out_buf_list.prev,
+				      struct iser_membuf, task_list);
+		/* ToDo: multiple RDMA-Write buffers */
+		if (task->out_buf_num > 1) {
+			unsigned char *ptr = data_buf->addr;
+			struct iser_membuf *cur_buf;
+
+			list_for_each_entry(cur_buf, &task->out_buf_list, task_list) {
+				if (cur_buf->rdma)
+					break;
+				memcpy(ptr, cur_buf->addr, cur_buf->size);
+				ptr += cur_buf->size;
+			}
+			iser_task_free_dout_tasks(task);
+		}
+
+		scsi_set_out_buffer(scmd, data_buf->addr);
+		scsi_set_out_length(scmd, task->out_len);
+	}
+	if (task->is_read) {
+		scsi_set_in_resid(scmd, 0);
+		/* ToDo: multiple RDMA-Read buffers */
+		data_buf = list_entry(task->in_buf_list.next,
+				      struct iser_membuf, task_list);
+		scsi_set_in_buffer(scmd, data_buf->addr);
+		scsi_set_in_length(scmd, task->in_len);
+	}
+
+	scmd->cmd_itn_id = session->tsih;
+	scmd->scb = req_bhs->cdb;
+	scmd->scb_len = sizeof(req_bhs->cdb);
+	memcpy(scmd->lun, req_bhs->lun, sizeof(scmd->lun));
+	scmd->attribute = scsi_cmd_attr(req_bhs->flags);
+	scmd->tag = task->tag;
+	scmd->result = 0;
+	scmd->mreq = NULL;
+	scmd->sense_len = 0;
+
+	dprintf("task:%p tag:0x%04"PRIx64 "\n", task, task->tag);
+
+	/* ToDo: handle bidi ahs and allocations
+	   the problem is that there is only one buffer in scsi command */
+
+	set_task_in_scsi(task);
+	target_cmd_queue(session->target->tid, scmd);
+}
+
+static void iser_rsp_set_read_resid(struct iscsi_cmd_rsp *rsp_bhs,
+				    int in_resid)
+{
+	if (in_resid > 0) {
+		rsp_bhs->flags |= ISCSI_FLAG_CMD_UNDERFLOW;
+		rsp_bhs->residual_count = cpu_to_be32((uint32_t)in_resid);
+	} else {
+		rsp_bhs->flags |= ISCSI_FLAG_CMD_OVERFLOW;
+		rsp_bhs->residual_count = cpu_to_be32(((uint32_t)-in_resid));
+	}
+	rsp_bhs->bi_residual_count = 0;
+}
+
+static void iser_rsp_set_bidir_resid(struct iscsi_cmd_rsp *rsp_bhs,
+				     int in_resid)
+{
+	if (in_resid > 0) {
+		rsp_bhs->flags |= ISCSI_FLAG_CMD_BIDI_UNDERFLOW;
+		rsp_bhs->bi_residual_count = cpu_to_be32((uint32_t)in_resid);
+	} else {
+		rsp_bhs->flags |= ISCSI_FLAG_CMD_BIDI_OVERFLOW;
+		rsp_bhs->bi_residual_count = cpu_to_be32(((uint32_t)-in_resid));
+	}
+	rsp_bhs->residual_count = 0;
+}
+
+static int iser_scsi_cmd_done(uint64_t nid, int result,
+			      struct scsi_cmd *scmd)
+{
+	struct iser_task *task = container_of(scmd, struct iser_task, scmd);
+	struct iscsi_cmd_rsp *rsp_bhs =
+		(struct iscsi_cmd_rsp *) task->pdu.bhs;
+	struct iser_conn *conn = task->conn;
+	struct iscsi_session *session = conn->h.session;
+	unsigned char sense_len = scmd->sense_len;
+	int in_resid;
+
+	assert(nid == scmd->cmd_itn_id);
+
+	if (unlikely(conn->h.state != STATE_FULL)) {
+		/* Connection is closed, but its resources are not released
+		   to allow receiving completion of such late tasks.
+		   When all tasks are released, and connection refcnt
+		   drops to zero, then all the resources can be freed. */
+		iser_complete_task(task);
+		return 0;
+	}
+
+	rsp_bhs->opcode = ISCSI_OP_SCSI_CMD_RSP;
+	rsp_bhs->flags = ISCSI_FLAG_CMD_FINAL;
+	rsp_bhs->response = ISCSI_STATUS_CMD_COMPLETED;
+	rsp_bhs->cmd_status = scsi_get_result(scmd);
+	*((uint64_t *) rsp_bhs->rsvd) = (uint64_t) 0;
+	rsp_bhs->itt = task->tag;
+	rsp_bhs->rsvd1 = 0;
+	rsp_bhs->statsn = cpu_to_be32(conn->h.stat_sn++);
+	iser_set_rsp_stat_sn(session, task->pdu.bhs);
+	rsp_bhs->exp_datasn = 0;
+
+	if (task->is_read) {
+		in_resid = scsi_get_in_resid(scmd);
+		if (in_resid != 0) {
+			if (!task->is_write)
+				iser_rsp_set_read_resid(rsp_bhs, in_resid);
+			else
+				iser_rsp_set_bidir_resid(rsp_bhs, in_resid);
+			if (in_resid > 0)
+				task->rdma_wr_remains = task->in_len - in_resid;
+		}
+		/* schedule_rdma_write(task, conn); // ToDo: need this? */
+	} else {
+		rsp_bhs->bi_residual_count = 0;
+		rsp_bhs->residual_count = 0;
+	}
+	task->pdu.ahssize = 0;
+	task->pdu.membuf.size = 0;
+
+	if (unlikely(sense_len > 0)) {
+		struct iscsi_sense_data *sense =
+		    (struct iscsi_sense_data *) task->pdu.membuf.addr;
+		sense->length = cpu_to_be16(sense_len);
+
+		/* ToDo: need to fix the ISCSI_PARAM_INITIATOR_RDSL bug in initiator */
+		assert(sense_len + 2 <= conn->rsize);
+
+		memcpy(sense->data, scmd->sense_buffer, sense_len);
+		task->pdu.membuf.size = sense_len + sizeof(*sense);
+	}
+	dprintf("task:%p tag:0x%04"PRIx64 "status:%x statsn:%d flags:0x%x "
+		"in_len:%d out_len:%d rsd:%d birsd:%d\n",
+		task, task->tag, rsp_bhs->cmd_status,
+		conn->h.stat_sn-1, rsp_bhs->flags,
+		scsi_get_in_length(scmd), scsi_get_out_length(scmd),
+		ntohl(rsp_bhs->residual_count),
+		ntohl(rsp_bhs->bi_residual_count));
+
+	schedule_resp_tx(task, conn);
+
+	return 0;
+}
+
+/*
+static int iser_task_handle_ahs(struct iser_task *task)
+{
+	struct scsi_cmd *scmd = &task->scmd;
+	struct iscsi_connection *conn = task->conn;
+	struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) &task->req.;
+	uint32_t data_len;
+	uint8_t *ahs;
+	int ahslen;
+	enum data_direction dir = scsi_get_data_dir(scmd);
+
+	ahs = task->ahs;
+	ahslen = req_bhs->hlength * 4;
+	if (ahslen >= 4) {
+		struct iscsi_ecdb_ahdr *ahs_extcdb = (void *) ahs;
+
+		if (ahs_extcdb->ahstype == ISCSI_AHSTYPE_CDB) {
+			int extcdb_len = ntohs(ahs_extcdb->ahslength) - 1;
+			unsigned char *p = (void *)task->extdata;
+
+			if (4 + extcdb_len > ahslen) {
+				eprintf("AHS len:%d too short for extcdb %d\n",
+					ahslen, extcdb_len);
+				return -EINVAL;
+			}
+			if (extcdb_len + sizeof(req_bhs->cdb) > 260) {
+				eprintf("invalid extcdb len:%d\n", extcdb_len);
+
+				return -EINVAL;
+			}
+
+			memcpy(p, req_bhs->cdb, sizeof(req_bhs->cdb));
+			memmove(p + sizeof(req_bhs->cdb), ahs_extcdb->ecdb,
+				extcdb_len);
+
+			scmd->scb = p;
+			scmd->scb_len = sizeof(req_bhs->cdb) + extcdb_len;
+
+			ahs += 4 + extcdb_len;
+			ahslen -= 4 + extcdb_len;
+		}
+	}
+
+	if (dir == DATA_BIDIRECTIONAL && ahslen >= 8) {
+		struct iscsi_rlength_ahdr *ahs_bidi = (void *) ahs;
+		if (ahs_bidi->ahstype == ISCSI_AHSTYPE_RLENGTH) {
+			uint32_t in_length = ntohl(ahs_bidi->read_length);
+
+			dprintf("bidi read len %u\n", in_length);
+
+			if (in_length) {
+				uint32_t len;
+				void *buf;
+
+				len = roundup(in_length,
+					      conn->tp->data_padding);
+				buf = conn->tp->alloc_data_buf(conn, len);
+				if (!buf)
+					return -ENOMEM;
+
+				scsi_set_in_buffer(scmd, buf);
+				scsi_set_in_length(scmd, in_length);
+			}
+		}
+	}
+}
+*/
+
+static void iser_sched_buf_alloc(struct event_data *evt)
+{
+	struct iser_conn *conn = (struct iser_conn *) evt->data;
+	struct iser_task *task;
+	int err;
+
+	while (!list_empty(&conn->buf_alloc_list)) {
+		task = list_first_entry(&conn->buf_alloc_list,
+					struct iser_task,
+					exec_list);
+		err = iser_scsi_cmd_buf_alloc(task);
+		if (likely(!err))
+			list_del(&task->exec_list);
+		else {
+			if (err != -ENOMEM)
+				iser_conn_close(conn);
+			break;
+		}
+	}
+}
+
+static inline int task_batch_type(struct iser_task *task)
+{
+	struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) task->pdu.bhs;
+
+	switch (req_bhs->cdb[0]) {
+	case SYNCHRONIZE_CACHE:
+	case SYNCHRONIZE_CACHE_16:
+	case WRITE_6:
+	case WRITE_10:
+	case WRITE_12:
+	case WRITE_16:
+	case READ_6:
+	case READ_10:
+	case READ_12:
+	case READ_16:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static void iser_sched_iosubmit(struct event_data *evt)
+{
+	struct iser_conn *conn = (struct iser_conn *) evt->data;
+	struct iser_task *task = NULL;
+
+	while (!list_empty(&conn->iosubmit_list)) {
+		task = list_first_entry(&conn->iosubmit_list,
+					struct iser_task, exec_list);
+		list_del(&task->exec_list);
+		iser_scsi_cmd_iosubmit(task);
+	}
+}
+
+static void iser_sched_rdma_rd(struct event_data *evt)
+{
+	struct iser_conn *conn = (struct iser_conn *) evt->data;
+	struct iser_work_req *first_wr = NULL;
+	struct iser_task *prev_task = NULL;
+	struct iser_task *task = NULL;
+	int num_reqs = 0;
+	int err;
+
+	if (unlikely(conn->h.state != STATE_FULL)) {
+		dprintf("conn:%p closing, ignoring rdma_rd\n", conn);
+		/* ToDo: free all tasks and buffers */
+		return;
+	}
+
+	while (!list_empty(&conn->rdma_rd_list)) {
+		task = list_first_entry(&conn->rdma_rd_list,
+					struct iser_task, rdma_list);
+		list_del(&task->rdma_list);
+
+		iser_prep_rdma_rd_send_req(task, NULL, 1);
+		if (first_wr == NULL)
+			first_wr = &task->rdmad;
+		else
+			prev_task->rdmad.send_wr.next = &task->rdmad.send_wr;
+		prev_task = task;
+		num_reqs++;
+	}
+	if (prev_task) {
+		prev_task->rdmad.send_wr.next = NULL;
+		/* submit the chain of rdma-rd requests, start from the first */
+		err = iser_post_send(conn, first_wr, num_reqs);
+
+		/* ToDo: error handling */
+	}
+}
+
+static void iser_sched_tx(struct event_data *evt)
+{
+	struct iser_conn *conn = (struct iser_conn *) evt->data;
+	struct iser_work_req *first_wr = NULL;
+	struct iser_task *prev_task = NULL;
+	struct iser_task *task;
+	struct iser_work_req *cur_send_wr;
+	int num_reqs = 0;
+	int err;
+
+	if (unlikely(conn->h.state == STATE_CLOSE ||
+		     conn->h.state == STATE_EXIT)) {
+		dprintf("conn:%p closing, ignoring tx\n", conn);
+		return;
+	}
+
+	while (!list_empty(&conn->resp_tx_list)) {
+		task = list_first_entry(&conn->resp_tx_list,
+					struct iser_task,
+					tx_list);
+		list_del(&task->tx_list);
+		list_add_tail(&task->tx_list, &conn->sent_list);
+
+		if (task->is_read && task->rdma_wr_remains) {
+			iser_prep_rdma_wr_send_req(task, &task->txd, 1);
+			cur_send_wr = &task->rdmad;
+			num_reqs++;
+		} else
+			cur_send_wr = &task->txd;
+
+		if (prev_task == NULL)
+			first_wr = cur_send_wr;
+		else
+			prev_task->txd.send_wr.next = &cur_send_wr->send_wr;
+
+		iser_prep_resp_send_req(task, NULL, 1);
+		prev_task = task;
+		num_reqs++;
+	}
+	if (prev_task) {
+		prev_task->txd.send_wr.next = NULL;
+		/* submit the chain of rdma-wr & tx reqs, start from the first */
+		err = iser_post_send(conn, first_wr, num_reqs);
+
+		/* ToDo: error handling */
+	}
+}
+
+static void iser_sched_post_recv(struct event_data *evt)
+{
+	struct iser_conn *conn = (struct iser_conn *) evt->data;
+	struct iser_task *first_task = NULL;
+	struct iser_task *prev_task = NULL;
+	struct iser_task *task;
+	int num_recv_bufs = 0;
+	int err;
+
+	if (unlikely(conn->h.state != STATE_FULL)) {
+		dprintf("conn:%p closing, ignoring post recv\n", conn);
+		return;
+	}
+
+	while (!list_empty(&conn->post_recv_list)) {
+		task = list_first_entry(&conn->post_recv_list,
+					struct iser_task,
+					recv_list);
+		list_del(&task->recv_list);
+
+		if (prev_task == NULL)
+			first_task = task;
+		else
+			prev_task->rxd.recv_wr.next = &task->rxd.recv_wr;
+
+		prev_task = task;
+		num_recv_bufs++;
+	}
+	if (prev_task) {
+		prev_task->rxd.recv_wr.next = NULL;
+		/* post the chain of recv buffers, start from the first */
+		err = iser_post_recv(conn, first_task, num_recv_bufs);
+
+		/* ToDo: error handling */
+	}
+}
+
+static int iser_scsi_cmd_rx(struct iser_task *task)
+{
+	struct iser_conn *conn = task->conn;
+	struct iscsi_session *session = conn->h.session;
+	struct iscsi_cmd *req_bhs = (struct iscsi_cmd *) task->pdu.bhs;
+	unsigned int flags = req_bhs->flags;
+	uint32_t imm_data_sz = ntoh24(req_bhs->dlength);
+	uint32_t xfer_sz = ntohl(req_bhs->data_length);
+	int err = 0;
+
+	task->is_read = flags & ISCSI_FLAG_CMD_READ;
+	task->is_write = flags & ISCSI_FLAG_CMD_WRITE;
+
+	if (task->is_write) {
+		task->out_len = xfer_sz;
+
+		/* add immediate data to the task */
+		if (!conn->h.session_param[ISCSI_PARAM_INITIAL_R2T_EN].val) {
+			task->unsol_sz = conn->h.session_param[ISCSI_PARAM_FIRST_BURST].val;
+			if (task->out_len < task->unsol_sz)
+				task->unsol_sz = task->out_len;
+		} else
+			task->unsol_sz = 0;
+
+		if (conn->h.session_param[ISCSI_PARAM_IMM_DATA_EN].val) {
+			if (imm_data_sz > 0) {
+				if (task->unsol_sz == 0)
+					task->unsol_sz = imm_data_sz;
+				iser_task_add_out_pdu_buf(task, &task->pdu.membuf, 0);
+			}
+		} else if (imm_data_sz > 0) {
+			eprintf("ImmediateData disabled but received\n");
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* immediate data is the first chunk of the unsolicited data */
+		task->unsol_remains = task->unsol_sz - imm_data_sz;
+		/* rdma-reads cover the entire solicited data */
+		task->rdma_rd_sz = task->out_len - task->unsol_sz;
+		task->rdma_rd_remains = task->rdma_rd_sz;
+
+		/* ToDo: multiple RDMA-Write buffers */
+		/* task->rdma_rd_offset = task->unsol_sz; */
+
+		task->in_len = 0;
+		task->rdma_wr_sz = 0;
+		task->rdma_wr_remains = 0;
+
+		if (!task->is_read)
+			scsi_set_data_dir(&task->scmd, DATA_WRITE);
+		else
+			scsi_set_data_dir(&task->scmd, DATA_BIDIRECTIONAL);
+	} else {
+		task->in_len = xfer_sz;
+		task->rdma_wr_sz = xfer_sz;
+		task->rdma_wr_remains = xfer_sz;
+		task->out_len = 0;
+		task->unsol_sz = 0;
+		task->unsol_remains = 0;
+		task->rdma_rd_sz = 0;
+		task->rdma_rd_remains = 0;
+
+		if (task->is_read)
+			scsi_set_data_dir(&task->scmd, DATA_READ);
+		else
+			scsi_set_data_dir(&task->scmd, DATA_NONE);
+	}
+
+	list_add_tail(&task->session_list, &session->cmd_list);
+out:
+	dprintf("task:%p tag:0x%04"PRIx64 "scsi_op:0x%x %s%s in_len:%d out_len:%d "
+		"imm_sz:%d unsol_sz:%d cmdsn:0x%x expcmdsn:0x%x\n",
+		task, task->tag, req_bhs->cdb[0],
+		task->is_read ? "rd" : "", task->is_write ? "wr" : "",
+		task->in_len, task->out_len, imm_data_sz, task->unsol_sz,
+		task->cmd_sn, session->exp_cmd_sn);
+
+	return err;
+}
+
+static int iser_data_out_rx(struct iser_task *dout_task)
+{
+	struct iser_conn *conn = dout_task->conn;
+	struct iscsi_session *session = conn->h.session;
+	struct iscsi_data *req_bhs =
+	    (struct iscsi_data *) dout_task->pdu.bhs;
+	struct iser_task *task;
+	int err = 0;
+
+	list_for_each_entry(task, &session->cmd_list, session_list) {
+		if (task->tag == req_bhs->itt)
+			goto found;
+	}
+	return -EINVAL;
+
+found:
+	iser_task_add_out_pdu_buf(task, &dout_task->pdu.membuf,
+				  be32_to_cpu(req_bhs->offset));
+	list_add_tail(&dout_task->dout_task_list, &task->dout_task_list);
+
+	/* ToDo: BUG!!! add an indication that it's data-out task so that
+		it can be released when the buffer is released */
+
+	task->unsol_remains -= ntoh24(req_bhs->dlength);
+
+	dprintf("task:%p tag:0x%04"PRIx64 ", dout taskptr:%p out_len:%d "
+		"uns_rem:%d rdma_rd_rem:%d expcmdsn:0x%x\n",
+		task, task->tag, dout_task, task->out_len,
+		task->unsol_remains, task->rdma_rd_remains,
+		session->exp_cmd_sn);
+
+	/* ToDo: look at the task counters !!! */
+	if (req_bhs->ttt == cpu_to_be32(ISCSI_RESERVED_TAG)) {
+		if (req_bhs->flags & ISCSI_FLAG_CMD_FINAL) {
+			if (!task_pending(task)) {
+				/* ToDo: this condition is weird ... */
+				if (task->rdma_rd_remains == 0 && task->unsol_remains == 0)
+					schedule_task_iosubmit(task, conn);
+			}
+		}
+	} else {
+		if (!(req_bhs->flags & ISCSI_FLAG_CMD_FINAL))
+			return err;
+
+		if (task->rdma_rd_remains == 0 && task->unsol_remains == 0)
+			schedule_task_iosubmit(task, conn);
+	}
+	return err;
+}
+
+/* usually rx_task is passed directly from the rx handler,
+   but due to the connection establishment event-data race
+   the first login task may be saved and injected afterwards.
+ */
+static void iser_login_rx(struct iser_task *rx_task)
+{
+	struct iser_conn *conn = rx_task->conn;
+	struct iser_task *tx_task = conn->login_tx_task;
+	struct iscsi_login_rsp *rsp_bhs =
+		(struct iscsi_login_rsp *)tx_task->pdu.bhs;
+
+	if (conn->h.state == STATE_START) {
+		eprintf("conn:%p, not established yet, delaying login_rx\n",
+			&conn->h);
+		conn->h.state = STATE_READY;
+		conn->login_rx_task = rx_task;
+		return;
+	}
+
+	iser_login_exec(&conn->h, &rx_task->pdu, &tx_task->pdu);
+	if (rsp_bhs->status_class) {
+		eprintf("conn:%p, login failed, class:%0x detail:%0x\n",
+			&conn->h, rsp_bhs->status_class, rsp_bhs->status_detail);
+	}
+
+	if (conn->login_phase == LOGIN_PHASE_LAST_SEND)
+		dprintf("transitioning to full-feature, no repost\n");
+	else
+		iser_post_recv(conn, rx_task, 1);
+
+	schedule_resp_tx(tx_task, conn);
+}
+
+static int iser_nop_out_rx(struct iser_task *task)
+{
+	struct iser_conn *conn = task->conn;
+	struct iscsi_nopout *req_bhs =
+		(struct iscsi_nopout *) task->pdu.bhs;
+	int err = 0;
+
+	if (req_bhs->ttt != cpu_to_be32(ISCSI_RESERVED_TAG)) {
+		/*
+		 * When sending NOP-In, we don't request a NOP-Out      			       .
+		 * by sending a valid Target Tranfer Tag.
+		 * See iser_send_ping_nop_in() and 10.18.2 in the draft 20.
+		 */
+		eprintf("conn:%p task:%p initiator bug, ttt not reserved\n",
+			&conn->h, task);
+		err = -ISCSI_REASON_PROTOCOL_ERROR;
+		goto reject;
+	}
+	if (req_bhs->itt == cpu_to_be32(ISCSI_RESERVED_TAG)) {
+		if (req_bhs->opcode & ISCSI_OP_IMMEDIATE) {
+			dprintf("No response to Nop-Out\n");
+			iser_post_recv(conn, task, 1);
+			return -EAGAIN; /* ToDo: fix the ret codes */
+		} else {
+			eprintf("conn:%p task:%p initiator bug, "
+				"itt reserved with no imm. flag\n", &conn->h, task);
+			err = -ISCSI_REASON_PROTOCOL_ERROR;
+			goto reject;
+		}
+	}
+
+	task->out_len = ntoh24(req_bhs->dlength);
+	dprintf("conn:%p nop-out task:%p cmdsn:0x%x data_sz:%d\n",
+		&conn->h, task, task->cmd_sn, task->out_len);
+
+	return 0;
+
+reject: /* ToDo: prepare and send reject pdu */
+	/* iser_send_reject(task, ISCSI_REASON_INVALID_PDU_FIELD); */
+	return err;
+}
+
+static int iser_task_delivery(struct iser_task *task)
+{
+	struct iser_conn *conn = task->conn;
+	int err;
+
+	switch (task->opcode) {
+	case ISCSI_OP_SCSI_CMD:
+		err = iser_scsi_cmd_buf_alloc(task);
+		if (unlikely(err == -ENOMEM))
+			list_add_tail(&task->exec_list,
+				      &task->conn->buf_alloc_list);
+		break;
+	case ISCSI_OP_NOOP_OUT:
+		err = iser_nop_out_exec(task);
+		break;
+	case ISCSI_OP_LOGOUT:
+		err = iser_logout_exec(task);
+		break;
+	case ISCSI_OP_SCSI_TMFUNC:
+		err = iser_tm_exec(task);
+		break;
+	case ISCSI_OP_TEXT:
+		err = iser_text_exec(&conn->h, &task->pdu, &conn->text_tx_task->pdu);
+		break;
+	default:
+		eprintf("Internal error: Unexpected op:0x%x\n", task->opcode);
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static inline int cmdsn_cmp(uint32_t sn1, uint32_t sn2)
+{
+	if (sn1 == sn2)
+		return 0;
+
+	return ((int32_t)sn1 - (int32_t)sn2) > 0 ? 1 : -1;
+}
+
+/* queues the task according to cmd-sn, no exec here */
+static int iser_queue_task(struct iscsi_session *session,
+			   struct iser_task *task)
+{
+	uint32_t cmd_sn = task->cmd_sn;
+	struct list_head *cmp_entry;
+	int err;
+
+	if (unlikely(task->is_immediate)) {
+		dprintf("exec imm task task:%p tag:0x%0" PRIx64 "cmd_sn:0x%x\n",
+			task, task->tag, cmd_sn);
+		err = iser_task_delivery(task);
+		if (likely(!err || err == -ENOMEM))
+			return 0;
+		else
+			return err;
+	}
+
+	/* if the current command is the expected one, exec it
+	   and all others possibly acumulated on the queue */
+	while (session->exp_cmd_sn == cmd_sn) {
+		session->exp_cmd_sn++;
+		dprintf("exec task:%p cmd_sn:0x%x\n", task, cmd_sn);
+		err = iser_task_delivery(task);
+		if (unlikely(err && err != -ENOMEM)) {
+			/* when no free buffers remains, the task will wait
+			   on queue, so it is not a real error, but we should
+			   not attempt to start other tasks until more
+			   memory becomes available */
+			return err;
+			/* ToDo: what if there are more tasks in case of error */
+		}
+
+		if (list_empty(&session->pending_cmd_list))
+			return 0;
+
+		task = list_first_entry(&session->pending_cmd_list,
+					struct iser_task, exec_list);
+		list_del(&task->exec_list);
+		clear_task_pending(task);
+		cmd_sn = be32_to_cpu(task->pdu.bhs->statsn);
+	}
+
+	/* cmd_sn > (exp_cmd_sn+ISER_MAX_QUEUE_CMD), i.e. beyond allowed window */
+	if (cmdsn_cmp(cmd_sn, session->exp_cmd_sn+ISER_MAX_QUEUE_CMD) == 1) {
+		eprintf("unexpected cmd_sn:0x%x, max:0x%x\n",
+			cmd_sn, session->exp_cmd_sn+ISER_MAX_QUEUE_CMD);
+		return -EINVAL;
+	}
+
+	/* insert the current task, ordered by cmd_sn */
+	list_for_each_prev(cmp_entry, &session->pending_cmd_list) {
+		struct iser_task *cmp_task;
+		uint32_t cmp_cmd_sn;
+		int cmp_res;
+
+		cmp_task = list_entry(cmp_entry, struct iser_task, exec_list);
+		cmp_cmd_sn = cmp_task->cmd_sn;
+
+		cmp_res = cmdsn_cmp(cmd_sn, cmp_cmd_sn);
+		if (cmp_res == 1) { /* cmd_sn > cmp_cmd_sn */
+			dprintf("inserted cmdsn:0x%x after cmdsn:0x%x\n",
+				cmd_sn, cmp_cmd_sn);
+			break;
+		} else if (cmp_res == -1) { /* cmd_sn < cmp_cmd_sn */
+			dprintf("inserting cmdsn:0x%x skip cmdsn:0x%x\n",
+				cmd_sn, cmp_cmd_sn);
+			continue;
+		} else { /* cmd_sn == cmp_cmd_sn */
+			eprintf("duplicate cmd_sn:0x%x, exp:%u\n",
+				cmd_sn, session->exp_cmd_sn);
+			return -EINVAL;
+		}
+	}
+	list_add(&task->exec_list, cmp_entry);
+	set_task_pending(task);
+	return 0;
+}
+
+static int iser_parse_req_headers(struct iser_task *task)
+{
+	struct iser_conn *conn = task->conn;
+	struct iser_hdr *iser_hdr = task->pdu.iser_hdr;
+	struct iscsi_hdr *iscsi_hdr = task->pdu.bhs;
+	int err = -1;
+
+	switch (iser_hdr->flags & 0xF0) {
+	case ISCSI_CTRL:
+		if (iser_hdr->flags & ISER_RSV) {
+			task->rem_read_stag =
+			    be32_to_cpu(iser_hdr->read_stag);
+			task->rem_read_va = be64_to_cpu(iser_hdr->read_va);
+			dprintf("task:%p rstag:0x%x va:0x%"PRIx64 "\n", task,
+				task->rem_read_stag, task->rem_read_va);
+		}
+		if (iser_hdr->flags & ISER_WSV) {
+			task->rem_write_stag =
+			    be32_to_cpu(iser_hdr->write_stag);
+			task->rem_write_va =
+			    be64_to_cpu(iser_hdr->write_va);
+			dprintf("task:%p wstag:0x%x va:0x%"PRIx64 "\n", task,
+				task->rem_write_stag, task->rem_write_va);
+		}
+		err = 0;
+		break;
+	case ISER_HELLO:
+		dprintf("iSER Hello message??\n");
+		break;
+	default:
+		eprintf("malformed iser iser_hdr, flags 0x%02x\n",
+			iser_hdr->flags);
+		break;
+	}
+
+	task->opcode = iscsi_hdr->opcode & ISCSI_OPCODE_MASK;
+	task->is_immediate = iscsi_hdr->opcode & ISCSI_OP_IMMEDIATE ? 1 : 0;
+	task->is_read = 0; /* valid for cmds only */
+	task->is_write = 0; /* valid for cmds only */
+
+	task->pdu.ahssize = iscsi_hdr->hlength * 4;
+	task->pdu.membuf.addr += task->pdu.ahssize;
+	task->pdu.membuf.size = ntoh24(iscsi_hdr->dlength);
+	task->pdu.membuf.rdma = 0;
+
+	task->tag = iscsi_hdr->itt;
+	task->cmd_sn = be32_to_cpu(iscsi_hdr->statsn);
+	conn->h.exp_stat_sn = be32_to_cpu(iscsi_hdr->exp_statsn);
+
+	return err;
+}
+
+static int iser_rx_handler_non_ff(struct iser_task *task)
+{
+	struct iser_conn *conn = task->conn;
+	int err = 0;
+
+	switch (conn->h.state) {
+	case STATE_START:
+	case STATE_READY:
+	case STATE_SECURITY:
+	case STATE_SECURITY_AUTH:
+	case STATE_SECURITY_DONE:
+	case STATE_SECURITY_LOGIN:
+	case STATE_SECURITY_FULL:
+	case STATE_LOGIN:
+	case STATE_LOGIN_FULL:
+		if (task->opcode == ISCSI_OP_LOGIN) {
+			dprintf("login rx, conn:%p\n", conn);
+			iser_login_rx(task);
+		} else {
+			eprintf("non-login pdu during login phase, "
+				"conn:%p opcode:0x%0x\n",
+				conn, task->opcode);
+			err = -EINVAL;
+		}
+		break;
+	case STATE_EXIT:
+	case STATE_CLOSE:
+		dprintf("ignored rx, while conn:%p closing\n", conn);
+		break;
+	default:
+		dprintf("ignored rx, conn:%p unexpected state:%d\n",
+			conn, conn->h.state);
+		break;
+	}
+	return 0;
+}
+
+static void iser_rx_handler(struct iser_work_req *rxd)
+{
+	struct iser_task *task = rxd->task;
+	struct iser_conn *conn = task->conn;
+	int queue_task = 1;
+	int err = 0;
+
+	iser_conn_put(conn);
+
+	err = iser_parse_req_headers(task);
+	if (unlikely(err))
+		goto out;
+
+	if (unlikely(conn->h.state != STATE_FULL)) {
+		err = iser_rx_handler_non_ff(task);
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&task->in_buf_list);
+	task->in_buf_num = 0;
+	INIT_LIST_HEAD(&task->out_buf_list);
+	task->out_buf_num = 0;
+
+	if (likely(task->opcode == ISCSI_OP_SCSI_CMD))
+		err = iser_scsi_cmd_rx(task);
+	else {
+		switch (task->opcode) {
+		case ISCSI_OP_SCSI_DATA_OUT:
+			err = iser_data_out_rx(task);
+			queue_task = 0;
+			break;
+		case ISCSI_OP_NOOP_OUT:
+			err = iser_nop_out_rx(task);
+			break;
+		case ISCSI_OP_LOGOUT:
+			dprintf("logout rx\n");
+			break;
+		case ISCSI_OP_SCSI_TMFUNC:
+			dprintf("tmfunc rx\n");
+			break;
+		case ISCSI_OP_TEXT:
+			dprintf("text rx\n");
+			break;
+		case ISCSI_OP_SNACK:
+			eprintf("Cannot handle SNACK yet\n");
+			err = -EINVAL;
+			break;
+		default:
+			eprintf("Unknown op 0x%x\n", task->opcode);
+			err = -EINVAL;
+			break;
+		}
+	}
+	if (likely(!err && queue_task))
+		err = iser_queue_task(conn->h.session, task);
+out:
+	if (unlikely(err)) {
+		eprintf("conn:%p task:%p err:%d, closing\n",
+			&conn->h, task, err);
+		iser_conn_close(conn);
+	}
+}
+
+static void iser_login_tx_complete(struct iser_conn *conn)
+{
+	switch (conn->h.state) {
+	case STATE_EXIT:
+		conn->h.state = STATE_CLOSE;
+		break;
+	case STATE_SECURITY_LOGIN:
+		conn->h.state = STATE_LOGIN;
+		break;
+	case STATE_SECURITY_FULL: /* fall through */
+	case STATE_LOGIN_FULL:
+		dprintf("conn:%p, last login send completed\n", conn);
+		conn->h.state = STATE_FULL;
+		iser_conn_login_phase_set(conn, LOGIN_PHASE_FF);
+		iser_free_login_resources(conn);
+		break;
+	}
+}
+
+static void iser_tx_complete_handler(struct iser_work_req *txd)
+{
+	struct iser_task *task = txd->task;
+	struct iser_conn *conn = task->conn;
+
+	dprintf("conn:%p task:%p tag:0x%04"PRIx64 "\n",
+		&conn->h, task, task->tag);
+	iser_conn_put(conn);
+
+	list_del(&task->tx_list); /* remove from conn->sent_list */
+
+	if (unlikely(task->unsolicited)) {
+		conn->nop_in_task = task;
+		return;
+	}
+
+	iser_complete_task(task);
+
+	if (unlikely(conn->h.state != STATE_FULL))
+		iser_login_tx_complete(conn);
+	else
+		schedule_post_recv(task, conn);
+}
+
+static void iser_rdma_wr_complete_handler(struct iser_work_req *rdmad)
+{
+	struct iser_task *task = rdmad->task;
+	struct iser_conn *conn = task->conn;
+
+	dprintf("conn:%p task:%p tag:0x%04"PRIx64 "\n",
+		&conn->h, task, task->tag);
+	iser_conn_put(conn);
+
+	/* no need to remove from conn->sent_list, it is done in
+	   iser_tx_complete_handler(), as rdma-wr is followed by tx */
+}
+
+static void iser_rdma_rd_complete_handler(struct iser_work_req *rdmad)
+{
+	struct iser_task *task = rdmad->task;
+	struct iser_conn *conn = task->conn;
+
+	task->rdma_rd_remains -= rdmad->sge.length;
+	dprintf("conn:%p task:%p tag:0x%04"PRIx64 ", rems rdma:%d unsol:%d\n",
+		&conn->h, task, task->tag, task->rdma_rd_remains,
+		task->unsol_remains);
+	iser_conn_put(conn);
+
+	/* no need to remove from a list, was removed before rdma-rd */
+
+	if (unlikely(conn->h.state != STATE_FULL))
+		return;
+
+	if (task->rdma_rd_remains == 0 && task->unsol_remains == 0)
+		schedule_task_iosubmit(task, conn);
+}
+
+/*
+ * Deal with just one work completion.
+ */
+static void handle_wc(struct ibv_wc *wc)
+{
+	struct iser_work_req *req = ptr_from_int64(wc->wr_id);
+	struct iser_task *task;
+	struct iser_conn *conn;
+
+	dprintf("%s complete, wr_id:%p len:%d\n",
+		iser_ib_op_to_str(req->iser_ib_op), req, wc->byte_len);
+
+	switch (req->iser_ib_op) {
+	case ISER_IB_RECV:
+		iser_rx_handler(req);
+		break;
+	case ISER_IB_SEND:
+		iser_tx_complete_handler(req);
+		break;
+	case ISER_IB_RDMA_WRITE:
+		iser_rdma_wr_complete_handler(req);
+		break;
+	case ISER_IB_RDMA_READ:
+		iser_rdma_rd_complete_handler(req);
+		break;
+	default:
+		task = req->task;
+		conn = (task ? task->conn : NULL);
+		eprintf("unexpected req op:%d, wc op%d, wc::%p "
+			"wr_id:%p task:%p conn:%p\n",
+			req->iser_ib_op, wc->opcode, wc, req, task, conn);
+		if (conn)
+			iser_conn_close(conn);
+		break;
+	}
+}
+
+static void handle_wc_error(struct ibv_wc *wc)
+{
+	struct iser_work_req *req = ptr_from_int64(wc->wr_id);
+	struct iser_task *task = req->task;
+	struct iser_conn *conn = task ? task->conn : NULL;
+
+	if (wc->status != IBV_WC_WR_FLUSH_ERR)
+		eprintf("conn:%p task:%p tag:0x%04"PRIx64 "wr_id:0x%p op:%s "
+			"err:%s vendor_err:0x%0x\n",
+			&conn->h, task, task->tag, req,
+			iser_ib_op_to_str(req->iser_ib_op),
+			ibv_wc_status_str(wc->status), wc->vendor_err);
+	else
+		dprintf("conn:%p task:%p tag:0x%04"PRIx64 "wr_id:0x%p op:%s "
+			"err:%s vendor_err:0x%0x\n",
+			&conn->h, task, task->tag, req,
+			iser_ib_op_to_str(req->iser_ib_op),
+			ibv_wc_status_str(wc->status), wc->vendor_err);
+
+	switch (req->iser_ib_op) {
+	case ISER_IB_SEND:
+		/* in both read and write tasks SEND is last,
+		   the task should be completed now */
+	case ISER_IB_RDMA_READ:
+		/* RDMA-RD is sent separately, and Response
+		   is to be SENT after its completion, so if RDMA-RD fails,
+		   task to be completed now */
+		iser_complete_task(task);
+		break;
+	case ISER_IB_RECV:
+		/* this should be the Flush, no task has been created yet */
+	case ISER_IB_RDMA_WRITE:
+		/* RDMA-WR and SEND response of a READ task
+		   are sent together, so when receiving RDMA-WR error,
+		   wait until SEND error arrives to complete the task */
+		break;
+	default:
+		eprintf("unexpected opcode %d, "
+			"wc:%p wr_id:%p task:%p conn:%p\n",
+			wc->opcode, wc, req, task, conn);
+		break;
+	}
+
+	if (conn) {
+		iser_conn_put(conn);
+		iser_conn_close(conn);
+	}
+}
+
+/*
+ * Could read as many entries as possible without blocking, but
+ * that just fills up a list of tasks.  Instead pop out of here
+ * so that tx progress, like issuing rdma reads and writes, can
+ * happen periodically.
+ */
+static int iser_poll_cq(struct iser_device *dev, int max_wc)
+{
+	int err = 0, numwc = 0;
+	struct ibv_wc wc;
+
+	for (;;) {
+		err = ibv_poll_cq(dev->cq, 1, &wc);
+		if (err == 0) /* no completions retrieved */
+			break;
+
+		if (unlikely(err < 0)) {
+			eprintf("ibv_poll_cq failed\n");
+			break;
+		}
+
+		VALGRIND_MAKE_MEM_DEFINED(&wc, sizeof(wc));
+		if (likely(wc.status == IBV_WC_SUCCESS))
+			handle_wc(&wc);
+		else
+			handle_wc_error(&wc);
+
+		if (++numwc == max_wc) {
+			err = 1;
+			break;
+		}
+	}
+	return err;
+}
+
+static int num_delayed_arm;
+#define MAX_NUM_DELAYED_ARM 16
+
+static void iser_rearm_completions(struct iser_device *dev)
+{
+	int err;
+
+	err = ibv_req_notify_cq(dev->cq, 0);
+	if (unlikely(err))
+		eprintf("ibv_req_notify_cq failed\n");
+
+	dev->poll_sched.sched_handler = iser_sched_consume_cq;
+	tgt_add_sched_event(&dev->poll_sched);
+
+	num_delayed_arm = 0;
+}
+
+static void iser_poll_cq_armable(struct iser_device *dev)
+{
+	int err;
+
+	err = iser_poll_cq(dev, MAX_POLL_WC);
+	if (unlikely(err < 0)) {
+		iser_rearm_completions(dev);
+		return;
+	}
+
+	if (err == 0 && (++num_delayed_arm == MAX_NUM_DELAYED_ARM))
+		/* no more completions on cq, give up and arm the interrupts */
+		iser_rearm_completions(dev);
+	else {
+		dev->poll_sched.sched_handler = iser_sched_poll_cq;
+		tgt_add_sched_event(&dev->poll_sched);
+	}
+}
+
+/* iser_sched_consume_cq() is scheduled to consume completion events that
+   could arrive after the cq had been seen empty, but just before
+   the interrupts were re-armed.
+   Intended to consume those remaining completions only, the function
+   does not re-arm interrupts, but polls the cq until it's empty.
+   As we always limit the number of completions polled at a time, we may
+   need to schedule this functions few times.
+   It may happen that during this process new completions occur, and
+   we get an interrupt about that. Some of the "new" completions may be
+   processed by the self-scheduling iser_sched_consume_cq(), which is
+   a good thing, because we don't need to wait for the interrupt event.
+   When the interrupt notification arrives, its handler will remove the
+   scheduled event, and call iser_poll_cq_armable(), so that the polling
+   cycle resumes normally.
+*/
+static void iser_sched_consume_cq(struct event_data *tev)
+{
+	struct iser_device *dev = tev->data;
+	int err;
+
+	err = iser_poll_cq(dev, MAX_POLL_WC);
+	if (err > 0) {
+		dev->poll_sched.sched_handler = iser_sched_consume_cq;
+		tgt_add_sched_event(&dev->poll_sched);
+	}
+}
+
+/* Scheduled to poll cq after a completion event has been
+   received and acknowledged, if no more completions are found
+   the interrupts are re-armed */
+static void iser_sched_poll_cq(struct event_data *tev)
+{
+	struct iser_device *dev = tev->data;
+	iser_poll_cq_armable(dev);
+}
+
+/*
+ * Called from main event loop when a CQ notification is available.
+ */
+static void iser_handle_cq_event(int fd __attribute__ ((unused)),
+				 int events __attribute__ ((unused)),
+				 void *data)
+{
+	struct iser_device *dev = data;
+	void *cq_context;
+	int err;
+
+	err = ibv_get_cq_event(dev->cq_channel, &dev->cq, &cq_context);
+	if (unlikely(err != 0)) {
+		/* Just print the log message, if that was a serious problem,
+		   it will express itself elsewhere */
+		eprintf("failed to retrieve CQ event, cq:%p\n", dev->cq);
+		return;
+	}
+
+	ibv_ack_cq_events(dev->cq, 1);
+
+	/* if a poll was previosuly scheduled, remove it,
+	   as it will be scheduled when necessary */
+	if (dev->poll_sched.scheduled)
+		tgt_remove_sched_event(&dev->poll_sched);
+
+	iser_poll_cq_armable(dev);
+}
+
+/*
+ * Called from main event loop when async event is available.
+ */
+static void iser_handle_async_event(int fd __attribute__ ((unused)),
+				    int events __attribute__ ((unused)),
+				    void *data)
+{
+	struct iser_device *dev = data;
+	char *dev_name = dev->ibv_ctxt->device->name;
+	struct ibv_async_event async_event;
+	struct iser_conn *conn;
+
+	if (ibv_get_async_event(dev->ibv_ctxt, &async_event)) {
+		eprintf("ibv_get_async_event failed\n");
+		return;
+	}
+
+	switch (async_event.event_type) {
+	case IBV_EVENT_COMM_EST:
+		conn = async_event.element.qp->qp_context;
+		eprintf("conn:0x%p cm_id:0x%p dev:%s, QP evt: %s\n",
+			&conn->h, conn->cm_id, dev_name,
+			ibv_event_type_str(IBV_EVENT_COMM_EST));
+		/* force "connection established" event */
+		rdma_notify(conn->cm_id, IBV_EVENT_COMM_EST);
+		break;
+
+	/* rest of QP-related events */
+	case IBV_EVENT_QP_FATAL:
+	case IBV_EVENT_QP_REQ_ERR:
+	case IBV_EVENT_QP_ACCESS_ERR:
+	case IBV_EVENT_SQ_DRAINED:
+	case IBV_EVENT_PATH_MIG:
+	case IBV_EVENT_PATH_MIG_ERR:
+	case IBV_EVENT_QP_LAST_WQE_REACHED:
+		conn = async_event.element.qp->qp_context;
+		eprintf("conn:0x%p cm_id:0x%p dev:%s, QP evt: %s\n",
+			&conn->h, conn->cm_id, dev_name,
+			ibv_event_type_str(async_event.event_type));
+		break;
+
+	/* CQ-related events */
+	case IBV_EVENT_CQ_ERR:
+		eprintf("dev:%s CQ evt: %s\n", dev_name,
+			ibv_event_type_str(async_event.event_type));
+		break;
+
+	/* SRQ events */
+	case IBV_EVENT_SRQ_ERR:
+	case IBV_EVENT_SRQ_LIMIT_REACHED:
+		eprintf("dev:%s SRQ evt: %s\n", dev_name,
+			ibv_event_type_str(async_event.event_type));
+		break;
+
+	/* Port events */
+	case IBV_EVENT_PORT_ACTIVE:
+	case IBV_EVENT_PORT_ERR:
+	case IBV_EVENT_LID_CHANGE:
+	case IBV_EVENT_PKEY_CHANGE:
+	case IBV_EVENT_SM_CHANGE:
+	case IBV_EVENT_CLIENT_REREGISTER:
+		eprintf("dev:%s port:%d evt: %s\n",
+			dev_name, async_event.element.port_num,
+			ibv_event_type_str(async_event.event_type));
+		break;
+
+	/* HCA events */
+	case IBV_EVENT_DEVICE_FATAL:
+		eprintf("dev:%s HCA evt: %s\n", dev_name,
+			ibv_event_type_str(async_event.event_type));
+		break;
+
+	default:
+		eprintf("dev:%s evt: %s\n", dev_name,
+			ibv_event_type_str(async_event.event_type));
+		break;
+	}
+
+	ibv_ack_async_event(&async_event);
+}
+
+/*
+ * First time a new connection is received on an RDMA device, record
+ * it and build a PD and static memory.
+ */
+static int iser_device_init(struct iser_device *dev)
+{
+	int cqe_num;
+	int err = -1;
+
+	dprintf("dev %p\n", dev);
+	dev->pd = ibv_alloc_pd(dev->ibv_ctxt);
+	if (dev->pd == NULL) {
+		eprintf("ibv_alloc_pd failed\n");
+		goto out;
+	}
+
+	err = iser_init_rdma_buf_pool(dev);
+	if (err) {
+		eprintf("iser_init_rdma_buf_pool failed\n");
+		goto out;
+	}
+
+	err = ibv_query_device(dev->ibv_ctxt, &dev->device_attr);
+	if (err < 0) {
+		eprintf("ibv_query_device failed, %m\n");
+		goto out;
+	}
+	cqe_num = dev->device_attr.max_cqe;
+	dprintf("max %d CQEs\n", cqe_num);
+
+	err = -1;
+	dev->cq_channel = ibv_create_comp_channel(dev->ibv_ctxt);
+	if (dev->cq_channel == NULL) {
+		eprintf("ibv_create_comp_channel failed\n");
+		goto out;
+	}
+
+	dev->cq = ibv_create_cq(dev->ibv_ctxt, cqe_num, NULL,
+				dev->cq_channel, 0);
+	if (dev->cq == NULL) {
+		eprintf("ibv_create_cq failed\n");
+		goto out;
+	}
+	dprintf("dev->cq:%p\n", dev->cq);
+
+	tgt_init_sched_event(&dev->poll_sched, iser_sched_poll_cq, dev);
+
+	err = ibv_req_notify_cq(dev->cq, 0);
+	if (err) {
+		eprintf("ibv_req_notify failed, %m\n");
+		goto out;
+	}
+
+	err = tgt_event_add(dev->cq_channel->fd, EPOLLIN,
+			    iser_handle_cq_event, dev);
+	if (err) {
+		eprintf("tgt_event_add failed, %m\n");
+		goto out;
+
+	}
+
+	err = tgt_event_add(dev->ibv_ctxt->async_fd, EPOLLIN,
+			    iser_handle_async_event, dev);
+	if (err)
+		return err;
+
+	list_add_tail(&dev->list, &iser_dev_list);
+
+out:
+	return err;
+}
+
+/*
+ * Init entire iscsi transport.  Begin listening for connections.
+ */
+static int iser_ib_init(void)
+{
+	int err;
+	struct sockaddr_in sock_addr;
+	short int port = iser_listen_port;
+
+	rdma_evt_channel = rdma_create_event_channel();
+	if (!rdma_evt_channel) {
+		eprintf("Failed to initialize RDMA; load kernel modules?\n");
+		return -1;
+	}
+
+	err = rdma_create_id(rdma_evt_channel, &cma_listen_id, NULL,
+			     RDMA_PS_TCP);
+	if (err) {
+		eprintf("rdma_create_id failed, %m\n");
+		return -1;
+	}
+
+	memset(&sock_addr, 0, sizeof(sock_addr));
+	sock_addr.sin_family = AF_INET;
+	sock_addr.sin_port = htons(port);
+	sock_addr.sin_addr.s_addr = INADDR_ANY;
+	err =
+	    rdma_bind_addr(cma_listen_id, (struct sockaddr *) &sock_addr);
+	if (err) {
+		if (err == -1)
+			eprintf("rdma_bind_addr -1: %m\n");
+		else
+			eprintf("rdma_bind_addr: %s\n", strerror(-err));
+		return -1;
+	}
+
+	/* 0 == maximum backlog */
+	err = rdma_listen(cma_listen_id, 0);
+	if (err) {
+		if (err == -1)
+			eprintf("rdma_listen -1: %m\n");
+		else
+			eprintf("rdma_listen: %s\n", strerror(-err));
+		return -1;
+	}
+
+	dprintf("listening for iser connections on port %d\n", port);
+	err = tgt_event_add(cma_listen_id->channel->fd, EPOLLIN,
+			    iser_handle_rdmacm, NULL);
+	if (err)
+		return err;
+
+	INIT_LIST_HEAD(&iser_dev_list);
+	INIT_LIST_HEAD(&iser_conn_list);
+	INIT_LIST_HEAD(&temp_conn);
+
+	return err;
+}
+
+#include <signal.h>
+#include <sys/time.h>
+
+static int iser_send_nop = 1;
+
+#define ISER_TIMER_INT_SEC      5
+static struct itimerval iser_timer = {
+	{ISER_TIMER_INT_SEC, 0},
+	{ISER_TIMER_INT_SEC, 0}
+};
+static int iser_timer_started;
+static int timer_fd[2] = {0, 0};
+
+static void iser_timer_sig_handler(int data)
+{
+	unsigned int n = 0;
+	int err;
+
+	err = write(timer_fd[1], &n, sizeof(n));
+	if (err < 0) {
+		eprintf("Failed to write to pipe, %m\n");
+		return;
+	}
+}
+
+static void iser_timer_evt_handler(int fd, int events, void *data)
+{
+	struct iser_conn *conn;
+	struct iser_task *task;
+	unsigned int n;
+	int err;
+
+	err = read(timer_fd[0], &n, sizeof(n));
+	if (err < 0) {
+		eprintf("Failed to read from pipe, %m\n");
+		return;
+	}
+
+	list_for_each_entry(conn, &iser_conn_list, conn_list) {
+		if (conn->h.state != STATE_FULL)
+			continue;
+		task = conn->nop_in_task;
+		if (!task)
+			continue;
+		conn->nop_in_task = NULL;
+		iser_send_ping_nop_in(task);
+	}
+}
+
+static int iser_start_timer(void)
+{
+	struct sigaction s;
+	int err;
+
+	if (!iser_timer_started)
+		iser_timer_started = 1;
+	else
+		return 0;
+
+	sigemptyset(&s.sa_mask);
+	sigaddset(&s.sa_mask, SIGALRM);
+	s.sa_flags = 0;
+	s.sa_handler = iser_timer_sig_handler;
+	err = sigaction(SIGALRM, &s, NULL);
+	if (err) {
+		eprintf("Failed to setup timer handler\n");
+		return err;
+	}
+
+	err = setitimer(ITIMER_REAL, &iser_timer, 0);
+	if (err) {
+		eprintf("Failed to set timer\n");
+		return err;
+	}
+
+	err = pipe(timer_fd);
+	if (err) {
+		eprintf("Failed to open timer pipe\n");
+		return err;
+	}
+
+	err = tgt_event_add(timer_fd[0], EPOLLIN,
+			    iser_timer_evt_handler, NULL);
+	if (err) {
+		eprintf("failed to add iser timer epoll event, fd:%d\n",
+			timer_fd[0]);
+		return err;
+	}
+
+	dprintf("started, timeout: %d\n", ISER_TIMER_INT_SEC);
+
+	return 0;
+}
+
+static int iser_stop_timer(void)
+{
+	int err;
+
+	if (iser_timer_started)
+		iser_timer_started = 0;
+	else
+		return 0;
+
+	tgt_event_del(timer_fd[0]);
+
+	if (timer_fd[0] > 0)
+		close(timer_fd[0]);
+	if (timer_fd[1] > 0)
+		close(timer_fd[1]);
+
+	err = setitimer(ITIMER_REAL, 0, 0);
+	if (err)
+		eprintf("Failed to stop timer\n");
+
+	dprintf("stopped\n");
+
+	return err;
+}
+
+static int iser_init(int index, char *args)
+{
+	int err;
+
+	err = iser_ib_init();
+	if (err)
+		return err;
+
+	if (iser_send_nop) {
+		err = iser_start_timer();
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void iser_exit(void)
+{
+	iser_stop_timer();
+}
+
+static int iser_target_create(struct target *t)
+{
+	struct iscsi_target *target;
+	int err;
+
+	err = iscsi_target_create(t);
+	if (err)
+		return err;
+
+	target = target_find_by_id(t->tid);
+	assert(target != NULL);
+
+	target->rdma = 1;
+	target->session_param[ISCSI_PARAM_INITIAL_R2T_EN].val = 1;
+	target->session_param[ISCSI_PARAM_IMM_DATA_EN].val = 0;
+
+	return 0;
+}
+
+static const char *lld_param_port = "port";
+static const char *lld_param_nop = "nop";
+static const char *lld_param_on = "on";
+static const char *lld_param_off = "off";
+
+static int iser_param_parser(char *p)
+{
+	int err = 0;
+	char *q;
+
+	while (*p) {
+		if (!strncmp(p, lld_param_port, strlen(lld_param_port))) {
+			short int port_val;
+
+			q = p + strlen(lld_param_port) + 1;
+			port_val = strtol(q, NULL, 10);
+			if (!errno)
+				iser_listen_port = port_val;
+			else {
+				eprintf("invalid port supplied\n");
+				err = -EINVAL;
+				break;
+			}
+		} else if (!strncmp(p, lld_param_nop, strlen(lld_param_nop))) {
+			q = p + strlen(lld_param_nop) + 1;
+			if (!strncmp(q, lld_param_on, strlen(lld_param_on)))
+				iser_send_nop = 1;
+			else
+			if (!strncmp(q, lld_param_off, strlen(lld_param_off)))
+				iser_send_nop = 0;
+			else {
+				eprintf("unsupported value for param:%s\n",
+					lld_param_nop);
+				err = -EINVAL;
+				break;
+			}
+		} else {
+			dprintf("unsupported param:%s\n", p);
+			err = -EINVAL;
+			break;
+		}
+
+		p += strcspn(p, ",");
+		if (*p == ',')
+			++p;
+	}
+	return err;
+}
+
+static struct iscsi_transport iscsi_iser = {
+	.name   		= "iser",
+	.rdma   		= 1,
+	.data_padding   	= 1,
+	.ep_show		= iser_show,
+	.ep_getsockname 	= iser_getsockname,
+	.ep_getpeername 	= iser_getpeername,
+};
+
+static struct tgt_driver iser = {
+	.name   		= "iser",
+	.use_kernel     	= 0,
+	.init   		= iser_init,
+	.exit   		= iser_exit,
+	.target_create  	= iser_target_create,
+	.target_destroy 	= iscsi_target_destroy,
+
+	.update 		= iscsi_target_update,
+	.show 			= iscsi_target_show,
+	.cmd_end_notify 	= iser_scsi_cmd_done,
+	.mgmt_end_notify	= iser_tm_done,
+	.transportid    	= iscsi_transportid,
+	.default_bst    	= "rdwr",
+};
+
+__attribute__ ((constructor))
+static void iser_driver_constructor(void)
+{
+	register_driver(&iser);
+
+	setup_param("iser", iser_param_parser);
+}
+
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe stgt" in
the body of a message to majordomo at vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



More information about the stgt mailing list