[Stgt-devel] [PATCH 20/20] iser wait for rdma completion

Pete Wyckoff pw
Tue Oct 16 17:21:27 CEST 2007


If we just sent a DATA_IN, do not do anything else until the completion
arrives.  Sending the response back to the initiator before the RDMA
completed could be disastrous.  The completion handler will put us back
on the tx list.

Likely very much unneeded.  Local RDMA Write completion means nothing.
Rely on the fact that the Send goes after the RDMA Write operations,
but no need to wait for them to complete, just ensure they all went out.

Signed-off-by: Pete Wyckoff <pw at osc.edu>
---
 usr/iscsi/iscsi_rdma.c |   21 +++++++++++++++++----
 usr/iscsi/iscsid.c     |   19 ++++++++++++++-----
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/usr/iscsi/iscsi_rdma.c b/usr/iscsi/iscsi_rdma.c
index f4c6c23..ab4b3f2 100644
--- a/usr/iscsi/iscsi_rdma.c
+++ b/usr/iscsi/iscsi_rdma.c
@@ -94,6 +94,7 @@ struct rdmalist {
 	struct ibv_send_wr wr;
 	struct list_head list;
 	struct iscsi_task *task;  /* to get iser_task for remote stag and va */
+	int final_rdma;
 };
 
 /*
@@ -917,6 +918,14 @@ static void handle_wc(struct ibv_wc *wc)
 
 		iscsi_rdma_event_modify(ci->iscsi_conn, EPOLLIN | EPOLLOUT);
 		list_add(&rdmal->list, &ci->rdmal);
+
+		/* now let it transmit the final response, as we know
+		 * the RDMAs have completed */
+		if (rdmal->final_rdma) {
+			struct iscsi_task *task = rdmal->task;
+			list_add_tail(&task->c_list, &task->conn->tx_clist);
+		}
+
 		if (waiting_rdma_slot) {
 			waiting_rdma_slot = 0;
 			num_tx_ready = 1;
@@ -1374,7 +1383,8 @@ static void iscsi_iser_write_end(struct iscsi_connection *iscsi_conn)
  */
 static int iser_post_rdma_wr(struct conn_info *ci, struct iscsi_task *task,
 			     void *buf, ssize_t size, int op,
-			     uint64_t remote_va, uint32_t remote_rkey)
+			     uint64_t remote_va, uint32_t remote_rkey,
+			     int final_rdma)
 {
 	int ret;
 	struct rdmalist *rdmal;
@@ -1395,6 +1405,8 @@ static int iser_post_rdma_wr(struct conn_info *ci, struct iscsi_task *task,
 	rdmal->wr.wr.rdma.remote_addr = remote_va;
 	rdmal->wr.wr.rdma.rkey = remote_rkey;
 
+	rdmal->final_rdma = final_rdma;
+
 	ret = ibv_post_send(ci->qp_hndl, &rdmal->wr, &bad_wr);
 	if (ret)
 		eprintf("ibv_post_send ret %d\n", ret);
@@ -1423,7 +1435,7 @@ static int iscsi_rdma_rdma_read(struct iscsi_connection *conn)
 		(unsigned long long) itask->rem_write_va);
 
 	ret = iser_post_rdma_wr(ci, task, buf, len, IBV_WR_RDMA_READ,
-				itask->rem_write_va, itask->rem_write_stag);
+				itask->rem_write_va, itask->rem_write_stag, 0);
 	if (ret < 0)
 		return ret;
 
@@ -1448,6 +1460,7 @@ static int iscsi_rdma_rdma_write(struct iscsi_connection *conn)
 	struct iser_task *itask = task->trans_data;
 	struct iscsi_pdu *rsp = &conn->rsp;
 	struct iscsi_data_rsp *datain = (struct iscsi_data_rsp *) &rsp->bhs;
+	int final_rdma = (task->offset == task->len);
 	uint32_t offset;
 	int ret;
 
@@ -1458,7 +1471,7 @@ static int iscsi_rdma_rdma_write(struct iscsi_connection *conn)
 
 	ret = iser_post_rdma_wr(ci, task, rsp->data, rsp->datasize,
 				IBV_WR_RDMA_WRITE, itask->rem_read_va + offset,
-				itask->rem_read_stag);
+				itask->rem_read_stag, final_rdma);
 	if (ret < 0)
 		return ret;
 
@@ -1467,7 +1480,7 @@ static int iscsi_rdma_rdma_write(struct iscsi_connection *conn)
 	 * rdma to finish before sending the completion.  Then we'll stick
 	 * ourselves back on the list.
 	 */
-	if (task->offset == task->len) {
+	if (final_rdma) {
 		iscsi_rdma_event_modify(ci->iscsi_conn, EPOLLIN);
 	} else {
 		/* poke ourselves to do the next rdma */
diff --git a/usr/iscsi/iscsid.c b/usr/iscsi/iscsid.c
index 7f882fa..3479ee7 100644
--- a/usr/iscsi/iscsid.c
+++ b/usr/iscsi/iscsid.c
@@ -1708,12 +1708,21 @@ static int iscsi_scsi_cmd_tx_done(struct iscsi_connection *conn)
 	case ISCSI_OP_R2T:
 		break;
 	case ISCSI_OP_SCSI_DATA_IN:
-		if (task->offset < task->len || task->result != 0
-		   || task->dir == BIDIRECTIONAL || conn->tp->rdma) {
-			dprintf("more data or sense or bidir %x\n", hdr->itt);
-			list_add_tail(&task->c_list, &task->conn->tx_clist);
-			return 0;
+		if (conn->tp->rdma) {
+			/* keep sending RDMA writes, but wait until they
+			 * are done before sending final response */
+			if (task->offset < task->len)
+			    list_add_tail(&task->c_list, &task->conn->tx_clist);
+		} else {
+			if (task->offset < task->len || task->result != 0
+			   || task->dir == BIDIRECTIONAL) {
+				dprintf("more data or sense or bidir %x\n",
+					hdr->itt);
+				list_add_tail(&task->c_list,
+					      &task->conn->tx_clist);
+			}
 		}
+		break;
 	case ISCSI_OP_SCSI_CMD_RSP:
 		iscsi_free_cmd_task(task);
 		break;
-- 
1.5.3.4




More information about the stgt mailing list