[Stgt-devel] iSER multiple readers

Robin Humble robin.humble+stgt
Sat Feb 9 17:39:04 CET 2008


On Fri, Feb 08, 2008 at 02:11:33PM -0500, Pete Wyckoff wrote:
>robin.humble+stgt at anu.edu.au wrote on Thu, 07 Feb 2008 21:07 -0500:
>> I think I'm seeing iSER read corruption problems.
>> a) in stock centos5.1 when not using kernel 2.6.22.6 I get read
>>    corruption with just a single reader
>> b) every kernel/OS/ofed combination that I've tried when there are
>>    multiple simultaneous readers
...
>> the easiest way I can reproduce it is:
>>  initiator side - write data:
>>    lmdd if=internal of=/dev/sdc opat=1 bs=1M count=1000
>>  target side - check that the file is ok (it is):
>>    lmdd of=internal if=/mnt/ramdisk/file ipat=1 bs=1M count=1000 mismatch=1
>>  initiator side - read and check data (is sometimes ok):
>>    lmdd of=internal if=/dev/sdc ipat=1 bs=1M count=1000 mismatch=1
>>  initiator side - read data with 2 processes at once (always fails):
>>    lmdd of=internal if=/dev/sdc ipat=1 bs=1M count=1000 mismatch=1 &
>>    lmdd of=internal if=/dev/sdc ipat=1 bs=1M count=1000 mismatch=1 &
>> 
>> I'm using the kernel git tree of stgt.
>> I don't see any problems when using TCP IPoIB.
>
>I've tried this and a few variations but can't find any problems.
>That's unfortunate.  To debug it, perhaps you can investigate the
>mismatched data that comes back and see if you can discern a
>pattern.  Like if it is always at 4k boundaries, or always at 512k
>boundaries, that could help us to narrow it down.

a few of these
  lmdd of=internal if=/dev/sdc bs=1M count=7000 ipat=1 mismatch=1
gives:
 off=116000000 want=6f80000 got=6fa1000
 off=518000000 want=1eec0000 got=1eee1000
 off=12000000 want=c40000 got=c5d000
 off=627000000 want=256e0000 got=256ee000
 off=344000000 want=148b6000 got=148c0000
 off=163000000 want=9c40000 got=9c5b000
 off=11000000 want=b40000 got=b47000
 off=514000000 want=1eb20000 got=1eb21000
 off=28000000 want=1b80000 got=1b93000
 off=78000000 want=4b3d000 got=4b41000
 off=70000000 want=4360000 got=4381000
 off=0 want=e0000 got=fb000
 off=20000000 want=13e0000 got=13fa000
so always on MB boundaries?

a few tests show that it's pretty hard to get mismatches with ~ bs=384
and below.

with bs=512
  lmdd of=internal if=/dev/sdc bs=512 count=7000000 ipat=1 mismatch=1
I get
 off=1010024448 want=3c33c000 got=3c350000
 off=1693302784 want=64edc000 got=64eea000
 off=45203456 want=2b1c000 got=2b27000
 off=289783808 want=1145c000 got=11460000
 off=507494400 want=1e3fc000 got=1e40f000
 off=282181632 want=10d1c000 got=10d30000
 off=334217216 want=13ebc000 got=13ebe000

>You had another corruption issue a long time ago that I thought was
>related to the response message getting in front of the RDMA.  But
>IB guys insist that this is not possible.  I had a patch that I very
>much did not like that delayed the final response message until the
>target saw the local completions for its RDMAs.  This never went in.
>It is dated 16 oct 2007.  In case your notes or mail archives lead
>you to believe this current read corruption is similar.

as always, thanks for looking into this so quickly!

so with ye olde
  [PATCH 20/20] iser wait for rdma completion
applied, now single and multiple readers with stock centos5.1 kernels
and userland work ok. odd.

is there any way to check more definitively whether the ordering is
getting messed up with my hardware/OS/OFED combo? perhaps some sort of
a micro-verbs/rdma benchmark that would convice the IB guys one way or
the other?

I've attached an updated version of the patch that applies to the
current tree.

cheers,
robin
-------------- next part --------------
--- tgt/usr/iscsi/iscsi_rdma.c	2008-01-23 12:50:27.000000000 +1100
+++ tgt+iserCompletePatch/usr/iscsi/iscsi_rdma.c	2008-02-10 02:17:06.000000000 +1100
@@ -94,6 +94,7 @@
 	struct ibv_send_wr wr;
 	struct list_head list;
 	struct iscsi_task *task;  /* to get iser_task for remote stag and va */
+	int final_rdma;
 };
 
 /*
@@ -938,6 +939,14 @@
 
 		iscsi_rdma_event_modify(conn, EPOLLIN | EPOLLOUT);
 		list_add(&rdmal->list, &ci->rdmal);
+
+		/* now let it transmit the final response, as we know
+		 * the RDMAs have completed */
+		if (rdmal->final_rdma) {
+			struct iscsi_task *task = rdmal->task;
+			list_add_tail(&task->c_list, &task->conn->tx_clist);
+		}
+
 		if (waiting_rdma_slot) {
 			waiting_rdma_slot = 0;
 			num_tx_ready = 1;
@@ -1408,7 +1417,8 @@
  */
 static int iser_post_rdma_wr(struct conn_info *ci, struct iscsi_task *task,
 			     void *buf, ssize_t size, int op,
-			     uint64_t remote_va, uint32_t remote_rkey)
+			     uint64_t remote_va, uint32_t remote_rkey,
+			     int final_rdma)
 {
 	int ret;
 	struct rdmalist *rdmal;
@@ -1429,6 +1439,8 @@
 	rdmal->wr.wr.rdma.remote_addr = remote_va;
 	rdmal->wr.wr.rdma.rkey = remote_rkey;
 
+	rdmal->final_rdma = final_rdma;
+
 	ret = ibv_post_send(ci->qp_hndl, &rdmal->wr, &bad_wr);
 	if (ret)
 		eprintf("ibv_post_send ret %d\n", ret);
@@ -1457,7 +1469,7 @@
 		(unsigned long long) itask->rem_write_va);
 
 	ret = iser_post_rdma_wr(ci, task, buf, len, IBV_WR_RDMA_READ,
-				itask->rem_write_va, itask->rem_write_stag);
+				itask->rem_write_va, itask->rem_write_stag, 0);
 	if (ret < 0)
 		return ret;
 
@@ -1482,6 +1494,7 @@
 	struct iser_task *itask = ISER_TASK(task);
 	struct iscsi_pdu *rsp = &conn->rsp;
 	struct iscsi_data_rsp *datain = (struct iscsi_data_rsp *) &rsp->bhs;
+	int final_rdma = (task->offset == task->len);
 	uint32_t offset;
 	int ret;
 
@@ -1492,7 +1505,7 @@
 
 	ret = iser_post_rdma_wr(ci, task, rsp->data, rsp->datasize,
 				IBV_WR_RDMA_WRITE, itask->rem_read_va + offset,
-				itask->rem_read_stag);
+				itask->rem_read_stag, final_rdma);
 	if (ret < 0)
 		return ret;
 
@@ -1501,7 +1514,7 @@
 	 * rdma to finish before sending the completion.  Then we'll stick
 	 * ourselves back on the list.
 	 */
-	if (task->offset == task->len) {
+	if (final_rdma) {
 		iscsi_rdma_event_modify(conn, EPOLLIN);
 	} else {
 		/* poke ourselves to do the next rdma */
--- tgt/usr/iscsi/iscsid.c	2008-01-23 12:50:27.000000000 +1100
+++ tgt+iserCompletePatch/usr/iscsi/iscsid.c	2008-02-10 02:47:18.000000000 +1100
@@ -1700,14 +1700,20 @@
 	case ISCSI_OP_R2T:
 		break;
 	case ISCSI_OP_SCSI_DATA_IN:
-		if (task->offset < task->len ||
-		    scsi_get_result(&task->scmd) != SAM_STAT_GOOD ||
-		    scsi_get_data_dir(&task->scmd) == DATA_BIDIRECTIONAL ||
-		    conn->tp->rdma) {
-			dprintf("more data or sense or bidir %x\n", hdr->itt);
-			list_add_tail(&task->c_list, &task->conn->tx_clist);
-			return 0;
-		}
+ 		if (conn->tp->rdma) {
+ 			/* keep sending RDMA writes, but wait until they
+ 			 * are done before sending final response */
+ 			if (task->offset < task->len)
+ 			    list_add_tail(&task->c_list, &task->conn->tx_clist);
+ 		} else {
+			if (task->offset < task->len ||
+			    scsi_get_result(&task->scmd) != SAM_STAT_GOOD ||
+			    scsi_get_data_dir(&task->scmd) == DATA_BIDIRECTIONAL ) {
+				dprintf("more data or sense or bidir %x\n", hdr->itt);
+				list_add_tail(&task->c_list, &task->conn->tx_clist);
+			}
+  		}
+ 		break;
 	case ISCSI_OP_SCSI_CMD_RSP:
 		iscsi_free_cmd_task(task);
 		break;



More information about the stgt mailing list