[Stgt-devel] Tuning iSER for performance

Erez Zilber erezz
Thu Mar 6 11:10:17 CET 2008


> It would be interesting to isolate the IO effects from the thread
> switching to see where it all goes.  I recall a lot of it goes to
> non-IO overhead.  A really-synchronous of bs_rdwr follows.  It may
> not build against the current tree; it's old.

Here's a patch that works with the current version of stgt. 

commit 421d1ed5ede16dfd45d86921fcdaaac5db436e23
Author: Erez Zilber <erezz at voltaire.com>
Date:   Thu Mar 6 10:46:33 2008 +0200

    bs-rdwr-sync
    
    New file bs_rdwr_sync.c is similar in spirit to bs_rdwr.c but without all
    the threading.  For simpler debugging of core iscsi.

diff --git a/usr/Makefile b/usr/Makefile
index 13c77d2..bc489b5 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -11,7 +11,7 @@ CFLAGS += -DISCSI
 TGTD_OBJS += $(addprefix iscsi/, conn.o param.o session.o \
 		iscsid.o target.o chap.o transport.o iscsi_tcp.o \
 		isns.o libcrc32c.o)
-TGTD_OBJS += bs_rdwr.o bs_aio.o
+TGTD_OBJS += bs_rdwr.o bs_rdwr_sync.o bs_aio.o
 
 LIBS += -lcrypto
 ifneq ($(ISCSI_RDMA),)
diff --git a/usr/bs_rdwr_sync.c b/usr/bs_rdwr_sync.c
new file mode 100644
index 0000000..daa2398
--- /dev/null
+++ b/usr/bs_rdwr_sync.c
@@ -0,0 +1,204 @@
+/*
+ * Synchronous I/O file backing store routine, based on an old bs_rdwr.
+ *
+ * Copyright (C) 2006-2007 FUJITA Tomonori <tomof at acm.org>
+ * Copyright (C) 2006-2007 Mike Christie <michaelc at cs.wisc.edu>
+ * Copyright (C) 2006-2007 Pete Wyckoff <pw at osc.edu>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+#define _XOPEN_SOURCE 500
+
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+
+#include <linux/fs.h>
+#include <sys/epoll.h>
+
+#include "list.h"
+#include "util.h"
+#include "tgtd.h"
+#include "scsi.h"
+
+static int bs_rdwr_sync_open(struct scsi_lu *lu, char *path, int *fd,
+			     uint64_t *size)
+{
+	*fd = backed_file_open(path, O_RDWR | O_LARGEFILE, size);
+	if (*fd < 0)
+		return *fd;
+	return 0;
+}
+
+static void bs_rdwr_sync_close(struct scsi_lu *lu)
+{
+	close(lu->fd);
+}
+
+static void set_medium_error(int *result, uint8_t *key, uint16_t *asc)
+{
+	*result = SAM_STAT_CHECK_CONDITION;
+	*key = MEDIUM_ERROR;
+	*asc = ASC_READ_ERROR;
+}
+
+static void bs_sync_sync_range(struct scsi_cmd *cmd, uint32_t length,
+			       int *result, uint8_t *key, uint16_t *asc)
+{
+	int ret;
+	unsigned int flags = SYNC_FILE_RANGE_WAIT_BEFORE| SYNC_FILE_RANGE_WRITE;
+
+	ret = __sync_file_range(cmd->dev->fd, cmd->offset, length, flags);
+	if (ret)
+		set_medium_error(result, key, asc);
+}
+
+static int bs_rdwr_sync_cmd_submit(struct scsi_cmd *cmd)
+{
+	int ret, fd = cmd->dev->fd;
+	uint32_t length;
+	int result = SAM_STAT_GOOD;
+	uint8_t key;
+	uint16_t asc;
+
+	ret = length = 0;
+	key = asc = 0;
+
+	switch (cmd->scb[0])
+	{
+	case SYNCHRONIZE_CACHE:
+	case SYNCHRONIZE_CACHE_16:
+		/* TODO */
+		length = (cmd->scb[0] == SYNCHRONIZE_CACHE) ? 0 : 0;
+
+		if (cmd->scb[1] & 0x2) {
+			result = SAM_STAT_CHECK_CONDITION;
+			key = ILLEGAL_REQUEST;
+			asc = ASC_INVALID_FIELD_IN_CDB;
+		} else
+			bs_sync_sync_range(cmd, length, &result, &key, &asc);
+		break;
+	case WRITE_6:
+	case WRITE_10:
+	case WRITE_12:
+	case WRITE_16:
+		length = scsi_get_out_length(cmd);
+		ret = pwrite64(fd, scsi_get_out_buffer(cmd), length,
+			       cmd->offset);
+		if (ret == length) {
+			if ((cmd->scb[0] != WRITE_6) && (cmd->scb[1] & 0x8))
+				bs_sync_sync_range(cmd, length, &result, &key,
+						   &asc);
+		} else
+			set_medium_error(&result, &key, &asc);
+
+		break;
+	case READ_6:
+	case READ_10:
+	case READ_12:
+	case READ_16:
+		length = scsi_get_in_length(cmd);
+		ret = pread64(fd, scsi_get_in_buffer(cmd), length,
+			      cmd->offset);
+
+		if (ret != length)
+			set_medium_error(&result, &key, &asc);
+		break;
+	default:
+		break;
+	}
+
+	dprintf("io done %p %x %d %u\n", cmd, cmd->scb[0], ret, length);
+
+	scsi_set_result(cmd, result);
+
+	if (result != SAM_STAT_GOOD) {
+		eprintf("io error %p %x %d %d %" PRIu64 ", %m\n",
+			cmd, cmd->scb[0], ret, length, cmd->offset);
+		sense_data_build(cmd, key, asc);
+	}
+
+	return 0;
+}
+
+/*static int bs_rdwr_sync_cmd_submit(struct scsi_cmd *cmd)
+{
+	struct scsi_lu *lu = cmd->dev;
+	int ret = 0, fd = lu->fd;
+	uint32_t length = 0;
+
+	dprintf("rw %d len %u off %llu cdb %02x\n", cmd->rw, cmd->len,
+		(unsigned long long) cmd->offset, cmd->scb[0]);
+
+	switch (cmd->scb[0]) {
+	case SYNCHRONIZE_CACHE:
+	case SYNCHRONIZE_CACHE_16:
+		ret = fsync(fd);
+		break;
+	case WRITE_6:
+	case WRITE_10:
+	case WRITE_12:
+	case WRITE_16:
+		length = scsi_get_out_length(cmd);
+		ret = pwrite64(fd, scsi_get_out_buffer(cmd), length,
+			       cmd->offset);
+		break;
+	case READ_6:
+	case READ_10:
+	case READ_12:
+	case READ_16:
+		length = scsi_get_in_length(cmd);
+		ret = pread64(fd, scsi_get_in_buffer(cmd), length, cmd->offset);
+		break;
+	default:
+		break;
+	}
+
+	if (ret == length) {
+		scsi_set_result(cmd, SAM_STAT_GOOD);
+	} else {
+		eprintf("io error %p %x %d %d %" PRIu64 ", %m\n",
+			cmd, cmd->scb[0], ret, length, cmd->offset);
+		scsi_set_result(cmd, SAM_STAT_CHECK_CONDITION);
+		sense_data_build(cmd, MEDIUM_ERROR, ASC_READ_ERROR);
+	}
+
+	return 0;
+}*/
+
+static int bs_rdwr_sync_cmd_done(struct scsi_cmd *cmd)
+{
+	return 0;
+}
+
+static struct backingstore_template rdwr_sync_bst = {
+	.bs_name		= "rdwr_sync",
+	.bs_datasize		= 0,
+	.bs_open		= bs_rdwr_sync_open,
+	.bs_close		= bs_rdwr_sync_close,
+	.bs_cmd_submit		= bs_rdwr_sync_cmd_submit,
+	.bs_cmd_done		= bs_rdwr_sync_cmd_done,
+};
+
+__attribute__((constructor)) static void bs_rdwr_sync_constructor(void)
+{
+	register_backingstore_template(&rdwr_sync_bst);
+}
diff --git a/usr/iscsi/iscsid.c b/usr/iscsi/iscsid.c
index 490a743..7ab4c8f 100644
--- a/usr/iscsi/iscsid.c
+++ b/usr/iscsi/iscsid.c
@@ -2153,7 +2153,7 @@ static struct tgt_driver iscsi = {
 	.show			= iscsi_target_show,
 	.cmd_end_notify		= iscsi_scsi_cmd_done,
 	.mgmt_end_notify	= iscsi_tm_done,
-	.default_bst		= "rdwr",
+	.default_bst		= "rdwr_sync",
 };
 
 __attribute__((constructor)) static void iscsi_driver_constructor(void)

Now, the performance is even lower (~460 MB/sec with rdwr_sync compared to ~670 MB/sec with rdwr). I've noticed that it takes a lot of time between target_cmd_queue (time = 663673) & iscsi_task_tx_start (669209).

I don't understand something in the behavior of iscsi_task_tx_start (this may be related to the long time mentioned above): when it is called, it handles only the 1st task in conn->tx_clist. Why doesn't it try to handle all tasks on the list? What happens is that after bs completes is work, it takes a lot of time until iscsi_task_tx_start is called for that task. iscsi_task_tx_start *is* called immediately, but it handles the 1st task only (so the current task has to wait for this thread to wake up multiple times until it will be handled). Can anyone explain this design?

Erez



More information about the stgt mailing list