[stgt] [PATCH 3/3] iscsi: use pthread per target for tcp

FUJITA Tomonori fujita.tomonori at lab.ntt.co.jp
Thu Jun 10 03:19:29 CEST 2010


Currently, the main process handles SCSI protocol processing (and
network I/O for iSCSI), and four I/O threads runs per lun to handle
disk I/Os.

The current model doesn't scale with the number of targets if you have
fast network (10GbE) and disk drives (SSDs).

With this patch, we use pthread per target for iSCSI/TCP (not iSER).

Target's pthread handles SCSI protocol processing and network I/Os for
the target, and four I/O threads runs per lun to handle disk I/Os.

Note that the pthread-per-target model is enabled only if tgt uses
signalfd. Even if with the main process model, tgt is much faster with
signalfd-capable kernels. Linux 2.6.22 or newer is strongly
recommended.

Signed-off-by: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>
---
 usr/bs.c               |   55 +++++++++++++++++++++---------------
 usr/iscsi/conn.c       |    5 ++-
 usr/iscsi/iscsi_rdma.c |    8 +++++
 usr/iscsi/iscsi_tcp.c  |   56 ++++++++++++++++++++++++++++++++++--
 usr/iscsi/iscsid.c     |    9 +++++-
 usr/iscsi/iscsid.h     |    9 ++++++
 usr/iscsi/target.c     |   73 ++++++++++++++++++++++++++++++++++++++++++++++-
 usr/iscsi/transport.h  |    3 ++
 usr/target.h           |    2 +
 usr/tgtd.h             |    8 +++++
 10 files changed, 197 insertions(+), 31 deletions(-)

diff --git a/usr/bs.c b/usr/bs.c
index e74cc13..a29a5f4 100644
--- a/usr/bs.c
+++ b/usr/bs.c
@@ -34,14 +34,14 @@
 
 #include "list.h"
 #include "tgtd.h"
+#include "target.h"
 #include "tgtadm_error.h"
 #include "util.h"
 #include "bs_thread.h"
 
 static LIST_HEAD(bst_list);
 
-static LIST_HEAD(finished_list);
-static pthread_mutex_t finished_lock;
+struct bs_finish bs_finish, *bsf = &bs_finish;
 
 int sig_fd = -1;
 
@@ -87,15 +87,15 @@ retry:
 		goto out;
 	}
 
-	pthread_mutex_lock(&finished_lock);
+	pthread_mutex_lock(&bsf->finished_lock);
 retest:
-	if (list_empty(&finished_list)) {
-		pthread_cond_wait(&finished_cond, &finished_lock);
+	if (list_empty(&bsf->finished_list)) {
+		pthread_cond_wait(&finished_cond, &bsf->finished_lock);
 		goto retest;
 	}
 
-	while (!list_empty(&finished_list)) {
-		cmd = list_first_entry(&finished_list,
+	while (!list_empty(&bsf->finished_list)) {
+		cmd = list_first_entry(&bsf->finished_list,
 				 struct scsi_cmd, bs_list);
 
 		dprintf("found %p\n", cmd);
@@ -104,7 +104,7 @@ retest:
 		list_add_tail(&cmd->bs_list, &ack_list);
 	}
 
-	pthread_mutex_unlock(&finished_lock);
+	pthread_mutex_unlock(&bsf->finished_lock);
 
 	nr = 1;
 rewrite:
@@ -154,9 +154,10 @@ rewrite:
 	}
 }
 
-static void bs_sig_request_done(int fd, int events, void *data)
+void bs_sig_request_done(int fd, int events, void *data)
 {
 	int ret;
+	struct bs_finish *b = data;
 	struct scsi_cmd *cmd;
 	struct signalfd_siginfo siginfo[16];
 	LIST_HEAD(list);
@@ -166,9 +167,9 @@ static void bs_sig_request_done(int fd, int events, void *data)
 		return;
 	}
 
-	pthread_mutex_lock(&finished_lock);
-	list_splice_init(&finished_list, &list);
-	pthread_mutex_unlock(&finished_lock);
+	pthread_mutex_lock(&b->finished_lock);
+	list_splice_init(&b->finished_list, &list);
+	pthread_mutex_unlock(&b->finished_lock);
 
 	while (!list_empty(&list)) {
 		cmd = list_first_entry(&list, struct scsi_cmd, bs_list);
@@ -184,6 +185,7 @@ static void *bs_thread_worker_fn(void *arg)
 	struct bs_thread_info *info = arg;
 	struct scsi_cmd *cmd;
 	sigset_t set;
+	struct bs_finish *tbsf;
 
 	sigfillset(&set);
 	sigprocmask(SIG_BLOCK, &set, NULL);
@@ -207,16 +209,24 @@ static void *bs_thread_worker_fn(void *arg)
 		cmd = list_first_entry(&info->pending_list,
 				       struct scsi_cmd, bs_list);
 
+
+		if (cmd->c_target->bsf)
+			tbsf = cmd->c_target->bsf;
+		else
+			tbsf = bsf;
+
 		list_del(&cmd->bs_list);
 		pthread_mutex_unlock(&info->pending_lock);
 
 		info->request_fn(cmd);
 
-		pthread_mutex_lock(&finished_lock);
-		list_add_tail(&cmd->bs_list, &finished_list);
-		pthread_mutex_unlock(&finished_lock);
+		pthread_mutex_lock(&tbsf->finished_lock);
+		list_add_tail(&cmd->bs_list, &tbsf->finished_list);
+		pthread_mutex_unlock(&tbsf->finished_lock);
 
-		if (sig_fd < 0)
+		if (cmd->c_target->bsf)
+			pthread_kill(cmd->c_target->bsf->thread, SIGUSR2);
+		else if (sig_fd < 0)
 			pthread_cond_signal(&finished_cond);
 		else
 			kill(getpid(), SIGUSR2);
@@ -225,13 +235,11 @@ static void *bs_thread_worker_fn(void *arg)
 	pthread_exit(NULL);
 }
 
-static int bs_init_signalfd(void)
+static int bs_init_signalfd(struct bs_finish *b)
 {
 	sigset_t mask;
 	int ret;
 
-	pthread_mutex_init(&finished_lock, NULL);
-
 	sigemptyset(&mask);
 	sigaddset(&mask, SIGUSR2);
 	sigprocmask(SIG_BLOCK, &mask, NULL);
@@ -240,7 +248,7 @@ static int bs_init_signalfd(void)
 	if (sig_fd < 0)
 		return 1;
 
-	ret = tgt_event_add(sig_fd, EPOLLIN, bs_sig_request_done, NULL);
+	ret = tgt_event_add(sig_fd, EPOLLIN, bs_sig_request_done, b);
 	if (ret < 0) {
 		close (sig_fd);
 		sig_fd = -1;
@@ -256,7 +264,6 @@ static int bs_init_notify_thread(void)
 	int ret;
 
 	pthread_cond_init(&finished_cond, NULL);
-	pthread_mutex_init(&finished_lock, NULL);
 
 	ret = pipe(command_fd);
 	if (ret) {
@@ -298,7 +305,6 @@ close_command_fd:
 	close(command_fd[1]);
 destroy_cond_mutex:
 	pthread_cond_destroy(&finished_cond);
-	pthread_mutex_destroy(&finished_lock);
 
 	return 1;
 }
@@ -307,7 +313,10 @@ int bs_init(void)
 {
 	int ret;
 
-	ret = bs_init_signalfd();
+	pthread_mutex_init(&bsf->finished_lock, NULL);
+	INIT_LIST_HEAD(&bsf->finished_list);
+
+	ret = bs_init_signalfd(bsf);
 	if (!ret) {
 		eprintf("use signalfd notification\n");
 		return 0;
diff --git a/usr/iscsi/conn.c b/usr/iscsi/conn.c
index ba7a58f..d8601e1 100644
--- a/usr/iscsi/conn.c
+++ b/usr/iscsi/conn.c
@@ -23,6 +23,7 @@
 #include <string.h>
 #include <errno.h>
 #include <sys/stat.h>
+#include <sys/epoll.h>
 
 #include "iscsid.h"
 #include "tgtd.h"
@@ -231,7 +232,9 @@ int conn_close_force(uint32_t tid, uint64_t sid, uint32_t cid)
 			list_for_each_entry(conn, &session->conn_list, clist) {
 				if (conn->cid == cid) {
 					eprintf("close %" PRIx64 " %u\n", sid, cid);
-					conn_close(conn);
+					conn->state = STATE_CLOSE;
+					conn->tp->ep_event_modify(conn,
+								  EPOLLIN|EPOLLOUT|EPOLLERR);
 					return TGTADM_SUCCESS;
 				}
 			}
diff --git a/usr/iscsi/iscsi_rdma.c b/usr/iscsi/iscsi_rdma.c
index 63edebf..115d774 100644
--- a/usr/iscsi/iscsi_rdma.c
+++ b/usr/iscsi/iscsi_rdma.c
@@ -1194,6 +1194,8 @@ static int iscsi_rdma_init(void)
 	INIT_LIST_HEAD(&iser_conn_list);
 	INIT_LIST_HEAD(&temp_conn);
 
+	iscsi_rdma_enabled = 1;
+
 	return ret;
 }
 
@@ -1246,6 +1248,11 @@ static int iscsi_rdma_login_complete(struct iscsi_connection *conn)
 	return ret;
 }
 
+static void iscsi_rdma_nexus_init(struct iscsi_connection *conn)
+{
+	conn->tp->ep_event_modify(conn, EPOLLIN);
+}
+
 /*
  * Copy the remote va and stag that were temporarily saved in conn_info.
  */
@@ -1725,6 +1732,7 @@ static struct iscsi_transport iscsi_iser = {
 	.data_padding		= 1,
 	.ep_init		= iscsi_rdma_init,
 	.ep_login_complete	= iscsi_rdma_login_complete,
+	.ep_nexus_init		= iscsi_rdma_nexus_init,
 	.alloc_task		= iscsi_iser_alloc_task,
 	.free_task		= iscsi_iser_free_task,
 	.ep_read		= iscsi_iser_read,
diff --git a/usr/iscsi/iscsi_tcp.c b/usr/iscsi/iscsi_tcp.c
index 8fc145f..d1edd84 100644
--- a/usr/iscsi/iscsi_tcp.c
+++ b/usr/iscsi/iscsi_tcp.c
@@ -31,6 +31,7 @@
 #include <netinet/tcp.h>
 #include <sys/epoll.h>
 #include <sys/socket.h>
+#include <pthread.h>
 
 #include "iscsid.h"
 #include "tgtd.h"
@@ -43,6 +44,7 @@ static struct iscsi_transport iscsi_tcp;
 
 struct iscsi_tcp_connection {
 	int fd;
+	int pthread;
 
 	struct iscsi_connection iscsi_conn;
 };
@@ -153,6 +155,7 @@ out:
 static void iscsi_tcp_event_handler(int fd, int events, void *data)
 {
 	struct iscsi_connection *conn = (struct iscsi_connection *) data;
+	struct iscsi_tcp_connection *tcp_conn = TCP_CONN(conn);
 
 	if (events & EPOLLIN)
 		iscsi_rx_handler(conn);
@@ -165,7 +168,19 @@ static void iscsi_tcp_event_handler(int fd, int events, void *data)
 
 	if (conn->state == STATE_CLOSE) {
 		dprintf("connection closed %p\n", conn);
-		conn_close(conn);
+		if (tcp_conn->pthread) {
+			struct iscsi_target *target = conn->session->target;
+
+			pthread_mutex_lock(&target->event_lock);
+			do_tgt_event_del(target->efd, &target->events_list,
+					 tcp_conn->fd);
+			pthread_mutex_unlock(&target->event_lock);
+			/* let the main thread handle this */
+			tcp_conn->pthread = 0;
+			tgt_event_modify(tcp_conn->fd, EPOLLIN|EPOLLOUT|EPOLLERR);
+		} else {
+			conn_close(conn);
+		}
 	}
 }
 
@@ -263,6 +278,29 @@ static int iscsi_tcp_conn_login_complete(struct iscsi_connection *conn)
 	return 0;
 }
 
+static void iscsi_tcp_conn_nexus_init(struct iscsi_connection *conn)
+{
+	struct iscsi_tcp_connection *tcp_conn = TCP_CONN(conn);
+	struct iscsi_target *target = conn->session->target;
+
+	if (iscsi_pthread_per_target()) {
+		/* remove the conn from the main thread. */
+		conn->tp->ep_event_modify(conn, 0);
+
+		pthread_mutex_lock(&target->event_lock);
+
+		do_tgt_event_add(target->efd, &target->events_list,
+				 tcp_conn->fd, EPOLLIN,
+				 iscsi_tcp_event_handler, conn);
+
+		pthread_mutex_unlock(&target->event_lock);
+
+		tcp_conn->pthread = 1;
+	}
+
+	conn->tp->ep_event_modify(conn, EPOLLIN);
+}
+
 static size_t iscsi_tcp_read(struct iscsi_connection *conn, void *buf,
 			     size_t nbytes)
 {
@@ -336,9 +374,18 @@ static void iscsi_event_modify(struct iscsi_connection *conn, int events)
 	struct iscsi_tcp_connection *tcp_conn = TCP_CONN(conn);
 	int ret;
 
-	ret = tgt_event_modify(tcp_conn->fd, events);
-	if (ret)
-		eprintf("tgt_event_modify failed\n");
+	if (tcp_conn->pthread) {
+		struct iscsi_target *target = conn->session->target;
+
+		pthread_mutex_lock(&target->event_lock);
+		do_tgt_event_modify(target->efd, &target->events_list,
+				    tcp_conn->fd, events);
+		pthread_mutex_unlock(&target->event_lock);
+	} else {
+		ret = tgt_event_modify(tcp_conn->fd, events);
+		if (ret)
+			eprintf("tgt_event_modify failed\n");
+	}
 }
 
 static struct iscsi_task *iscsi_tcp_alloc_task(struct iscsi_connection *conn,
@@ -391,6 +438,7 @@ static struct iscsi_transport iscsi_tcp = {
 	.ep_init		= iscsi_tcp_init,
 	.ep_exit		= iscsi_tcp_exit,
 	.ep_login_complete	= iscsi_tcp_conn_login_complete,
+	.ep_nexus_init		= iscsi_tcp_conn_nexus_init,
 	.alloc_task		= iscsi_tcp_alloc_task,
 	.free_task		= iscsi_tcp_free_task,
 	.ep_read		= iscsi_tcp_read,
diff --git a/usr/iscsi/iscsid.c b/usr/iscsi/iscsid.c
index dcca384..b4e0969 100644
--- a/usr/iscsi/iscsid.c
+++ b/usr/iscsi/iscsid.c
@@ -73,6 +73,13 @@ enum {
 	IOSTATE_TX_END,
 };
 
+int iscsi_rdma_enabled;
+
+int iscsi_pthread_per_target(void)
+{
+	return sig_fd >= 0 && !iscsi_rdma_enabled;
+}
+
 void conn_read_pdu(struct iscsi_connection *conn)
 {
 	conn->rx_iostate = IOSTATE_RX_BHS;
@@ -2224,7 +2231,7 @@ finish:
 		else {
 			conn->state = STATE_SCSI;
 			conn_read_pdu(conn);
-			conn->tp->ep_event_modify(conn, EPOLLIN);
+			conn->tp->ep_nexus_init(conn);
 		}
 		break;
 	case STATE_EXIT:
diff --git a/usr/iscsi/iscsid.h b/usr/iscsi/iscsid.h
index 6b982cb..1e70d81 100644
--- a/usr/iscsi/iscsid.h
+++ b/usr/iscsi/iscsid.h
@@ -244,6 +244,13 @@ struct iscsi_target {
 	int nr_sessions;
 
 	struct list_head isns_list;
+
+	int efd;
+	pthread_mutex_t event_lock;
+	struct list_head events_list;
+
+	struct bs_finish bsfin;
+	int stop_pthread;
 };
 
 enum task_flags {
@@ -310,6 +317,8 @@ extern int iscsi_target_show(int mode, int tid, uint64_t sid, uint32_t cid,
 int iscsi_target_update(int mode, int op, int tid, uint64_t sid, uint64_t lun,
 			uint32_t cid, char *name);
 
+int iscsi_pthread_per_target(void);
+
 /* param.c */
 int param_index_by_name(char *name, struct iscsi_key *keys);
 
diff --git a/usr/iscsi/target.c b/usr/iscsi/target.c
index c6ac031..b547626 100644
--- a/usr/iscsi/target.c
+++ b/usr/iscsi/target.c
@@ -25,6 +25,7 @@
 #include <unistd.h>
 #include <netdb.h>
 #include <sys/stat.h>
+#include <sys/epoll.h>
 #include <sys/un.h>
 #include <netinet/in.h>
 #include <sys/socket.h>
@@ -32,10 +33,12 @@
 #include <netinet/tcp.h>
 #include <netinet/ip.h>
 #include <arpa/inet.h>
+#include <pthread.h>
 #include "iscsid.h"
 #include "tgtadm.h"
 #include "tgtd.h"
 #include "target.h"
+#include "util.h"
 
 LIST_HEAD(iscsi_targets_list);
 
@@ -252,12 +255,63 @@ void iscsi_target_destroy(int tid)
 	}
 
 	list_del(&target->tlist);
+
+	pthread_mutex_init(&target->event_lock, NULL);
+
+	if (target->bsfin.thread) {
+		target->stop_pthread = 1;
+		pthread_kill(target->bsfin.thread, SIGUSR2);
+
+		pthread_join(target->bsfin.thread, NULL);
+		pthread_mutex_destroy(&target->bsfin.finished_lock);
+	}
+
+	close(target->efd);
 	free(target);
 	isns_target_deregister(tgt_targetname(tid));
 
 	return;
 }
 
+static void *iscsi_thread_fn(void *arg)
+{
+	struct iscsi_target *t = arg;
+	struct epoll_event events[1024];
+	struct event_data *tev;
+	sigset_t mask;
+	int nevent, i;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGUSR2);
+	pthread_sigmask(SIG_BLOCK, &mask, NULL);
+
+	pthread_mutex_lock(&t->event_lock);
+
+	do_tgt_event_add(t->efd, &t->events_list, sig_fd, EPOLLIN,
+			 bs_sig_request_done, &t->bsfin);
+
+	pthread_mutex_unlock(&t->event_lock);
+
+retry:
+	nevent = epoll_wait(t->efd, events, ARRAY_SIZE(events), 1000);
+	if (nevent < 0) {
+		if (errno != EINTR) {
+			eprintf("%m\n");
+			exit(1);
+		}
+	} else if (nevent) {
+		for (i = 0; i < nevent; i++) {
+			tev = (struct event_data *) events[i].data.ptr;
+			tev->handler(tev->fd, events[i].events, tev->data);
+		}
+	}
+
+	if (!t->stop_pthread)
+		goto retry;
+
+	pthread_exit(NULL);
+}
+
 int iscsi_target_create(struct target *t)
 {
 	int tid = t->tid;
@@ -288,11 +342,15 @@ int iscsi_target_create(struct target *t)
 		[ISCSI_PARAM_MAX_OUTST_PDU] =  {0, 0},  /* not in open-iscsi */
 	};
 
-	target = malloc(sizeof(*target));
+	target = zalloc(sizeof(*target));
 	if (!target)
 		return -ENOMEM;
 
-	memset(target, 0, sizeof(*target));
+	target->efd = epoll_create(128);
+	if (target->efd < 0) {
+		free(target);
+		return -EINVAL;
+	}
 
 	memcpy(target->session_param, default_tgt_session_param,
 	       sizeof(target->session_param));
@@ -300,10 +358,21 @@ int iscsi_target_create(struct target *t)
 	INIT_LIST_HEAD(&target->tlist);
 	INIT_LIST_HEAD(&target->sessions_list);
 	INIT_LIST_HEAD(&target->isns_list);
+	INIT_LIST_HEAD(&target->events_list);
 	target->tid = tid;
 	list_add_tail(&target->tlist, &iscsi_targets_list);
 
 	isns_target_register(tgt_targetname(tid));
+
+	if (iscsi_pthread_per_target()) {
+		pthread_create(&target->bsfin.thread, NULL, iscsi_thread_fn, target);
+
+		pthread_mutex_init(&target->bsfin.finished_lock, NULL);
+		INIT_LIST_HEAD(&target->bsfin.finished_list);
+		t->bsf = &target->bsfin;
+		eprintf("create thread %u\n", (unsigned)target->bsfin.thread);
+	}
+
 	return 0;
 }
 
diff --git a/usr/iscsi/transport.h b/usr/iscsi/transport.h
index 92a6f0a..c94b86b 100644
--- a/usr/iscsi/transport.h
+++ b/usr/iscsi/transport.h
@@ -4,6 +4,8 @@
 #include <sys/socket.h>
 #include "list.h"
 
+extern int iscsi_rdma_enabled;
+
 struct iscsi_connection;
 struct iscsi_task;
 
@@ -17,6 +19,7 @@ struct iscsi_transport {
 	int (*ep_init) (void);
 	void (*ep_exit) (void);
 	int (*ep_login_complete)(struct iscsi_connection *conn);
+	void (*ep_nexus_init)(struct iscsi_connection *conn);
 	struct iscsi_task *(*alloc_task)(struct iscsi_connection *conn,
 					 size_t ext_len);
 	void (*free_task)(struct iscsi_task *task);
diff --git a/usr/target.h b/usr/target.h
index 9283431..4607fc4 100644
--- a/usr/target.h
+++ b/usr/target.h
@@ -39,6 +39,8 @@ struct target {
 	struct list_head acl_list;
 
 	struct tgt_account account;
+
+	struct bs_finish *bsf;
 };
 
 struct it_nexus {
diff --git a/usr/tgtd.h b/usr/tgtd.h
index 79d9c88..b8541c8 100644
--- a/usr/tgtd.h
+++ b/usr/tgtd.h
@@ -334,6 +334,14 @@ struct event_data {
 
 extern int sig_fd;
 
+struct bs_finish {
+	struct list_head finished_list;
+	pthread_mutex_t finished_lock;
+	pthread_t thread;
+};
+
+void bs_sig_request_done(int fd, int events, void *data);
+
 int do_tgt_event_add(int efd, struct list_head *list, int fd, int events,
 		     event_handler_t handler, void *data);
 void do_tgt_event_del(int efd, struct list_head *list, int fd);
-- 
1.6.5

--
To unsubscribe from this list: send the line "unsubscribe stgt" in
the body of a message to majordomo at vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



More information about the stgt mailing list