[stgt] tgtd exits in iscsi_tx_handler during heavy I/Os

Kiefer Chang zapchang at gmail.com
Wed May 18 04:45:36 CEST 2011


Dear Tomonori,


The symptom happens when doing heavy I/O.
I will apply the patch and try to reproduce the problem as soon as possible.

By the way, if a connection is closed and a new connection is accept,
the previous released fd will be reused (accept function return the
same fd just like the previous-closed connection).
The code will then try to zalloc a new tcp_conn structure and use the
iscsi_conn in it.
The tcp_conn's address allocated will be the same memory address as
previous closed-connection.
The behavior is correct, but I just wondering if this may be related...

Thanks a lot!

--
Kiefer Chang

2011/5/18 FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>:
> On Sat, 14 May 2011 15:24:37 +0800
> Kiefer Chang <zapchang at gmail.com> wrote:
>
>> tgtd (1.0.12, no RDMA) is running in a virtual machines and we use
>> files on Lustre file system as backing stores.
>> Every target is using a file on Lustre as its LUN 1.
>> The OS is based on CentOS 5.4, kernel version is 2.6.18-164.
>>
>> There are about 15~ targets for this I/O tests.
>> (they are logged in by 1 physical node, and the disks are attached to
>> 3 virtual machines).
>> I/O test is vdbench file system I/O (performing in the VM).
>>
>> We also running tests to create/delete other targets repeatedly at the
>> same time.
>
> You see the problem during creating/deleting targets?
>
> Or you see the problem during heavy I/Os without creating/deleting
> targets?
>
>
>> After some period of time, the tgtd exits with the following message:
>> (Please see get the completed log file from
>> http://dl.dropbox.com/u/8354750/messages.kiefer.gz)
>>
>>
>> May 12 17:55:36 localhost tgtd: conn_close(101) connection closed, 0x105c6ca8 3
>> May 12 17:55:36 localhost tgtd: conn_close(107) sesson 0x10882630 1
>> May 12 17:55:39 localhost tgtd: conn_close(90) already closed 0x105c6ca8 2
>> May 12 17:56:47 localhost tgtd: abort_task_set(1149) found 40000009 0
>> May 12 17:56:47 localhost tgtd: abort_task_set(1149) found 0 0
>> May 12 17:56:47 localhost tgtd: abort_cmd(1125) found 40000045 e
>> May 12 17:57:08 localhost tgtd: conn_close(101) connection closed, 0x1074c4c8 3
>> May 12 17:57:08 localhost tgtd: conn_close(107) sesson 0x108a5d90 1
>> May 12 17:57:10 localhost tgtd: iscsi_tx_handler(2244) error 2 22
>> May 12 17:57:10 localhost tgtd: tgtd logger exits abnormally, pid:3296
>>
>>
>> The symptom is not always reproduce-able, sometimes it happens twice a
>> day, sometimes it runs well for server days.
>>
>> iscsi_tx_handler(2244) error 2 22
>>
>> is not the only exit message, we also saw:
>>
>> iscsi_tx_handler(2244) error 0 0
>>
>> By searching the mailing list we found two threads discussing
>> iscsi_tx_handler exit issue, but there are no further information in
>> those threads.
>> We suspect this is a TMF issues just as one thread says, but we still
>> can't assure.
>
> I also suspect that you hit TMF bugs.
>
>
> Can you try the following patch with the latest git version?
>
> It's not a fix but disables TMF so we could see if TMF causes the
> issue or not.
>
>
> Thanks,
>
> diff --git a/usr/bs_thread.h b/usr/bs_thread.h
> index d460032..14456a7 100644
> --- a/usr/bs_thread.h
> +++ b/usr/bs_thread.h
> @@ -29,3 +29,4 @@ extern int bs_thread_open(struct bs_thread_info *info, request_func_t *rfn,
>  extern void bs_thread_close(struct bs_thread_info *info);
>  extern int bs_thread_cmd_submit(struct scsi_cmd *cmd);
>
> +extern void bs_wait_one_completion(void);
> diff --git a/usr/iscsi/conn.c b/usr/iscsi/conn.c
> index 53e719e..f268bfc 100644
> --- a/usr/iscsi/conn.c
> +++ b/usr/iscsi/conn.c
> @@ -115,14 +115,14 @@ void conn_close(struct iscsi_connection *conn)
>                if (task->conn != conn)
>                        continue;
>
> -               eprintf("Forcing release of pending task %p %" PRIx64 "\n",
> -                       task, task->tag);
> +               eprintf("Forcing release of pending task %p %" PRIx64 " %u\n",
> +                       task, task->tag, conn->refcount);
>                list_del(&task->c_list);
>                iscsi_free_task(task);
>        }
>
>        if (conn->tx_task) {
> -               dprintf("Add current tx task to the tx list for removal "
> +               eprintf("Add current tx task to the tx list for removal "
>                        "%p %" PRIx64 "\n",
>                        conn->tx_task, conn->tx_task->tag);
>                list_add(&conn->tx_task->c_list, &conn->tx_clist);
> @@ -134,8 +134,8 @@ void conn_close(struct iscsi_connection *conn)
>
>                op = task->req.opcode & ISCSI_OPCODE_MASK;
>
> -               eprintf("Forcing release of tx task %p %" PRIx64 " %x\n",
> -                       task, task->tag, op);
> +               eprintf("Forcing release of tx task %p %" PRIx64 " %x %u\n",
> +                       task, task->tag, op, conn->refcount);
>                switch (op) {
>                case ISCSI_OP_SCSI_CMD:
>                        /*
> @@ -155,14 +155,14 @@ void conn_close(struct iscsi_connection *conn)
>                        iscsi_free_task(task);
>                        break;
>                default:
> -                       eprintf("%x\n", op);
> +                       eprintf("unknow op %x\n", op);
>                        break;
>                }
>        }
>
>        if (conn->rx_task) {
> -               eprintf("Forcing release of rx task %p %" PRIx64 "\n",
> -                       conn->rx_task, conn->rx_task->tag);
> +               eprintf("Forcing release of rx task %p %" PRIx64 " %u\n",
> +                       conn->rx_task, conn->rx_task->tag, conn->refcount);
>                iscsi_free_task(conn->rx_task);
>        }
>        conn->rx_task = NULL;
> @@ -173,10 +173,23 @@ void conn_close(struct iscsi_connection *conn)
>                 * This task is in SCSI. We need to wait for I/O
>                 * completion.
>                 */
> +               eprintf("release task %p %" PRIx64 " flag %lx, %u\n",
> +                       task, task->tag, task->flags, conn->refcount);
> +
>                if (task_in_scsi(task))
>                        continue;
>                iscsi_free_task(task);
>        }
> +
> +       eprintf("%p %u\n", conn, conn->refcount);
> +
> +       while (conn->refcount != 1) {
> +               struct timeval t;
> +               t.tv_sec = 1;
> +               t.tv_usec = 0;
> +               select(0, NULL, NULL, NULL, &t);
> +               eprintf("%p %u\n", conn, conn->refcount);
> +       }
>  done:
>        conn_put(conn);
>  }
> diff --git a/usr/iscsi/iscsi_tcp.c b/usr/iscsi/iscsi_tcp.c
> index e87bbf1..a5a225b 100644
> --- a/usr/iscsi/iscsi_tcp.c
> +++ b/usr/iscsi/iscsi_tcp.c
> @@ -164,7 +164,7 @@ static void iscsi_tcp_event_handler(int fd, int events, void *data)
>                iscsi_tx_handler(conn);
>
>        if (conn->state == STATE_CLOSE) {
> -               dprintf("connection closed %p\n", conn);
> +               eprintf("connection closed %p\n", conn);
>                conn_close(conn);
>        }
>  }
> diff --git a/usr/iscsi/iscsid.c b/usr/iscsi/iscsid.c
> index 7666381..9528880 100644
> --- a/usr/iscsi/iscsid.c
> +++ b/usr/iscsi/iscsid.c
> @@ -1207,6 +1207,8 @@ static int iscsi_scsi_cmd_done(uint64_t nid, int result, struct scsi_cmd *scmd)
>         * task got reassinged to another connection.
>         */
>        if (task->conn->state == STATE_CLOSE) {
> +               eprintf("finish a task on a closed conn, %p %u\n",
> +                       task, task->conn->refcount);
>                iscsi_free_cmd_task(task);
>                return 0;
>        }
> @@ -1396,6 +1398,8 @@ static int iscsi_tm_execute(struct iscsi_task *task)
>        struct iscsi_tm *req = (struct iscsi_tm *) &task->req;
>        int fn = 0, err = 0;
>
> +       eprintf("%x\n", req->flags & ISCSI_FLAG_TM_FUNC_MASK);
> +
>        switch (req->flags & ISCSI_FLAG_TM_FUNC_MASK) {
>        case ISCSI_TM_FUNC_ABORT_TASK:
>                fn = ABORT_TASK;
> @@ -1424,6 +1428,8 @@ static int iscsi_tm_execute(struct iscsi_task *task)
>                        req->flags & ISCSI_FLAG_TM_FUNC_MASK);
>        }
>
> +       err = ISCSI_TMF_RSP_NOT_SUPPORTED;
> +
>        if (err)
>                task->result = err;
>        else {
>
--
To unsubscribe from this list: send the line "unsubscribe stgt" in
the body of a message to majordomo at vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



More information about the stgt mailing list