[stgt] tgtd segfault during heavy I/O

Kiefer Chang zapchang at gmail.com
Tue Jul 19 08:57:45 CEST 2011


Dear FUJITA-san,

Forget to mention, signalfd doesn't work in my platform.
pthread notification is used by default.

2011/7/19 Kiefer Chang <zapchang at gmail.com>:
> Hi FUJITA-san,
>
> tgtd is blocked in the while loop after dealing with first abort_task_cmd:
>
> Jul 19 10:19:56 node tgtd: iscsi_tcp_release(398) conn=0x168765c8 fd=337 closed
> Jul 19 10:19:57 node tgtd: iscsi_tcp_event_handler(167) connection
> closed 0x1687a8b8
> Jul 19 10:19:57 node tgtd: conn_close(101) connection closed, 0x1687a8b8 1
> Jul 19 10:19:57 node tgtd: iscsi_tcp_release(398) conn=0x1687a8b8 fd=338 closed
> Jul 19 10:24:17 node tgtd: iscsi_tm_execute(1407) 1
> Jul 19 10:24:17 node tgtd: abort_task_set(1153) found f0000048 0
> Jul 19 10:24:17 node tgtd: abort_cmd(1126) found f0000048 6
> Jul 19 10:24:32 node tgtd: iscsi_tcp_event_handler(167) connection
> closed 0x166af268
> Jul 19 10:24:32 node tgtd: conn_close(101) connection closed, 0x166af268 3
> Jul 19 10:24:32 node tgtd: conn_close(107) sesson 0x166d9580 1
> Jul 19 10:24:32 node tgtd: conn_close(177) release task 0x1730aad0 0 flag 2, 3
> Jul 19 10:24:32 node tgtd: conn_close(177) release task 0x171ff570
> f0000048 flag 2, 3
> Jul 19 10:24:32 node tgtd: conn_close(184) 0x166af268 3
> Jul 19 10:24:32 node tgtd: conn_close(190) 0x166af268 3
> Jul 19 10:24:32 node tgtd: bs_thread_request_done(132) wrong wakeup
> Jul 19 10:24:33 node tgtd: conn_close(190) 0x166af268 3
> Jul 19 10:24:35 node last message repeated 4 times
> Jul 19 10:24:35 node tgtd: iscsi_scsi_cmd_done(1217) finish a task on
> a closed conn, 0x171ff570 3
> Jul 19 10:24:35 node tgtd: tgt_event_modify(213) Cannot find event 246
> Jul 19 10:24:35 node tgtd: iscsi_event_modify(448) tgt_event_modify failed
> Jul 19 10:24:35 node tgtd: conn_close(190) 0x166af268 2
> Jul 19 10:24:35 node tgtd: bs_thread_request_done(132) wrong wakeup
> Jul 19 10:24:36 node tgtd: conn_close(190) 0x166af268 2
> Jul 19 10:24:36 node tgtd: bs_thread_request_done(132) wrong wakeup
> Jul 19 10:24:36 node tgtd: conn_close(190) 0x166af268 2
> Jul 19 10:24:36 node tgtd: bs_thread_request_done(132) wrong wakeup
> Jul 19 10:24:37 node tgtd: conn_close(190) 0x166af268 2
> Jul 19 10:24:37 node tgtd: bs_thread_request_done(132) wrong wakeup
> Jul 19 10:24:37 node tgtd: conn_close(190) 0x166af268 2
> Jul 19 10:24:37 node tgtd: bs_thread_request_done(132) wrong wakeup
>
> Full log can be downloaded from:
> http://dl.dropbox.com/u/8354750/tgtd/20110719/messages.gz
>
>
> Thanks.
>
>
> 2011/7/19 FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>:
>> On Mon, 18 Jul 2011 20:26:07 +0800
>> Kiefer Chang <zapchang at gmail.com> wrote:
>>
>>> I applied the patch. The symptom still exists.
>>> I add 2 print statements in the patch. I can't find it in the log
>>> after tgtd segfault.
>>> Please download the log if you need:
>>> http://dl.dropbox.com/u/8354750/tgtd/20110718/messages.gz
>>
>> Thanks for the testing. I'll investigate this cause but meanwhile can
>> you try the following patch?
>>
>> diff --git a/usr/bs.c b/usr/bs.c
>> index d72d090..8f56aee 100644
>> --- a/usr/bs.c
>> +++ b/usr/bs.c
>> @@ -122,7 +122,7 @@ out:
>>        pthread_exit(NULL);
>>  }
>>
>> -static void bs_thread_request_done(int fd, int events, void *data)
>> +void bs_thread_request_done(int fd, int events, void *data)
>>  {
>>        struct scsi_cmd *cmd;
>>        int nr_events, ret;
>> @@ -230,6 +230,8 @@ static int bs_init_signalfd(void)
>>        sigset_t mask;
>>        int ret;
>>
>> +       return 1;
>> +
>>        pthread_mutex_init(&finished_lock, NULL);
>>
>>        sigemptyset(&mask);
>> @@ -270,6 +272,7 @@ static int bs_init_notify_thread(void)
>>                goto close_command_fd;
>>        }
>>
>> +       set_non_blocking(done_fd[0]);
>>        ret = tgt_event_add(done_fd[0], EPOLLIN, bs_thread_request_done, NULL);
>>        if (ret) {
>>                eprintf("failed to add epoll event\n");
>> diff --git a/usr/bs_thread.h b/usr/bs_thread.h
>> index d460032..14456a7 100644
>> --- a/usr/bs_thread.h
>> +++ b/usr/bs_thread.h
>> @@ -29,3 +29,4 @@ extern int bs_thread_open(struct bs_thread_info *info, request_func_t *rfn,
>>  extern void bs_thread_close(struct bs_thread_info *info);
>>  extern int bs_thread_cmd_submit(struct scsi_cmd *cmd);
>>
>> +extern void bs_wait_one_completion(void);
>> diff --git a/usr/iscsi/conn.c b/usr/iscsi/conn.c
>> index 53e719e..928a8a1 100644
>> --- a/usr/iscsi/conn.c
>> +++ b/usr/iscsi/conn.c
>> @@ -115,14 +115,14 @@ void conn_close(struct iscsi_connection *conn)
>>                if (task->conn != conn)
>>                        continue;
>>
>> -               eprintf("Forcing release of pending task %p %" PRIx64 "\n",
>> -                       task, task->tag);
>> +               eprintf("Forcing release of pending task %p %" PRIx64 " %u\n",
>> +                       task, task->tag, conn->refcount);
>>                list_del(&task->c_list);
>>                iscsi_free_task(task);
>>        }
>>
>>        if (conn->tx_task) {
>> -               dprintf("Add current tx task to the tx list for removal "
>> +               eprintf("Add current tx task to the tx list for removal "
>>                        "%p %" PRIx64 "\n",
>>                        conn->tx_task, conn->tx_task->tag);
>>                list_add(&conn->tx_task->c_list, &conn->tx_clist);
>> @@ -134,8 +134,8 @@ void conn_close(struct iscsi_connection *conn)
>>
>>                op = task->req.opcode & ISCSI_OPCODE_MASK;
>>
>> -               eprintf("Forcing release of tx task %p %" PRIx64 " %x\n",
>> -                       task, task->tag, op);
>> +               eprintf("Forcing release of tx task %p %" PRIx64 " %x %u\n",
>> +                       task, task->tag, op, conn->refcount);
>>                switch (op) {
>>                case ISCSI_OP_SCSI_CMD:
>>                        /*
>> @@ -155,14 +155,14 @@ void conn_close(struct iscsi_connection *conn)
>>                        iscsi_free_task(task);
>>                        break;
>>                default:
>> -                       eprintf("%x\n", op);
>> +                       eprintf("unknow op %x\n", op);
>>                        break;
>>                }
>>        }
>>
>>        if (conn->rx_task) {
>> -               eprintf("Forcing release of rx task %p %" PRIx64 "\n",
>> -                       conn->rx_task, conn->rx_task->tag);
>> +               eprintf("Forcing release of rx task %p %" PRIx64 " %u\n",
>> +                       conn->rx_task, conn->rx_task->tag, conn->refcount);
>>                iscsi_free_task(conn->rx_task);
>>        }
>>        conn->rx_task = NULL;
>> @@ -173,10 +173,24 @@ void conn_close(struct iscsi_connection *conn)
>>                 * This task is in SCSI. We need to wait for I/O
>>                 * completion.
>>                 */
>> +               eprintf("release task %p %" PRIx64 " flag %lx, %u\n",
>> +                       task, task->tag, task->flags, conn->refcount);
>> +
>>                if (task_in_scsi(task))
>>                        continue;
>>                iscsi_free_task(task);
>>        }
>> +
>> +       eprintf("%p %u\n", conn, conn->refcount);
>> +
>> +       while (conn->refcount != 1) {
>> +               struct timeval t;
>> +               t.tv_sec = 1;
>> +               t.tv_usec = 0;
>> +               eprintf("%p %u\n", conn, conn->refcount);
>> +               bs_thread_request_done(0, 0, NULL);
>> +               select(0, NULL, NULL, NULL, &t);
>> +       }
>>  done:
>>        conn_put(conn);
>>  }
>> diff --git a/usr/iscsi/iscsi_tcp.c b/usr/iscsi/iscsi_tcp.c
>> index 1988b2f..c61faa1 100644
>> --- a/usr/iscsi/iscsi_tcp.c
>> +++ b/usr/iscsi/iscsi_tcp.c
>> @@ -164,7 +164,7 @@ static void iscsi_tcp_event_handler(int fd, int events, void *data)
>>                iscsi_tx_handler(conn);
>>
>>        if (conn->state == STATE_CLOSE) {
>> -               dprintf("connection closed %p\n", conn);
>> +               eprintf("connection closed %p\n", conn);
>>                conn_close(conn);
>>        }
>>  }
>> diff --git a/usr/iscsi/iscsid.c b/usr/iscsi/iscsid.c
>> index 22a21cc..59f88c4 100644
>> --- a/usr/iscsi/iscsid.c
>> +++ b/usr/iscsi/iscsid.c
>> @@ -1209,6 +1209,12 @@ static int iscsi_scsi_cmd_done(uint64_t nid, int result, struct scsi_cmd *scmd)
>>        struct iscsi_task *task = ITASK(scmd);
>>        uint32_t read_len = scsi_get_in_length(scmd);
>>
>> +       if (result == TASK_ABORTED) {
>> +               list_del(&task->c_hlist);
>> +               iscsi_free_task(task);
>> +               return 0;
>> +       }
>> +
>>        /*
>>         * Since the connection is closed we just free the task.
>>         * We could delay the closing of the conn in some cases and send
>> @@ -1216,6 +1222,8 @@ static int iscsi_scsi_cmd_done(uint64_t nid, int result, struct scsi_cmd *scmd)
>>         * task got reassinged to another connection.
>>         */
>>        if (task->conn->state == STATE_CLOSE) {
>> +               eprintf("finish a task on a closed conn, %p %u\n",
>> +                       task, task->conn->refcount);
>>                iscsi_free_cmd_task(task);
>>                return 0;
>>        }
>> @@ -1405,6 +1413,8 @@ static int iscsi_tm_execute(struct iscsi_task *task)
>>        struct iscsi_tm *req = (struct iscsi_tm *) &task->req;
>>        int fn = 0, err = 0;
>>
>> +       eprintf("%x\n", req->flags & ISCSI_FLAG_TM_FUNC_MASK);
>> +
>>        switch (req->flags & ISCSI_FLAG_TM_FUNC_MASK) {
>>        case ISCSI_TM_FUNC_ABORT_TASK:
>>                fn = ABORT_TASK;
>> @@ -1412,15 +1422,11 @@ static int iscsi_tm_execute(struct iscsi_task *task)
>>        case ISCSI_TM_FUNC_ABORT_TASK_SET:
>>                fn = ABORT_TASK_SET;
>>                break;
>> -       case ISCSI_TM_FUNC_CLEAR_ACA:
>> -               fn = CLEAR_TASK_SET;
>> -               break;
>> -       case ISCSI_TM_FUNC_CLEAR_TASK_SET:
>> -               fn = CLEAR_ACA;
>> -               break;
>>        case ISCSI_TM_FUNC_LOGICAL_UNIT_RESET:
>>                fn = LOGICAL_UNIT_RESET;
>>                break;
>> +       case ISCSI_TM_FUNC_CLEAR_ACA:
>> +       case ISCSI_TM_FUNC_CLEAR_TASK_SET:
>>        case ISCSI_TM_FUNC_TARGET_WARM_RESET:
>>        case ISCSI_TM_FUNC_TARGET_COLD_RESET:
>>        case ISCSI_TM_FUNC_TASK_REASSIGN:
>> diff --git a/usr/target.c b/usr/target.c
>> index 5f04cf9..3e3d28b 100644
>> --- a/usr/target.c
>> +++ b/usr/target.c
>> @@ -1127,7 +1127,10 @@ static int abort_cmd(struct target* target, struct mgmt_req *mreq,
>>                cmd->mreq = mreq;
>>                err = -EBUSY;
>>        } else {
>> -               cmd->dev->cmd_done(target, cmd);
>> +               eprintf("found %" PRIx64 " %lx\n", cmd->tag, cmd->state);
>> +               cmd_hlist_remove(cmd);
>> +               list_del(&cmd->qlist);
>> +
>>                target_cmd_io_done(cmd, TASK_ABORTED);
>>        }
>>        return err;
>>
>
--
To unsubscribe from this list: send the line "unsubscribe stgt" in
the body of a message to majordomo at vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



More information about the stgt mailing list