[stgt] tgtd segfault during heavy I/O

Kiefer Chang zapchang at gmail.com
Wed Jul 20 13:08:59 CEST 2011


Hi fujita-san,

Here is the result, unfortunately neither the core dump or directly
attaching gdb to tgtd can't show right code trace.
system log:
http://dl.dropbox.com/u/8354750/tgtd/20110720/messages.gz

backtrace (tgtd built with make DEBUG=1):
Core was generated by `./tgtd'.
Program terminated with signal 11, Segmentation fault.
#0  0x0000000000000000 in ?? ()
(gdb) bt
#0  0x0000000000000000 in ?? ()
#1  0x00000000004177d1 in event_loop () at tgtd.c:400
#2  0x0000000000417e82 in main (argc=1, argv=0x7fff192e2bc8) at tgtd.c:574


By the way, if I use O_DIRECT flag on all lun's backing store, I found
tgtd won't segfault during the same I/O test (1 day).
I can't see any abort task command in the log.
I can't understand why...Shouldn't lower performance cause command
timeout much easily?



2011/7/20 FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>:
> On Tue, 19 Jul 2011 14:57:45 +0800
> Kiefer Chang <zapchang at gmail.com> wrote:
>
>> Forget to mention, signalfd doesn't work in my platform.
>> pthread notification is used by default.
>
> It should be fine. However, after reading your log, I found another
> bug.
>
> Can you try the following patch (against the latest git)?
>
> Thanks a lot for the testing.
>
> diff --git a/usr/bs.c b/usr/bs.c
> index d72d090..8f56aee 100644
> --- a/usr/bs.c
> +++ b/usr/bs.c
> @@ -122,7 +122,7 @@ out:
>        pthread_exit(NULL);
>  }
>
> -static void bs_thread_request_done(int fd, int events, void *data)
> +void bs_thread_request_done(int fd, int events, void *data)
>  {
>        struct scsi_cmd *cmd;
>        int nr_events, ret;
> @@ -230,6 +230,8 @@ static int bs_init_signalfd(void)
>        sigset_t mask;
>        int ret;
>
> +       return 1;
> +
>        pthread_mutex_init(&finished_lock, NULL);
>
>        sigemptyset(&mask);
> @@ -270,6 +272,7 @@ static int bs_init_notify_thread(void)
>                goto close_command_fd;
>        }
>
> +       set_non_blocking(done_fd[0]);
>        ret = tgt_event_add(done_fd[0], EPOLLIN, bs_thread_request_done, NULL);
>        if (ret) {
>                eprintf("failed to add epoll event\n");
> diff --git a/usr/bs_thread.h b/usr/bs_thread.h
> index d460032..14456a7 100644
> --- a/usr/bs_thread.h
> +++ b/usr/bs_thread.h
> @@ -29,3 +29,4 @@ extern int bs_thread_open(struct bs_thread_info *info, request_func_t *rfn,
>  extern void bs_thread_close(struct bs_thread_info *info);
>  extern int bs_thread_cmd_submit(struct scsi_cmd *cmd);
>
> +extern void bs_wait_one_completion(void);
> diff --git a/usr/iscsi/conn.c b/usr/iscsi/conn.c
> index 53e719e..3836655 100644
> --- a/usr/iscsi/conn.c
> +++ b/usr/iscsi/conn.c
> @@ -115,14 +115,14 @@ void conn_close(struct iscsi_connection *conn)
>                if (task->conn != conn)
>                        continue;
>
> -               eprintf("Forcing release of pending task %p %" PRIx64 "\n",
> -                       task, task->tag);
> +               eprintf("Forcing release of pending task %p %" PRIx64 " %u\n",
> +                       task, task->tag, conn->refcount);
>                list_del(&task->c_list);
>                iscsi_free_task(task);
>        }
>
>        if (conn->tx_task) {
> -               dprintf("Add current tx task to the tx list for removal "
> +               eprintf("Add current tx task to the tx list for removal "
>                        "%p %" PRIx64 "\n",
>                        conn->tx_task, conn->tx_task->tag);
>                list_add(&conn->tx_task->c_list, &conn->tx_clist);
> @@ -134,8 +134,8 @@ void conn_close(struct iscsi_connection *conn)
>
>                op = task->req.opcode & ISCSI_OPCODE_MASK;
>
> -               eprintf("Forcing release of tx task %p %" PRIx64 " %x\n",
> -                       task, task->tag, op);
> +               eprintf("Forcing release of tx task %p %" PRIx64 " %x %u\n",
> +                       task, task->tag, op, conn->refcount);
>                switch (op) {
>                case ISCSI_OP_SCSI_CMD:
>                        /*
> @@ -155,14 +155,14 @@ void conn_close(struct iscsi_connection *conn)
>                        iscsi_free_task(task);
>                        break;
>                default:
> -                       eprintf("%x\n", op);
> +                       eprintf("unknow op %x\n", op);
>                        break;
>                }
>        }
>
>        if (conn->rx_task) {
> -               eprintf("Forcing release of rx task %p %" PRIx64 "\n",
> -                       conn->rx_task, conn->rx_task->tag);
> +               eprintf("Forcing release of rx task %p %" PRIx64 " %u\n",
> +                       conn->rx_task, conn->rx_task->tag, conn->refcount);
>                iscsi_free_task(conn->rx_task);
>        }
>        conn->rx_task = NULL;
> @@ -173,10 +173,25 @@ void conn_close(struct iscsi_connection *conn)
>                 * This task is in SCSI. We need to wait for I/O
>                 * completion.
>                 */
> +               eprintf("release task %p %x %" PRIx64 " flag %lx, %u\n",
> +                       task, task->req.opcode & ISCSI_OPCODE_MASK,
> +                       task->tag, task->flags, conn->refcount);
> +
>                if (task_in_scsi(task))
>                        continue;
>                iscsi_free_task(task);
>        }
> +
> +       eprintf("%p %u\n", conn, conn->refcount);
> +
> +       while (conn->refcount != 1) {
> +               struct timeval t;
> +               t.tv_sec = 1;
> +               t.tv_usec = 0;
> +               eprintf("%p %u\n", conn, conn->refcount);
> +               bs_thread_request_done(0, 0, NULL);
> +               select(0, NULL, NULL, NULL, &t);
> +       }
>  done:
>        conn_put(conn);
>  }
> diff --git a/usr/iscsi/iscsi_tcp.c b/usr/iscsi/iscsi_tcp.c
> index 1988b2f..c61faa1 100644
> --- a/usr/iscsi/iscsi_tcp.c
> +++ b/usr/iscsi/iscsi_tcp.c
> @@ -164,7 +164,7 @@ static void iscsi_tcp_event_handler(int fd, int events, void *data)
>                iscsi_tx_handler(conn);
>
>        if (conn->state == STATE_CLOSE) {
> -               dprintf("connection closed %p\n", conn);
> +               eprintf("connection closed %p\n", conn);
>                conn_close(conn);
>        }
>  }
> diff --git a/usr/iscsi/iscsid.c b/usr/iscsi/iscsid.c
> index 22a21cc..4cde178 100644
> --- a/usr/iscsi/iscsid.c
> +++ b/usr/iscsi/iscsid.c
> @@ -1209,6 +1209,12 @@ static int iscsi_scsi_cmd_done(uint64_t nid, int result, struct scsi_cmd *scmd)
>        struct iscsi_task *task = ITASK(scmd);
>        uint32_t read_len = scsi_get_in_length(scmd);
>
> +       if (result == TASK_ABORTED) {
> +               list_del(&task->c_hlist);
> +               iscsi_free_task(task);
> +               return 0;
> +       }
> +
>        /*
>         * Since the connection is closed we just free the task.
>         * We could delay the closing of the conn in some cases and send
> @@ -1216,6 +1222,8 @@ static int iscsi_scsi_cmd_done(uint64_t nid, int result, struct scsi_cmd *scmd)
>         * task got reassinged to another connection.
>         */
>        if (task->conn->state == STATE_CLOSE) {
> +               eprintf("finish a task on a closed conn, %p %u\n",
> +                       task, task->conn->refcount);
>                iscsi_free_cmd_task(task);
>                return 0;
>        }
> @@ -1376,6 +1384,13 @@ static int iscsi_tm_done(struct mgmt_req *mreq)
>
>        task = (struct iscsi_task *) (unsigned long) mreq->mid;
>
> +       if (task->conn->state == STATE_CLOSE) {
> +               eprintf("finish a tm task on a closed conn, %p %u\n",
> +                       task, task->conn->refcount);
> +               iscsi_free_task(task);
> +               return 0;
> +       }
> +
>        switch (mreq->result) {
>        case 0:
>                task->result = ISCSI_TMF_RSP_COMPLETE;
> @@ -1405,6 +1420,8 @@ static int iscsi_tm_execute(struct iscsi_task *task)
>        struct iscsi_tm *req = (struct iscsi_tm *) &task->req;
>        int fn = 0, err = 0;
>
> +       eprintf("%x\n", req->flags & ISCSI_FLAG_TM_FUNC_MASK);
> +
>        switch (req->flags & ISCSI_FLAG_TM_FUNC_MASK) {
>        case ISCSI_TM_FUNC_ABORT_TASK:
>                fn = ABORT_TASK;
> @@ -1412,15 +1429,11 @@ static int iscsi_tm_execute(struct iscsi_task *task)
>        case ISCSI_TM_FUNC_ABORT_TASK_SET:
>                fn = ABORT_TASK_SET;
>                break;
> -       case ISCSI_TM_FUNC_CLEAR_ACA:
> -               fn = CLEAR_TASK_SET;
> -               break;
> -       case ISCSI_TM_FUNC_CLEAR_TASK_SET:
> -               fn = CLEAR_ACA;
> -               break;
>        case ISCSI_TM_FUNC_LOGICAL_UNIT_RESET:
>                fn = LOGICAL_UNIT_RESET;
>                break;
> +       case ISCSI_TM_FUNC_CLEAR_ACA:
> +       case ISCSI_TM_FUNC_CLEAR_TASK_SET:
>        case ISCSI_TM_FUNC_TARGET_WARM_RESET:
>        case ISCSI_TM_FUNC_TARGET_COLD_RESET:
>        case ISCSI_TM_FUNC_TASK_REASSIGN:
> diff --git a/usr/target.c b/usr/target.c
> index 5f04cf9..5afce53 100644
> --- a/usr/target.c
> +++ b/usr/target.c
> @@ -1101,6 +1101,8 @@ void target_cmd_done(struct scsi_cmd *cmd)
>        struct mgmt_req *mreq;
>
>        mreq = cmd->mreq;
> +       if (mreq)
> +               eprintf("%p %d\n", mreq, mreq->busy);
>        if (mreq && !--mreq->busy) {
>                mreq->result = mreq->function == ABORT_TASK ? -EEXIST : 0;
>                mreq->itn_id = cmd->cmd_itn_id;
> @@ -1127,7 +1129,10 @@ static int abort_cmd(struct target* target, struct mgmt_req *mreq,
>                cmd->mreq = mreq;
>                err = -EBUSY;
>        } else {
> -               cmd->dev->cmd_done(target, cmd);
> +               eprintf("found %" PRIx64 " %lx\n", cmd->tag, cmd->state);
> +               cmd_hlist_remove(cmd);
> +               list_del(&cmd->qlist);
> +
>                target_cmd_io_done(cmd, TASK_ABORTED);
>        }
>        return err;
> @@ -1140,7 +1145,7 @@ static int abort_task_set(struct mgmt_req *mreq, struct target* target,
>        struct it_nexus *itn;
>        int i, err, count = 0;
>
> -       eprintf("found %" PRIx64 " %d\n", tag, all);
> +       eprintf("aborting %" PRIx64 " %d, %p\n", tag, all, mreq);
>
>        list_for_each_entry(itn, &target->it_nexus_list, nexus_siblings) {
>                for (i = 0; i < ARRAY_SIZE(itn->cmd_hash_list); i++) {
>
>
--
To unsubscribe from this list: send the line "unsubscribe stgt" in
the body of a message to majordomo at vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



More information about the stgt mailing list