This is deep dive into NVME 1 kernel to try to debug timeout on my machine after i upgraded to Mint 22.
SMART Link to heading
Starting with the issue here, dmesg
showed the timeout and where the kernel stalled booting.
[ 0.817343] nvme nvme0: pci function 10000:e1:00.0
[ 0.817353] pcieport 10000:e0:1c.4: can't derive routing for PCI INT A
[ 0.817356] nvme 10000:e1:00.0: PCI INT A: not connected
[ 0.834306] nvme nvme0: 8/0/0 default/read/poll queues
[ 0.839964] nvme0n1: p1 p2
[ 31.777624] nvme nvme0: I/O tag 641 (c281) opcode 0x1 (I/O Cmd) QID 8 timeout, aborting req_op:WRITE(1) size:4096
[ 61.985629] nvme nvme0: I/O tag 641 (c281) opcode 0x1 (I/O Cmd) QID 8 timeout, reset controller
[ 123.940522] nvme nvme0: Abort status: 0x371
My first step was to do SMART on drive 2 which didn’t show any issues from the NVME. So, Let’s move on.
=== START OF SMART DATA SECTION ===
SMART overall-health self-assessment test result: PASSED
SMART/Health Information (NVMe Log 0x02)
Critical Warning: 0x00
Temperature: 27 Celsius
Available Spare: 100%
Available Spare Threshold: 3%
Percentage Used: 2%
Data Units Read: 8,205,799 [4.20 TB]
Data Units Written: 7,511,214 [3.84 TB]
Host Read Commands: 101,356,931
Host Write Commands: 109,136,421
Controller Busy Time: 2,095
Power Cycles: 348
Power On Hours: 436
Unsafe Shutdowns: 87
Media and Data Integrity Errors: 0
Error Information Log Entries: 419
Warning Comp. Temperature Time: 0
Critical Comp. Temperature Time: 0
Temperature Sensor 1: 27 Celsius
Thermal Temp. 1 Transition Count: 1
Thermal Temp. 1 Total Time: 84
Error Information (NVMe Log 0x01, 16 of 64 entries)
No Errors Logged
Self-test Log (NVMe Log 0x06)
Self-test status: No self-test in progress
No Self-tests Logged
NVME PCIE driver Link to heading
Looking at the PCIE NVME driver operations pci_driver
in drivers/nvme/host/pci.c
static struct pci_driver nvme_driver = {
.name = "nvme",
.id_table = nvme_id_table,
.probe = nvme_probe,
.remove = nvme_remove,
.shutdown = nvme_shutdown,
.driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
#ifdef CONFIG_PM_SLEEP
.pm = &nvme_dev_pm_ops,
#endif
},
.sriov_configure = pci_sriov_configure_simple,
.err_handler = &nvme_err_handler,
};
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct nvme_dev *dev;
int result = -ENOMEM;
dev = nvme_pci_alloc_dev(pdev, id);
if (IS_ERR(dev))
return PTR_ERR(dev);
result = nvme_add_ctrl(&dev->ctrl);
if (result)
goto out_put_ctrl;
result = nvme_dev_map(dev);
if (result)
goto out_uninit_ctrl;
result = nvme_setup_prp_pools(dev);
if (result)
goto out_dev_unmap;
result = nvme_pci_alloc_iod_mempool(dev);
if (result)
goto out_release_prp_pools;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
...
...
result = nvme_setup_host_mem(dev);
if (result < 0)
goto out_disable;
result = nvme_setup_io_queues(dev);
static int nvme_setup_io_queues(struct nvme_dev *dev) {
...
...
dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
dev->io_queues[HCTX_TYPE_DEFAULT],
dev->io_queues[HCTX_TYPE_READ],
dev->io_queues[HCTX_TYPE_POLL]);
}
NVME controller operations Link to heading
Called from nvme_pci_alloc_dev
, nvme_init_ctrl
is defined drivers/nvme/host/core.c
ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
quirks);
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name = "pcie",
.module = THIS_MODULE,
.flags = NVME_F_METADATA_SUPPORTED,
.dev_attr_groups = nvme_pci_dev_attr_groups,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32 = nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
.free_ctrl = nvme_pci_free_ctrl,
.submit_async_event = nvme_pci_submit_async_event,
.subsystem_reset = nvme_pci_subsystem_reset,
.get_address = nvme_pci_get_address,
.print_device_info = nvme_pci_print_device_info,
.supports_pci_p2pdma = nvme_pci_supports_pci_p2pdma,
};
nvme_timeout
is part of blk_mq_ops
that implements blk-mq 4
static const struct blk_mq_ops nvme_mq_ops = {
.queue_rq = nvme_queue_rq,
.queue_rqs = nvme_queue_rqs,
.complete = nvme_pci_complete_rq,
.commit_rqs = nvme_commit_rqs,
.init_hctx = nvme_init_hctx,
.init_request = nvme_pci_init_request,
.map_queues = nvme_pci_map_queues,
.timeout = nvme_timeout,
.poll = nvme_poll,
};
Error Trace Link to heading
Going through errors and cross reference them to source code:
The first error in nvme_timeout
in drivers/nvme/host/pci.c
.
[ 34.341698] nvme nvme0: I/O tag 705 (72c1) opcode 0x2 (I/O Cmd) QID 7 timeout, aborting req_op:READ(0) size:131072
[ 34.341727] nvme nvme0: I/O tag 706 (72c2) opcode 0x2 (I/O Cmd) QID 7 timeout, aborting req_op:READ(0) size:131072
[ 38.245587] nvme nvme0: I/O tag 640 (c280) opcode 0x1 (I/O Cmd) QID 3 timeout, aborting req_op:WRITE(1) size:65536
[ 38.245614] nvme nvme0: I/O tag 641 (2281) opcode 0x1 (I/O Cmd) QID 3 timeout, aborting req_op:WRITE(1) size:12288
dev_warn(nvmeq->dev->ctrl.device,
"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, aborting req_op:%s(%u) size:%u\n",
req->tag, nvme_cid(req), opcode, nvme_get_opcode_str(opcode),
nvmeq->qid, blk_op_str(req_op(req)), req_op(req),
blk_rq_bytes(req));
iod->aborted = true;
cmd.abort.opcode = nvme_admin_abort_cmd;
cmd.abort.cid = nvme_cid(req);
cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
The 2nd error when nvme_timeout
is called later and iod was already aborted in the first call in nvme_timeout
.
[ 64.549689] nvme nvme0: I/O tag 705 (72c1) opcode 0x2 (I/O Cmd) QID 7 timeout, reset controller
if (!nvmeq->qid || iod->aborted) {
dev_warn(dev->ctrl.device,
"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
req->tag, nvme_cid(req), opcode,
nvme_opcode_str(nvmeq->qid, opcode), nvmeq->qid);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
goto disable;
}
Then we get these errors from block/blk-mq.c
, which obviously complains about IO operations
[ 128.040083] nvme0n1: I/O Cmd(0x2) @ LBA 249192208, 256 blocks, I/O Error (sct 0x3 / sc 0x71)
[ 128.040096] I/O error, dev nvme0n1, sector 249192208 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[ 128.040108] nvme0n1: I/O Cmd(0x2) @ LBA 249191952, 256 blocks, I/O Error (sct 0x3 / sc 0x71)
[ 128.040112] I/O error, dev nvme0n1, sector 249191952 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
static void blk_print_req_error(struct request *req, blk_status_t status)
{
printk_ratelimited(KERN_ERR
"%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
"phys_seg %u prio class %u\n",
blk_status_to_str(status),
req->q->disk ? req->q->disk->disk_name : "?",
blk_rq_pos(req), (__force u32)req_op(req),
blk_op_str(req_op(req)),
(__force u32)(req->cmd_flags & ~REQ_OP_MASK),
req->nr_phys_segments,
IOPRIO_PRIO_CLASS(req->ioprio));
}
The last errors comes from drivers/nvme/host/pci.c
in abort_endio
[ 128.040146] nvme nvme0: Abort status: 0x371
[ 128.040151] nvme nvme0: Abort status: 0x371
[ 128.040155] nvme nvme0: Abort status: 0x371
[ 128.040158] nvme nvme0: Abort status: 0x371
static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
dev_warn(nvmeq->dev->ctrl.device,
"Abort status: 0x%x", nvme_req(req)->status);
atomic_inc(&nvmeq->dev->ctrl.abort_limit);
blk_mq_free_request(req);
return RQ_END_IO_NONE;
}
abort_endio
was set in nvme_timeout
with init’ing abort_req
abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
BLK_MQ_REQ_NOWAIT);
if (IS_ERR(abort_req)) {
atomic_inc(&dev->ctrl.abort_limit);
return BLK_EH_RESET_TIMER;
}
nvme_init_request(abort_req, &cmd);
abort_req->end_io = abort_endio;
abort_req->end_io_data = NULL;
blk_execute_rq_nowait(abort_req, false);
At this point, I think we have double fault kinda of situation here where nvme_timeout
gets called multiple times when kernel tries to access NVME.
So, why is NVME timing out?
NVME notes Link to heading
CSTS
definition from NVME specification 6
3.1.6 Offset 1Ch: CSTS – Controller Status
Bit Type Reset Description
31:05 RO 0 Reserved
04 RW1C HwInit
NVM Subsystem Reset Occurred (NSSRO): The initial value of this field is '1' if the
last occurance of an NVM Subsystem Reset occured while power was applied to the
NVM subsystem. The initial value of this field is '0' following an NVM Subsystem Reset
due to application of power to the NVM subsystem. This field is only valid if the
controller supports the NVM Subsystem Reset feature defined in section 7.3.1 as
indicated by CAP.NSSRS set to ‘1’.
The reset value of this field is '0' if an NVM Subsystem Reset causes activation of a
new firmware image.
03:02 RO 0
Shutdown Status (
Workaround 1 Link to heading
Changing io_timeout
to 1 reduces the first timeouts but there is still the delay to the final abort.
sudo vim /etc/default/grub
add vme_core.io_timeout=1 to boot parameters
sudo update-grub