This is deep dive into NVME 1 kernel to try to debug timeout on my machine after i upgraded to Mint 22.

SMART Link to heading

Starting with the issue here, dmesg showed the timeout and where the kernel stalled booting.

[    0.817343] nvme nvme0: pci function 10000:e1:00.0
[    0.817353] pcieport 10000:e0:1c.4: can't derive routing for PCI INT A
[    0.817356] nvme 10000:e1:00.0: PCI INT A: not connected
[    0.834306] nvme nvme0: 8/0/0 default/read/poll queues
[    0.839964]  nvme0n1: p1 p2

[   31.777624] nvme nvme0: I/O tag 641 (c281) opcode 0x1 (I/O Cmd) QID 8 timeout, aborting req_op:WRITE(1) size:4096
[   61.985629] nvme nvme0: I/O tag 641 (c281) opcode 0x1 (I/O Cmd) QID 8 timeout, reset controller
[  123.940522] nvme nvme0: Abort status: 0x371

My first step was to do SMART on drive 2 which didn’t show any issues from the NVME. So, Let’s move on.

=== START OF SMART DATA SECTION ===
SMART overall-health self-assessment test result: PASSED

SMART/Health Information (NVMe Log 0x02)
Critical Warning:                   0x00
Temperature:                        27 Celsius
Available Spare:                    100%
Available Spare Threshold:          3%
Percentage Used:                    2%
Data Units Read:                    8,205,799 [4.20 TB]
Data Units Written:                 7,511,214 [3.84 TB]
Host Read Commands:                 101,356,931
Host Write Commands:                109,136,421
Controller Busy Time:               2,095
Power Cycles:                       348
Power On Hours:                     436
Unsafe Shutdowns:                   87
Media and Data Integrity Errors:    0
Error Information Log Entries:      419
Warning  Comp. Temperature Time:    0
Critical Comp. Temperature Time:    0
Temperature Sensor 1:               27 Celsius
Thermal Temp. 1 Transition Count:   1
Thermal Temp. 1 Total Time:         84

Error Information (NVMe Log 0x01, 16 of 64 entries)
No Errors Logged

Self-test Log (NVMe Log 0x06)
Self-test status: No self-test in progress
No Self-tests Logged

NVME PCIE driver Link to heading

Looking at the PCIE NVME driver operations pci_driver in drivers/nvme/host/pci.c

static struct pci_driver nvme_driver = {
	.name		= "nvme",
	.id_table	= nvme_id_table,
	.probe		= nvme_probe,
	.remove		= nvme_remove,
	.shutdown	= nvme_shutdown,
	.driver		= {
		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
#ifdef CONFIG_PM_SLEEP
		.pm		= &nvme_dev_pm_ops,
#endif
	},
	.sriov_configure = pci_sriov_configure_simple,
	.err_handler	= &nvme_err_handler,
};
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
	struct nvme_dev *dev;
	int result = -ENOMEM;

	dev = nvme_pci_alloc_dev(pdev, id);
	if (IS_ERR(dev))
		return PTR_ERR(dev);

	result = nvme_add_ctrl(&dev->ctrl);
	if (result)
		goto out_put_ctrl;

	result = nvme_dev_map(dev);
	if (result)
		goto out_uninit_ctrl;

	result = nvme_setup_prp_pools(dev);
	if (result)
		goto out_dev_unmap;

	result = nvme_pci_alloc_iod_mempool(dev);
	if (result)
		goto out_release_prp_pools;

	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));

    ...
    ...

	result = nvme_setup_host_mem(dev);
	if (result < 0)
		goto out_disable;

	result = nvme_setup_io_queues(dev);
static int nvme_setup_io_queues(struct nvme_dev *dev) {
    ...
    ...
	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
					dev->io_queues[HCTX_TYPE_DEFAULT],
					dev->io_queues[HCTX_TYPE_READ],
					dev->io_queues[HCTX_TYPE_POLL]);
}

NVME controller operations Link to heading

Called from nvme_pci_alloc_dev, nvme_init_ctrl is defined drivers/nvme/host/core.c

	ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
			     quirks);
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
	.name			= "pcie",
	.module			= THIS_MODULE,
	.flags			= NVME_F_METADATA_SUPPORTED,
	.dev_attr_groups	= nvme_pci_dev_attr_groups,
	.reg_read32		= nvme_pci_reg_read32,
	.reg_write32		= nvme_pci_reg_write32,
	.reg_read64		= nvme_pci_reg_read64,
	.free_ctrl		= nvme_pci_free_ctrl,
	.submit_async_event	= nvme_pci_submit_async_event,
	.subsystem_reset	= nvme_pci_subsystem_reset,
	.get_address		= nvme_pci_get_address,
	.print_device_info	= nvme_pci_print_device_info,
	.supports_pci_p2pdma	= nvme_pci_supports_pci_p2pdma,
};

nvme_timeout is part of blk_mq_ops that implements blk-mq 4

static const struct blk_mq_ops nvme_mq_ops = {
	.queue_rq	= nvme_queue_rq,
	.queue_rqs	= nvme_queue_rqs,
	.complete	= nvme_pci_complete_rq,
	.commit_rqs	= nvme_commit_rqs,
	.init_hctx	= nvme_init_hctx,
	.init_request	= nvme_pci_init_request,
	.map_queues	= nvme_pci_map_queues,
	.timeout	= nvme_timeout,
	.poll		= nvme_poll,
};

Error Trace Link to heading

Going through errors and cross reference them to source code:

The first error in nvme_timeout in drivers/nvme/host/pci.c.

[   34.341698] nvme nvme0: I/O tag 705 (72c1) opcode 0x2 (I/O Cmd) QID 7 timeout, aborting req_op:READ(0) size:131072
[   34.341727] nvme nvme0: I/O tag 706 (72c2) opcode 0x2 (I/O Cmd) QID 7 timeout, aborting req_op:READ(0) size:131072
[   38.245587] nvme nvme0: I/O tag 640 (c280) opcode 0x1 (I/O Cmd) QID 3 timeout, aborting req_op:WRITE(1) size:65536
[   38.245614] nvme nvme0: I/O tag 641 (2281) opcode 0x1 (I/O Cmd) QID 3 timeout, aborting req_op:WRITE(1) size:12288
	dev_warn(nvmeq->dev->ctrl.device,
		 "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, aborting req_op:%s(%u) size:%u\n",
		 req->tag, nvme_cid(req), opcode, nvme_get_opcode_str(opcode),
		 nvmeq->qid, blk_op_str(req_op(req)), req_op(req),
		 blk_rq_bytes(req));
	iod->aborted = true;

	cmd.abort.opcode = nvme_admin_abort_cmd;
	cmd.abort.cid = nvme_cid(req);
	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);

The 2nd error when nvme_timeout is called later and iod was already aborted in the first call in nvme_timeout.

[   64.549689] nvme nvme0: I/O tag 705 (72c1) opcode 0x2 (I/O Cmd) QID 7 timeout, reset controller
	if (!nvmeq->qid || iod->aborted) {
		dev_warn(dev->ctrl.device,
			 "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
			 req->tag, nvme_cid(req), opcode,
			 nvme_opcode_str(nvmeq->qid, opcode), nvmeq->qid);
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
		goto disable;
	}

Then we get these errors from block/blk-mq.c, which obviously complains about IO operations

[  128.040083] nvme0n1: I/O Cmd(0x2) @ LBA 249192208, 256 blocks, I/O Error (sct 0x3 / sc 0x71) 
[  128.040096] I/O error, dev nvme0n1, sector 249192208 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0
[  128.040108] nvme0n1: I/O Cmd(0x2) @ LBA 249191952, 256 blocks, I/O Error (sct 0x3 / sc 0x71) 
[  128.040112] I/O error, dev nvme0n1, sector 249191952 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 0

static void blk_print_req_error(struct request *req, blk_status_t status)
{
	printk_ratelimited(KERN_ERR
		"%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
		"phys_seg %u prio class %u\n",
		blk_status_to_str(status),
		req->q->disk ? req->q->disk->disk_name : "?",
		blk_rq_pos(req), (__force u32)req_op(req),
		blk_op_str(req_op(req)),
		(__force u32)(req->cmd_flags & ~REQ_OP_MASK),
		req->nr_phys_segments,
		IOPRIO_PRIO_CLASS(req->ioprio));
}

The last errors comes from drivers/nvme/host/pci.c in abort_endio

[  128.040146] nvme nvme0: Abort status: 0x371
[  128.040151] nvme nvme0: Abort status: 0x371
[  128.040155] nvme nvme0: Abort status: 0x371
[  128.040158] nvme nvme0: Abort status: 0x371
static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error)
{
	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;

	dev_warn(nvmeq->dev->ctrl.device,
		 "Abort status: 0x%x", nvme_req(req)->status);
	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
	blk_mq_free_request(req);
	return RQ_END_IO_NONE;
}

abort_endio was set in nvme_timeout with init’ing abort_req

	abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
					 BLK_MQ_REQ_NOWAIT);
	if (IS_ERR(abort_req)) {
		atomic_inc(&dev->ctrl.abort_limit);
		return BLK_EH_RESET_TIMER;
	}
	nvme_init_request(abort_req, &cmd);

	abort_req->end_io = abort_endio;
	abort_req->end_io_data = NULL;
	blk_execute_rq_nowait(abort_req, false);

At this point, I think we have double fault kinda of situation here where nvme_timeout gets called multiple times when kernel tries to access NVME.

So, why is NVME timing out?

NVME notes Link to heading

CSTS definition from NVME specification 6

3.1.6 Offset 1Ch: CSTS – Controller Status
Bit Type Reset Description
31:05 RO 0 Reserved
04 RW1C HwInit
NVM Subsystem Reset Occurred (NSSRO): The initial value of this field is '1' if the
last occurance of an NVM Subsystem Reset occured while power was applied to the
NVM subsystem. The initial value of this field is '0' following an NVM Subsystem Reset
due to application of power to the NVM subsystem. This field is only valid if the
controller supports the NVM Subsystem Reset feature defined in section 7.3.1 as
indicated by CAP.NSSRS set to ‘1’.
The reset value of this field is '0' if an NVM Subsystem Reset causes activation of a
new firmware image.
03:02 RO 0
Shutdown Status (

Workaround 1 Link to heading

Changing io_timeout to 1 reduces the first timeouts but there is still the delay to the final abort.

sudo vim /etc/default/grub
add vme_core.io_timeout=1 to boot parameters
sudo update-grub