This post is about how qemu emulates PCI devices for guest Kernel. Honestly, I took the scenic road here which is reading the code and setting breakpoints in GDB at interesting places and work my way through the stack trace. Disclaimer, This is a very short post on a complicated topic and huge code base(Qemu). But It’s a good start to dig deeper into qemu code (more specifically PCI emulation in qemu)

Compiling Qemu for gdbb Link to heading

sudo apt install libpixman-1-dev libglib2.0-dev

mkdir build
cd build

../configure --target-list=x86_64-softmmu --enable-debug

Chipset Emulation Initialization Link to heading

Qemu has several chipset support. The oldest one is i440fx in hw/pci-host/i440fx.c, It defines pc_init1 which is the entry point for emulating that chipset. This is a stack trace starting from main all the way down ( or up :)) to pc_init1.

#0  pc_init1 (machine=0x555556c88400, host_type=0x5555561acef9 "i440FX-pcihost", pci_type=0x5555561acef2 "i440FX") at ../hw/i386/pc_piix.c:93
#1  0x0000555555bd9563 in pc_init_v8_0 (machine=0x555556c88400) at ../hw/i386/pc_piix.c:464
#2  0x00005555558f0a19 in machine_run_board_init (machine=0x555556c88400, mem_path=0x0, errp=0x5555569bc2e0 <error_fatal>) at ../hw/core/machine.c:1408
#3  0x0000555555b0df44 in qemu_init_board () at ../softmmu/vl.c:2513
#4  0x0000555555b0e1e5 in qmp_x_exit_preconfig (errp=0x5555569bc2e0 <error_fatal>) at ../softmmu/vl.c:2609
#5  0x0000555555b10a97 in qemu_init (argc=10, argv=0x7fffffffdea8) at ../softmmu/vl.c:3612
#6  0x000055555585c6cb in main (argc=10, argv=0x7fffffffdea8) at ../softmmu/main.c:47

In the next few snippets, I will trace the stacktrace above. Starting with qmp_x_exit_preconfig

void qmp_x_exit_preconfig(Error **errp)
{
    if (phase_check(PHASE_MACHINE_INITIALIZED)) {
        error_setg(errp, "The command is permitted only before machine initialization");
        return;
    }

    qemu_init_board();
    qemu_create_cli_devices();
    qemu_machine_creation_done();

Which calls qemu_init_board

static void qemu_init_board(void)
{

    /* From here on we enter MACHINE_PHASE_INITIALIZED.  */
    machine_run_board_init(current_machine, mem_path, &error_fatal);
    ....
}

machine_run_board_init does few things but the most import part here is machine_class->init(machine);

    accel_init_interfaces(ACCEL_GET_CLASS(machine->accelerator));
    machine_class->init(machine);
    phase_advance(PHASE_MACHINE_INITIAL

Which eventually calls pc_init. The interesting part in pc_init1 is the call to i440fx_init

        pci_bus = i440fx_init(pci_type,
                              i440fx_host,
                              system_memory, system_io, machine->ram_size,
                              x86ms->below_4g_mem_size,
                              x86ms->above_4g_mem_size,
                              pci_memory, ram_memory);

i440fx_init created root host bridge and populates devices

PCIBus *i440fx_init(const char *pci_type,
                    DeviceState *dev,
                    MemoryRegion *address_space_mem,
                    MemoryRegion *address_space_io,
                    ram_addr_t ram_size,
                    ram_addr_t below_4g_mem_size,
                    ram_addr_t above_4g_mem_size,
                    MemoryRegion *pci_address_space,
                    MemoryRegion *ram_memory)
{
    PCIBus *b;
    PCIDevice *d;
    PCIHostState *s;
    PCII440FXState *f;
    unsigned i;
    I440FXState *i440fx;

    s = PCI_HOST_BRIDGE(dev);
    b = pci_root_bus_new(dev, NULL, pci_address_space,
                         address_space_io, 0, TYPE_PCI_BUS);
    s->bus = b;
    object_property_add_child(qdev_get_machine(), "i440fx", OBJECT(dev));
    sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);

    d = pci_create_simple(b, 0, pci_type);
    f = I440FX_PCI_DEVICE(d);

pci_root_bus_new eventually calls pci_root_bus_internal_init which registers the root bus.

static void pci_root_bus_internal_init(PCIBus *bus, DeviceState *parent,
                                       MemoryRegion *address_space_mem,
                                       MemoryRegion *address_space_io,
                                       uint8_t devfn_min)
{
    assert(PCI_FUNC(devfn_min) == 0);
    bus->devfn_min = devfn_min;
    bus->slot_reserved_mask = 0x0;
    bus->address_space_mem = address_space_mem;
    bus->address_space_io = address_space_io;
    bus->flags |= PCI_BUS_IS_ROOT;

    /* host bridge */
    QLIST_INIT(&bus->child);

    pci_host_bus_register(parent);
}

Side note, i440fx calls pci_create_simple to create devices which goes to Qemu PCI subsystem and eventually call back the i440fx_realize on these devices.

0  i440fx_realize (dev=0x555556e6b620, errp=0x7fffffffd5c0) at ../hw/pci-host/i440fx.c:233
#1  0x0000555555a072ea in pci_qdev_realize (qdev=0x555556e6b620, errp=0x7fffffffd640) at ../hw/pci/pci.c:2098
#2  0x0000555555e16107 in device_set_realized (obj=0x555556e6b620, value=true, errp=0x7fffffffd750) at ../hw/core/qdev.c:510
#3  0x0000555555e202e3 in property_set_bool (obj=0x555556e6b620, v=0x555556e6fbb0, name=0x555556208211 "realized", opaque=0x555556a3da90, errp=0x7fffffffd750) at ../qom/object.c:2285
#4  0x0000555555e1e1da in object_property_set (obj=0x555556e6b620, name=0x555556208211 "realized", v=0x555556e6fbb0, errp=0x7fffffffd750) at ../qom/object.c:1420
#5  0x0000555555e22829 in object_property_set_qobject (obj=0x555556e6b620, name=0x555556208211 "realized", value=0x555556e6b220, errp=0x5555569bc2e0 <error_fatal>) at ../qom/qom-qobject.c:28
#6  0x0000555555e1e559 in object_property_set_bool (obj=0x555556e6b620, name=0x555556208211 "realized", value=true, errp=0x5555569bc2e0 <error_fatal>) at ../qom/object.c:1489
#7  0x0000555555e15895 in qdev_realize (dev=0x555556e6b620, bus=0x555556e33b40, errp=0x5555569bc2e0 <error_fatal>) at ../hw/core/qdev.c:292
#8  0x0000555555e158c6 in qdev_realize_and_unref (dev=0x555556e6b620, bus=0x555556e33b40, errp=0x5555569bc2e0 <error_fatal>) at ../hw/core/qdev.c:299
#9  0x0000555555a075f9 in pci_realize_and_unref (dev=0x555556e6b620, bus=0x555556e33b40, errp=0x5555569bc2e0 <error_fatal>) at ../hw/pci/pci.c:2167
#10 0x0000555555a07649 in pci_create_simple_multifunction (bus=0x555556e33b40, devfn=0, multifunction=false, name=0x5555561acef2 "i440FX") at ../hw/pci/pci.c:2175
#11 0x0000555555a07681 in pci_create_simple (bus=0x555556e33b40, devfn=0, name=0x5555561acef2 "i440FX") at ../hw/pci/pci.c:2181
#12 0x0000555555a2158b in i440fx_init
    (pci_type=0x5555561acef2 "i440FX", dev=0x555556d32620, address_space_mem=0x555556c6dc00, address_space_io=0x555556bbc2e0, ram_size=2147483648, below_4g_mem_size=2147483648, above_4g_mem_size=0, pci_address_space=0x555556b6d800, ram_m4
#13 0x0000555555bd8a72 in pc_init1 (machine=0x555556c88400, host_type=0x5555561acef9 "i440FX-pcihost", pci_type=0x5555561acef2 "i440FX") at ../hw/i386/pc_piix.c:227
#14 0x0000555555bd9563 in pc_init_v8_0 (machine=0x555556c88400) at ../hw/i386/pc_piix.c:464
#15 0x00005555558f0a19 in machine_run_board_init (machine=0x555556c88400, mem_path=0x0, errp=0x5555569bc2e0 <error_fatal>) at ../hw/core/machine.c:1408
#16 0x0000555555b0df44 in qemu_init_board () at ../softmmu/vl.c:2513
#17 0x0000555555b0e1e5 in qmp_x_exit_preconfig (errp=0x5555569bc2e0 <error_fatal>) at ../softmmu/vl.c:2609
#18 0x0000555555b10a97 in qemu_init (argc=10, argv=0x7fffffffdea8) at ../softmmu/vl.c:3612
#19 0x000055555585c6cb in main (argc=10, argv=0x7fffffffdea8) at ../softmmu/main.c:47

Run time Link to heading

During kernel execution, Qemu traps the IO/Memory access to emulate them for the guest kernel. I found i440fx_write_config callback in i440fx which handles memory/IO write

#0  i440fx_write_config (dev=0x555556e6b620, address=88, val=858992640, len=4) at ../hw/pci-host/i440fx.c:88
#1  0x0000555555a0acf1 in pci_host_config_write_common (pci_dev=0x555556e6b620, addr=88, limit=256, val=858992640, len=4) at ../hw/pci/pci_host.c:85
#2  0x0000555555a0af0f in pci_data_write (s=0x555556e33b40, addr=2147483736, val=858992640, len=4) at ../hw/pci/pci_host.c:127
#3  0x0000555555a0b0b6 in pci_host_data_write (opaque=0x555556d32620, addr=0, val=858992640, len=4) at ../hw/pci/pci_host.c:177
#4  0x0000555555d58118 in memory_region_write_accessor (mr=0x555556d32a60, addr=0, value=0x7ffff6b08068, size=4, shift=0, mask=4294967295, attrs=...) at ../softmmu/memory.c:493
#5  0x0000555555d58366 in access_with_adjusted_size (addr=0, value=0x7ffff6b08068, size=4, access_size_min=1, access_size_max=4, access_fn=0x555555d5801e <memory_region_write_accessor>, mr=0x555556d32a60, attrs=...)
    at ../softmmu/memory.c:555
#6  0x0000555555d5b549 in memory_region_dispatch_write (mr=0x555556d32a60, addr=0, data=858992640, op=MO_32, attrs=...) at ../softmmu/memory.c:1515
#7  0x0000555555d6ad14 in address_space_stl_internal (as=0x55555699e120 <address_space_io>, addr=3324, val=858992640, attrs=..., result=0x0, endian=DEVICE_NATIVE_ENDIAN) at /home/aa/Downloads/sources/qemu/memory_ldst.c.inc:319
#8  0x0000555555d6ae0f in address_space_stl (as=0x55555699e120 <address_space_io>, addr=3324, val=858992640, attrs=..., result=0x0) at /home/aa/Downloads/sources/qemu/memory_ldst.c.inc:350
#9  0x0000555555bc3bb8 in helper_outl (env=0x555556cb1470, port=3324, data=858992640) at ../target/i386/tcg/sysemu/misc_helper.c:55
#10 0x00007fff772b11a0 in code_gen_buffer ()
#11 0x0000555555dd8a3d in cpu_tb_exec (cpu=0x555556caf080, itb=0x7fffb395cfc0, tb_exit=0x7ffff6b086f8) at ../accel/tcg/cpu-exec.c:460
#12 0x0000555555dd97cf in cpu_loop_exec_tb (cpu=0x555556caf080, tb=0x7fffb395cfc0, pc=4294884085, last_tb=0x7ffff6b08708, tb_exit=0x7ffff6b086f8) at ../accel/tcg/cpu-exec.c:894
#13 0x0000555555dd9aeb in cpu_exec_loop (cpu=0x555556caf080, sc=0x7ffff6b08780) at ../accel/tcg/cpu-exec.c:1005
#14 0x0000555555dd9c39 in cpu_exec_setjmp (cpu=0x555556caf080, sc=0x7ffff6b08780) at ../accel/tcg/cpu-exec.c:1037
#15 0x0000555555dd9ccf in cpu_exec (cpu=0x555556caf080) at ../accel/tcg/cpu-exec.c:1063
#16 0x0000555555e08813 in tcg_cpus_exec (cpu=0x555556caf080) at ../accel/tcg/tcg-accel-ops.c:81
#17 0x0000555555e08efb in mttcg_cpu_thread_fn (arg=0x555556caf080) at ../accel/tcg/tcg-accel-ops-mttcg.c:95
#18 0x00005555560021bb in qemu_thread_start (args=0x555556d07620) at ../util/qemu-thread-posix.c:541
#19 0x00007ffff77b8b43 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#20 0x00007ffff784aa00 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

i440fx_write_config calls pci_default_write_config

static void i440fx_write_config(PCIDevice *dev,
                                uint32_t address, uint32_t val, int len)
{
    PCII440FXState *d = I440FX_PCI_DEVICE(dev);

    /* XXX: implement SMRAM.D_LOCK */
    pci_default_write_config(dev, address, val, len);
    if (ranges_overlap(address, len, I440FX_PAM, I440FX_PAM_SIZE) ||
        range_covers_byte(address, len, I440FX_SMRAM)) {
        i440fx_update_memory_mappings(d);
    }
}

and pci_default_write_config eventually call pci_host_config_write to set the value of the emulated register. For completeness, I copied pci_host_config_read as well.

static void pci_host_config_write(void *opaque, hwaddr addr,
                                  uint64_t val, unsigned len)
{
    PCIHostState *s = opaque;

    PCI_DPRINTF("%s addr " HWADDR_FMT_plx " len %d val %"PRIx64"\n",
                __func__, addr, len, val);
    if (addr != 0 || len != 4) {
        return;
    }
    s->config_reg = val;
}

static uint64_t pci_host_config_read(void *opaque, hwaddr addr,
                                     unsigned len)
{
    PCIHostState *s = opaque;
    uint32_t val = s->config_reg;

    PCI_DPRINTF("%s addr " HWADDR_FMT_plx " len %d val %"PRIx32"\n",
                __func__, addr, len, val);
    return val;
}

To sum up, There are 2 stages here:

  • The board initialization where PCI topology is create starting with host bridge and other buses and devices
  • During the time, Qemu (while doing code translation using the wonderful TCG) traps the memory/IO operations and redirect them into the devices created in the initialization above.

That’s it.