This is a quick write-up about SRIOV and virtual functions emulation in Qemu. From link
SR-IOV is a specification that allows a single Peripheral Component Interconnect Express (PCIe) physical device under a single root port to appear as multiple separate physical devices to the hypervisor or the guest operating system.
SR-IOV uses physical functions (PFs) and virtual functions (VFs) to manage global functions for the SR-IOV devices. PFs are full PCIe functions that are capable of configuring and managing the SR-IOV functionality. It is possible to configure or control PCIe devices using PFs, and the PF has full ability to move data in and out of the device. VFs are lightweight PCIe functions that support data flowing but have a restricted set of configuration resources.
Device emulation igb Link to heading
Let’s Have a look at an example of Network device that supports SRIOV. In hw/net/igb.c
, igb is one of the Intel NIC’s that supports SRIOV. The emulation calls these functions to init the SRIOV capability and BAR
pcie_sriov_pf_init(pci_dev, IGB_CAP_SRIOV_OFFSET, "igbvf",
IGB_82576_VF_DEV_ID, IGB_MAX_VF_FUNCTIONS, IGB_MAX_VF_FUNCTIONS,
IGB_VF_OFFSET, IGB_VF_STRIDE);
pcie_sriov_pf_init_vf_bar(pci_dev, 0,
PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH,
16 * KiB);
pcie_sriov_pf_init_vf_bar(pci_dev, 3,
PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH,
16 * KiB);
sriov subsystem Link to heading
In hw/pci/pcie_sriov.c
, pcie_sriov_pf_init
sets the configuration spaces for SRIOV capability.
void pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
const char *vfname, uint16_t vf_dev_id,
uint16_t init_vfs, uint16_t total_vfs,
uint16_t vf_offset, uint16_t vf_stride)
{
uint8_t *cfg = dev->config + offset;
uint8_t *wmask;
pcie_add_capability(dev, PCI_EXT_CAP_ID_SRIOV, 1,
offset, PCI_EXT_CAP_SRIOV_SIZEOF);
dev->exp.sriov_cap = offset;
dev->exp.sriov_pf.num_vfs = 0;
dev->exp.sriov_pf.vfname = g_strdup(vfname);
dev->exp.sriov_pf.vf = NULL;
pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, vf_offset);
pci_set_word(cfg + PCI_SRIOV_VF_STRIDE, vf_stride);
pci_set_word(cfg + PCI_SRIOV_SUP_PGSIZE, SRIOV_SUP_PGSIZE_MINREQ);
pci_set_word(cfg + PCI_SRIOV_SYS_PGSIZE, 0x1);
pci_set_word(cfg + PCI_SRIOV_VF_DID, vf_dev_id);
pci_set_word(cfg + PCI_SRIOV_INITIAL_VF, init_vfs);
pci_set_word(cfg + PCI_SRIOV_TOTAL_VF, total_vfs);
pci_set_word(cfg + PCI_SRIOV_NUM_VF, 0);
wmask = dev->wmask + offset;
pci_set_word(wmask + PCI_SRIOV_CTRL,
PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE | PCI_SRIOV_CTRL_ARI);
pci_set_word(wmask + PCI_SRIOV_NUM_VF, 0xffff);
pci_set_word(wmask + PCI_SRIOV_SYS_PGSIZE, 0x553);
qdev_prop_set_bit(&dev->qdev, "multifunction", true);
}
pcie_sriov_pf_init_vf_bar
is called with devices to set BAR mask and configuration.
void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num,
uint8_t type, dma_addr_t size)
{
...
...
wmask = ~(size - 1);
addr = sriov_cap + PCI_SRIOV_BAR + region_num * 4;
pci_set_long(dev->config + addr, type);
if (!(type & PCI_BASE_ADDRESS_SPACE_IO) &&
type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
pci_set_quad(dev->wmask + addr, wmask);
pci_set_quad(dev->cmask + addr, ~0ULL);
} else {
pci_set_long(dev->wmask + addr, wmask & 0xffffffff);
pci_set_long(dev->cmask + addr, 0xffffffff);
}
dev->exp.sriov_pf.vf_bar_type[region_num] = type;
}
pci methods accessing VF address space Link to heading
In hw/pci/pci.c
, pci_config_get_bar_addr
is called by the generic pci subsystem. But the virtual function has special handling.
static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg,
uint8_t type, pcibus_t size)
{
pcibus_t new_addr;
if (!pci_is_vf(d)) {
...
...
} else {
PCIDevice *pf = d->exp.sriov_vf.pf;
uint16_t sriov_cap = pf->exp.sriov_cap;
int bar = sriov_cap + PCI_SRIOV_BAR + reg * 4;
uint16_t vf_offset =
pci_get_word(pf->config + sriov_cap + PCI_SRIOV_VF_OFFSET);
uint16_t vf_stride =
pci_get_word(pf->config + sriov_cap + PCI_SRIOV_VF_STRIDE);
uint32_t vf_num = (d->devfn - (pf->devfn + vf_offset)) / vf_stride;
if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
new_addr = pci_get_quad(pf->config + bar);
} else {
new_addr = pci_get_long(pf->config + bar);
}
new_addr += vf_num * size;
}
...
...
}
Enable and disable devices Link to heading
Still in pci subsystem, In pci_default_write_config
, sriov had to be handled differently by calling pcie_sriov_config_writ
.
void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int l)
{
...
...
pcie_sriov_config_write(d, addr, val_in, l);
}
The config enables and disables the device by calling register_vfs
and unregister_vfs
.
void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
uint32_t val, int len)
{
uint32_t off;
uint16_t sriov_cap = dev->exp.sriov_cap;
if (!sriov_cap || address < sriov_cap) {
return;
}
off = address - sriov_cap;
if (off >= PCI_EXT_CAP_SRIOV_SIZEOF) {
return;
}
trace_sriov_config_write(dev->name, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn), off, val, len);
if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
if (dev->exp.sriov_pf.num_vfs) {
if (!(val & PCI_SRIOV_CTRL_VFE)) {
unregister_vfs(dev);
}
} else {
if (val & PCI_SRIOV_CTRL_VFE) {
register_vfs(dev);
}
}
}
}
In register_vfs
and unregister_vfs
, the virtual functions are enabled and disabled.
static PCIDevice *register_vf(PCIDevice *pf, int devfn, const char *name,
uint16_t vf_num)
{
PCIDevice *dev = pci_new(devfn, name);
dev->exp.sriov_vf.pf = pf;
dev->exp.sriov_vf.vf_number = vf_num;
PCIBus *bus = pci_get_bus(pf);
Error *local_err = NULL;
...
...
qdev_realize(&dev->qdev, &bus->qbus, &local_err);
/* set vid/did according to sr/iov spec - they are not used */
pci_config_set_vendor_id(dev->config, 0xffff);
pci_config_set_device_id(dev->config, 0xffff);
return dev;
}
static void register_vfs(PCIDevice *dev)
{
uint16_t sriov_cap = dev->exp.sriov_cap;
uint16_t vf_offset = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_VF_OFFSET);
uint16_t vf_stride = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_VF_STRIDE);
int32_t devfn = dev->devfn + vf_offset;
num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs);
trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn), num_vfs);
for (i = 0; i < num_vfs; i++) {
dev->exp.sriov_pf.vf[i] = register_vf(dev, devfn,
dev->exp.sriov_pf.vfname, i);
if (!dev->exp.sriov_pf.vf[i]) {
num_vfs = i;
break;
}
devfn += vf_stride;
}
dev->exp.sriov_pf.num_vfs = num_vfs;
}
static void unregister_vfs(PCIDevice *dev)
{
...
...
uint16_t num_vfs = dev->exp.sriov_pf.num_vfs;
trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn), num_vfs);
for (i = 0; i < num_vfs; i++) {
PCIDevice *vf = dev->exp.sriov_pf.vf[i];
object_property_set_bool(OBJECT(vf), "realized", false, &local_err);
...
...
}
g_free(dev->exp.sriov_pf.vf);
dev->exp.sriov_pf.vf = NULL;
dev->exp.sriov_pf.num_vfs = 0;
pci_set_word(dev->config + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0);
}
multifunction and virtual functions Link to heading
static void pci_init_multifunction(PCIBus *bus, PCIDevice *dev, Error **errp)
{
uint8_t slot = PCI_SLOT(dev->devfn);
uint8_t func;
if (dev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
}
/*
* With SR/IOV and ARI, a device at function 0 need not be a multifunction
* device, as it may just be a VF that ended up with function 0 in
* the legacy PCI interpretation. Avoid failing in such cases:
*/
if (pci_is_vf(dev) &&
dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
return;
}