On Wed, 2015-05-27 at 10:46 +0800, Chen Fan wrote:
we introduce a has_bus_reset capability to sign the vfio
devices if support host bus reset.
Signed-off-by: Chen Fan <address@hidden>
---
hw/vfio/pci.c | 123
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 123 insertions(+)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index f4e7855..5934fd7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -33,6 +33,7 @@
#include "hw/pci/msix.h"
#include "hw/pci/pci.h"
#include "hw/pci/pci_bridge.h"
+#include "hw/pci/pci_bus.h"
#include "qemu-common.h"
#include "qemu/error-report.h"
#include "qemu/event_notifier.h"
@@ -170,6 +171,7 @@ typedef struct VFIOPCIDevice {
bool req_enabled;
bool has_flr;
bool has_pm_reset;
+ bool has_bus_reset;
I still think that caching this is a bad idea, there's no point at which
we can blindly assume the capability is still present.
bool rom_read_failed;
} VFIOPCIDevice;
@@ -203,6 +205,7 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
uint32_t val, int len);
static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
+static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev);
/*
* Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -2853,6 +2856,20 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev, int pos,
uint16_t size)
dev_iter = pci_bridge_get_device(dev_iter->bus);
}
+ /*
+ * Don't check bus reset capability when device is enabled during
+ * qemu machine creation, which is done by machine init function.
+ */
+ if (DEVICE(vdev)->hotplugged) {
+ vfio_check_host_bus_reset(vdev);
+ if (!vdev->has_bus_reset) {
+ error_report("vfio: Cannot enable AER for device %s, "
+ "which is not support host bus reset.",
"which does not support host bus reset."
+ vdev->vbasedev.name);
+ goto error;
+ }
+ }
+
errcap = vfio_pci_read_config(pdev, pdev->exp.aer_cap + PCI_ERR_CAP, 4);
/*
* The ability to record multiple headers is depending on
@@ -3678,6 +3695,112 @@ static void vfio_setup_resetfn(VFIOPCIDevice *vdev)
}
}
+struct VfioDeviceFind {
We use VFIOFooBar for all other camel case definitions, much like PCIBus
and PCIDevice below.
+ PCIBus *pbus;
+ PCIDevice *pdev;
+ bool found;
+};
+
+static void find_devices(PCIBus *bus, void *opaque)
+{
+ struct VfioDeviceFind *find = opaque;
+ int i;
+
+ if (find->found == true) {
if (find->found) {...
+ return;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(bus->devices); i++) {
+ if (!bus->devices[i]) {
+ continue;
+ }
+
+ if (bus->devices[i] == find->pdev) {
+ find->pbus = bus;
+ find->found = true;
+ break;
+ }
+ }
+}
+
+static void vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
+{
+ PCIBus *bus = vdev->pdev.bus;
+ struct vfio_pci_hot_reset_info *info = NULL;
+ struct vfio_pci_dependent_device *devices;
+ VFIOGroup *group;
+ int ret, i;
+ bool has_bus_reset = false;
+
+ ret = vfio_get_hot_reset_info(vdev, &info);
+ if (ret < 0) {
if (ret) {...
+ goto out;
+ }
+
+ /* List all affected devices by bus reset */
+ devices = &info->devices[0];
+
+ /* Verify that we have all the groups required */
+ for (i = 0; i < info->count; i++) {
+ PCIHostDeviceAddress host;
+ VFIOPCIDevice *tmp;
+ VFIODevice *vbasedev_iter;
+
+ host.domain = devices[i].segment;
+ host.bus = devices[i].bus;
+ host.slot = PCI_SLOT(devices[i].devfn);
+ host.function = PCI_FUNC(devices[i].devfn);
+
+ /* Skip the current device */
+ if (vfio_pci_host_match(&host, &vdev->host)) {
+ continue;
+ }
+
+ /* Ensure we own the group of the affected device */
+ QLIST_FOREACH(group, &vfio_group_list, next) {
+ if (group->groupid == devices[i].group_id) {
+ break;
+ }
+ }
+
+ if (!group) {
+ goto out;
+ }
+
+ /* Ensure affected devices for reset under the same bus */
+ QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+ if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+ continue;
+ }
+ tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
+ if (vfio_pci_host_match(&host, &tmp->host)) {
+ struct VfioDeviceFind find = { .pdev = &tmp->pdev, .found =
false };
+
+ pci_for_each_bus(bus, find_devices, &find);
+ if (!find.found) {
+ goto out;
+ }
+ /*
+ * When the check device is hotplugged to a higher bus again,
+ * which would influence the affected device which enable aer
+ * below the bus.
+ */
+ if (tmp->features & VFIO_FEATURE_ENABLE_AER &&
+ find.pbus != bus) {
+ goto out;
+ }
I think what you're trying to do here is assume that if a reset of A
affects B, then a reset of B affects A, and if B is on a subordinate bus
from A, then our configuration is broken, right? Can we really
guarantee that assumption? If we had a physical topology that mirrored
this virtual topology, that wouldn't necessarily be true. For instance,
if A was a function of a multi-function device where another function
was a PCIe upstream switch, B could be subordinate to that switch, so a
reset of A affects B, but a reset of B doesn't affect A.
+ break;
+ }
+ }
+ }
+
+ has_bus_reset = true;
+
+out:
+ vdev->has_bus_reset = has_bus_reset;
I don't see any value is storing this, it can't be trusted at any point
in the future. I think that any time we add a device or think about
forwarding an AER message to the guest, we need to do a validation pass,
testing the entire topology.
To do that we'd iterate through every device in every group for PCI
devices that support AER. For each one, get the hot reset info for the
affected device list. For each affected device, if it's attached to the
VM, it needs to be on or below the bus of the target device.
Additionally, we should test all of the non-AER supporting vfio-pci
devices on or below the target bus to verify they have a reset
mechanism. Run that function for each vfio-pci device as it's added,
regardless of whether it's hotplug. If the test fails, fail the initfn
for the current device. Also run the test prior to AER injection, if it
fails demote the AER injection to a machine halt as we have now.