On 03/22/2016 05:40 AM, Alex Williamson wrote:
On Mon, 21 Mar 2016 18:08:42 +0800
Cao jin <caoj.f...@cn.fujitsu.com> wrote:
From: Chen Fan <chen.fan.f...@cn.fujitsu.com>
when boot up a VM that assigning vfio devices with aer enabled, we
must check the vfio device whether support host bus reset. because
when one error occur. OS driver always recover the device by do a
bus reset, in order to recover the vfio device, qemu must able to do
a host bus reset to recover the device to default status. and for all
affected devices by the bus reset. we must check them whether all
are assigned to the VM and on the same virtual bus. meanwhile, for
simply done, the devices which don't affected by the host bus reset
are not allowed to assign to the same virtual bus.
Rewording/expansion suggestion:
When assigning a vfio device with AER enabled, we must check whether
the device supports a host bus reset (ie. hot reset) as this may be
used by the guest OS in order to recover the device from an AER
error. QEMU must therefore have the ability to perform a physical
host bus reset using the existing vfio APIs in response to a virtual
bus reset in the VM. A physical bus reset affects all of the devices
on the host bus, therefore we place a few simplifying configuration
restriction on the VM:
- All physical devices affected by a bus reset must be assigned to
the VM with AER enabled on each and be configured on the same
virtual bus in the VM.
- No devices unaffected by the bus reset, be they physical, emulated,
or paravirtual may be configured on the same virtual bus as a
device supporting AER signaling through vfio.
In other words users wishing to enable AER on a multifunction device
need to assign all functions of the device to the same virtual bus
and enable AER support for each device. The easiest way to
accomplish this is to identity map the physical functions to virtual
functions with multifunction enabled on the virtual device.
nice description. thanks :)
Signed-off-by: Chen Fan <chen.fan.f...@cn.fujitsu.com>
---
hw/vfio/pci.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
hw/vfio/pci.h | 1 +
2 files changed, 205 insertions(+), 1 deletion(-)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 8842b7f..dce3b6d 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -38,6 +38,10 @@
static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
+#define HOST_CMP_FUNC_MASK (1 << 0)
+static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name,
+ uint8_t mask);
+
This would of course go back to a _slot version of the function as
you've had in previous versions.
/*
* Disabling BAR mmaping can be slow, but toggling it around INTx can
* also be a huge overhead. We try to get the best of both worlds by
@@ -1877,6 +1881,185 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev,
uint8_t pos)
return 0;
}
+static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
+{
+ PCIBus *bus = vdev->pdev.bus;
+ struct vfio_pci_hot_reset_info *info = NULL;
+ struct vfio_pci_dependent_device *devices;
+ VFIOGroup *group;
+ int ret, i, devfn;
+
+ ret = vfio_get_hot_reset_info(vdev, &info);
+ if (ret) {
+ error_setg(errp, "vfio: Cannot enable AER for device %s,"
+ " device does not support hot reset.",
+ vdev->vbasedev.name);
+ return;
+ }
+
+ /* List all affected devices by bus reset */
+ devices = &info->devices[0];
+
+ /* Verify that we have all the groups required */
+ for (i = 0; i < info->count; i++) {
+ PCIHostDeviceAddress host;
+ VFIOPCIDevice *tmp;
+ VFIODevice *vbasedev_iter;
+ bool found = false;
+
+ host.domain = devices[i].segment;
+ host.bus = devices[i].bus;
+ host.slot = PCI_SLOT(devices[i].devfn);
+ host.function = PCI_FUNC(devices[i].devfn);
+
+ /* Skip the current device */
+ if (vfio_pci_host_match(&host, vdev->vbasedev.name, 0)) {
+ continue;
+ }
+
+ /* Ensure we own the group of the affected device */
+ QLIST_FOREACH(group, &vfio_group_list, next) {
+ if (group->groupid == devices[i].group_id) {
+ break;
+ }
+ }
+
+ if (!group) {
+ error_setg(errp, "vfio: Cannot enable AER for device %s, "
+ "depends on group %d which is not owned.",
+ vdev->vbasedev.name, devices[i].group_id);
+ goto out;
+ }
+
+ /* Ensure affected devices for reset on the same bus */
+ QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+ if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+ continue;
+ }
+ tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
+ if (vfio_pci_host_match(&host, tmp->vbasedev.name, 0)) {
+ /*
+ * AER errors may be broadcast to all functions of a multi-
+ * function endpoint. If any of those sibling functions are
+ * also assigned, they need to have AER enabled or else an
+ * error may continue to cause a vm_stop condition. IOW,
+ * AER setup of this function would be pointless.
+ */
+ if (vfio_pci_host_match(&host, vdev->vbasedev.name,
+ HOST_CMP_FUNC_MASK) &&
+ !(tmp->features & VFIO_FEATURE_ENABLE_AER)) {
+ error_setg(errp, "vfio: Cannot enable AER for device %s, on
same slot"
+ " the dependent device %s which does not enable
AER.",
+ vdev->vbasedev.name, tmp->vbasedev.name);
+ goto out;
+ }
+
+ if (tmp->pdev.bus != bus) {
+ error_setg(errp, "vfio: Cannot enable AER for device %s, "
+ "the dependent device %s is not on the same
bus",
+ vdev->vbasedev.name, tmp->vbasedev.name);
+ goto out;
+ }
+ found = true;
+ break;
+ }
+ }
+
+ /* Ensure all affected devices assigned to VM */
+ if (!found) {
+ error_setg(errp, "vfio: Cannot enable AER for device %s, "
+ "the dependent device %04x:%02x:%02x.%x "
+ "is not assigned to VM.",
+ vdev->vbasedev.name, host.domain, host.bus,
+ host.slot, host.function);
+ goto out;
+ }
+ }
+
+ /*
+ * To make things simply, functions must are combined
+ * in the same way as on the host. no other irrelative
+ * functions on the virtual same bus was required.
+ */
Suggestion:
The above code verified that all devices affected by a bus reset
exist on the same bus in the VM. To further simplify, we also
require that there are no additional devices beyond those existing on
the VM bus.
+ for (devfn = 0; devfn < 8; devfn++) {
What about ARI? Don't we potentially need to walk up to 255 here if
the parent bridge device has ARI enabled? Should we always walk up
through 255 to prevent users from using non-zero slot numbers for other
devices?
can we use the function pcie_cap_is_arifwd_enabled to check the bridge
whether enable ARI?, if don't enable, we can simple scan the slot 0,
otherwise,
we should walk up to 255.
+ VFIOPCIDevice *tmp;
+ PCIDevice *dev;
+ bool found = false;
+
+ dev = pci_find_device(bus, pci_bus_num(bus),
+ PCI_DEVFN(PCI_SLOT(vdev->pdev.devfn), devfn));
+
+ if (!dev) {
+ continue;
+ }
+
+ if (!object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
+ error_setg(errp, "vfio: Cannot enable AER for device %s, "
+ "only support all dependent devices on the same bus, "
+ "%s is not one of the dependent devices.",
+ vdev->vbasedev.name, dev->name);
"vfio: Cannot enable AER for device %s, device %s cannot be configured
on the same virtual bus"
Is dev->name sufficiently descriptive? Perhaps print the conflicting
PCI address as well.
+ goto out;
+ }
+
+ tmp = DO_UPCAST(VFIOPCIDevice, pdev, dev);
+ for (i = 0; i < info->count; i++) {
+ PCIHostDeviceAddress host;
+
+ host.domain = devices[i].segment;
+ host.bus = devices[i].bus;
+ host.slot = PCI_SLOT(devices[i].devfn);
+ host.function = PCI_FUNC(devices[i].devfn);
+
+ if (vfio_pci_host_match(&host, tmp->vbasedev.name, 0)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ error_setg(errp, "vfio: Cannot enable AER for device %s, "
+ "only support all dependent devices on the same bus, "
+ "%s is not one of the dependent devices.",
+ vdev->vbasedev.name, tmp->vbasedev.name);
Update this comment similar to above too.
+ goto out;
+ }
+ }
+
+out:
+ g_free(info);
+ return;
+}
+
+static void vfio_aer_check_host_bus_reset(Error **errp)
+{
+ VFIOGroup *group;
+ VFIODevice *vbasedev;
+ VFIOPCIDevice *vdev;
+ Error *local_err = NULL;
+
+ /* Check All vfio-pci devices if have bus reset capability */
+ QLIST_FOREACH(group, &vfio_group_list, next) {
+ QLIST_FOREACH(vbasedev, &group->device_list, next) {
+ if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+ continue;
+ }
+ vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+ if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+ if (!pci_get_function_0(&vdev->pdev)) {
+ continue;
+ }
Is this function_0 test really necessary? It seems like it's only
relevant to the hotplug case, in which case it should be included in
the next patch, but even there I don't see a path where we'll get here
when function 0 is not populated. Is this perhaps leftover from
previous versions? It actually seems like it's opening a gap where
function 0 could be a non-vfio-pci device (or non-existent) and bypass
our validation check here.
yes, you are right, as for adding the function_0 test, I just thought that
if we boot up VM without function_0 in command line , whether we should
test the other not_function_0 vfio device with aer on the bus support
hot reset.
if we don't add the function 0 test, we probably can't boot up the VM.
because the affected function 0 is not added yet. I never thought that
this open a gap that the function 0 could be a non-vfio-pci device, thanks
for pointing it out. for hotplug case, we also potential hotplug a
not-vfio-pci device
and bypass our check. so I think the check still need to place in
device_realize like
previous version?
Thanks,
Chen
+ vfio_check_hot_bus_reset(vdev, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ }
+ }
+ }
+
+ return;
+}
+
static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
int pos, uint16_t size)
{
@@ -2060,7 +2243,6 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
vfio_intx_enable(vdev);
}
-#define HOST_CMP_FUNC_MASK (1 << 0)
static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name,
uint8_t mask)
{
@@ -2587,6 +2769,22 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice
*vdev)
vdev->req_enabled = false;
}
+static void vfio_pci_machine_done_notify(Notifier *notifier, void *unused)
+{
+ Error *local_err = NULL;
+
+ vfio_aer_check_host_bus_reset(&local_err);
+ if (local_err) {
+ fprintf(stderr, "%s\n", error_get_pretty(local_err));
+ error_free(local_err);
+ exit(1);
+ }
+}
+
+static Notifier machine_notifier = {
+ .notify = vfio_pci_machine_done_notify,
+};
+
static int vfio_initfn(PCIDevice *pdev)
{
VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
@@ -2932,6 +3130,11 @@ static const TypeInfo vfio_pci_dev_info = {
static void register_vfio_pci_dev_type(void)
{
type_register_static(&vfio_pci_dev_info);
+ /*
+ * Register notifier when machine init is done, since we need
+ * check the configration manner after all vfio device are inited.
+ */
Suggestion:
The AER configuration may depend on multiple devices, so we cannot
validate consistency after each device is initialized. We can only
depend on function initialization order (function 0 last) for hotplug
devices, therefore a machine-init-done notifier is used to validate
the configuration after all cold-plug devices are processed.
+ qemu_add_machine_init_done_notifier(&machine_notifier);
}
type_init(register_vfio_pci_dev_type)
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 7b3924e..db7c6d5 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -15,6 +15,7 @@
#include "qemu-common.h"
#include "exec/memory.h"
#include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
#include "hw/pci/pci_bridge.h"
#include "hw/vfio/vfio-common.h"
#include "qemu/event_notifier.h"
.