RFC. Tested with 3.14.51 and Xen 4.5.1. I'll make a proper patch against a
more current kernel if we decide this is heading in the right direction.


    xen/mcfg: Notify Xen of PCI MMCONFIG area before adding device

    On systems where the ACPI DSDT advertises a PCI MMCONFIG area but the
E820
    table does not reserve it, it's up to Dom0 to inform Xen of MMCONFIG
areas
    via PHYSDEVOP_pci_mmcfg_reserved.  This needs to happen before Xen
tries to
    access extended capabilities like SRIOV in pci_add_device(), which is
    triggered by PHYSDEVOP_pci_device_add et al.

    Since both MMCONFIG discovery and PCI bus enumeration occur in
acpi_init(),
    calling PHYSDEVOP_pci_mmcfg_reserved cannot be delegated to a separate
    initcall.  Instead, it can be done in xen_pci_notifier() immediately
before
    calling PHYSDEVOP_pci_device_add.

    Without this change, Xen 4.4 and 4.5 emit WARN messsages from
    msix_capability_init() when setting up Intel 82599 VFs, since vf_rlen
has
    not been initialized by pci_add_device().  And on Xen 4.5, Xen nukes the
    DomU due to "Potentially insecure use of MSI-X" when the VF driver
loads in
    the DomU.  Both problems are fixed by this change.

    Signed-off-by: Ed Swierk <eswi...@skyportsystems.com>

diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
index dd9c249..47f6b45 100644
--- a/drivers/xen/pci.c
+++ b/drivers/xen/pci.c
@@ -26,8 +26,57 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 #include "../pci/pci.h"
+
 #ifdef CONFIG_PCI_MMCONFIG
 #include <asm/pci_x86.h>
+#include <linux/list.h>
+
+/* pci_mmcfg_list entries that have already been reported to xen */
+LIST_HEAD(xen_pci_mmcfg_list);
+
+static void xen_report_pci_mmcfg_region(u16 segment, u8 bus)
+{
+ struct pci_mmcfg_region *cfg, *new;
+ struct physdev_pci_mmcfg_reserved r;
+ int rc;
+
+ if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+ return;
+
+ cfg = pci_mmconfig_lookup(segment, bus);
+ if (!cfg)
+ return;
+
+ list_for_each_entry_rcu(new, &xen_pci_mmcfg_list, list) {
+ if (new->segment == cfg->segment &&
+    new->start_bus == cfg->start_bus &&
+    new->end_bus == cfg->end_bus)
+ return;
+ }
+
+ r.address = cfg->address;
+ r.segment = cfg->segment;
+ r.start_bus = cfg->start_bus;
+ r.end_bus = cfg->end_bus;
+ r.flags = XEN_PCI_MMCFG_RESERVED;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r);
+ if (rc != 0 && rc != -ENOSYS) {
+ pr_warn("Failed to report MMCONFIG reservation state for %s"
+ " to hypervisor (%d)\n", cfg->name, rc);
+ }
+
+ new = kmemdup(cfg, sizeof(*new), GFP_KERNEL);
+ if (new) {
+ list_add_tail_rcu(&new->list, &xen_pci_mmcfg_list);
+ } else {
+ pr_warn("Failed to allocate xen_pci_mmcfg_list entry\n");
+ }
+}
+
+#else
+
+static void xen_report_pci_mmcfg_region(u16 segment, u8 bus) { }
+
 #endif

 static bool __read_mostly pci_seg_supported = true;
@@ -86,6 +135,7 @@ static int xen_add_device(struct device *dev)
  }
 #endif /* CONFIG_ACPI */

+ xen_report_pci_mmcfg_region(add.seg, add.bus);
  r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add);
  if (r != -ENOSYS)
  return r;
@@ -104,6 +154,7 @@ static int xen_add_device(struct device *dev)
  .physfn.devfn = physfn->devfn,
  };

+ xen_report_pci_mmcfg_region(0, manage_pci_ext.bus);
  r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
  &manage_pci_ext);
  }
@@ -115,6 +166,7 @@ static int xen_add_device(struct device *dev)
  .is_extfn = 1,
  };

+ xen_report_pci_mmcfg_region(0, manage_pci_ext.bus);
  r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
  &manage_pci_ext);
  } else {
@@ -123,6 +175,7 @@ static int xen_add_device(struct device *dev)
  .devfn = pci_dev->devfn,
  };

+ xen_report_pci_mmcfg_region(0, manage_pci.bus);
  r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
  &manage_pci);
  }
@@ -142,6 +195,7 @@ static int xen_remove_device(struct device *dev)
  .devfn = pci_dev->devfn
  };

+ xen_report_pci_mmcfg_region(device.seg, device.bus);
  r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_remove,
   &device);
  } else if (pci_domain_nr(pci_dev->bus))
@@ -152,6 +206,7 @@ static int xen_remove_device(struct device *dev)
  .devfn = pci_dev->devfn
  };

+ xen_report_pci_mmcfg_region(0, manage_pci.bus);
  r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
   &manage_pci);
  }
@@ -195,49 +250,3 @@ static int __init register_xen_pci_notifier(void)
 }

 arch_initcall(register_xen_pci_notifier);
-
-#ifdef CONFIG_PCI_MMCONFIG
-static int __init xen_mcfg_late(void)
-{
- struct pci_mmcfg_region *cfg;
- int rc;
-
- if (!xen_initial_domain())
- return 0;
-
- if ((pci_probe & PCI_PROBE_MMCONF) == 0)
- return 0;
-
- if (list_empty(&pci_mmcfg_list))
- return 0;
-
- /* Check whether they are in the right area. */
- list_for_each_entry(cfg, &pci_mmcfg_list, list) {
- struct physdev_pci_mmcfg_reserved r;
-
- r.address = cfg->address;
- r.segment = cfg->segment;
- r.start_bus = cfg->start_bus;
- r.end_bus = cfg->end_bus;
- r.flags = XEN_PCI_MMCFG_RESERVED;
-
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r);
- switch (rc) {
- case 0:
- case -ENOSYS:
- continue;
-
- default:
- pr_warn("Failed to report MMCONFIG reservation"
- " state for %s to hypervisor"
- " (%d)\n",
- cfg->name, rc);
- }
- }
- return 0;
-}
-/*
- * Needs to be done after acpi_init which are subsys_initcall.
- */
-subsys_initcall_sync(xen_mcfg_late);
-#endif


On Tue, Sep 22, 2015 at 8:09 AM, Jan Beulich <jbeul...@suse.com> wrote:

> >>> On 22.09.15 at 16:37, <konrad.w...@oracle.com> wrote:
> > On Tue, Sep 22, 2015 at 08:11:41AM -0600, Jan Beulich wrote:
> >> >>> On 22.09.15 at 16:03, <konrad.w...@oracle.com> wrote:
> >> > On Tue, Sep 22, 2015 at 07:52:19AM -0600, Jan Beulich wrote:
> >> >> >>> On 22.09.15 at 15:39, <konrad.w...@oracle.com> wrote:
> >> >> > On Tue, Sep 22, 2015 at 06:26:11AM -0700, Ed Swierk wrote:
> >> >> >> Any other ideas?
> >> >> >
> >> >> > I like it - as it will update it right away. However we would need
> some
> >> >> > extra smarts in Xen to reconfigure its view of the PCI device now
> that the
> >> >> > extended configuration space has become available.
> >> >>
> >> >> What parts are you thinking of that would need updating (and
> >> >> aren't getting updated already)?
> >> >
> >> > The VF data. As before this call Xen might have not been able to
> >> > get to the extended configuration.
> >>
> >> I still don't understand: Afaics pci_add_device() updates everything
> >> that may need updating already. And things appear to be working
> >> fine with our kernel (where the MMCFG notification comes right out
> >> of the x86 code earlier referred to), despite this meaning updates
> >> to the data collected during early boot. I guess you'll need to be
> >> more specific on what you see missing...
> >
> > This is all ancient recollection of what I had seen a year or so ago
> > so take it a with a cup of salt.
>
> Urgh - a whole cup...
>
> > I have one of those Intel machines where the MMCFG is not marked
> > in the E820 but is in the ACPI and I remember getting frequent
> > WARN_ON from Xen in the msi.c code when doing passthrough on the VF.
> > I don't have the logs but my vague recollection was that Xen could
> > not validate the VF's MSI-X because at the time it gathered the
> > VF PCI device information the extended configurations were not
> > available. This was prior to the XSA120 discovery so ancient
> > code.
>
> Well, VFs should not even show up prior to MMCFG getting
> announced.
>
> Jan
>
>
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Reply via email to