RFC. Tested with 3.14.51 and Xen 4.5.1. I'll make a proper patch against a more current kernel if we decide this is heading in the right direction.
xen/mcfg: Notify Xen of PCI MMCONFIG area before adding device On systems where the ACPI DSDT advertises a PCI MMCONFIG area but the E820 table does not reserve it, it's up to Dom0 to inform Xen of MMCONFIG areas via PHYSDEVOP_pci_mmcfg_reserved. This needs to happen before Xen tries to access extended capabilities like SRIOV in pci_add_device(), which is triggered by PHYSDEVOP_pci_device_add et al. Since both MMCONFIG discovery and PCI bus enumeration occur in acpi_init(), calling PHYSDEVOP_pci_mmcfg_reserved cannot be delegated to a separate initcall. Instead, it can be done in xen_pci_notifier() immediately before calling PHYSDEVOP_pci_device_add. Without this change, Xen 4.4 and 4.5 emit WARN messsages from msix_capability_init() when setting up Intel 82599 VFs, since vf_rlen has not been initialized by pci_add_device(). And on Xen 4.5, Xen nukes the DomU due to "Potentially insecure use of MSI-X" when the VF driver loads in the DomU. Both problems are fixed by this change. Signed-off-by: Ed Swierk <eswi...@skyportsystems.com> diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index dd9c249..47f6b45 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -26,8 +26,57 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> #include "../pci/pci.h" + #ifdef CONFIG_PCI_MMCONFIG #include <asm/pci_x86.h> +#include <linux/list.h> + +/* pci_mmcfg_list entries that have already been reported to xen */ +LIST_HEAD(xen_pci_mmcfg_list); + +static void xen_report_pci_mmcfg_region(u16 segment, u8 bus) +{ + struct pci_mmcfg_region *cfg, *new; + struct physdev_pci_mmcfg_reserved r; + int rc; + + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + return; + + cfg = pci_mmconfig_lookup(segment, bus); + if (!cfg) + return; + + list_for_each_entry_rcu(new, &xen_pci_mmcfg_list, list) { + if (new->segment == cfg->segment && + new->start_bus == cfg->start_bus && + new->end_bus == cfg->end_bus) + return; + } + + r.address = cfg->address; + r.segment = cfg->segment; + r.start_bus = cfg->start_bus; + r.end_bus = cfg->end_bus; + r.flags = XEN_PCI_MMCFG_RESERVED; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r); + if (rc != 0 && rc != -ENOSYS) { + pr_warn("Failed to report MMCONFIG reservation state for %s" + " to hypervisor (%d)\n", cfg->name, rc); + } + + new = kmemdup(cfg, sizeof(*new), GFP_KERNEL); + if (new) { + list_add_tail_rcu(&new->list, &xen_pci_mmcfg_list); + } else { + pr_warn("Failed to allocate xen_pci_mmcfg_list entry\n"); + } +} + +#else + +static void xen_report_pci_mmcfg_region(u16 segment, u8 bus) { } + #endif static bool __read_mostly pci_seg_supported = true; @@ -86,6 +135,7 @@ static int xen_add_device(struct device *dev) } #endif /* CONFIG_ACPI */ + xen_report_pci_mmcfg_region(add.seg, add.bus); r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add); if (r != -ENOSYS) return r; @@ -104,6 +154,7 @@ static int xen_add_device(struct device *dev) .physfn.devfn = physfn->devfn, }; + xen_report_pci_mmcfg_region(0, manage_pci_ext.bus); r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, &manage_pci_ext); } @@ -115,6 +166,7 @@ static int xen_add_device(struct device *dev) .is_extfn = 1, }; + xen_report_pci_mmcfg_region(0, manage_pci_ext.bus); r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, &manage_pci_ext); } else { @@ -123,6 +175,7 @@ static int xen_add_device(struct device *dev) .devfn = pci_dev->devfn, }; + xen_report_pci_mmcfg_region(0, manage_pci.bus); r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add, &manage_pci); } @@ -142,6 +195,7 @@ static int xen_remove_device(struct device *dev) .devfn = pci_dev->devfn }; + xen_report_pci_mmcfg_region(device.seg, device.bus); r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_remove, &device); } else if (pci_domain_nr(pci_dev->bus)) @@ -152,6 +206,7 @@ static int xen_remove_device(struct device *dev) .devfn = pci_dev->devfn }; + xen_report_pci_mmcfg_region(0, manage_pci.bus); r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, &manage_pci); } @@ -195,49 +250,3 @@ static int __init register_xen_pci_notifier(void) } arch_initcall(register_xen_pci_notifier); - -#ifdef CONFIG_PCI_MMCONFIG -static int __init xen_mcfg_late(void) -{ - struct pci_mmcfg_region *cfg; - int rc; - - if (!xen_initial_domain()) - return 0; - - if ((pci_probe & PCI_PROBE_MMCONF) == 0) - return 0; - - if (list_empty(&pci_mmcfg_list)) - return 0; - - /* Check whether they are in the right area. */ - list_for_each_entry(cfg, &pci_mmcfg_list, list) { - struct physdev_pci_mmcfg_reserved r; - - r.address = cfg->address; - r.segment = cfg->segment; - r.start_bus = cfg->start_bus; - r.end_bus = cfg->end_bus; - r.flags = XEN_PCI_MMCFG_RESERVED; - - rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r); - switch (rc) { - case 0: - case -ENOSYS: - continue; - - default: - pr_warn("Failed to report MMCONFIG reservation" - " state for %s to hypervisor" - " (%d)\n", - cfg->name, rc); - } - } - return 0; -} -/* - * Needs to be done after acpi_init which are subsys_initcall. - */ -subsys_initcall_sync(xen_mcfg_late); -#endif On Tue, Sep 22, 2015 at 8:09 AM, Jan Beulich <jbeul...@suse.com> wrote: > >>> On 22.09.15 at 16:37, <konrad.w...@oracle.com> wrote: > > On Tue, Sep 22, 2015 at 08:11:41AM -0600, Jan Beulich wrote: > >> >>> On 22.09.15 at 16:03, <konrad.w...@oracle.com> wrote: > >> > On Tue, Sep 22, 2015 at 07:52:19AM -0600, Jan Beulich wrote: > >> >> >>> On 22.09.15 at 15:39, <konrad.w...@oracle.com> wrote: > >> >> > On Tue, Sep 22, 2015 at 06:26:11AM -0700, Ed Swierk wrote: > >> >> >> Any other ideas? > >> >> > > >> >> > I like it - as it will update it right away. However we would need > some > >> >> > extra smarts in Xen to reconfigure its view of the PCI device now > that the > >> >> > extended configuration space has become available. > >> >> > >> >> What parts are you thinking of that would need updating (and > >> >> aren't getting updated already)? > >> > > >> > The VF data. As before this call Xen might have not been able to > >> > get to the extended configuration. > >> > >> I still don't understand: Afaics pci_add_device() updates everything > >> that may need updating already. And things appear to be working > >> fine with our kernel (where the MMCFG notification comes right out > >> of the x86 code earlier referred to), despite this meaning updates > >> to the data collected during early boot. I guess you'll need to be > >> more specific on what you see missing... > > > > This is all ancient recollection of what I had seen a year or so ago > > so take it a with a cup of salt. > > Urgh - a whole cup... > > > I have one of those Intel machines where the MMCFG is not marked > > in the E820 but is in the ACPI and I remember getting frequent > > WARN_ON from Xen in the msi.c code when doing passthrough on the VF. > > I don't have the logs but my vague recollection was that Xen could > > not validate the VF's MSI-X because at the time it gathered the > > VF PCI device information the extended configurations were not > > available. This was prior to the XSA120 discovery so ancient > > code. > > Well, VFs should not even show up prior to MMCFG getting > announced. > > Jan > >
_______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel