AFAICT there's not actually any good reason why guests can't use x2apic and have more than 255 CPUs today, even without exposing interrupt remapping to the guest.
The only issue is that guests can't direct external IOAPIC and MSI interrupts at the higher APIC IDs. So what? A guest might have a workload where it makes plenty of sense to use those extra CPUs and just refrain from targeting external interrupts at them. In fact, if you take a close look at the hyperv-iommu driver in the Linux guest kernel, you'll note that it doesn't actually do any remapping at all; all it does is return -EINVAL if asked to set affinity to a CPU which can't be targeted. For Linux at least, it should be fairly simple to have a per-IRQ controller affinity limit, so it doesn't attempt to target CPUs it can't reach. But actually, it's really simple to extend the limit of reachable APICs even without the complexity of adding a full vIOMMU. There are 8 bits of extended destination ID in the IOAPIC RTE, which maps to bits 11-4 of the MSI address. This was historically not used in bare metal, but IRQ remapping now uses the lowest bit to indicate a remappable format interrupt. A VMM can use the other 7 bits to allow guests to target 15 bits of APIC ID, which gives support for 32Ki vCPUs without needing to expose IRQ remapping to the guest. Here's a proof-of-concept hack, which I've tested with a Linux guest that knows where to put the additional 7 bits in the IOAPIC RTE and MSI message. At least IOAPIC and emulated AHCI (MSI) are working; I haven't tested assigned PCI devices yet. diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c index 4eb2d77b87..b0f4b1a630 100644 --- a/hw/i386/kvm/apic.c +++ b/hw/i386/kvm/apic.c @@ -14,6 +14,7 @@ #include "qemu/module.h" #include "cpu.h" #include "hw/i386/apic_internal.h" +#include "hw/i386/apic-msidef.h" #include "hw/pci/msi.h" #include "sysemu/hw_accel.h" #include "sysemu/kvm.h" @@ -183,6 +184,13 @@ static void kvm_send_msi(MSIMessage *msg) { int ret; + /* + * The message has already passed through interrupt remapping if enabled, + * but the legacy extended destination ID in low bits still needs to be + * handled. + */ + msg->address = apic_convert_ext_dest_id(msg->address); + ret = kvm_irqchip_send_msi(kvm_state, *msg); if (ret < 0) { fprintf(stderr, "KVM: injection failed, MSI lost (%s)\n", diff --git a/hw/i386/pc.c b/hw/i386/pc.c index e87be5d29a..eb4901d6b7 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -807,7 +807,7 @@ void pc_machine_done(Notifier *notifier, void *data) fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus); } - if (x86ms->apic_id_limit > 255 && !xen_enabled()) { + if (0 && x86ms->apic_id_limit > 255 && !xen_enabled()) { IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); if (!iommu || !x86_iommu_ir_supported(X86_IOMMU_DEVICE(iommu)) || diff --git a/include/hw/i386/apic-msidef.h b/include/hw/i386/apic-msidef.h index 420b41167d..b3e0da64a5 100644 --- a/include/hw/i386/apic-msidef.h +++ b/include/hw/i386/apic-msidef.h @@ -28,4 +28,20 @@ #define MSI_ADDR_DEST_IDX_SHIFT 4 #define MSI_ADDR_DEST_ID_MASK 0x000ff000 +static inline uint64_t apic_convert_ext_dest_id(uint64_t address) +{ + uint64_t ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT); + /* + * If the remappable format bit is set, or the upper bits are + * already set in address_hi, or the low extended bits aren't + * there anyway, do nothing. + */ + if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) || + (address >> 32)) + return address; + + address &= ~ext_id; + address |= ext_id << 35; + return address; +} #endif /* HW_APIC_MSIDEF_H */ diff --git a/target/i386/kvm.c b/target/i386/kvm.c index f6dae4cfb6..547a2faf72 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -4589,13 +4589,11 @@ int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, X86IOMMUState *iommu = x86_iommu_get_default(); if (iommu) { - int ret; - MSIMessage src, dst; X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu); - if (!class->int_remap) { - return 0; - } + if (class->int_remap) { + int ret; + MSIMessage src, dst; src.address = route->u.msi.address_hi; src.address <<= VTD_MSI_ADDR_HI_SHIFT; @@ -4610,11 +4608,21 @@ int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, return 1; } + /* + * Handled untranslated compatibilty format interrupt with + * extended destination ID in the low bits 11-5. */ + dst.address = apic_convert_ext_dest_id(dst.address); + route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT; route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK; route->u.msi.data = dst.data; + return 0; + } } + address = apic_convert_ext_dest_id(address); + route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT; + route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK; return 0; }
smime.p7s
Description: S/MIME cryptographic signature