From: Yu Zhang <[email protected]> Sent: Monday, May 11, 2026 9:24 AM
> 
> Add a para-virtualized IOMMU driver for Linux guests running on Hyper-V.
> This driver implements stage-1 IO translation within the guest OS.
> It integrates with the Linux IOMMU core, utilizing Hyper-V hypercalls
> for:
>  - Capability discovery
>  - Domain allocation, configuration, and deallocation
>  - Device attachment and detachment
>  - IOTLB invalidation
> 
> The driver constructs x86-compatible stage-1 IO page tables in the
> guest memory using consolidated IO page table helpers. This allows
> the guest to manage stage-1 translations independently of vendor-
> specific drivers (like Intel VT-d or AMD IOMMU).
> 
> Hyper-V consumes this stage-1 IO page table when a device domain is
> created and configured, and nests it with the host's stage-2 IO page
> tables, therefore eliminating the VM exits for guest IOMMU mapping
> operations. For unmapping operations, VM exits to perform the IOTLB
> flush are still unavoidable.
> 
> Hyper-V identifies each PCI pass-thru device by a logical device ID
> in its hypercall interface. The vPCI driver (pci-hyperv) registers the
> per-bus portion of this ID with the pvIOMMU driver during bus probe.
> The pvIOMMU driver stores this mapping and combines it with the function
> number of the endpoint PCI device to form the complete ID for hypercalls.

As you are probably aware, Mukesh's patch series to support PCI
pass-thru devices also needs to get the logical device ID. Maybe the
registration mechanism needs to move somewhere that can be shared
with his code.

> 
> Co-developed-by: Wei Liu <[email protected]>
> Signed-off-by: Wei Liu <[email protected]>
> Co-developed-by: Easwar Hariharan <[email protected]>
> Signed-off-by: Easwar Hariharan <[email protected]>
> Signed-off-by: Yu Zhang <[email protected]>
> ---
>  arch/x86/hyperv/hv_init.c           |   4 +
>  arch/x86/include/asm/mshyperv.h     |   4 +
>  drivers/iommu/hyperv/Kconfig        |  17 +
>  drivers/iommu/hyperv/Makefile       |   1 +
>  drivers/iommu/hyperv/iommu.c        | 705 ++++++++++++++++++++++++++++
>  drivers/iommu/hyperv/iommu.h        |  54 +++
>  drivers/pci/controller/pci-hyperv.c |  19 +-
>  include/asm-generic/mshyperv.h      |  12 +
>  8 files changed, 815 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/iommu/hyperv/iommu.c
>  create mode 100644 drivers/iommu/hyperv/iommu.h
> 
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 323adc93f2dc..2c8ff8e06249 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -578,6 +578,10 @@ void __init hyperv_init(void)
>       old_setup_percpu_clockev = x86_init.timers.setup_percpu_clockev;
>       x86_init.timers.setup_percpu_clockev = hv_stimer_setup_percpu_clockev;
> 
> +#ifdef CONFIG_HYPERV_PVIOMMU
> +     x86_init.iommu.iommu_init = hv_iommu_init;
> +#endif
> +
>       hv_apic_init();
> 
>       x86_init.pci.arch_init = hv_pci_init;
> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
> index f64393e853ee..20d947c2c758 100644
> --- a/arch/x86/include/asm/mshyperv.h
> +++ b/arch/x86/include/asm/mshyperv.h
> @@ -313,6 +313,10 @@ static inline void mshv_vtl_return_hypercall(void) {}
>  static inline void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) 
> {}
>  #endif
> 
> +#ifdef CONFIG_HYPERV_PVIOMMU
> +int __init hv_iommu_init(void);
> +#endif
> +
>  #include <asm-generic/mshyperv.h>
> 
>  #endif
> diff --git a/drivers/iommu/hyperv/Kconfig b/drivers/iommu/hyperv/Kconfig
> index 30f40d867036..9e658d5c9a77 100644
> --- a/drivers/iommu/hyperv/Kconfig
> +++ b/drivers/iommu/hyperv/Kconfig
> @@ -8,3 +8,20 @@ config HYPERV_IOMMU
>       help
>         Stub IOMMU driver to handle IRQs to support Hyper-V Linux
>         guest and root partitions.
> +
> +if HYPERV_IOMMU
> +config HYPERV_PVIOMMU
> +     bool "Microsoft Hypervisor para-virtualized IOMMU support"
> +     depends on X86 && HYPERV

What is the intent w.r.t. 32-bit builds? Using X86 instead of X86_64
allows it. I did a 32-bit build and didn't get any build failures, which is
good. But I can't run it to see if the pvIOMMU actually works in a
32-bit build. I don't know how building X86_64 generic PT entries
would fare.

> +     select IOMMU_API
> +     select GENERIC_PT
> +     select IOMMU_PT
> +     select IOMMU_PT_X86_64
> +     select IOMMU_IOVA
> +     default HYPERV
> +     help
> +       Para-virtualized IOMMU driver for Linux guests running on
> +       Microsoft Hyper-V. Provides DMA remapping and IOTLB
> +       flush support to enable DMA isolation for devices
> +       assigned to the guest.
> +endif
> diff --git a/drivers/iommu/hyperv/Makefile b/drivers/iommu/hyperv/Makefile
> index 9f557bad94ff..8669741c0a51 100644
> --- a/drivers/iommu/hyperv/Makefile
> +++ b/drivers/iommu/hyperv/Makefile
> @@ -1,2 +1,3 @@
>  # SPDX-License-Identifier: GPL-2.0
>  obj-$(CONFIG_HYPERV_IOMMU) += irq_remapping.o
> +obj-$(CONFIG_HYPERV_PVIOMMU) += iommu.o
> diff --git a/drivers/iommu/hyperv/iommu.c b/drivers/iommu/hyperv/iommu.c
> new file mode 100644
> index 000000000000..e5fc625314b5
> --- /dev/null
> +++ b/drivers/iommu/hyperv/iommu.c
> @@ -0,0 +1,705 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Hyper-V IOMMU driver.
> + *
> + * Copyright (C) 2019, 2024-2026 Microsoft, Inc.
> + */
> +
> +#define pr_fmt(fmt) "Hyper-V pvIOMMU: " fmt
> +#define dev_fmt(fmt) pr_fmt(fmt)
> +
> +#include <linux/iommu.h>
> +#include <linux/pci.h>
> +#include <linux/dma-map-ops.h>
> +#include <linux/generic_pt/iommu.h>
> +#include <linux/pci-ats.h>
> +
> +#include <asm/iommu.h>
> +#include <asm/hypervisor.h>
> +#include <asm/mshyperv.h>
> +
> +#include "iommu.h"
> +#include "../iommu-pages.h"
> +
> +struct hv_iommu_dev *hv_iommu_device;
> +
> +/*
> + * Identity and blocking domains are static singletons: identity is a 1:1
> + * passthrough with no page table, blocking rejects all DMA. Neither holds
> + * per-IOMMU state, so one instance suffices even with multiple vIOMMUs.
> + */
> +static struct hv_iommu_domain hv_identity_domain;
> +static struct hv_iommu_domain hv_blocking_domain;
> +static const struct iommu_domain_ops hv_iommu_identity_domain_ops;
> +static const struct iommu_domain_ops hv_iommu_blocking_domain_ops;
> +static struct iommu_ops hv_iommu_ops;
> +static LIST_HEAD(hv_iommu_pci_bus_list);
> +static DEFINE_SPINLOCK(hv_iommu_pci_bus_lock);
> +
> +#define hv_iommu_present(iommu_cap) (iommu_cap & HV_IOMMU_CAP_PRESENT)
> +#define hv_iommu_s1_domain_supported(iommu_cap) (iommu_cap & HV_IOMMU_CAP_S1)
> +#define hv_iommu_5lvl_supported(iommu_cap) (iommu_cap & HV_IOMMU_CAP_S1_5LVL)
> +#define hv_iommu_ats_supported(iommu_cap) (iommu_cap & HV_IOMMU_CAP_ATS)
> +
> +int hv_iommu_register_pci_bus(int pci_domain_nr, u32 logical_dev_id_prefix)
> +{
> +     struct hv_pci_busdata *bus, *new;
> +     int ret = 0;
> +
> +     if (no_iommu || !iommu_detected)
> +             return 0;
> +
> +     new = kzalloc_obj(*new, GFP_KERNEL);
> +     if (!new)
> +             return -ENOMEM;
> +
> +     spin_lock(&hv_iommu_pci_bus_lock);
> +     list_for_each_entry(bus, &hv_iommu_pci_bus_list, list) {
> +             if (bus->pci_domain_nr != pci_domain_nr)
> +                     continue;
> +
> +             if (bus->logical_dev_id_prefix != logical_dev_id_prefix) {
> +                     pr_err("stale registration for PCI domain %d (old 
> prefix 0x%08x, new 0x%08x)\n",
> +                            pci_domain_nr, bus->logical_dev_id_prefix,
> +                            logical_dev_id_prefix);
> +                     ret = -EEXIST;
> +             }
> +
> +             goto out_free;
> +     }
> +
> +     new->pci_domain_nr = pci_domain_nr;
> +     new->logical_dev_id_prefix = logical_dev_id_prefix;
> +     list_add(&new->list, &hv_iommu_pci_bus_list);
> +     spin_unlock(&hv_iommu_pci_bus_lock);
> +     return 0;
> +
> +out_free:
> +     spin_unlock(&hv_iommu_pci_bus_lock);
> +     kfree(new);
> +     return ret;
> +}
> +EXPORT_SYMBOL_FOR_MODULES(hv_iommu_register_pci_bus, "pci-hyperv");
> +
> +void hv_iommu_unregister_pci_bus(int pci_domain_nr)
> +{
> +     struct hv_pci_busdata *bus, *tmp;
> +
> +     spin_lock(&hv_iommu_pci_bus_lock);
> +     list_for_each_entry_safe(bus, tmp, &hv_iommu_pci_bus_list, list) {
> +             if (bus->pci_domain_nr == pci_domain_nr) {
> +                     list_del(&bus->list);
> +                     kfree(bus);
> +                     break;
> +             }
> +     }
> +     spin_unlock(&hv_iommu_pci_bus_lock);
> +}
> +EXPORT_SYMBOL_FOR_MODULES(hv_iommu_unregister_pci_bus, "pci-hyperv");
> +
> +/*
> + * Look up the logical device ID for a vPCI device. Returns 0 on success
> + * with *logical_id filled in; -ENODEV if no entry registered for this
> + * device's vPCI bus.
> + */
> +static int hv_iommu_lookup_logical_dev_id(struct pci_dev *pdev, u64 
> *logical_id)
> +{
> +     struct hv_pci_busdata *bus;
> +     int domain = pci_domain_nr(pdev->bus);
> +     int ret = -ENODEV;
> +
> +     spin_lock(&hv_iommu_pci_bus_lock);
> +     list_for_each_entry(bus, &hv_iommu_pci_bus_list, list) {
> +             if (bus->pci_domain_nr == domain) {
> +                     *logical_id = (u64)bus->logical_dev_id_prefix |
> +                                   PCI_FUNC(pdev->devfn);
> +                     ret = 0;
> +                     break;
> +             }
> +     }
> +     spin_unlock(&hv_iommu_pci_bus_lock);
> +     return ret;
> +}
> +
> +static int hv_create_device_domain(struct hv_iommu_domain *hv_domain, u32 
> domain_stage)
> +{
> +     int ret;
> +     u64 status;
> +     unsigned long flags;
> +     struct hv_input_create_device_domain *input;
> +
> +     ret = ida_alloc_range(&hv_iommu_device->domain_ids,
> +                     hv_iommu_device->first_domain, 
> hv_iommu_device->last_domain,
> +                     GFP_KERNEL);
> +     if (ret < 0)
> +             return ret;
> +
> +     hv_domain->device_domain.partition_id = HV_PARTITION_ID_SELF;
> +     hv_domain->device_domain.domain_id.type = domain_stage;
> +     hv_domain->device_domain.domain_id.id = ret;
> +     hv_domain->hv_iommu = hv_iommu_device;
> +
> +     local_irq_save(flags);
> +
> +     input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +     memset(input, 0, sizeof(*input));
> +     input->device_domain = hv_domain->device_domain;
> +     input->create_device_domain_flags.forward_progress_required = 1;
> +     input->create_device_domain_flags.inherit_owning_vtl = 0;
> +     status = hv_do_hypercall(HVCALL_CREATE_DEVICE_DOMAIN, input, NULL);
> +
> +     local_irq_restore(flags);
> +
> +     if (!hv_result_success(status)) {
> +             pr_err("HVCALL_CREATE_DEVICE_DOMAIN failed, status %lld\n", 
> status);
> +             ida_free(&hv_iommu_device->domain_ids, 
> hv_domain->device_domain.domain_id.id);
> +     }
> +
> +     return hv_result_to_errno(status);
> +}
> +
> +static void hv_delete_device_domain(struct hv_iommu_domain *hv_domain)
> +{
> +     u64 status;
> +     unsigned long flags;
> +     struct hv_input_delete_device_domain *input;
> +
> +     local_irq_save(flags);
> +
> +     input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +     memset(input, 0, sizeof(*input));
> +     input->device_domain = hv_domain->device_domain;
> +     status = hv_do_hypercall(HVCALL_DELETE_DEVICE_DOMAIN, input, NULL);
> +
> +     local_irq_restore(flags);
> +
> +     if (!hv_result_success(status))
> +             pr_err("HVCALL_DELETE_DEVICE_DOMAIN failed, status %lld\n", 
> status);
> +
> +     ida_free(&hv_domain->hv_iommu->domain_ids, 
> hv_domain->device_domain.domain_id.id);
> +}
> +
> +static bool hv_iommu_capable(struct device *dev, enum iommu_cap cap)
> +{
> +     switch (cap) {
> +     case IOMMU_CAP_CACHE_COHERENCY:
> +             return true;
> +     case IOMMU_CAP_DEFERRED_FLUSH:
> +             return true;
> +     default:
> +             return false;
> +     }
> +}
> +
> +static void hv_flush_device_domain(struct hv_iommu_domain *hv_domain)
> +{
> +     u64 status;
> +     unsigned long flags;
> +     struct hv_input_flush_device_domain *input;
> +
> +     local_irq_save(flags);
> +
> +     input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +     memset(input, 0, sizeof(*input));
> +     input->device_domain = hv_domain->device_domain;

The previous version of this patch had code to set several other fields in
the input. I wanted to confirm that not setting them in this version is
intentional. Were they not needed?

> +     status = hv_do_hypercall(HVCALL_FLUSH_DEVICE_DOMAIN, input, NULL);
> +
> +     local_irq_restore(flags);
> +
> +     if (!hv_result_success(status))
> +             pr_err("HVCALL_FLUSH_DEVICE_DOMAIN failed, status %lld\n", 
> status);
> +}
> +
> +static void hv_iommu_detach_dev(struct iommu_domain *domain, struct device 
> *dev)
> +{
> +     u64 status;
> +     unsigned long flags;
> +     struct hv_input_detach_device_domain *input;
> +     struct pci_dev *pdev;
> +     struct hv_iommu_domain *hv_domain = to_hv_iommu_domain(domain);
> +     struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
> +
> +     /* See the attach function, only PCI devices for now */
> +     if (!dev_is_pci(dev) || vdev->hv_domain != hv_domain)
> +             return;

Are these sanity checks necessary? The only caller is hv_iommu_attach_dev()
and it has already done the checks.

> +
> +     pdev = to_pci_dev(dev);
> +
> +     dev_dbg(dev, "detaching from domain %d\n", 
> hv_domain->device_domain.domain_id.id);
> +
> +     local_irq_save(flags);
> +
> +     input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +     memset(input, 0, sizeof(*input));
> +     input->partition_id = HV_PARTITION_ID_SELF;
> +     if (hv_iommu_lookup_logical_dev_id(pdev, &input->device_id.as_uint64)) {

As Sashiko and Jacob Pan pointed out, doing the lookup while interrupts are 
disabled
is problematic. My suggestion would be to just do the lookup into a local 
variable
before disabling interrupts (rather than using a raw spin lock as Jacob 
suggested).

Same situation occurs in hv_iommu_attach_dev() and
hv_iommu_get_logical_device_property().

> +             local_irq_restore(flags);
> +             dev_warn(&pdev->dev, "no IOMMU registration for vPCI bus on 
> detach\n");
> +             return;
> +     }
> +     status = hv_do_hypercall(HVCALL_DETACH_DEVICE_DOMAIN, input, NULL);
> +
> +     local_irq_restore(flags);
> +
> +     if (!hv_result_success(status))
> +             pr_err("HVCALL_DETACH_DEVICE_DOMAIN failed, status %lld\n", 
> status);
> +
> +     hv_flush_device_domain(hv_domain);
> +
> +     vdev->hv_domain = NULL;
> +}
> +
> +static int hv_iommu_attach_dev(struct iommu_domain *domain, struct device 
> *dev,
> +                            struct iommu_domain *old)
> +{
> +     u64 status;
> +     unsigned long flags;
> +     struct pci_dev *pdev;
> +     struct hv_input_attach_device_domain *input;
> +     struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
> +     struct hv_iommu_domain *hv_domain = to_hv_iommu_domain(domain);
> +     int ret;
> +
> +     /* Only allow PCI devices for now */
> +     if (!dev_is_pci(dev))
> +             return -EINVAL;
> +
> +     if (vdev->hv_domain == hv_domain)
> +             return 0;
> +
> +     if (vdev->hv_domain)
> +             hv_iommu_detach_dev(&vdev->hv_domain->domain, dev);
> +
> +     pdev = to_pci_dev(dev);
> +     dev_dbg(dev, "attaching to domain %d\n",
> +             hv_domain->device_domain.domain_id.id);
> +
> +     local_irq_save(flags);
> +
> +     input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +     memset(input, 0, sizeof(*input));
> +     input->device_domain = hv_domain->device_domain;
> +     ret = hv_iommu_lookup_logical_dev_id(pdev, &input->device_id.as_uint64);
> +     if (ret) {
> +             local_irq_restore(flags);
> +             dev_err(&pdev->dev, "no IOMMU registration for vPCI bus\n");
> +             return ret;
> +     }
> +     status = hv_do_hypercall(HVCALL_ATTACH_DEVICE_DOMAIN, input, NULL);
> +
> +     local_irq_restore(flags);
> +
> +     if (!hv_result_success(status))
> +             pr_err("HVCALL_ATTACH_DEVICE_DOMAIN failed, status %lld\n", 
> status);
> +     else
> +             vdev->hv_domain = hv_domain;
> +
> +     return hv_result_to_errno(status);
> +}
> +
> +static int hv_iommu_get_logical_device_property(struct device *dev,
> +                                     u32 code,
> +                                     struct 
> hv_output_get_logical_device_property *property)
> +{
> +     u64 status, lid;
> +     unsigned long flags;
> +     int ret;
> +     struct hv_input_get_logical_device_property *input;
> +     struct hv_output_get_logical_device_property *output;
> +
> +     local_irq_save(flags);
> +
> +     input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +     output = *this_cpu_ptr(hyperv_pcpu_input_arg) + sizeof(*input);

Nit: The other way to set output is:

        output = input + 1;

I think this produces slightly better code because of not needing to
reference the per-cpu variable hyperv_pcpu_input_arg a 2nd time.


> +     memset(input, 0, sizeof(*input));
> +     input->partition_id = HV_PARTITION_ID_SELF;
> +     ret = hv_iommu_lookup_logical_dev_id(to_pci_dev(dev), &lid);
> +     if (ret) {
> +             local_irq_restore(flags);
> +             return ret;
> +     }
> +     input->logical_device_id = lid;
> +     input->code = code;
> +     status = hv_do_hypercall(HVCALL_GET_LOGICAL_DEVICE_PROPERTY, input, 
> output);
> +     *property = *output;
> +
> +     local_irq_restore(flags);
> +
> +     if (!hv_result_success(status))
> +             pr_err("HVCALL_GET_LOGICAL_DEVICE_PROPERTY failed, status 
> %lld\n", status);
> +
> +     return hv_result_to_errno(status);
> +}
> +
> +static struct iommu_device *hv_iommu_probe_device(struct device *dev)
> +{
> +     struct pci_dev *pdev;
> +     struct hv_iommu_endpoint *vdev;
> +     struct hv_output_get_logical_device_property device_iommu_property = 
> {0};
> +
> +     if (!dev_is_pci(dev))
> +             return ERR_PTR(-ENODEV);
> +
> +     pdev = to_pci_dev(dev);
> +
> +     if (hv_iommu_get_logical_device_property(dev,
> +                                              
> HV_LOGICAL_DEVICE_PROPERTY_PVIOMMU,
> +                                              &device_iommu_property) ||
> +         !(device_iommu_property.device_iommu & HV_DEVICE_IOMMU_ENABLED))
> +             return ERR_PTR(-ENODEV);
> +
> +     vdev = kzalloc_obj(*vdev, GFP_KERNEL);
> +     if (!vdev)
> +             return ERR_PTR(-ENOMEM);
> +
> +     vdev->dev = dev;
> +     vdev->hv_iommu = hv_iommu_device;
> +     dev_iommu_priv_set(dev, vdev);
> +
> +     if (hv_iommu_ats_supported(hv_iommu_device->cap) &&
> +         pci_ats_supported(pdev))
> +             pci_enable_ats(pdev, __ffs(hv_iommu_device->pgsize_bitmap));
> +
> +     return &vdev->hv_iommu->iommu;
> +}
> +
> +static void hv_iommu_release_device(struct device *dev)
> +{
> +     struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
> +     struct pci_dev *pdev = to_pci_dev(dev);
> +
> +     if (pdev->ats_enabled)
> +             pci_disable_ats(pdev);
> +
> +     dev_iommu_priv_set(dev, NULL);
> +     set_dma_ops(dev, NULL);

Previous versions of this function did hv_iommu_detach_dev(). With that call
removed from here, hv_iommu_detach_dev() is only called when attaching a
domain to a device that already has a domain attached. Is it the case that
Hyper-V doesn't require the detach as a cleanup step?

> +
> +     kfree(vdev);
> +}
> +
> +static struct iommu_group *hv_iommu_device_group(struct device *dev)
> +{
> +     if (dev_is_pci(dev))
> +             return pci_device_group(dev);
> +     else
> +             return generic_device_group(dev);
> +}
> +
> +static int hv_configure_device_domain(struct hv_iommu_domain *hv_domain, u32 
> domain_type)
> +{
> +     u64 status;
> +     unsigned long flags;
> +     struct pt_iommu_x86_64_hw_info pt_info;
> +     struct hv_input_configure_device_domain *input;
> +
> +     local_irq_save(flags);
> +
> +     input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +     memset(input, 0, sizeof(*input));
> +     input->device_domain = hv_domain->device_domain;
> +     input->settings.flags.blocked = (domain_type == IOMMU_DOMAIN_BLOCKED);
> +     input->settings.flags.translation_enabled = (domain_type != 
> IOMMU_DOMAIN_IDENTITY);
> +
> +     if (domain_type & __IOMMU_DOMAIN_PAGING) {
> +             pt_iommu_x86_64_hw_info(&hv_domain->pt_iommu_x86_64, &pt_info);
> +             input->settings.page_table_root = pt_info.gcr3_pt;
> +             input->settings.flags.first_stage_paging_mode =
> +                     pt_info.levels == 5;
> +     }
> +     status = hv_do_hypercall(HVCALL_CONFIGURE_DEVICE_DOMAIN, input, NULL);
> +
> +     local_irq_restore(flags);
> +
> +     if (!hv_result_success(status))
> +             pr_err("HVCALL_CONFIGURE_DEVICE_DOMAIN failed, status %lld\n", 
> status);
> +
> +     return hv_result_to_errno(status);
> +}
> +
> +static int __init hv_initialize_static_domains(void)
> +{
> +     int ret;
> +     struct hv_iommu_domain *hv_domain;
> +
> +     /* Default stage-1 identity domain */
> +     hv_domain = &hv_identity_domain;
> +
> +     ret = hv_create_device_domain(hv_domain, HV_DEVICE_DOMAIN_TYPE_S1);
> +     if (ret)
> +             return ret;
> +
> +     ret = hv_configure_device_domain(hv_domain, IOMMU_DOMAIN_IDENTITY);
> +     if (ret)
> +             goto delete_identity_domain;
> +
> +     hv_domain->domain.type = IOMMU_DOMAIN_IDENTITY;
> +     hv_domain->domain.ops = &hv_iommu_identity_domain_ops;
> +     hv_domain->domain.owner = &hv_iommu_ops;
> +     hv_domain->domain.geometry = hv_iommu_device->geometry;
> +     hv_domain->domain.pgsize_bitmap = hv_iommu_device->pgsize_bitmap;
> +
> +     /* Default stage-1 blocked domain */
> +     hv_domain = &hv_blocking_domain;
> +
> +     ret = hv_create_device_domain(hv_domain, HV_DEVICE_DOMAIN_TYPE_S1);
> +     if (ret)
> +             goto delete_identity_domain;
> +
> +     ret = hv_configure_device_domain(hv_domain, IOMMU_DOMAIN_BLOCKED);
> +     if (ret)
> +             goto delete_blocked_domain;
> +
> +     hv_domain->domain.type = IOMMU_DOMAIN_BLOCKED;
> +     hv_domain->domain.ops = &hv_iommu_blocking_domain_ops;
> +     hv_domain->domain.owner = &hv_iommu_ops;
> +     hv_domain->domain.geometry = hv_iommu_device->geometry;
> +     hv_domain->domain.pgsize_bitmap = hv_iommu_device->pgsize_bitmap;
> +
> +     return 0;
> +
> +delete_blocked_domain:
> +     hv_delete_device_domain(&hv_blocking_domain);
> +delete_identity_domain:
> +     hv_delete_device_domain(&hv_identity_domain);
> +     return ret;
> +}
> +
> +#define INTERRUPT_RANGE_START        (0xfee00000)
> +#define INTERRUPT_RANGE_END  (0xfeefffff)
> +static void hv_iommu_get_resv_regions(struct device *dev,
> +             struct list_head *head)
> +{
> +     struct iommu_resv_region *region;
> +
> +     region = iommu_alloc_resv_region(INTERRUPT_RANGE_START,
> +                                   INTERRUPT_RANGE_END - 
> INTERRUPT_RANGE_START + 1,
> +                                   0, IOMMU_RESV_MSI, GFP_KERNEL);
> +     if (!region)
> +             return;
> +
> +     list_add_tail(&region->list, head);
> +}
> +
> +static void hv_iommu_flush_iotlb_all(struct iommu_domain *domain)
> +{
> +     hv_flush_device_domain(to_hv_iommu_domain(domain));
> +}
> +
> +static void hv_iommu_iotlb_sync(struct iommu_domain *domain,
> +                             struct iommu_iotlb_gather *iotlb_gather)
> +{
> +     hv_flush_device_domain(to_hv_iommu_domain(domain));
> +
> +     iommu_put_pages_list(&iotlb_gather->freelist);
> +}
> +
> +static void hv_iommu_paging_domain_free(struct iommu_domain *domain)
> +{
> +     struct hv_iommu_domain *hv_domain = to_hv_iommu_domain(domain);
> +
> +     /* Free all remaining mappings */
> +     pt_iommu_deinit(&hv_domain->pt_iommu);
> +
> +     hv_delete_device_domain(hv_domain);
> +
> +     kfree(hv_domain);
> +}
> +
> +static const struct iommu_domain_ops hv_iommu_identity_domain_ops = {
> +     .attach_dev     = hv_iommu_attach_dev,
> +};
> +
> +static const struct iommu_domain_ops hv_iommu_blocking_domain_ops = {
> +     .attach_dev     = hv_iommu_attach_dev,
> +};
> +
> +static const struct iommu_domain_ops hv_iommu_paging_domain_ops = {
> +     .attach_dev     = hv_iommu_attach_dev,
> +     IOMMU_PT_DOMAIN_OPS(x86_64),
> +     .flush_iotlb_all = hv_iommu_flush_iotlb_all,
> +     .iotlb_sync = hv_iommu_iotlb_sync,
> +     .free = hv_iommu_paging_domain_free,
> +};
> +
> +static struct iommu_domain *hv_iommu_domain_alloc_paging(struct device *dev)
> +{
> +     int ret;
> +     struct hv_iommu_domain *hv_domain;
> +     struct pt_iommu_x86_64_cfg cfg = {};
> +
> +     hv_domain = kzalloc_obj(*hv_domain, GFP_KERNEL);
> +     if (!hv_domain)
> +             return ERR_PTR(-ENOMEM);
> +
> +     ret = hv_create_device_domain(hv_domain, HV_DEVICE_DOMAIN_TYPE_S1);
> +     if (ret) {
> +             kfree(hv_domain);
> +             return ERR_PTR(ret);
> +     }
> +
> +     hv_domain->domain.geometry = hv_iommu_device->geometry;
> +     hv_domain->pt_iommu.nid = dev_to_node(dev);
> +
> +     cfg.common.hw_max_vasz_lg2 = hv_iommu_device->max_iova_width;
> +     cfg.common.hw_max_oasz_lg2 = 52;
> +     cfg.top_level = (hv_iommu_device->max_iova_width > 48) ? 4 : 3;
> +
> +     ret = pt_iommu_x86_64_init(&hv_domain->pt_iommu_x86_64, &cfg, 
> GFP_KERNEL);
> +     if (ret) {
> +             hv_delete_device_domain(hv_domain);
> +             kfree(hv_domain);
> +             return ERR_PTR(ret);
> +     }
> +
> +     /* Constrain to page sizes the hypervisor supports */
> +     hv_domain->domain.pgsize_bitmap &= hv_iommu_device->pgsize_bitmap;
> +
> +     hv_domain->domain.ops = &hv_iommu_paging_domain_ops;
> +
> +     ret = hv_configure_device_domain(hv_domain, __IOMMU_DOMAIN_PAGING);
> +     if (ret) {
> +             pt_iommu_deinit(&hv_domain->pt_iommu);
> +             hv_delete_device_domain(hv_domain);
> +             kfree(hv_domain);
> +             return ERR_PTR(ret);
> +     }
> +
> +     return &hv_domain->domain;

I think this function would be better if the error paths did "goto"
a cascading set of error labels. That's the typical pattern, and it's what you
use in hv_iommu_init(), for example.

> +}
> +
> +static struct iommu_ops hv_iommu_ops = {
> +     .capable                  = hv_iommu_capable,
> +     .domain_alloc_paging      = hv_iommu_domain_alloc_paging,
> +     .probe_device             = hv_iommu_probe_device,
> +     .release_device           = hv_iommu_release_device,
> +     .device_group             = hv_iommu_device_group,
> +     .get_resv_regions         = hv_iommu_get_resv_regions,
> +     .owner                    = THIS_MODULE,
> +     .identity_domain          = &hv_identity_domain.domain,
> +     .blocked_domain           = &hv_blocking_domain.domain,
> +     .release_domain           = &hv_blocking_domain.domain,
> +};
> +
> +static int hv_iommu_detect(struct hv_output_get_iommu_capabilities 
> *hv_iommu_cap)
> +{
> +     u64 status;
> +     unsigned long flags;
> +     struct hv_input_get_iommu_capabilities *input;
> +     struct hv_output_get_iommu_capabilities *output;
> +
> +     local_irq_save(flags);
> +
> +     input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +     output = *this_cpu_ptr(hyperv_pcpu_input_arg) + sizeof(*input);

Potentially use "output = input + 1" here as well.

> +     memset(input, 0, sizeof(*input));
> +     input->partition_id = HV_PARTITION_ID_SELF;
> +     status = hv_do_hypercall(HVCALL_GET_IOMMU_CAPABILITIES, input, output);
> +     *hv_iommu_cap = *output;
> +
> +     local_irq_restore(flags);
> +
> +     if (!hv_result_success(status))
> +             pr_err("HVCALL_GET_IOMMU_CAPABILITIES failed, status %lld\n", 
> status);
> +
> +     return hv_result_to_errno(status);
> +}
> +
> +static void __init hv_init_iommu_device(struct hv_iommu_dev *hv_iommu,
> +                     struct hv_output_get_iommu_capabilities *hv_iommu_cap)
> +{
> +     ida_init(&hv_iommu->domain_ids);
> +
> +     hv_iommu->cap = hv_iommu_cap->iommu_cap;
> +     hv_iommu->max_iova_width = hv_iommu_cap->max_iova_width;
> +     if (!hv_iommu_5lvl_supported(hv_iommu->cap) &&
> +         hv_iommu->max_iova_width > 48) {
> +             pr_info("5-level paging not supported, limiting iova width to 
> 48.\n");
> +             hv_iommu->max_iova_width = 48;
> +     }
> +
> +     hv_iommu->geometry = (struct iommu_domain_geometry) {
> +             .aperture_start = 0,
> +             .aperture_end = (((u64)1) << hv_iommu->max_iova_width) - 1,
> +             .force_aperture = true,
> +     };
> +
> +     hv_iommu->first_domain = HV_DEVICE_DOMAIN_ID_DEFAULT + 1;
> +     hv_iommu->last_domain = HV_DEVICE_DOMAIN_ID_NULL - 1;
> +     /* Only x86 page sizes (4K/2M/1G) are supported */
> +     hv_iommu->pgsize_bitmap = hv_iommu_cap->pgsize_bitmap &
> +                               (SZ_4K | SZ_2M | SZ_1G);
> +     if (hv_iommu->pgsize_bitmap != hv_iommu_cap->pgsize_bitmap)
> +             pr_warn("unsupported page sizes masked: 0x%llx -> 0x%llx\n",
> +                     hv_iommu_cap->pgsize_bitmap, hv_iommu->pgsize_bitmap);
> +     if (!hv_iommu->pgsize_bitmap) {
> +             pr_warn("no supported page sizes, defaulting to 4K\n");
> +             hv_iommu->pgsize_bitmap = SZ_4K;
> +     }
> +     hv_iommu_device = hv_iommu;
> +}
> +
> +int __init hv_iommu_init(void)
> +{
> +     int ret = 0;
> +     struct hv_iommu_dev *hv_iommu = NULL;
> +     struct hv_output_get_iommu_capabilities hv_iommu_cap = {0};
> +
> +     if (no_iommu || iommu_detected)
> +             return -ENODEV;
> +
> +     if (!hv_is_hyperv_initialized())
> +             return -ENODEV;
> +
> +     ret = hv_iommu_detect(&hv_iommu_cap);
> +     if (ret) {
> +             pr_err("HVCALL_GET_IOMMU_CAPABILITIES failed: %d\n", ret);
> +             return -ENODEV;
> +     }
> +
> +     if (!hv_iommu_present(hv_iommu_cap.iommu_cap) ||
> +         !hv_iommu_s1_domain_supported(hv_iommu_cap.iommu_cap)) {
> +             pr_err("IOMMU capabilities not sufficient: cap=0x%llx\n",
> +                    hv_iommu_cap.iommu_cap);
> +             return -ENODEV;
> +     }
> +
> +     iommu_detected = 1;
> +     pci_request_acs();
> +
> +     hv_iommu = kzalloc_obj(*hv_iommu, GFP_KERNEL);
> +     if (!hv_iommu)
> +             return -ENOMEM;
> +
> +     hv_init_iommu_device(hv_iommu, &hv_iommu_cap);
> +
> +     ret = hv_initialize_static_domains();
> +     if (ret) {
> +             pr_err("static domains init failed: %d\n", ret);
> +             goto err_free;
> +     }
> +
> +     ret = iommu_device_sysfs_add(&hv_iommu->iommu, NULL, NULL, "%s", 
> "hv-iommu");
> +     if (ret) {
> +             pr_err("iommu_device_sysfs_add failed: %d\n", ret);
> +             goto err_delete_static_domains;
> +     }
> +
> +     ret = iommu_device_register(&hv_iommu->iommu, &hv_iommu_ops, NULL);
> +     if (ret) {
> +             pr_err("iommu_device_register failed: %d\n", ret);
> +             goto err_sysfs_remove;
> +     }
> +
> +     pr_info("successfully initialized\n");
> +     return 0;
> +
> +err_sysfs_remove:
> +     iommu_device_sysfs_remove(&hv_iommu->iommu);
> +err_delete_static_domains:
> +     hv_delete_device_domain(&hv_blocking_domain);
> +     hv_delete_device_domain(&hv_identity_domain);
> +err_free:
> +     kfree(hv_iommu);
> +     return ret;
> +}
> diff --git a/drivers/iommu/hyperv/iommu.h b/drivers/iommu/hyperv/iommu.h
> new file mode 100644
> index 000000000000..43f20d371245
> --- /dev/null
> +++ b/drivers/iommu/hyperv/iommu.h
> @@ -0,0 +1,54 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * Hyper-V IOMMU driver.
> + *
> + * Copyright (C) 2024-2025, Microsoft, Inc.
> + *
> + */
> +
> +#ifndef _HYPERV_IOMMU_H
> +#define _HYPERV_IOMMU_H
> +
> +struct hv_iommu_dev {
> +     struct iommu_device iommu;
> +     struct ida domain_ids;
> +
> +     /* Device configuration */
> +     u8  max_iova_width;
> +     u8  max_pasid_width;
> +     u64 cap;
> +     u64 pgsize_bitmap;
> +
> +     struct iommu_domain_geometry geometry;
> +     u64 first_domain;
> +     u64 last_domain;
> +};
> +
> +struct hv_iommu_domain {
> +     union {
> +             struct iommu_domain    domain;
> +             struct pt_iommu        pt_iommu;
> +             struct pt_iommu_x86_64 pt_iommu_x86_64;
> +     };
> +     struct hv_iommu_dev *hv_iommu;
> +     struct hv_input_device_domain device_domain;
> +     u64             pgsize_bitmap;
> +};
> +
> +struct hv_pci_busdata {
> +     int               pci_domain_nr;
> +     u32               logical_dev_id_prefix;
> +     struct list_head  list;
> +};
> +
> +struct hv_iommu_endpoint {
> +     struct device *dev;
> +     struct hv_iommu_dev *hv_iommu;
> +     struct hv_iommu_domain *hv_domain;
> +};
> +
> +#define to_hv_iommu_domain(d) \
> +     container_of(d, struct hv_iommu_domain, domain)
> +
> +#endif /* _HYPERV_IOMMU_H */
> diff --git a/drivers/pci/controller/pci-hyperv.c 
> b/drivers/pci/controller/pci-hyperv.c
> index cfc8fa403dad..a4af9c8c2220 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -3715,6 +3715,7 @@ static int hv_pci_probe(struct hv_device *hdev,
>       struct hv_pcibus_device *hbus;
>       int ret, dom;
>       u16 dom_req;
> +     u32 prefix;
>       char *name;
> 
>       bridge = devm_pci_alloc_host_bridge(&hdev->device, 0);
> @@ -3857,13 +3858,25 @@ static int hv_pci_probe(struct hv_device *hdev,
> 
>       hbus->state = hv_pcibus_probed;
> 
> -     ret = create_root_hv_pci_bus(hbus);
> +     /* Notify pvIOMMU before any device on the bus is scanned. */
> +     prefix = (hdev->dev_instance.b[5] << 24) |
> +              (hdev->dev_instance.b[4] << 16) |
> +              (hdev->dev_instance.b[7] <<  8) |
> +              (hdev->dev_instance.b[6] & 0xf8);

This assembling of the logical device id prefix duplicates the
code in hv_irq_retarget_interrupt(). Could this code save the
prefix in struct hv_pcibus_device, and then have
hv_irq_retarget_interrupt() use it?  Then it would be clear
that HVCALL_RETARGET_INTERRUPT is using exactly the same
logical device id as the IOMMU hypercalls.

> +
> +     ret = hv_iommu_register_pci_bus(dom, prefix);
>       if (ret)
>               goto free_windows;
> 
> +     ret = create_root_hv_pci_bus(hbus);
> +     if (ret)
> +             goto unregister_pviommu;
> +
>       mutex_unlock(&hbus->state_lock);
>       return 0;
> 
> +unregister_pviommu:
> +     hv_iommu_unregister_pci_bus(dom);
>  free_windows:
>       hv_pci_free_bridge_windows(hbus);
>  exit_d0:
> @@ -3974,8 +3987,10 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool
> keep_devs)
>  static void hv_pci_remove(struct hv_device *hdev)
>  {
>       struct hv_pcibus_device *hbus;
> +     int dom;
> 
>       hbus = hv_get_drvdata(hdev);
> +     dom = hbus->bridge->domain_nr;

Nit: Setting "dom" here feels a little weird because the value is only needed
under the "if" statement. The value must be read before the root bus is
removed, but even so moving it under the "if" statement would make more
sense to me.

>       if (hbus->state == hv_pcibus_installed) {
>               tasklet_disable(&hdev->channel->callback_event);
>               hbus->state = hv_pcibus_removing;
> @@ -3994,6 +4009,8 @@ static void hv_pci_remove(struct hv_device *hdev)
>               hv_pci_remove_slots(hbus);
>               pci_remove_root_bus(hbus->bridge->bus);
>               pci_unlock_rescan_remove();
> +
> +             hv_iommu_unregister_pci_bus(dom);
>       }
> 
>       hv_pci_bus_exit(hdev, false);
> diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
> index bf601d67cecb..b71345c74568 100644
> --- a/include/asm-generic/mshyperv.h
> +++ b/include/asm-generic/mshyperv.h
> @@ -73,6 +73,18 @@ extern enum hv_partition_type hv_curr_partition_type;
>  extern void * __percpu *hyperv_pcpu_input_arg;
>  extern void * __percpu *hyperv_pcpu_output_arg;
> 
> +#ifdef CONFIG_HYPERV_PVIOMMU
> +int  hv_iommu_register_pci_bus(int pci_domain_nr, u32 logical_dev_id_prefix);
> +void hv_iommu_unregister_pci_bus(int pci_domain_nr);
> +#else
> +static inline int hv_iommu_register_pci_bus(int pci_domain_nr,
> +                                         u32 logical_dev_id_prefix)
> +{
> +     return 0;
> +}
> +static inline void hv_iommu_unregister_pci_bus(int pci_domain_nr) { }
> +#endif
> +
>  u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr);
>  u64 hv_do_fast_hypercall8(u16 control, u64 input8);
>  u64 hv_do_fast_hypercall16(u16 control, u64 input1, u64 input2);
> --
> 2.52.0
> 


Reply via email to