From: Yu Zhang <[email protected]> Sent: Monday, May 11, 2026 9:24 AM
>
> Add page-selective IOTLB flush using HVCALL_FLUSH_DEVICE_DOMAIN_LIST.
> This hypercall accepts a list of (page_number, page_mask_shift) entries,
> enabling finer-grained IOTLB invalidation compared to the domain-wide
> HVCALL_FLUSH_DEVICE_DOMAIN used by hv_iommu_flush_iotlb_all().
>
> hv_iommu_fill_iova_list() decomposes a contiguous IOVA range into a
> minimal set of aligned power-of-two regions that fit in a single
> hypercall input page. When the range exceeds the page capacity, the
> code falls back to a full domain flush automatically.
>
> Signed-off-by: Yu Zhang <[email protected]>
> Signed-off-by: Easwar Hariharan <[email protected]>
> ---
> drivers/iommu/hyperv/iommu.c | 91 +++++++++++++++++++++++++++++++++++-
> include/hyperv/hvgdk_mini.h | 1 +
> include/hyperv/hvhdk_mini.h | 17 +++++++
> 3 files changed, 108 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/iommu/hyperv/iommu.c b/drivers/iommu/hyperv/iommu.c
> index e5fc625314b5..3bca362b7815 100644
> --- a/drivers/iommu/hyperv/iommu.c
> +++ b/drivers/iommu/hyperv/iommu.c
> @@ -486,10 +486,98 @@ static void hv_iommu_flush_iotlb_all(struct
> iommu_domain *domain)
> hv_flush_device_domain(to_hv_iommu_domain(domain));
> }
>
> +/* Max number of iova_list entries in a single hypercall input page. */
> +#define HV_IOMMU_MAX_FLUSH_VA_COUNT \
> + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_flush_device_domain_list))
> / \
> + sizeof(union hv_iommu_flush_va))
> +
> +/* Returned by hv_iommu_fill_iova_list() when the range exceeds the capacity
> */
> +#define HV_IOMMU_FLUSH_VA_OVERFLOW U16_MAX
> +
> +static inline u16 hv_iommu_fill_iova_list(union hv_iommu_flush_va *iova_list,
> + unsigned long start,
> + unsigned long end)
> +{
> + unsigned long start_pfn = start >> PAGE_SHIFT;
> + unsigned long end_pfn = PAGE_ALIGN(end) >> PAGE_SHIFT;
"end" is an inclusive end address per comment in struct iommu_iotlb_gather.
So a page aligned value would typically have 0xFFF as the low order 12 bits,
and PAGE_ALIGN() will do the right thing. But I don't think the value is
*required* to be page aligned. If the value of "end" had 0x000 as the
low order 12 bits, the above calculation would fail to include the page
that has the address ending in 0x000. I think it needs to be
PAGE_ALIGN(end + 1) in order to work correctly for this corner case.
> + unsigned long nr_pages = end_pfn - start_pfn;
> + u16 count = 0;
> +
> + while (nr_pages > 0) {
> + unsigned long flush_pages;
> + int order;
> + unsigned long pfn_align;
> + unsigned long size_align;
> +
> + if (count >= HV_IOMMU_MAX_FLUSH_VA_COUNT) {
> + count = HV_IOMMU_FLUSH_VA_OVERFLOW;
> + break;
> + }
> +
> + if (start_pfn)
> + pfn_align = __ffs(start_pfn);
I don't understand why __ffs() is correct here. I would expect
__fls() so it is consistent with the calculation of size_align. But I
can only surmise how the hypercall works since there's no
documentation, so maybe my understanding of the hypercall is
wrong. If __ffs really is correct, a comment explaining why
would help. :-)
> + else
> + pfn_align = BITS_PER_LONG - 1;
> +
> + size_align = __fls(nr_pages);
> + order = min(pfn_align, size_align);
> + iova_list[count].page_mask_shift = order;
> + iova_list[count].page_number = start_pfn;
> +
> + flush_pages = 1UL << order;
> + start_pfn += flush_pages;
> + nr_pages -= flush_pages;
> + count++;
> + }
> +
> + return count;
> +}
> +
> +static void hv_flush_device_domain_list(struct hv_iommu_domain *hv_domain,
> + struct iommu_iotlb_gather *iotlb_gather)
> +{
> + u64 status;
> + u16 count;
> + unsigned long flags;
> + struct hv_input_flush_device_domain_list *input;
> +
> + local_irq_save(flags);
> +
> + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> + memset(input, 0, sizeof(*input));
> +
> + input->device_domain = hv_domain->device_domain;
> + input->flags |= HV_FLUSH_DEVICE_DOMAIN_LIST_IOMMU_FORMAT;
I would suggest moving the memset() and setting the input fields down
under the "else" below so that they are parallel with the flush all case.
> + count = hv_iommu_fill_iova_list(input->iova_list,
> + iotlb_gather->start,
> + iotlb_gather->end);
> + if (count == HV_IOMMU_FLUSH_VA_OVERFLOW) {
> + /*
> + * Range exceeds hypercall page capacity. Fall back to a full
> + * domain flush.
> + */
> + struct hv_input_flush_device_domain *flush_all = (void *)input;
> +
> + memset(flush_all, 0, sizeof(*flush_all));
> + flush_all->device_domain = hv_domain->device_domain;
> + status = hv_do_hypercall(HVCALL_FLUSH_DEVICE_DOMAIN,
> + flush_all, NULL);
> + } else {
> + status = hv_do_rep_hypercall(
> + HVCALL_FLUSH_DEVICE_DOMAIN_LIST,
> + count, 0, input, NULL);
> + }
> +
> + local_irq_restore(flags);
> +
> + if (!hv_result_success(status))
> + pr_err("HVCALL_FLUSH_DEVICE_DOMAIN_LIST failed, status %lld\n",
> status);
As Sashiko pointed out, a failure here can lead to all kinds of trouble because
of leaving unflushed entries. Maybe a WARN() is more appropriate? Also, maybe
a failure in the list flush should try a flush all as a fallback, with the
WARN()
only if the flush all fails.
> +}
> +
> static void hv_iommu_iotlb_sync(struct iommu_domain *domain,
> struct iommu_iotlb_gather *iotlb_gather)
> {
> - hv_flush_device_domain(to_hv_iommu_domain(domain));
> + hv_flush_device_domain_list(to_hv_iommu_domain(domain), iotlb_gather);
>
> iommu_put_pages_list(&iotlb_gather->freelist);
> }
> @@ -543,6 +631,7 @@ static struct iommu_domain
> *hv_iommu_domain_alloc_paging(struct device *dev)
>
> cfg.common.hw_max_vasz_lg2 = hv_iommu_device->max_iova_width;
> cfg.common.hw_max_oasz_lg2 = 52;
> + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE);
> cfg.top_level = (hv_iommu_device->max_iova_width > 48) ? 4 : 3;
>
> ret = pt_iommu_x86_64_init(&hv_domain->pt_iommu_x86_64, &cfg,
> GFP_KERNEL);
> diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
> index 5bdbb44da112..eaaf87171478 100644
> --- a/include/hyperv/hvgdk_mini.h
> +++ b/include/hyperv/hvgdk_mini.h
> @@ -496,6 +496,7 @@ union hv_vp_assist_msr_contents { /*
> HV_REGISTER_VP_ASSIST_PAGE */
> #define HVCALL_GET_GPA_PAGES_ACCESS_STATES 0x00c9
> #define HVCALL_CONFIGURE_DEVICE_DOMAIN 0x00ce
> #define HVCALL_FLUSH_DEVICE_DOMAIN 0x00d0
> +#define HVCALL_FLUSH_DEVICE_DOMAIN_LIST 0x00d1
> #define HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS 0x00d7
> #define HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS 0x00d8
> #define HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY 0x00db
> diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h
> index 493608e791b4..f51d5d9467f1 100644
> --- a/include/hyperv/hvhdk_mini.h
> +++ b/include/hyperv/hvhdk_mini.h
> @@ -671,4 +671,21 @@ struct hv_input_flush_device_domain {
> u32 reserved;
> } __packed;
>
> +union hv_iommu_flush_va {
> + u64 iova;
> + struct {
> + u64 page_mask_shift : 12;
> + u64 page_number : 52;
> + };
> +} __packed;
> +
> +
> +struct hv_input_flush_device_domain_list {
> + struct hv_input_device_domain device_domain;
> +#define HV_FLUSH_DEVICE_DOMAIN_LIST_IOMMU_FORMAT (1 << 0)
> + u32 flags;
> + u32 reserved;
> + union hv_iommu_flush_va iova_list[];
> +} __packed;
> +
> #endif /* _HV_HVHDK_MINI_H */
> --
> 2.52.0
>