Hi,

On 1/11/25 4:32 AM, Nicolin Chen wrote:
> For systems that require MSI pages to be mapped into the IOMMU translation
> the IOMMU driver provides an IOMMU_RESV_SW_MSI range, which is the default
> recommended IOVA window to place these mappings. However, there is nothing
> special about this address. And to support the RMR trick in VMM for nested
well at least it shall not overlap VMM's RAM. So it was not random either.
> translation, the VMM needs to know what sw_msi window the kernel is using.
> As there is no particular reason to force VMM to adopt the kernel default,
> provide a simple IOMMU_OPTION_SW_MSI_START/SIZE ioctl that the VMM can use
> to directly specify the sw_msi window that it wants to use, which replaces
> and disables the default IOMMU_RESV_SW_MSI from the driver to avoid having
> to build an API to discover the default IOMMU_RESV_SW_MSI.
IIUC the MSI window will then be different when using legacy VFIO
assignment and iommufd backend.
MSI reserved regions are exposed in
/sys/kernel/iommu_groups/<n>/reserved_regions
0x0000000008000000 0x00000000080fffff msi

Is that configurability reflected accordingly?

How do you make sure it does not collide with other resv regions? I
don't see any check here.

>
> Since iommufd now has its own sw_msi function, this is easy to implement.
>
> To keep things simple, the parameters are global to the entire iommufd FD,
> and will directly replace the IOMMU_RESV_SW_MSI values. The VMM must set
> the values before creating any hwpt's to have any effect.
>
> Suggested-by: Jason Gunthorpe <j...@nvidia.com>
> Signed-off-by: Nicolin Chen <nicol...@nvidia.com>
> ---
>  drivers/iommu/iommufd/iommufd_private.h |  4 +++
>  include/uapi/linux/iommufd.h            | 18 ++++++++++++-
>  drivers/iommu/iommufd/device.c          |  4 +++
>  drivers/iommu/iommufd/io_pagetable.c    |  4 ++-
>  drivers/iommu/iommufd/ioas.c            | 34 +++++++++++++++++++++++++
>  drivers/iommu/iommufd/main.c            |  6 +++++
>  6 files changed, 68 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/iommu/iommufd/iommufd_private.h 
> b/drivers/iommu/iommufd/iommufd_private.h
> index 3e83bbb5912c..9f071609f00b 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -45,6 +45,9 @@ struct iommufd_ctx {
>       struct mutex sw_msi_lock;
>       struct list_head sw_msi_list;
>       unsigned int sw_msi_id;
> +     /* User-programmed SW_MSI region, to override igroup->sw_msi_start */
> +     phys_addr_t sw_msi_start;
> +     size_t sw_msi_size;
>  
>       u8 account_mode;
>       /* Compatibility with VFIO no iommu */
> @@ -281,6 +284,7 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
>  int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
>  int iommufd_option_rlimit_mode(struct iommu_option *cmd,
>                              struct iommufd_ctx *ictx);
> +int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx 
> *ictx);
>  
>  int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
>  int iommufd_check_iova_range(struct io_pagetable *iopt,
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index 34810f6ae2b5..c864a201e502 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -294,7 +294,9 @@ struct iommu_ioas_unmap {
>  
>  /**
>   * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
> - *                       ioctl(IOMMU_OPTION_HUGE_PAGES)
> + *                       ioctl(IOMMU_OPTION_HUGE_PAGES) and
> + *                       ioctl(IOMMU_OPTION_SW_MSI_START) and
> + *                       ioctl(IOMMU_OPTION_SW_MSI_SIZE)
>   * @IOMMU_OPTION_RLIMIT_MODE:
>   *    Change how RLIMIT_MEMLOCK accounting works. The caller must have 
> privilege
>   *    to invoke this. Value 0 (default) is user based accounting, 1 uses 
> process
> @@ -304,10 +306,24 @@ struct iommu_ioas_unmap {
>   *    iommu mappings. Value 0 disables combining, everything is mapped to
>   *    PAGE_SIZE. This can be useful for benchmarking.  This is a per-IOAS
>   *    option, the object_id must be the IOAS ID.
> + * @IOMMU_OPTION_SW_MSI_START:
> + *    Change the base address of the IOMMU mapping region for MSI 
> doorbell(s).
> + *    It must be set this before attaching a device to an IOAS/HWPT, 
> otherwise
> + *    this option will be not effective on that IOAS/HWPT. User can choose to
> + *    let kernel pick a base address, by simply ignoring this option or 
> setting
> + *    a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id must 
> be 0
I think we should document it cannot be put at a random place either.
> + * @IOMMU_OPTION_SW_MSI_SIZE:
> + *    Change the size of the IOMMU mapping region for MSI doorbell(s). It 
> must
> + *    be set this before attaching a device to an IOAS/HWPT, otherwise it 
> won't
> + *    be effective on that IOAS/HWPT. The value is in MB, and the minimum 
> value
> + *    is 1 MB. A value 0 (default) will invalidate the MSI doorbell base 
> address
> + *    value set to IOMMU_OPTION_SW_MSI_START. Global option, object_id must 
> be 0
>   */
>  enum iommufd_option {
>       IOMMU_OPTION_RLIMIT_MODE = 0,
>       IOMMU_OPTION_HUGE_PAGES = 1,
> +     IOMMU_OPTION_SW_MSI_START = 2,
> +     IOMMU_OPTION_SW_MSI_SIZE = 3,
>  };
>  
>  /**
> diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> index f75b3c23cd41..093a3bd798db 100644
> --- a/drivers/iommu/iommufd/device.c
> +++ b/drivers/iommu/iommufd/device.c
> @@ -445,10 +445,14 @@ static int
>  iommufd_device_attach_reserved_iova(struct iommufd_device *idev,
>                                   struct iommufd_hwpt_paging *hwpt_paging)
>  {
> +     struct iommufd_ctx *ictx = idev->ictx;
>       int rc;
>  
>       lockdep_assert_held(&idev->igroup->lock);
>  
> +     /* Override it with a user-programmed SW_MSI region */
> +     if (ictx->sw_msi_size && ictx->sw_msi_start != PHYS_ADDR_MAX)
> +             idev->igroup->sw_msi_start = ictx->sw_msi_start;
>       rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
>                                                idev->dev,
>                                                &idev->igroup->sw_msi_start);
> diff --git a/drivers/iommu/iommufd/io_pagetable.c 
> b/drivers/iommu/iommufd/io_pagetable.c
> index 8a790e597e12..5d7f5ca1eecf 100644
> --- a/drivers/iommu/iommufd/io_pagetable.c
> +++ b/drivers/iommu/iommufd/io_pagetable.c
> @@ -1446,7 +1446,9 @@ int iopt_table_enforce_dev_resv_regions(struct 
> io_pagetable *iopt,
>               if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
>                       num_hw_msi++;
>               if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
> -                     *sw_msi_start = resv->start;
> +                     /* Bypass the driver-defined SW_MSI region, if preset */
> +                     if (*sw_msi_start == PHYS_ADDR_MAX)
> +                             *sw_msi_start = resv->start;
>                       num_sw_msi++;
>               }
>  
> diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
> index 1542c5fd10a8..3f4e25b660f9 100644
> --- a/drivers/iommu/iommufd/ioas.c
> +++ b/drivers/iommu/iommufd/ioas.c
> @@ -620,6 +620,40 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd,
>       return -EOPNOTSUPP;
>  }
>  
> +int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx *ictx)
> +{
> +     if (cmd->object_id)
> +             return -EOPNOTSUPP;
> +
> +     if (cmd->op == IOMMU_OPTION_OP_GET) {
> +             switch (cmd->option_id) {
> +             case IOMMU_OPTION_SW_MSI_START:
> +                     cmd->val64 = (u64)ictx->sw_msi_start;
> +                     break;
> +             case IOMMU_OPTION_SW_MSI_SIZE:
> +                     cmd->val64 = (u64)ictx->sw_msi_size;
> +                     break;
> +             default:
> +                     return -EOPNOTSUPP;
> +             }
> +             return 0;
> +     }
> +     if (cmd->op == IOMMU_OPTION_OP_SET) {
> +             switch (cmd->option_id) {
> +             case IOMMU_OPTION_SW_MSI_START:
> +                     ictx->sw_msi_start = (phys_addr_t)cmd->val64;
> +                     break;
> +             case IOMMU_OPTION_SW_MSI_SIZE:
> +                     ictx->sw_msi_size = (size_t)cmd->val64;
> +                     break;
> +             default:
> +                     return -EOPNOTSUPP;
> +             }
> +             return 0;
> +     }
> +     return -EOPNOTSUPP;
> +}
> +
>  static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
>                                         struct iommufd_ioas *ioas)
>  {
> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index 7cc9497b7193..026297265c71 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -229,6 +229,8 @@ static int iommufd_fops_open(struct inode *inode, struct 
> file *filp)
>       init_waitqueue_head(&ictx->destroy_wait);
>       mutex_init(&ictx->sw_msi_lock);
>       INIT_LIST_HEAD(&ictx->sw_msi_list);
> +     ictx->sw_msi_start = PHYS_ADDR_MAX;
> +     ictx->sw_msi_size = 0;
>       filp->private_data = ictx;
>       return 0;
>  }
> @@ -287,6 +289,10 @@ static int iommufd_option(struct iommufd_ucmd *ucmd)
>       case IOMMU_OPTION_RLIMIT_MODE:
>               rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
>               break;
> +     case IOMMU_OPTION_SW_MSI_START:
> +     case IOMMU_OPTION_SW_MSI_SIZE:
> +             rc = iommufd_option_sw_msi(cmd, ucmd->ictx);
> +             break;
>       case IOMMU_OPTION_HUGE_PAGES:
>               rc = iommufd_ioas_option(ucmd);
>               break;
Eric


Reply via email to