tegra241-cmdqv: Add user-space use support

Pranjal Shrivastava Wed, 30 Apr 2025 15:07:23 -0700

On Fri, Apr 25, 2025 at 10:58:16PM -0700, Nicolin Chen wrote:
> The CMDQV HW supports a user-space use for virtualization cases. It allows
> the VM to issue guest-level TLBI or ATC_INV commands directly to the queue
> and executes them without a VMEXIT, as HW will replace the VMID field in a
> TLBI command and the SID field in an ATC_INV command with the preset VMID
> and SID.
> 
> This is built upon the vIOMMU infrastructure by allowing VMM to allocate a
> VINTF (as a vIOMMU object) and assign VCMDQs (vCMDQ objects) to the VINTF.
> 
> So firstly, replace the standard vSMMU model with the VINTF implementation
> but reuse the standard cache_invalidate op (for unsupported commands) and
> the standard alloc_domain_nested op (for standard nested STE).
> 
> Each VINTF has two 64KB MMIO pages (128B per logical vCMDQ):
>  - Page0 (directly accessed by guest) has all the control and status bits.
>  - Page1 (trapped by VMM) has guest-owned queue memory location/size info.
> 
> VMM should trap the emulated VINTF0's page1 of the guest VM for the guest-
> level VCMDQ location/size info and forward that to the kernel to translate
> to a physical memory location to program the VCMDQ HW during an allocation
> call. Then, it should mmap the assigned VINTF's page0 to the VINTF0 page0
> of the guest VM. This allows the guest OS to read and write the guest-own
> VINTF's page0 for direct control of the VCMDQ HW.
> 
> For ATC invalidation commands that hold an SID, it requires all devices to
> register their virtual SIDs to the SID_MATCH registers and their physical
> SIDs to the pairing SID_REPLACE registers, so that HW can use those as a
> lookup table to replace those virtual SIDs with the correct physical SIDs.
> Thus, implement the driver-allocated vDEVICE op with a tegra241_vintf_sid
> structure to allocate SID_REPLACE and to program the SIDs accordingly.
> 
> This enables the HW accelerated feature for NVIDIA Grace CPU. Compared to
> the standard SMMUv3 operating in the nested translation mode trapping CMDQ
> for TLBI and ATC_INV commands, this gives a huge performance improvement:
> 70% to 90% reductions of invalidation time were measured by various DMA
> unmap tests running in a guest OS.
>


The write-up is super helpful to understand how the HW works from a high
level. Thanks for explaining this well! :) 

I'm curious to know the DMA unmap tests that were run for perf?

> Signed-off-by: Nicolin Chen <nicol...@nvidia.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  15 +
>  include/uapi/linux/iommufd.h                  |  49 ++-
>  .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c     |   6 +-
>  .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c    | 374 +++++++++++++++++-
>  4 files changed, 435 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h 
> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> index bab7a9ce1283..d3f18a286447 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> @@ -1000,6 +1000,14 @@ int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device 
> *smmu,
>                               struct arm_smmu_cmdq *cmdq, u64 *cmds, int n,
>                               bool sync);
>  
> +static inline phys_addr_t
> +arm_smmu_domain_ipa_to_pa(struct arm_smmu_domain *smmu_domain, u64 ipa)
> +{
> +     if (WARN_ON_ONCE(smmu_domain->stage != ARM_SMMU_DOMAIN_S2))
> +             return 0;
> +     return iommu_iova_to_phys(&smmu_domain->domain, ipa);
> +}
> +
>  #ifdef CONFIG_ARM_SMMU_V3_SVA
>  bool arm_smmu_sva_supported(struct arm_smmu_device *smmu);
>  bool arm_smmu_master_sva_supported(struct arm_smmu_master *master);
> @@ -1076,9 +1084,16 @@ int arm_smmu_attach_prepare_vmaster(struct 
> arm_smmu_attach_state *state,
>  void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state);
>  void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master);
>  int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt);
> +struct iommu_domain *
> +arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
> +                           const struct iommu_user_data *user_data);
> +int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
> +                            struct iommu_user_data_array *array);
>  #else
>  #define arm_smmu_hw_info NULL
>  #define arm_vsmmu_alloc NULL
> +#define arm_vsmmu_alloc_domain_nested NULL
> +#define arm_vsmmu_cache_invalidate NULL
>  
>  static inline int
>  arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index b2614f0f1547..d69e7c1d39ea 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -549,12 +549,25 @@ struct iommu_hw_info_vtd {
>       __aligned_u64 ecap_reg;
>  };
>  
> +/**
> + * enum iommu_hw_info_arm_smmuv3_flags - Flags for ARM SMMUv3 hw_info
> + * @IOMMU_HW_INFO_ARM_SMMUV3_HAS_TEGRA241_CMDQV: Tegra241 implementation with
> + *                                               CMDQV support; @impl is 
> valid
> + */
> +enum iommu_hw_info_arm_smmuv3_flags {
> +     IOMMU_HW_INFO_ARM_SMMUV3_HAS_TEGRA241_CMDQV = 1 << 0,
> +};
> +
>  /**
>   * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information
>   *                                   (IOMMU_HW_INFO_TYPE_ARM_SMMUV3)
>   *
> - * @flags: Must be set to 0
> - * @impl: Must be 0
> + * @flags: Combination of enum iommu_hw_info_arm_smmuv3_flags
> + * @impl: Implementation-defined bits when the following flags are set:
> + *        - IOMMU_HW_INFO_ARM_SMMUV3_HAS_TEGRA241_CMDQV
> + *          Bits[15:12] - Log2 of the total number of SID replacements
> + *          Bits[07:04] - Log2 of the total number of vCMDQs per vIOMMU
> + *          Bits[03:00] - Version number for the CMDQ-V HW

Nit: It seems that we deliberately chose not to reveal `NUM_VINTF_LOG2`
to the user-space. If so, maybe we shall mark those bitfields as unused
or reserved for clarity? Bits[11:08] - Reserved / Unused (even 31:16).

>   * @idr: Implemented features for ARM SMMU Non-secure programming interface
>   * @iidr: Information about the implementation and implementer of ARM SMMU,
>   *        and architecture version supported
> @@ -952,10 +965,28 @@ struct iommu_fault_alloc {
>   * enum iommu_viommu_type - Virtual IOMMU Type
>   * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use
>   * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type
> + * @IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension for 
> SMMUv3
>   */
>  enum iommu_viommu_type {
>       IOMMU_VIOMMU_TYPE_DEFAULT = 0,
>       IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1,
> +     IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV = 2,
> +};

This is a little confusing.. I understand that we need a new viommu type
to copy the new struct iommu_viommu_tegra241_cmdqv b/w the user & kernel

But, in a previous patch (Add vsmmu_alloc impl op), we add a check to
fallback to the standard type SMMUv3, if the impl_ops->vsmmu_alloc 
returns -EOPNOTSUPP:

        if (master->smmu->impl_ops && master->smmu->impl_ops->vsmmu_alloc)
                vsmmu = master->smmu->impl_ops->vsmmu_alloc(
                        master->smmu, s2_parent, ictx, viommu_type, user_data);
        if (PTR_ERR(vsmmu) == -EOPNOTSUPP) {
                if (viommu_type != IOMMU_VIOMMU_TYPE_ARM_SMMUV3)
                        return ERR_PTR(-EOPNOTSUPP);
                /* Fallback to standard SMMUv3 type if viommu_type matches */
                vsmmu = iommufd_viommu_alloc(ictx, struct arm_vsmmu, core,
                                             &arm_vsmmu_ops);

Now, if we'll ALWAYS try to allocate an impl-specified vsmmu first, even
when the viommu_type == IOMMU_VIOMMU_TYPE_ARM_SMMUV3, we are anyways
going to return back from the impl_ops->vsmmu_alloc with -EOPNOTSUPP.
Then we'll again check if the retval was -EOPNOTSUPP and re-check the
viommu_type requested.. which seems a little counter intuitive.

Instead, I'd suggest to simply call the impl_ops->vsmmu_alloc if the
viommu_type != IOMMU_VIOMMU_TYPE_ARM_SMMUV3, in case the type isn't
supported, the impl_op->vsmmu_alloc would return with error anyway.

Thus, the snippet from arm_vsmmu_alloc could look like:

        if (viommu_type != IOMMU_VIOMMU_TYPE_ARM_SMMUV3) {
                if (master->smmu->impl_ops && 
master->smmu->impl_ops->vsmmu_alloc)
                        vsmmu = master->smmu->impl_ops->vsmmu_alloc(
                                master->smmu, s2_parent, ictx, viommu_type, 
user_data);
                        if (IS_ERR(vsmmu))
                                return return ERR_CAST(vsmmu);
        } else {
                /* Fallback to standard SMMUv3 type */
                vsmmu = iommufd_viommu_alloc(ictx, struct arm_vsmmu, core,
                                             &arm_vsmmu_ops);
        }

> +
> +/**
> + * struct iommu_viommu_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Virtual 
> Interface
> + *                                      (IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV)
> + * @out_vintf_page0_pgoff: Offset of the VINTF page0 for mmap syscall
> + * @out_vintf_page0_pgsz: Size of the VINTF page0 for mmap syscall
> + *
> + * Both @out_vintf_page0_pgoff and @out_vintf_page0_pgsz are given by the 
> kernel
> + * for user space to mmap the VINTF page0 from the host physical address 
> space
> + * to the guest physical address space so that a guest kernel can directly 
> R/W
> + * access to the VINTF page0 in order to control its virtual comamnd queues.
> + */
> +struct iommu_viommu_tegra241_cmdqv {
> +     __aligned_u64 out_vintf_page0_pgoff;
> +     __aligned_u64 out_vintf_page0_pgsz;
>  };
>  
>  /**
> @@ -1152,9 +1183,23 @@ struct iommu_veventq_alloc {
>  /**
>   * enum iommu_vcmdq_type - Virtual Command Queue Type
>   * @IOMMU_VCMDQ_TYPE_DEFAULT: Reserved for future use
> + * @IOMMU_VCMDQ_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension for 
> SMMUv3
>   */
>  enum iommu_vcmdq_type {
>       IOMMU_VCMDQ_TYPE_DEFAULT = 0,
> +     /*
> +      * TEGRA241_CMDQV requirements (otherwise it will fail)
> +      * - alloc starts from the lowest @index=0 in ascending order
> +      * - destroy starts from the last allocated @index in descending order
> +      * - @addr must be aligned to @length in bytes and be mmapped in IOAS
> +      * - @length must be a power of 2, with a minimum 32 bytes and a maximum
> +      *   1 ^ idr[1].CMDQS x 16 bytes (do GET_HW_INFO call to read idr[1] in
> +      *   struct iommu_hw_info_arm_smmuv3)
> +      * - suggest to back the queue memory with contiguous physical pages or
> +      *   a single huge page with alignment of the queue size, limit vSMMU's
> +      *   IDR1.CMDQS to the huge page size divided by 16 bytes
> +      */
> +     IOMMU_VCMDQ_TYPE_TEGRA241_CMDQV = 1,
>  };
>  
>  /**
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c 
> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
> index 63861c60b615..40246cd04656 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
> @@ -226,7 +226,7 @@ static int arm_smmu_validate_vste(struct 
> iommu_hwpt_arm_smmuv3 *arg,
>       return 0;
>  }
>  
> -static struct iommu_domain *
> +struct iommu_domain *
>  arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
>                             const struct iommu_user_data *user_data)
>  {
> @@ -337,8 +337,8 @@ static int arm_vsmmu_convert_user_cmd(struct arm_vsmmu 
> *vsmmu,
>       return 0;
>  }
>  
> -static int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
> -                                   struct iommu_user_data_array *array)
> +int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
> +                            struct iommu_user_data_array *array)
>  {
>       struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
>       struct arm_smmu_device *smmu = vsmmu->smmu;
> diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c 
> b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
> index 869c90b660c1..88e2b6506b3a 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
> @@ -8,7 +8,9 @@
>  #include <linux/dma-mapping.h>
>  #include <linux/interrupt.h>
>  #include <linux/iommu.h>
> +#include <linux/iommufd.h>
>  #include <linux/iopoll.h>
> +#include <uapi/linux/iommufd.h>
>  
>  #include <acpi/acpixf.h>
>  
> @@ -26,8 +28,10 @@
>  #define  CMDQV_EN                    BIT(0)
>  
>  #define TEGRA241_CMDQV_PARAM         0x0004
> +#define  CMDQV_NUM_SID_PER_VM_LOG2   GENMASK(15, 12)
>  #define  CMDQV_NUM_VINTF_LOG2                GENMASK(11, 8)
>  #define  CMDQV_NUM_VCMDQ_LOG2                GENMASK(7, 4)
> +#define  CMDQV_VER                   GENMASK(3, 0)
>  
>  #define TEGRA241_CMDQV_STATUS                0x0008
>  #define  CMDQV_ENABLED                       BIT(0)
> @@ -53,6 +57,9 @@
>  #define  VINTF_STATUS                        GENMASK(3, 1)
>  #define  VINTF_ENABLED                       BIT(0)
>  
> +#define TEGRA241_VINTF_SID_MATCH(s)  (0x0040 + 0x4*(s))
> +#define TEGRA241_VINTF_SID_REPLACE(s)        (0x0080 + 0x4*(s))
> +
>  #define TEGRA241_VINTF_LVCMDQ_ERR_MAP_64(m) \
>                                       (0x00C0 + 0x8*(m))
>  #define  LVCMDQ_ERR_MAP_NUM_64               2
> @@ -114,16 +121,20 @@ MODULE_PARM_DESC(bypass_vcmdq,
>  
>  /**
>   * struct tegra241_vcmdq - Virtual Command Queue
> + * @core: Embedded iommufd_vcmdq structure
>   * @idx: Global index in the CMDQV
>   * @lidx: Local index in the VINTF
>   * @enabled: Enable status
>   * @cmdqv: Parent CMDQV pointer
>   * @vintf: Parent VINTF pointer
> + * @prev: Previous LVCMDQ to depend on
>   * @cmdq: Command Queue struct
>   * @page0: MMIO Page0 base address
>   * @page1: MMIO Page1 base address
>   */
>  struct tegra241_vcmdq {
> +     struct iommufd_vcmdq core;
> +
>       u16 idx;
>       u16 lidx;
>  
> @@ -131,22 +142,29 @@ struct tegra241_vcmdq {
>  
>       struct tegra241_cmdqv *cmdqv;
>       struct tegra241_vintf *vintf;
> +     struct tegra241_vcmdq *prev;
>       struct arm_smmu_cmdq cmdq;
>  
>       void __iomem *page0;
>       void __iomem *page1;
>  };
> +#define core_to_vcmdq(v) container_of(v, struct tegra241_vcmdq, core)
>  
>  /**
>   * struct tegra241_vintf - Virtual Interface
> + * @vsmmu: Embedded arm_vsmmu structure
>   * @idx: Global index in the CMDQV
>   * @enabled: Enable status
>   * @hyp_own: Owned by hypervisor (in-kernel)
>   * @cmdqv: Parent CMDQV pointer
>   * @lvcmdqs: List of logical VCMDQ pointers
>   * @base: MMIO base address
> + * @immap_id: Allocated immap_id ID for mmap() call
> + * @sids: Stream ID replacement resources
>   */
>  struct tegra241_vintf {
> +     struct arm_vsmmu vsmmu;
> +
>       u16 idx;
>  
>       bool enabled;
> @@ -156,6 +174,24 @@ struct tegra241_vintf {
>       struct tegra241_vcmdq **lvcmdqs;
>  
>       void __iomem *base;
> +     unsigned long immap_id;
> +
> +     struct ida sids;
> +};
> +#define viommu_to_vintf(v) container_of(v, struct tegra241_vintf, vsmmu.core)
> +
> +/**
> + * struct tegra241_vintf_sid - Virtual Interface Stream ID Replacement
> + * @core: Embedded iommufd_vdevice structure, holding virtual Stream ID
> + * @vintf: Parent VINTF pointer
> + * @sid: Physical Stream ID
> + * @id: Slot index in the VINTF
> + */
> +struct tegra241_vintf_sid {
> +     struct iommufd_vdevice core;
> +     struct tegra241_vintf *vintf;
> +     u32 sid;
> +     u8 idx;
>  };
>  
>  /**
> @@ -163,10 +199,12 @@ struct tegra241_vintf {
>   * @smmu: SMMUv3 device
>   * @dev: CMDQV device
>   * @base: MMIO base address
> + * @base_phys: MMIO physical base address, for mmap
>   * @irq: IRQ number
>   * @num_vintfs: Total number of VINTFs
>   * @num_vcmdqs: Total number of VCMDQs
>   * @num_lvcmdqs_per_vintf: Number of logical VCMDQs per VINTF
> + * @num_sids_per_vintf: Total number of SID replacements per VINTF
>   * @vintf_ids: VINTF id allocator
>   * @vintfs: List of VINTFs
>   */
> @@ -175,12 +213,14 @@ struct tegra241_cmdqv {
>       struct device *dev;
>  
>       void __iomem *base;
> +     phys_addr_t base_phys;
>       int irq;
>  
>       /* CMDQV Hardware Params */
>       u16 num_vintfs;
>       u16 num_vcmdqs;
>       u16 num_lvcmdqs_per_vintf;
> +     u16 num_sids_per_vintf;
>  
>       struct ida vintf_ids;
>  
> @@ -380,6 +420,12 @@ static void tegra241_vcmdq_hw_deinit(struct 
> tegra241_vcmdq *vcmdq)
>       dev_dbg(vcmdq->cmdqv->dev, "%sdeinited\n", h);
>  }
>  
> +/* This function is for LVCMDQ, so @vcmdq must be mapped prior */
> +static void _tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
> +{
> +     writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE));
> +}
> +
>  /* This function is for LVCMDQ, so @vcmdq must be mapped prior */
>  static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
>  {
> @@ -390,7 +436,7 @@ static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq 
> *vcmdq)
>       tegra241_vcmdq_hw_deinit(vcmdq);
>  
>       /* Configure and enable VCMDQ */
> -     writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE));
> +     _tegra241_vcmdq_hw_init(vcmdq);
>  
>       ret = vcmdq_write_config(vcmdq, VCMDQ_EN);
>       if (ret) {
> @@ -420,6 +466,7 @@ static void tegra241_vcmdq_unmap_lvcmdq(struct 
> tegra241_vcmdq *vcmdq)
>  static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf)
>  {
>       u16 lidx = vintf->cmdqv->num_lvcmdqs_per_vintf;
> +     int sidx;
>  
>       /* HW requires to unmap LVCMDQs in descending order */
>       while (lidx--) {
> @@ -429,6 +476,10 @@ static void tegra241_vintf_hw_deinit(struct 
> tegra241_vintf *vintf)
>               }
>       }
>       vintf_write_config(vintf, 0);
> +     for (sidx = 0; sidx < vintf->cmdqv->num_sids_per_vintf; sidx++) {
> +             writel_relaxed(0, REG_VINTF(vintf, SID_REPLACE(sidx)));
> +             writel_relaxed(0, REG_VINTF(vintf, SID_MATCH(sidx)));
> +     }
>  }
>  
>  /* Map a global VCMDQ to the pre-assigned LVCMDQ */
> @@ -457,7 +508,8 @@ static int tegra241_vintf_hw_init(struct tegra241_vintf 
> *vintf, bool hyp_own)
>        * whether enabling it here or not, as !HYP_OWN cmdq HWs only support a
>        * restricted set of supported commands.
>        */
> -     regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own);
> +     regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own) |
> +              FIELD_PREP(VINTF_VMID, vintf->vsmmu.vmid);
>       writel(regval, REG_VINTF(vintf, CONFIG));
>  
>       ret = vintf_write_config(vintf, regval | VINTF_EN);
> @@ -584,7 +636,9 @@ static void tegra241_vintf_free_lvcmdq(struct 
> tegra241_vintf *vintf, u16 lidx)
>  
>       dev_dbg(vintf->cmdqv->dev,
>               "%sdeallocated\n", lvcmdq_error_header(vcmdq, header, 64));
> -     kfree(vcmdq);
> +     /* Guest-owned VCMDQ is free-ed with vcmdq by iommufd core */
> +     if (vcmdq->vintf->hyp_own)
> +             kfree(vcmdq);
>  }
>  
>  static struct tegra241_vcmdq *
> @@ -623,6 +677,9 @@ tegra241_vintf_alloc_lvcmdq(struct tegra241_vintf *vintf, 
> u16 lidx)
>  
>  static void tegra241_cmdqv_deinit_vintf(struct tegra241_cmdqv *cmdqv, u16 
> idx)
>  {
> +     if (cmdqv->vintfs[idx]->immap_id)
> +             iommufd_ctx_free_mmap(cmdqv->vintfs[idx]->vsmmu.core.ictx,
> +                                   cmdqv->vintfs[idx]->immap_id);
>       kfree(cmdqv->vintfs[idx]->lvcmdqs);
>       ida_free(&cmdqv->vintf_ids, idx);
>       cmdqv->vintfs[idx] = NULL;
> @@ -671,7 +728,11 @@ static void tegra241_cmdqv_remove_vintf(struct 
> tegra241_cmdqv *cmdqv, u16 idx)
>  
>       dev_dbg(cmdqv->dev, "VINTF%u: deallocated\n", vintf->idx);
>       tegra241_cmdqv_deinit_vintf(cmdqv, idx);
> -     kfree(vintf);
> +     if (!vintf->hyp_own)
> +             ida_destroy(&vintf->sids);
> +     /* Guest-owned VINTF is free-ed with viommu by iommufd core */
> +     if (vintf->hyp_own)
> +             kfree(vintf);
>  }
>  
>  static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu)
> @@ -699,10 +760,32 @@ static void tegra241_cmdqv_remove(struct 
> arm_smmu_device *smmu)
>       put_device(cmdqv->dev); /* smmu->impl_dev */
>  }
>  
> +static struct arm_vsmmu *
> +tegra241_cmdqv_vsmmu_alloc(struct arm_smmu_device *smmu,
> +                        struct arm_smmu_domain *smmu_domain,
> +                        struct iommufd_ctx *ictx, unsigned int viommu_type,
> +                        const struct iommu_user_data *user_data);
> +
> +static u32 tegra241_cmdqv_hw_info(struct arm_smmu_device *smmu, u32 *impl)
> +{
> +     struct tegra241_cmdqv *cmdqv =
> +             container_of(smmu, struct tegra241_cmdqv, smmu);
> +     u32 regval = readl_relaxed(REG_CMDQV(cmdqv, PARAM));
> +
> +     *impl = FIELD_GET(CMDQV_VER, regval);
> +     *impl |= FIELD_PREP(CMDQV_NUM_VCMDQ_LOG2,
> +                         ilog2(cmdqv->num_lvcmdqs_per_vintf));
> +     *impl |= FIELD_PREP(CMDQV_NUM_SID_PER_VM_LOG2,
> +                        ilog2(cmdqv->num_sids_per_vintf));
> +     return IOMMU_HW_INFO_ARM_SMMUV3_HAS_TEGRA241_CMDQV;
> +}
> +
>  static struct arm_smmu_impl_ops tegra241_cmdqv_impl_ops = {
>       .get_secondary_cmdq = tegra241_cmdqv_get_cmdq,
>       .device_reset = tegra241_cmdqv_hw_reset,
>       .device_remove = tegra241_cmdqv_remove,
> +     .vsmmu_alloc = tegra241_cmdqv_vsmmu_alloc,
> +     .hw_info = tegra241_cmdqv_hw_info,
>  };
>  
>  /* Probe Functions */
> @@ -844,6 +927,7 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, 
> struct resource *res,
>       cmdqv->irq = irq;
>       cmdqv->base = base;
>       cmdqv->dev = smmu->impl_dev;
> +     cmdqv->base_phys = res->start;
>  
>       if (cmdqv->irq > 0) {
>               ret = request_threaded_irq(irq, NULL, tegra241_cmdqv_isr,
> @@ -860,6 +944,8 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, 
> struct resource *res,
>       cmdqv->num_vintfs = 1 << FIELD_GET(CMDQV_NUM_VINTF_LOG2, regval);
>       cmdqv->num_vcmdqs = 1 << FIELD_GET(CMDQV_NUM_VCMDQ_LOG2, regval);
>       cmdqv->num_lvcmdqs_per_vintf = cmdqv->num_vcmdqs / cmdqv->num_vintfs;
> +     cmdqv->num_sids_per_vintf =
> +             1 << FIELD_GET(CMDQV_NUM_SID_PER_VM_LOG2, regval);
>  
>       cmdqv->vintfs =
>               kcalloc(cmdqv->num_vintfs, sizeof(*cmdqv->vintfs), GFP_KERNEL);
> @@ -913,3 +999,283 @@ struct arm_smmu_device *tegra241_cmdqv_probe(struct 
> arm_smmu_device *smmu)
>       put_device(smmu->impl_dev);
>       return ERR_PTR(-ENODEV);
>  }
> +
> +/* User-space vIOMMU and vCMDQ Functions */
> +
> +static int tegra241_vcmdq_hw_init_user(struct tegra241_vcmdq *vcmdq)
> +{
> +     char header[64];
> +
> +     /* Configure the vcmdq only; User space does the enabling */
> +     _tegra241_vcmdq_hw_init(vcmdq);
> +
> +     dev_dbg(vcmdq->cmdqv->dev, "%sinited at host PA 0x%llx size 0x%lx\n",
> +             lvcmdq_error_header(vcmdq, header, 64),
> +             vcmdq->cmdq.q.q_base & VCMDQ_ADDR,
> +             1UL << (vcmdq->cmdq.q.q_base & VCMDQ_LOG2SIZE));
> +     return 0;
> +}
> +
> +static struct iommufd_vcmdq *
> +tegra241_vintf_alloc_lvcmdq_user(struct iommufd_viommu *viommu,
> +                              unsigned int type, u32 index, dma_addr_t addr,
> +                              size_t length)
> +{
> +     struct tegra241_vintf *vintf = viommu_to_vintf(viommu);
> +     struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
> +     struct arm_smmu_device *smmu = &cmdqv->smmu;
> +     struct tegra241_vcmdq *vcmdq, *prev = NULL;
> +     u32 log2size, max_n_shift;
> +     phys_addr_t q_base;
> +     char header[64];
> +     int ret;
> +
> +     if (type != IOMMU_VCMDQ_TYPE_TEGRA241_CMDQV)
> +             return ERR_PTR(-EOPNOTSUPP);
> +     if (index >= cmdqv->num_lvcmdqs_per_vintf)
> +             return ERR_PTR(-EINVAL);
> +     if (vintf->lvcmdqs[index])
> +             return ERR_PTR(-EEXIST);
> +     /*
> +      * HW requires to map LVCMDQs in ascending order, so reject if the
> +      * previous lvcmdqs is not allocated yet.
> +      */
> +     if (index) {
> +             prev = vintf->lvcmdqs[index - 1];
> +             if (!prev)
> +                     return ERR_PTR(-EIO);
> +     }
> +     /*
> +      * @length must be a power of 2, in range of
> +      *   [ 32, 1 ^ (idr[1].CMDQS + CMDQ_ENT_SZ_SHIFT) ]
> +      */

Nit: 2 ^ (idr[1].CMDQS + CMDQ_ENT_SZ_SHIFT) to match the comment in uapi

> +     max_n_shift = FIELD_GET(IDR1_CMDQS,
> +                             readl_relaxed(smmu->base + ARM_SMMU_IDR1));
> +     if (!is_power_of_2(length) || length < 32 ||
> +         length > (1 << (max_n_shift + CMDQ_ENT_SZ_SHIFT)))
> +             return ERR_PTR(-EINVAL);
> +     log2size = ilog2(length) - CMDQ_ENT_SZ_SHIFT;
> +
> +     /* @addr must be aligned to @length and be mapped in s2_parent domain */
> +     if (addr & ~VCMDQ_ADDR || addr & (length - 1))
> +             return ERR_PTR(-EINVAL);
> +     q_base = arm_smmu_domain_ipa_to_pa(vintf->vsmmu.s2_parent, addr);
> +     if (!q_base)
> +             return ERR_PTR(-ENXIO);
> +
> +     vcmdq = iommufd_vcmdq_alloc(viommu, struct tegra241_vcmdq, core);
> +     if (!vcmdq)
> +             return ERR_PTR(-ENOMEM);
> +
> +     /*
> +      * HW requires to unmap LVCMDQs in descending order, so destroy() must
> +      * follow this rule. Set a dependency on its previous LVCMDQ so iommufd
> +      * core will help enforce it.
> +      */
> +     if (prev) {
> +             ret = iommufd_vcmdq_depend(vcmdq, prev, core);
> +             if (ret)
> +                     goto free_vcmdq;
> +     }
> +     vcmdq->prev = prev;
> +
> +     ret = tegra241_vintf_init_lvcmdq(vintf, index, vcmdq);
> +     if (ret)
> +             goto free_vcmdq;
> +
> +     dev_dbg(cmdqv->dev, "%sallocated\n",
> +             lvcmdq_error_header(vcmdq, header, 64));
> +
> +     tegra241_vcmdq_map_lvcmdq(vcmdq);
> +
> +     vcmdq->cmdq.q.q_base = q_base & VCMDQ_ADDR;
> +     vcmdq->cmdq.q.q_base |= log2size;
> +
> +     ret = tegra241_vcmdq_hw_init_user(vcmdq);
> +     if (ret)
> +             goto free_vcmdq;
> +     vintf->lvcmdqs[index] = vcmdq;
> +
> +     return &vcmdq->core;
> +free_vcmdq:
> +     iommufd_struct_destroy(viommu->ictx, vcmdq, core);
> +     return ERR_PTR(ret);

Are we missing an undepend here?

     if (vcmdq->prev)
        iommufd_vcmdq_undepend(vcmdq, vcmdq->prev, core);

> +}
> +
> +static void tegra241_vintf_destroy_lvcmdq_user(struct iommufd_vcmdq *core)
> +{
> +     struct tegra241_vcmdq *vcmdq = core_to_vcmdq(core);
> +
> +     tegra241_vcmdq_hw_deinit(vcmdq);
> +     tegra241_vcmdq_unmap_lvcmdq(vcmdq);
> +     tegra241_vintf_free_lvcmdq(vcmdq->vintf, vcmdq->lidx);
> +     if (vcmdq->prev)
> +             iommufd_vcmdq_undepend(vcmdq, vcmdq->prev, core);
> +
> +     /* IOMMUFD core frees the memory of vcmdq and vcmdq */
> +}
> +
> +static void tegra241_cmdqv_destroy_vintf_user(struct iommufd_viommu *viommu)
> +{
> +     struct tegra241_vintf *vintf = viommu_to_vintf(viommu);
> +
> +     tegra241_cmdqv_remove_vintf(vintf->cmdqv, vintf->idx);
> +
> +     /* IOMMUFD core frees the memory of vintf and viommu */
> +}
> +
> +static struct iommufd_vdevice *
> +tegra241_vintf_alloc_vdevice(struct iommufd_viommu *viommu, struct device 
> *dev,
> +                          u64 dev_id)
> +{
> +     struct tegra241_vintf *vintf = viommu_to_vintf(viommu);
> +     struct arm_smmu_master *master = dev_iommu_priv_get(dev);
> +     struct arm_smmu_stream *stream = &master->streams[0];
> +     struct tegra241_vintf_sid *vsid;
> +     int sidx;
> +
> +     if (dev_id > UINT_MAX)
> +             return ERR_PTR(-EINVAL);
> +
> +     vsid = iommufd_vdevice_alloc(viommu, struct tegra241_vintf_sid, core);
> +     if (!vsid)
> +             return ERR_PTR(-ENOMEM);
> +
> +     WARN_ON_ONCE(master->num_streams != 1);
> +
> +     /* Find an empty pair of SID_REPLACE and SID_MATCH */
> +     sidx = ida_alloc_max(&vintf->sids, vintf->cmdqv->num_sids_per_vintf - 1,
> +                          GFP_KERNEL);
> +     if (sidx < 0) {
> +             iommufd_struct_destroy(viommu->ictx, vsid, core);
> +             return ERR_PTR(sidx);
> +     }
> +
> +     writel_relaxed(stream->id, REG_VINTF(vintf, SID_REPLACE(sidx)));
> +     writel_relaxed(dev_id << 1 | 0x1, REG_VINTF(vintf, SID_MATCH(sidx)));
> +     dev_dbg(vintf->cmdqv->dev,
> +             "VINTF%u: allocated SID_REPLACE%d for pSID=%x, vSID=%x\n",
> +             vintf->idx, sidx, stream->id, (u32)dev_id);
> +
> +     vsid->idx = sidx;
> +     vsid->vintf = vintf;
> +     vsid->sid = stream->id;
> +
> +     return &vsid->core;
> +}
> +
> +static void tegra241_vintf_destroy_vdevice(struct iommufd_vdevice *vdev)
> +{
> +     struct tegra241_vintf_sid *vsid =
> +             container_of(vdev, struct tegra241_vintf_sid, core);
> +     struct tegra241_vintf *vintf = vsid->vintf;
> +
> +     writel_relaxed(0, REG_VINTF(vintf, SID_REPLACE(vsid->idx)));
> +     writel_relaxed(0, REG_VINTF(vintf, SID_MATCH(vsid->idx)));

Just a thought: Should these be writel to avoid races?
Although I believe all user-queues would be free-d by this point?

> +     ida_free(&vintf->sids, vsid->idx);
> +     dev_dbg(vintf->cmdqv->dev,
> +             "VINTF%u: deallocated SID_REPLACE%d for pSID=%x\n", vintf->idx,
> +             vsid->idx, vsid->sid);
> +
> +     /* IOMMUFD core frees the memory of vsid and vdev */
> +}
> +
> +static struct iommufd_viommu_ops tegra241_cmdqv_viommu_ops = {
> +     .destroy = tegra241_cmdqv_destroy_vintf_user,
> +     .alloc_domain_nested = arm_vsmmu_alloc_domain_nested,
> +     .cache_invalidate = arm_vsmmu_cache_invalidate,
> +     .vdevice_alloc = tegra241_vintf_alloc_vdevice,
> +     .vdevice_destroy = tegra241_vintf_destroy_vdevice,
> +     .vcmdq_alloc = tegra241_vintf_alloc_lvcmdq_user,
> +     .vcmdq_destroy = tegra241_vintf_destroy_lvcmdq_user,
> +};
> +
> +static struct arm_vsmmu *
> +tegra241_cmdqv_vsmmu_alloc(struct arm_smmu_device *smmu,
> +                        struct arm_smmu_domain *s2_parent,
> +                        struct iommufd_ctx *ictx, unsigned int viommu_type,
> +                        const struct iommu_user_data *user_data)
> +{
> +     struct tegra241_cmdqv *cmdqv =
> +             container_of(smmu, struct tegra241_cmdqv, smmu);
> +     struct iommu_viommu_tegra241_cmdqv data;
> +     struct tegra241_vintf *vintf;
> +     phys_addr_t page0_base;
> +     int ret;
> +
> +     if (viommu_type != IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV)
> +             return ERR_PTR(-EOPNOTSUPP);
> +     if (!user_data)
> +             return ERR_PTR(-EINVAL);
> +
> +     ret = iommu_copy_struct_from_user(&data, user_data,
> +                                       IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
> +                                       out_vintf_page0_pgsz);
> +     if (ret)
> +             return ERR_PTR(ret);
> +
> +     vintf = iommufd_viommu_alloc(ictx, struct tegra241_vintf, vsmmu.core,
> +                                  &tegra241_cmdqv_viommu_ops);
> +     if (!vintf)
> +             return ERR_PTR(-ENOMEM);
> +
> +     ret = tegra241_cmdqv_init_vintf(cmdqv, cmdqv->num_vintfs - 1, vintf);
> +     if (ret < 0) {
> +             dev_err(cmdqv->dev, "no more available vintf\n");
> +             goto free_vintf;
> +     }
> +
> +     vintf->vsmmu.smmu = smmu;
> +     vintf->vsmmu.s2_parent = s2_parent;
> +     /* FIXME Move VMID allocation from the S2 domain allocation to here */
> +     vintf->vsmmu.vmid = s2_parent->s2_cfg.vmid;
> +
> +     /*
> +      * Initialize the user-owned VINTF without a LVCMDQ, because it has to
> +      * wait for the allocation of a user-owned LVCMDQ, for security reason.
> +      * It is different than the kernel-owned VINTF0, which had pre-assigned
> +      * and pre-allocated global VCMDQs that would be mapped to the LVCMDQs
> +      * by the tegra241_vintf_hw_init() call.
> +      */
> +     ret = tegra241_vintf_hw_init(vintf, false);
> +     if (ret)
> +             goto deinit_vintf;
> +
> +     vintf->lvcmdqs = kcalloc(cmdqv->num_lvcmdqs_per_vintf,
> +                              sizeof(*vintf->lvcmdqs), GFP_KERNEL);
> +     if (!vintf->lvcmdqs) {
> +             ret = -ENOMEM;
> +             goto hw_deinit_vintf;
> +     }
> +
> +     page0_base = cmdqv->base_phys + TEGRA241_VINTFi_PAGE0(vintf->idx);
> +     ret = iommufd_ctx_alloc_mmap(ictx, page0_base, SZ_64K,
> +                                  &vintf->immap_id);
> +     if (ret)
> +             goto hw_deinit_vintf;
> +
> +     data.out_vintf_page0_pgsz = SZ_64K;
> +     data.out_vintf_page0_pgoff = vintf->immap_id;
> +     ret = iommu_copy_struct_to_user(user_data, &data,
> +                                     IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
> +                                     out_vintf_page0_pgsz);
> +     if (ret)
> +             goto free_mmap;
> +
> +     ida_init(&vintf->sids);
> +
> +     dev_dbg(cmdqv->dev, "VINTF%u: allocated with vmid (%d)\n", vintf->idx,
> +             vintf->vsmmu.vmid);
> +
> +     return &vintf->vsmmu;
> +
> +free_mmap:
> +     iommufd_ctx_free_mmap(ictx, vintf->immap_id);
> +hw_deinit_vintf:
> +     tegra241_vintf_hw_deinit(vintf);
> +deinit_vintf:
> +     tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx);
> +free_vintf:
> +     iommufd_struct_destroy(ictx, vintf, vsmmu.core);
> +     return ERR_PTR(ret);
> +}

Thanks,
Praan

> -- 
> 2.43.0
>

Re: [PATCH v2 21/22] iommu/tegra241-cmdqv: Add user-space use support

Reply via email to