On 3/19/2025 10:37 AM, jesse.zh...@amd.com wrote:
> From: "jesse.zh...@amd.com" <jesse.zh...@amd.com>
>
> This commit updates the VM flush implementation for the SDMA engine.
>
> - Added a new function `sdma_v4_4_2_get_invalidate_req` to construct the
> VM_INVALIDATE_ENG0_REQ
> register value for the specified VMID and flush type. This function ensures
> that all relevant
> page table cache levels (L1 PTEs, L2 PTEs, and L2 PDEs) are invalidated.
>
> - Modified the `sdma_v4_4_2_ring_emit_vm_flush` function to use the new
> `sdma_v4_4_2_get_invalidate_req`
> function. The updated function emits the necessary register writes and
> waits to perform a VM flush
> for the specified VMID. It updates the PTB address registers and issues a
> VM invalidation request
> using the specified VM invalidation engine.
>
> - Included the necessary header file `gc/gc_9_0_sh_mask.h` to provide access
> to the required register
> definitions.
>
> v2: vm flush by the vm inalidation packet (Lijo)
> v3: code stle and define thh macro for the vm invalidation packet (Christian)
> v4: Format definition sdma vm invalidate packet (Lijo)
>
> Suggested-by: Lijo Lazar <lijo.la...@amd.com>
> Signed-off-by: Jesse Zhang <jesse.zh...@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 77 +++++++++++++++----
> .../gpu/drm/amd/amdgpu/vega10_sdma_pkt_open.h | 54 +++++++++++++
> 2 files changed, 117 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> index fd34dc138081..06ce0c98ef5d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> @@ -31,6 +31,7 @@
> #include "amdgpu_ucode.h"
> #include "amdgpu_trace.h"
> #include "amdgpu_reset.h"
> +#include "gc/gc_9_0_sh_mask.h"
>
> #include "sdma/sdma_4_4_2_offset.h"
> #include "sdma/sdma_4_4_2_sh_mask.h"
> @@ -1292,21 +1293,71 @@ static void
> sdma_v4_4_2_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> seq, 0xffffffff, 4);
> }
>
> -
> -/**
> - * sdma_v4_4_2_ring_emit_vm_flush - vm flush using sDMA
> +/*
> + * sdma_v4_4_2_get_invalidate_req - Construct the VM_INVALIDATE_ENG0_REQ
> register value
> + * @vmid: The VMID to invalidate
> + * @flush_type: The type of flush (0 = legacy, 1 = lightweight, 2 =
> heavyweight)
> *
> - * @ring: amdgpu_ring pointer
> - * @vmid: vmid number to use
> - * @pd_addr: address
> + * This function constructs the VM_INVALIDATE_ENG0_REQ register value for
> the specified VMID
> + * and flush type. It ensures that all relevant page table cache levels (L1
> PTEs, L2 PTEs, and
> + * L2 PDEs) are invalidated.
> + */
> +static uint32_t sdma_v4_4_2_get_invalidate_req(unsigned int vmid,
> + uint32_t flush_type)
> +{
> + u32 req = 0;
> +
> + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ,
> + PER_VMID_INVALIDATE_REQ, 1 << vmid);
> + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, FLUSH_TYPE,
> flush_type);
> + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PTES, 1);
> + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE0, 1);
> + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE1, 1);
> + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE2, 1);
> + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L1_PTES, 1);
> + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ,
> + CLEAR_PROTECTION_FAULT_STATUS_ADDR, 0);
> +
> + return req;
> +}
> +
> +/*
> + * sdma_v4_4_2_ring_emit_vm_flush - Emit VM flush commands for SDMA
> + * @ring: The SDMA ring
> + * @vmid: The VMID to flush
> + * @pd_addr: The page directory address
> *
> - * Update the page table base and flush the VM TLB
> - * using sDMA.
> + * This function emits the necessary register writes and waits to perform a
> VM flush for the
> + * specified VMID. It updates the PTB address registers and issues a VM
> invalidation request
> + * using the specified VM invalidation engine.
> */
> static void sdma_v4_4_2_ring_emit_vm_flush(struct amdgpu_ring *ring,
> - unsigned vmid, uint64_t pd_addr)
> + unsigned int vmid, uint64_t pd_addr)
> {
> - amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
> + struct amdgpu_device *adev = ring->adev;
> + uint32_t req = sdma_v4_4_2_get_invalidate_req(vmid, 0);
> + unsigned int eng = ring->vm_inv_eng;
> + struct amdgpu_vmhub *hub = &adev->vmhub[ring->vm_hub];
> +
> + amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 +
> + (hub->ctx_addr_distance * vmid),
> + lower_32_bits(pd_addr));
> +
> + amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 +
> + (hub->ctx_addr_distance * vmid),
> + upper_32_bits(pd_addr));
> + /*
> + * Construct and emit the VM invalidation packet
> + */
> + amdgpu_ring_write(ring,
> + SDMA_PKT_VM_INVALIDATE_HEADER_OP(SDMA_OP_VM_INVALIDATE) |
> + SDMA_PKT_VM_INVALIDATE_HEADER_SUB_OP(SDMA_SUBOP_VM_INVALIDATE) |
> + SDMA_PKT_VM_INVALIDATE_HEADER_XCC0_ENGINE_ID(0x1f) |
> + SDMA_PKT_VM_INVALIDATE_HEADER_XCC1_ENGINE_ID(0x1f |
> + SDMA_PKT_VM_INVALIDATE_HEADER_MMHUB_ENGINE_ID(eng)));
> + amdgpu_ring_write(ring, VM_INVALIDATE_REQ_INVALIDATE(req));
> + amdgpu_ring_write(ring, 0);
> + amdgpu_ring_write(ring,
> VM_INVALIDATE_ADDR_RANGE_INVALIDATE_ACK(BIT(vmid)));
> }
>
> static void sdma_v4_4_2_ring_emit_wreg(struct amdgpu_ring *ring,
> @@ -2115,8 +2166,7 @@ static const struct amdgpu_ring_funcs
> sdma_v4_4_2_ring_funcs = {
> 3 + /* hdp invalidate */
> 6 + /* sdma_v4_4_2_ring_emit_pipeline_sync */
> /* sdma_v4_4_2_ring_emit_vm_flush */
> - SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> + 4 + 2 * 3 +
> 10 + 10 + 10, /* sdma_v4_4_2_ring_emit_fence x3 for user fence,
> vm fence */
> .emit_ib_size = 7 + 6, /* sdma_v4_4_2_ring_emit_ib */
> .emit_ib = sdma_v4_4_2_ring_emit_ib,
> @@ -2148,8 +2198,7 @@ static const struct amdgpu_ring_funcs
> sdma_v4_4_2_page_ring_funcs = {
> 3 + /* hdp invalidate */
> 6 + /* sdma_v4_4_2_ring_emit_pipeline_sync */
> /* sdma_v4_4_2_ring_emit_vm_flush */
> - SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> + 4 + 2 * 3 +
> 10 + 10 + 10, /* sdma_v4_4_2_ring_emit_fence x3 for user fence,
> vm fence */
> .emit_ib_size = 7 + 6, /* sdma_v4_4_2_ring_emit_ib */
> .emit_ib = sdma_v4_4_2_ring_emit_ib,
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_sdma_pkt_open.h
> b/drivers/gpu/drm/amd/amdgpu/vega10_sdma_pkt_open.h
> index 8de4ccce5e38..2da2e2443c87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega10_sdma_pkt_open.h
> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_sdma_pkt_open.h
> @@ -64,6 +64,9 @@
> #define HEADER_BARRIER 5
> #define SDMA_OP_AQL_COPY 0
> #define SDMA_OP_AQL_BARRIER_OR 0
> +/* vm invalidation is only available for GC9.4.3/GC9.4.4/GC9.5.0 */
> +#define SDMA_OP_VM_INVALIDATE 8
> +#define SDMA_SUBOP_VM_INVALIDATE 4
>
> /*define for op field*/
> #define SDMA_PKT_HEADER_op_offset 0
> @@ -3331,5 +3334,56 @@
> #define
> SDMA_AQL_PKT_BARRIER_OR_COMPLETION_SIGNAL_HI_completion_signal_63_32_shift 0
> #define
> SDMA_AQL_PKT_BARRIER_OR_COMPLETION_SIGNAL_HI_COMPLETION_SIGNAL_63_32(x) (((x)
> & SDMA_AQL_PKT_BARRIER_OR_COMPLETION_SIGNAL_HI_completion_signal_63_32_mask)
> << SDMA_AQL_PKT_BARRIER_OR_COMPLETION_SIGNAL_HI_completion_signal_63_32_shift)
>
> +/*
> +** Definitions for SDMA_VM_INVALIDATION packet
> +*/
>
> +/* Define for HEADER word (DW0) */
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_op_offset 0
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_op_mask 0x000000FF
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_op_shift 0
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_OP(x) (((x) &
> SDMA_PKT_VM_INVALIDATE_HEADER_op_mask) <<
> SDMA_PKT_VM_INVALIDATE_HEADER_op_shift)
> +
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_SUB_op_offset 8
By consistent format, I meant -
_offset = DWORD offset
_mask = Mask of the field
_shift = shift required for the field within that DWORD
Besides, all defines start with
SDMA_PKT_<packet name>_<dword name>_<field name>_offset
SDMA_PKT_<packet name>_<dword name>_<field name>_mask
SDMA_PKT_<packet name>_<dword name>_<field name>_shift
Or, better approach design/verif team to generate the header and copy
directly.
Thanks,
Lijo
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_SUB_op_mask 0x000000FF
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_SUB_op_shift 8
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_SUB_OP(x) (((x) &
> SDMA_PKT_VM_INVALIDATE_HEADER_SUB_op_mask) <<
> SDMA_PKT_VM_INVALIDATE_HEADER_SUB_op_shift)
> +
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_xcc0_engine_id_offset 16
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_xcc0_engine_id_mask 0x0000001F
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_xcc0_engine_id_shift 16
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_XCC0_ENGINE_ID(x) (((x) &
> SDMA_PKT_VM_INVALIDATE_HEADER_xcc0_engine_id_mask) <<
> SDMA_PKT_VM_INVALIDATE_HEADER_xcc0_engine_id_shift)
> +
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_xcc1_engine_id_offset 21
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_xcc1_engine_id_mask 0x0000003E
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_xcc1_engine_id_shift 21
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_XCC1_ENGINE_ID(x) (((x) &
> SDMA_PKT_VM_INVALIDATE_HEADER_xcc1_engine_id_mask) <<
> SDMA_PKT_VM_INVALIDATE_HEADER_xcc1_engine_id_shift)
> +
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_mmhub_engine_id_offset 26
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_mmhub_engine_id_mask 0x0000007C
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_mmhub_engine_id_shift 26
> +#define SDMA_PKT_VM_INVALIDATE_HEADER_MMHUB_ENGINE_ID(x) (((x) &
> SDMA_PKT_VM_INVALIDATE_HEADER_mmhub_engine_id_mask) <<
> SDMA_PKT_VM_INVALIDATE_HEADER_mmhub_engine_id_shift)
> +
> +/* Define for INVALIDATEREQ word (DW1) */
> +#define VM_INVALIDATE_req_invalidate_offset 0
> +#define VM_INVALIDATE_req_invalidate_mask 0xFFFFFFFF
> +#define VM_INVALIDATE_req_invalidate_shift 0
> +#define VM_INVALIDATE_REQ_INVALIDATE(x) (((x) &
> VM_INVALIDATE_req_invalidate_mask) << VM_INVALIDATE_req_invalidate_shift)
> +
> +/* Define for ADDRESSRANGELO word (DW2) */
> +#define VM_INVALIDATE_ADDR_RANGE_LO_addr_31_0_offset 0
> +#define VM_INVALIDATE_ADDR_RANGE_LO_addr_31_0_mask 0xFFFFFFFF
> +#define VM_INVALIDATE_ADDR_RANGE_LO_addr_31_0_shift 0
> +#define VM_INVALIDATE_ADDR_RANGE_LO_ADDR_31_0(x) (((x) &
> VM_INVALIDATE_ADDR_RANGE_LO_addr_31_0_mask) <<
> VM_INVALIDATE_ADDR_RANGE_LO_addr_31_0_shift)
> +
> +#define VM_INVALIDATE_ADDR_RANGE_HI_addr_64_32_offset 16
> +#define VM_INVALIDATE_ADDR_RANGE_HI_addr_64_32_mask 0x0000001F
> +#define VM_INVALIDATE_ADDR_RANGE_HI_addr_64_32_shift 16
> +#define VM_INVALIDATE_ADDR_RANGE_HI_ADDR_64_32(x) (((x) &
> VM_INVALIDATE_ADDR_RANGE_HI_addr_64_32_mask) <<
> VM_INVALIDATE_ADDR_RANGE_HI_addr_64_32_shift)
> +
> +/* Define for ADDRESSRANGEHI and INVALIDATEACK word (DW3) */
> +#define VM_INVALIDATE_ADDR_RANGE_invalidate_ack_offset 0
> +#define VM_INVALIDATE_ADDR_RANGE_invalidate_ack_mask 0x0000FFFF
> +#define VM_INVALIDATE_ADDR_RANGE_invalidate_ack_shift 0
> +#define VM_INVALIDATE_ADDR_RANGE_INVALIDATE_ACK(x) (((x) &
> VM_INVALIDATE_ADDR_RANGE_invalidate_ack_mask) <<
> VM_INVALIDATE_ADDR_RANGE_invalidate_ack_shift)
> #endif /* __SDMA_PKT_OPEN_H_ */