Re: [RFC] drm/amdgpu: flush gpu tlb using sdma ring

Christian König Fri, 09 Jun 2023 04:37:13 -0700

Am 09.06.23 um 12:02 schrieb Shashank Sharma:

This RFC patch moves the code to flush TLB using SDMA ring to
a different place like a separate function. The idea is that
KFD/KGD code should be able to flush GPU TLB using SDMA if they
want to.


Cc: Christian Koenig <christian.koe...@amd.com>
Signed-off-by: Shashank Sharma <shashank.sha...@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 41 ++++++++++++++++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  1 +
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   | 36 +++++----------------
  3 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 231ca06bc9c7..20a5dad32bfc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -30,6 +30,47 @@
  /* SDMA CSA reside in the 3rd page of CSA */
  #define AMDGPU_CSA_SDMA_OFFSET (4096 * 2)

+/**

+ * amdgpu_sdma_flush_tlb - flush gpu TLB using SDMA ring
+ *
+ * @adev: amdgpu device pointer
+ *
+ * return: returns dma fence which must be put by caller
+ *
+ * The SDMA on Navi has a bug which can theoretically result in memory
+ * corruption if an invalidation happens at the same time as an VA
+ * translation. Avoid this by doing the invalidation from the SDMA
+ * itself.
+ */
+struct dma_fence *amdgpu_sdma_flush_tlb(struct amdgpu_device *adev)

That function is actually not really SDMA related. It just happens thatthe SDMA is used for buffer moves and we needed that as workaround forHW problems.

Maybe add a function to amdgpu_ttm.c with the implementation to use anyring for TLB flush and a wrapper here to use the SDMA for TLB flush.

+{
+       int r;
+       struct dma_fence *fence;
+       struct amdgpu_job *job;
+       struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;


Variables like "r" last and longer lines first please.

Apart from those nit picks looks good to me,
Christian.

+
+       mutex_lock(&adev->mman.gtt_window_lock);
+       r = amdgpu_job_alloc_with_ib(ring->adev, &adev->mman.entity,
+                                    AMDGPU_FENCE_OWNER_UNDEFINED,
+                                    16 * 4, AMDGPU_IB_POOL_IMMEDIATE,
+                                    &job);
+       if (r)
+               goto error_alloc;
+
+       job->vm_pd_addr = amdgpu_gmc_pd_addr(adev->gart.bo);
+       job->vm_needs_flush = true;
+       job->ibs->ptr[job->ibs->length_dw++] = ring->funcs->nop;
+       amdgpu_ring_pad_ib(ring, &job->ibs[0]);
+       fence = amdgpu_job_submit(job);
+
+       mutex_unlock(&adev->mman.gtt_window_lock);
+       return fence;
+
+error_alloc:
+       mutex_unlock(&adev->mman.gtt_window_lock);
+       return NULL;
+}
+
  /*
   * GPU SDMA IP block helpers function.
   */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index fc8528812598..c895948f6e82 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -130,5 +130,6 @@ void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device 
*adev,
          bool duplicate);
  void amdgpu_sdma_unset_buffer_funcs_helper(struct amdgpu_device *adev);
  int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
+struct dma_fence *amdgpu_sdma_flush_tlb(struct amdgpu_device *adev);

#endif

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index ab2556ca984e..0bfaee00a838 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -52,6 +52,7 @@
  #include "athub_v2_1.h"

#include "amdgpu_reset.h"

+#include "amdgpu_sdma.h"

#if 0

  static const struct soc15_reg_golden golden_settings_navi10_hdp[] =
@@ -333,9 +334,6 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
  {
        struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
        struct dma_fence *fence;
-       struct amdgpu_job *job;
-
-       int r;

/* flush hdp cache */

        adev->hdp.funcs->flush_hdp(adev, NULL);
@@ -378,34 +376,16 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device 
*adev, uint32_t vmid,
                return;
        }

- /* The SDMA on Navi has a bug which can theoretically result in memory

-        * corruption if an invalidation happens at the same time as an VA
-        * translation. Avoid this by doing the invalidation from the SDMA
-        * itself.
-        */
-       r = amdgpu_job_alloc_with_ib(ring->adev, &adev->mman.entity,
-                                    AMDGPU_FENCE_OWNER_UNDEFINED,
-                                    16 * 4, AMDGPU_IB_POOL_IMMEDIATE,
-                                    &job);
-       if (r)
-               goto error_alloc;
-
-       job->vm_pd_addr = amdgpu_gmc_pd_addr(adev->gart.bo);
-       job->vm_needs_flush = true;
-       job->ibs->ptr[job->ibs->length_dw++] = ring->funcs->nop;
-       amdgpu_ring_pad_ib(ring, &job->ibs[0]);
-       fence = amdgpu_job_submit(job);
-
        mutex_unlock(&adev->mman.gtt_window_lock);

- dma_fence_wait(fence, false);

-       dma_fence_put(fence);
-
-       return;
+       fence = amdgpu_sdma_flush_tlb(adev);
+       if (fence) {
+               dma_fence_wait(fence, false);
+               dma_fence_put(fence);
+               return;
+       }

-error_alloc:

-       mutex_unlock(&adev->mman.gtt_window_lock);
-       DRM_ERROR("Error flushing GPU TLB using the SDMA (%d)!\n", r);
+       DRM_ERROR("Error flushing GPU TLB using the SDMA !\n");
  }

/**

Re: [RFC] drm/amdgpu: flush gpu tlb using sdma ring

Reply via email to