[PATCH] drm/amdgpu: capture invalid hardware access v2

Christian König Mon, 08 Mar 2021 10:06:58 -0800

From: Dennis Li <dennis...@amd.com>

When recovery thread has begun GPU reset, there should be not other
threads to access hardware, otherwise system randomly hang.


v2 (chk): rewritten from scratch, use trylock and lockdep instead of
          hand wiring the logic.

Signed-off-by: Dennis Li <dennis...@amd.com>
Signed-off-by: Christian König <christian.koe...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 74 +++++++++++++++++-----
 1 file changed, 57 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e247c3a2ec08..c990af6a43ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -326,6 +326,34 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, 
loff_t pos,
 /*
  * register access helper functions.
  */
+
+/* Check if hw access should be skipped because of hotplug or device error */
+static bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
+{
+       if (adev->in_pci_err_recovery)
+               return true;
+
+#ifdef CONFIG_LOCKDEP
+       /*
+        * This is a bit complicated to understand, so worth a comment. What we 
assert
+        * here is that the GPU reset is not running on another thread in 
parallel.
+        *
+        * For this we trylock the read side of the reset semaphore, if that 
succeeds
+        * we know that the reset is not running in paralell.
+        *
+        * If the trylock fails we assert that we are either already holding 
the read
+        * side of the lock or are the reset thread itself and hold the write 
side of
+        * the lock.
+        */
+       if (down_read_trylock(&adev->reset_sem))
+               up_read(&adev->reset_sem);
+       else
+               lockdep_assert_held(&adev->reset_sem);
+#endif
+
+       return false;
+}
+
 /**
  * amdgpu_device_rreg - read a memory mapped IO or indirect register
  *
@@ -340,7 +368,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 {
        uint32_t ret;
 
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return 0;
 
        if ((reg * 4) < adev->rmmio_size) {
@@ -377,7 +405,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
  */
 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return 0;
 
        if (offset < adev->rmmio_size)
@@ -402,7 +430,7 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, 
uint32_t offset)
  */
 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t 
value)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return;
 
        if (offset < adev->rmmio_size)
@@ -425,7 +453,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
                        uint32_t reg, uint32_t v,
                        uint32_t acc_flags)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return;
 
        if ((reg * 4) < adev->rmmio_size) {
@@ -452,7 +480,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
                             uint32_t reg, uint32_t v)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return;
 
        if (amdgpu_sriov_fullaccess(adev) &&
@@ -475,7 +503,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
  */
 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return 0;
 
        if ((reg * 4) < adev->rio_mem_size)
@@ -497,7 +525,7 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
  */
 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return;
 
        if ((reg * 4) < adev->rio_mem_size)
@@ -519,7 +547,7 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, 
u32 v)
  */
 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return 0;
 
        if (index < adev->doorbell.num_doorbells) {
@@ -542,7 +570,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 
index)
  */
 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return;
 
        if (index < adev->doorbell.num_doorbells) {
@@ -563,7 +591,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 
index, u32 v)
  */
 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return 0;
 
        if (index < adev->doorbell.num_doorbells) {
@@ -586,7 +614,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 
index)
  */
 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return;
 
        if (index < adev->doorbell.num_doorbells) {
@@ -610,10 +638,13 @@ u32 amdgpu_device_indirect_rreg(struct amdgpu_device 
*adev,
                                u32 pcie_index, u32 pcie_data,
                                u32 reg_addr)
 {
-       unsigned long flags;
-       u32 r;
        void __iomem *pcie_index_offset;
        void __iomem *pcie_data_offset;
+       unsigned long flags;
+       u32 r;
+
+       if (amdgpu_device_skip_hw_access(adev))
+               return 0;
 
        spin_lock_irqsave(&adev->pcie_idx_lock, flags);
        pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
@@ -641,10 +672,13 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device 
*adev,
                                  u32 pcie_index, u32 pcie_data,
                                  u32 reg_addr)
 {
-       unsigned long flags;
-       u64 r;
        void __iomem *pcie_index_offset;
        void __iomem *pcie_data_offset;
+       unsigned long flags;
+       u64 r;
+
+       if (amdgpu_device_skip_hw_access(adev))
+               return 0;
 
        spin_lock_irqsave(&adev->pcie_idx_lock, flags);
        pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
@@ -677,9 +711,12 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device 
*adev,
                                 u32 pcie_index, u32 pcie_data,
                                 u32 reg_addr, u32 reg_data)
 {
-       unsigned long flags;
        void __iomem *pcie_index_offset;
        void __iomem *pcie_data_offset;
+       unsigned long flags;
+
+       if (amdgpu_device_skip_hw_access(adev))
+               return;
 
        spin_lock_irqsave(&adev->pcie_idx_lock, flags);
        pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
@@ -706,9 +743,12 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device 
*adev,
                                   u32 pcie_index, u32 pcie_data,
                                   u32 reg_addr, u64 reg_data)
 {
-       unsigned long flags;
        void __iomem *pcie_index_offset;
        void __iomem *pcie_data_offset;
+       unsigned long flags;
+
+       if (amdgpu_device_skip_hw_access(adev))
+               return;
 
        spin_lock_irqsave(&adev->pcie_idx_lock, flags);
        pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: capture invalid hardware access v2

Reply via email to