Am 07.02.2016 um 22:51 schrieb Matthew Dawson:
> When the radeon driver resets a gpu, it attempts to test whether all the
> rings can successfully handle an IB.  If these rings fail to respond, the
> process will wait forever.  Another gpu reset can't happen at this point,
> as the current reset holds a lock required to do so.  Instead, make all
> the IB tests run with a timeout, so the system can attempt to recover
> in this case.
>
> While this doesn't fix the underlying issue with card resets failing, it
> gives the system a higher chance of recovering.  These timeouts have been
> confirmed to help both a Tathi and Hawaii card recover after a gpu reset.
>
> This also adds a new function, radeon_fence_wait_timeout, that behaves like
> fence_wait_timeout.  It is used instead of fence_wait_timeout as it continues
> to work during a reset.  radeon_fence_wait is changed to be implemented
> using this function.
>
> V2:
>   - Changed the timeout to 1s, as the default 10s from radeon_wait_timeout was
> too long.  A timeout of 100ms was tested and found to be too short.
>   - Changed radeon_fence_wait_timeout to behave more like fence_wait_timeout.
>
> Signed-off-by: Matthew Dawson <matthew at mjdsystems.ca>

Reviewed-by: Christian König <christian.koenig at amd.com>

Regards,
Christian.

> ---
>   drivers/gpu/drm/radeon/cik.c          | 11 ++++++++--
>   drivers/gpu/drm/radeon/cik_sdma.c     |  9 ++++++--
>   drivers/gpu/drm/radeon/r100.c         | 10 +++++++--
>   drivers/gpu/drm/radeon/r600.c         | 10 +++++++--
>   drivers/gpu/drm/radeon/r600_dma.c     |  9 ++++++--
>   drivers/gpu/drm/radeon/radeon.h       |  2 ++
>   drivers/gpu/drm/radeon/radeon_fence.c | 40 
> ++++++++++++++++++++++++++++-------
>   drivers/gpu/drm/radeon/radeon_vce.c   | 11 +++++++---
>   drivers/gpu/drm/radeon/uvd_v1_0.c     | 10 +++++++--
>   9 files changed, 89 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
> index 4c30d8c..0600140 100644
> --- a/drivers/gpu/drm/radeon/cik.c
> +++ b/drivers/gpu/drm/radeon/cik.c
> @@ -4219,13 +4219,20 @@ int cik_ib_test(struct radeon_device *rdev, struct 
> radeon_ring *ring)
>               DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>               return r;
>       }
> -     r = radeon_fence_wait(ib.fence, false);
> -     if (r) {
> +     r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +             RADEON_USEC_IB_TEST_TIMEOUT));
> +     if (r < 0) {
>               DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>               radeon_scratch_free(rdev, scratch);
>               radeon_ib_free(rdev, &ib);
>               return r;
> +     } else if (r == 0) {
> +             DRM_ERROR("radeon: fence wait timed out.\n");
> +             radeon_scratch_free(rdev, scratch);
> +             radeon_ib_free(rdev, &ib);
> +             return -ETIMEDOUT;
>       }
> +     r = 0;
>       for (i = 0; i < rdev->usec_timeout; i++) {
>               tmp = RREG32(scratch);
>               if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/cik_sdma.c 
> b/drivers/gpu/drm/radeon/cik_sdma.c
> index d16f2ee..9c351dc 100644
> --- a/drivers/gpu/drm/radeon/cik_sdma.c
> +++ b/drivers/gpu/drm/radeon/cik_sdma.c
> @@ -737,11 +737,16 @@ int cik_sdma_ib_test(struct radeon_device *rdev, struct 
> radeon_ring *ring)
>               DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>               return r;
>       }
> -     r = radeon_fence_wait(ib.fence, false);
> -     if (r) {
> +     r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +             RADEON_USEC_IB_TEST_TIMEOUT));
> +     if (r < 0) {
>               DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>               return r;
> +     } else if (r == 0) {
> +             DRM_ERROR("radeon: fence wait timed out.\n");
> +             return -ETIMEDOUT;
>       }
> +     r = 0;
>       for (i = 0; i < rdev->usec_timeout; i++) {
>               tmp = le32_to_cpu(rdev->wb.wb[index/4]);
>               if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
> index 5eae0a8..6e478a2 100644
> --- a/drivers/gpu/drm/radeon/r100.c
> +++ b/drivers/gpu/drm/radeon/r100.c
> @@ -3732,11 +3732,17 @@ int r100_ib_test(struct radeon_device *rdev, struct 
> radeon_ring *ring)
>               DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>               goto free_ib;
>       }
> -     r = radeon_fence_wait(ib.fence, false);
> -     if (r) {
> +     r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +             RADEON_USEC_IB_TEST_TIMEOUT));
> +     if (r < 0) {
>               DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>               goto free_ib;
> +     } else if (r == 0) {
> +             DRM_ERROR("radeon: fence wait timed out.\n");
> +             r = -ETIMEDOUT;
> +             goto free_ib;
>       }
> +     r = 0;
>       for (i = 0; i < rdev->usec_timeout; i++) {
>               tmp = RREG32(scratch);
>               if (tmp == 0xDEADBEEF) {
> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
> index cc2fdf0..ed12104 100644
> --- a/drivers/gpu/drm/radeon/r600.c
> +++ b/drivers/gpu/drm/radeon/r600.c
> @@ -3381,11 +3381,17 @@ int r600_ib_test(struct radeon_device *rdev, struct 
> radeon_ring *ring)
>               DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>               goto free_ib;
>       }
> -     r = radeon_fence_wait(ib.fence, false);
> -     if (r) {
> +     r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +             RADEON_USEC_IB_TEST_TIMEOUT));
> +     if (r < 0) {
>               DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>               goto free_ib;
> +     } else if (r == 0) {
> +             DRM_ERROR("radeon: fence wait timed out.\n");
> +             r = -ETIMEDOUT;
> +             goto free_ib;
>       }
> +     r = 0;
>       for (i = 0; i < rdev->usec_timeout; i++) {
>               tmp = RREG32(scratch);
>               if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/r600_dma.c 
> b/drivers/gpu/drm/radeon/r600_dma.c
> index d2dd29a..fb65e6f 100644
> --- a/drivers/gpu/drm/radeon/r600_dma.c
> +++ b/drivers/gpu/drm/radeon/r600_dma.c
> @@ -368,11 +368,16 @@ int r600_dma_ib_test(struct radeon_device *rdev, struct 
> radeon_ring *ring)
>               DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>               return r;
>       }
> -     r = radeon_fence_wait(ib.fence, false);
> -     if (r) {
> +     r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +             RADEON_USEC_IB_TEST_TIMEOUT));
> +     if (r < 0) {
>               DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>               return r;
> +     } else if (r == 0) {
> +             DRM_ERROR("radeon: fence wait timed out.\n");
> +             return -ETIMEDOUT;
>       }
> +     r = 0;
>       for (i = 0; i < rdev->usec_timeout; i++) {
>               tmp = le32_to_cpu(rdev->wb.wb[index/4]);
>               if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 78a51b3..007be29 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -120,6 +120,7 @@ extern int radeon_mst;
>    */
>   #define RADEON_MAX_USEC_TIMEOUT                     100000  /* 100 ms */
>   #define RADEON_FENCE_JIFFIES_TIMEOUT                (HZ / 2)
> +#define RADEON_USEC_IB_TEST_TIMEOUT          1000000 /* 1s */
>   /* RADEON_IB_POOL_SIZE must be a power of 2 */
>   #define RADEON_IB_POOL_SIZE                 16
>   #define RADEON_DEBUGFS_MAX_COMPONENTS               32
> @@ -382,6 +383,7 @@ void radeon_fence_driver_force_completion(struct 
> radeon_device *rdev, int ring);
>   int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence 
> **fence, int ring);
>   void radeon_fence_process(struct radeon_device *rdev, int ring);
>   bool radeon_fence_signaled(struct radeon_fence *fence);
> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool 
> interruptible, long timeout);
>   int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
>   int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
>   int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c 
> b/drivers/gpu/drm/radeon/radeon_fence.c
> index 05815c4..7ef075a 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct 
> radeon_device *rdev,
>   }
>   
>   /**
> - * radeon_fence_wait - wait for a fence to signal
> + * radeon_fence_wait_timeout - wait for a fence to signal with timeout
>    *
>    * @fence: radeon fence object
>    * @intr: use interruptible sleep
> @@ -535,12 +535,15 @@ static long radeon_fence_wait_seq_timeout(struct 
> radeon_device *rdev,
>    * Wait for the requested fence to signal (all asics).
>    * @intr selects whether to use interruptable (true) or non-interruptable
>    * (false) sleep when waiting for the fence.
> - * Returns 0 if the fence has passed, error for all other cases.
> + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite wait
> + * Returns remaining time if the sequence number has passed, 0 when
> + * the wait timeout, or an error for all other cases.
>    */
> -int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, long 
> timeout)
>   {
>       uint64_t seq[RADEON_NUM_RINGS] = {};
>       long r;
> +     int r_sig;
>   
>       /*
>        * This function should not be called on !radeon fences.
> @@ -552,15 +555,36 @@ int radeon_fence_wait(struct radeon_fence *fence, bool 
> intr)
>               return fence_wait(&fence->base, intr);
>   
>       seq[fence->ring] = fence->seq;
> -     r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, 
> MAX_SCHEDULE_TIMEOUT);
> -     if (r < 0) {
> +     r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, timeout);
> +     if (r <= 0) {
>               return r;
>       }
>   
> -     r = fence_signal(&fence->base);
> -     if (!r)
> +     r_sig = fence_signal(&fence->base);
> +     if (!r_sig)
>               FENCE_TRACE(&fence->base, "signaled from fence_wait\n");
> -     return 0;
> +     return r;
> +}
> +
> +/**
> + * radeon_fence_wait - wait for a fence to signal
> + *
> + * @fence: radeon fence object
> + * @intr: use interruptible sleep
> + *
> + * Wait for the requested fence to signal (all asics).
> + * @intr selects whether to use interruptable (true) or non-interruptable
> + * (false) sleep when waiting for the fence.
> + * Returns 0 if the fence has passed, error for all other cases.
> + */
> +int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> +{
> +     long r = radeon_fence_wait_timeout(fence, intr, MAX_SCHEDULE_TIMEOUT);
> +     if (r > 0) {
> +             return 0;
> +     } else {
> +             return r;
> +     }
>   }
>   
>   /**
> diff --git a/drivers/gpu/drm/radeon/radeon_vce.c 
> b/drivers/gpu/drm/radeon/radeon_vce.c
> index 7eb1ae7..566a1a0 100644
> --- a/drivers/gpu/drm/radeon/radeon_vce.c
> +++ b/drivers/gpu/drm/radeon/radeon_vce.c
> @@ -810,11 +810,16 @@ int radeon_vce_ib_test(struct radeon_device *rdev, 
> struct radeon_ring *ring)
>               goto error;
>       }
>   
> -     r = radeon_fence_wait(fence, false);
> -     if (r) {
> +     r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
> +             RADEON_USEC_IB_TEST_TIMEOUT));
> +     if (r < 0) {
>               DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> +     } else if (r == 0) {
> +             DRM_ERROR("radeon: fence wait timed out.\n");
> +             r = -ETIMEDOUT;
>       } else {
> -             DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
> +             DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
> +             r = 0;
>       }
>   error:
>       radeon_fence_unref(&fence);
> diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c 
> b/drivers/gpu/drm/radeon/uvd_v1_0.c
> index c6b1cbc..12ddcfa 100644
> --- a/drivers/gpu/drm/radeon/uvd_v1_0.c
> +++ b/drivers/gpu/drm/radeon/uvd_v1_0.c
> @@ -522,11 +522,17 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev, struct 
> radeon_ring *ring)
>               goto error;
>       }
>   
> -     r = radeon_fence_wait(fence, false);
> -     if (r) {
> +     r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
> +             RADEON_USEC_IB_TEST_TIMEOUT));
> +     if (r < 0) {
>               DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>               goto error;
> +     } else if (r == 0) {
> +             DRM_ERROR("radeon: fence wait timed out.\n");
> +             r = -ETIMEDOUT;
> +             goto error;
>       }
> +     r = 0;
>       DRM_INFO("ib test on ring %d succeeded\n",  ring->idx);
>   error:
>       radeon_fence_unref(&fence);

Reply via email to