Am 07.02.2016 um 22:51 schrieb Matthew Dawson: > When the radeon driver resets a gpu, it attempts to test whether all the > rings can successfully handle an IB. If these rings fail to respond, the > process will wait forever. Another gpu reset can't happen at this point, > as the current reset holds a lock required to do so. Instead, make all > the IB tests run with a timeout, so the system can attempt to recover > in this case. > > While this doesn't fix the underlying issue with card resets failing, it > gives the system a higher chance of recovering. These timeouts have been > confirmed to help both a Tathi and Hawaii card recover after a gpu reset. > > This also adds a new function, radeon_fence_wait_timeout, that behaves like > fence_wait_timeout. It is used instead of fence_wait_timeout as it continues > to work during a reset. radeon_fence_wait is changed to be implemented > using this function. > > V2: > - Changed the timeout to 1s, as the default 10s from radeon_wait_timeout was > too long. A timeout of 100ms was tested and found to be too short. > - Changed radeon_fence_wait_timeout to behave more like fence_wait_timeout. > > Signed-off-by: Matthew Dawson <matthew at mjdsystems.ca>
Reviewed-by: Christian König <christian.koenig at amd.com> Regards, Christian. > --- > drivers/gpu/drm/radeon/cik.c | 11 ++++++++-- > drivers/gpu/drm/radeon/cik_sdma.c | 9 ++++++-- > drivers/gpu/drm/radeon/r100.c | 10 +++++++-- > drivers/gpu/drm/radeon/r600.c | 10 +++++++-- > drivers/gpu/drm/radeon/r600_dma.c | 9 ++++++-- > drivers/gpu/drm/radeon/radeon.h | 2 ++ > drivers/gpu/drm/radeon/radeon_fence.c | 40 > ++++++++++++++++++++++++++++------- > drivers/gpu/drm/radeon/radeon_vce.c | 11 +++++++--- > drivers/gpu/drm/radeon/uvd_v1_0.c | 10 +++++++-- > 9 files changed, 89 insertions(+), 23 deletions(-) > > diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c > index 4c30d8c..0600140 100644 > --- a/drivers/gpu/drm/radeon/cik.c > +++ b/drivers/gpu/drm/radeon/cik.c > @@ -4219,13 +4219,20 @@ int cik_ib_test(struct radeon_device *rdev, struct > radeon_ring *ring) > DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); > return r; > } > - r = radeon_fence_wait(ib.fence, false); > - if (r) { > + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( > + RADEON_USEC_IB_TEST_TIMEOUT)); > + if (r < 0) { > DRM_ERROR("radeon: fence wait failed (%d).\n", r); > radeon_scratch_free(rdev, scratch); > radeon_ib_free(rdev, &ib); > return r; > + } else if (r == 0) { > + DRM_ERROR("radeon: fence wait timed out.\n"); > + radeon_scratch_free(rdev, scratch); > + radeon_ib_free(rdev, &ib); > + return -ETIMEDOUT; > } > + r = 0; > for (i = 0; i < rdev->usec_timeout; i++) { > tmp = RREG32(scratch); > if (tmp == 0xDEADBEEF) > diff --git a/drivers/gpu/drm/radeon/cik_sdma.c > b/drivers/gpu/drm/radeon/cik_sdma.c > index d16f2ee..9c351dc 100644 > --- a/drivers/gpu/drm/radeon/cik_sdma.c > +++ b/drivers/gpu/drm/radeon/cik_sdma.c > @@ -737,11 +737,16 @@ int cik_sdma_ib_test(struct radeon_device *rdev, struct > radeon_ring *ring) > DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); > return r; > } > - r = radeon_fence_wait(ib.fence, false); > - if (r) { > + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( > + RADEON_USEC_IB_TEST_TIMEOUT)); > + if (r < 0) { > DRM_ERROR("radeon: fence wait failed (%d).\n", r); > return r; > + } else if (r == 0) { > + DRM_ERROR("radeon: fence wait timed out.\n"); > + return -ETIMEDOUT; > } > + r = 0; > for (i = 0; i < rdev->usec_timeout; i++) { > tmp = le32_to_cpu(rdev->wb.wb[index/4]); > if (tmp == 0xDEADBEEF) > diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c > index 5eae0a8..6e478a2 100644 > --- a/drivers/gpu/drm/radeon/r100.c > +++ b/drivers/gpu/drm/radeon/r100.c > @@ -3732,11 +3732,17 @@ int r100_ib_test(struct radeon_device *rdev, struct > radeon_ring *ring) > DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); > goto free_ib; > } > - r = radeon_fence_wait(ib.fence, false); > - if (r) { > + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( > + RADEON_USEC_IB_TEST_TIMEOUT)); > + if (r < 0) { > DRM_ERROR("radeon: fence wait failed (%d).\n", r); > goto free_ib; > + } else if (r == 0) { > + DRM_ERROR("radeon: fence wait timed out.\n"); > + r = -ETIMEDOUT; > + goto free_ib; > } > + r = 0; > for (i = 0; i < rdev->usec_timeout; i++) { > tmp = RREG32(scratch); > if (tmp == 0xDEADBEEF) { > diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c > index cc2fdf0..ed12104 100644 > --- a/drivers/gpu/drm/radeon/r600.c > +++ b/drivers/gpu/drm/radeon/r600.c > @@ -3381,11 +3381,17 @@ int r600_ib_test(struct radeon_device *rdev, struct > radeon_ring *ring) > DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); > goto free_ib; > } > - r = radeon_fence_wait(ib.fence, false); > - if (r) { > + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( > + RADEON_USEC_IB_TEST_TIMEOUT)); > + if (r < 0) { > DRM_ERROR("radeon: fence wait failed (%d).\n", r); > goto free_ib; > + } else if (r == 0) { > + DRM_ERROR("radeon: fence wait timed out.\n"); > + r = -ETIMEDOUT; > + goto free_ib; > } > + r = 0; > for (i = 0; i < rdev->usec_timeout; i++) { > tmp = RREG32(scratch); > if (tmp == 0xDEADBEEF) > diff --git a/drivers/gpu/drm/radeon/r600_dma.c > b/drivers/gpu/drm/radeon/r600_dma.c > index d2dd29a..fb65e6f 100644 > --- a/drivers/gpu/drm/radeon/r600_dma.c > +++ b/drivers/gpu/drm/radeon/r600_dma.c > @@ -368,11 +368,16 @@ int r600_dma_ib_test(struct radeon_device *rdev, struct > radeon_ring *ring) > DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); > return r; > } > - r = radeon_fence_wait(ib.fence, false); > - if (r) { > + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( > + RADEON_USEC_IB_TEST_TIMEOUT)); > + if (r < 0) { > DRM_ERROR("radeon: fence wait failed (%d).\n", r); > return r; > + } else if (r == 0) { > + DRM_ERROR("radeon: fence wait timed out.\n"); > + return -ETIMEDOUT; > } > + r = 0; > for (i = 0; i < rdev->usec_timeout; i++) { > tmp = le32_to_cpu(rdev->wb.wb[index/4]); > if (tmp == 0xDEADBEEF) > diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h > index 78a51b3..007be29 100644 > --- a/drivers/gpu/drm/radeon/radeon.h > +++ b/drivers/gpu/drm/radeon/radeon.h > @@ -120,6 +120,7 @@ extern int radeon_mst; > */ > #define RADEON_MAX_USEC_TIMEOUT 100000 /* 100 ms */ > #define RADEON_FENCE_JIFFIES_TIMEOUT (HZ / 2) > +#define RADEON_USEC_IB_TEST_TIMEOUT 1000000 /* 1s */ > /* RADEON_IB_POOL_SIZE must be a power of 2 */ > #define RADEON_IB_POOL_SIZE 16 > #define RADEON_DEBUGFS_MAX_COMPONENTS 32 > @@ -382,6 +383,7 @@ void radeon_fence_driver_force_completion(struct > radeon_device *rdev, int ring); > int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence > **fence, int ring); > void radeon_fence_process(struct radeon_device *rdev, int ring); > bool radeon_fence_signaled(struct radeon_fence *fence); > +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool > interruptible, long timeout); > int radeon_fence_wait(struct radeon_fence *fence, bool interruptible); > int radeon_fence_wait_next(struct radeon_device *rdev, int ring); > int radeon_fence_wait_empty(struct radeon_device *rdev, int ring); > diff --git a/drivers/gpu/drm/radeon/radeon_fence.c > b/drivers/gpu/drm/radeon/radeon_fence.c > index 05815c4..7ef075a 100644 > --- a/drivers/gpu/drm/radeon/radeon_fence.c > +++ b/drivers/gpu/drm/radeon/radeon_fence.c > @@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct > radeon_device *rdev, > } > > /** > - * radeon_fence_wait - wait for a fence to signal > + * radeon_fence_wait_timeout - wait for a fence to signal with timeout > * > * @fence: radeon fence object > * @intr: use interruptible sleep > @@ -535,12 +535,15 @@ static long radeon_fence_wait_seq_timeout(struct > radeon_device *rdev, > * Wait for the requested fence to signal (all asics). > * @intr selects whether to use interruptable (true) or non-interruptable > * (false) sleep when waiting for the fence. > - * Returns 0 if the fence has passed, error for all other cases. > + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite wait > + * Returns remaining time if the sequence number has passed, 0 when > + * the wait timeout, or an error for all other cases. > */ > -int radeon_fence_wait(struct radeon_fence *fence, bool intr) > +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, long > timeout) > { > uint64_t seq[RADEON_NUM_RINGS] = {}; > long r; > + int r_sig; > > /* > * This function should not be called on !radeon fences. > @@ -552,15 +555,36 @@ int radeon_fence_wait(struct radeon_fence *fence, bool > intr) > return fence_wait(&fence->base, intr); > > seq[fence->ring] = fence->seq; > - r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, > MAX_SCHEDULE_TIMEOUT); > - if (r < 0) { > + r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, timeout); > + if (r <= 0) { > return r; > } > > - r = fence_signal(&fence->base); > - if (!r) > + r_sig = fence_signal(&fence->base); > + if (!r_sig) > FENCE_TRACE(&fence->base, "signaled from fence_wait\n"); > - return 0; > + return r; > +} > + > +/** > + * radeon_fence_wait - wait for a fence to signal > + * > + * @fence: radeon fence object > + * @intr: use interruptible sleep > + * > + * Wait for the requested fence to signal (all asics). > + * @intr selects whether to use interruptable (true) or non-interruptable > + * (false) sleep when waiting for the fence. > + * Returns 0 if the fence has passed, error for all other cases. > + */ > +int radeon_fence_wait(struct radeon_fence *fence, bool intr) > +{ > + long r = radeon_fence_wait_timeout(fence, intr, MAX_SCHEDULE_TIMEOUT); > + if (r > 0) { > + return 0; > + } else { > + return r; > + } > } > > /** > diff --git a/drivers/gpu/drm/radeon/radeon_vce.c > b/drivers/gpu/drm/radeon/radeon_vce.c > index 7eb1ae7..566a1a0 100644 > --- a/drivers/gpu/drm/radeon/radeon_vce.c > +++ b/drivers/gpu/drm/radeon/radeon_vce.c > @@ -810,11 +810,16 @@ int radeon_vce_ib_test(struct radeon_device *rdev, > struct radeon_ring *ring) > goto error; > } > > - r = radeon_fence_wait(fence, false); > - if (r) { > + r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies( > + RADEON_USEC_IB_TEST_TIMEOUT)); > + if (r < 0) { > DRM_ERROR("radeon: fence wait failed (%d).\n", r); > + } else if (r == 0) { > + DRM_ERROR("radeon: fence wait timed out.\n"); > + r = -ETIMEDOUT; > } else { > - DRM_INFO("ib test on ring %d succeeded\n", ring->idx); > + DRM_INFO("ib test on ring %d succeeded\n", ring->idx); > + r = 0; > } > error: > radeon_fence_unref(&fence); > diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c > b/drivers/gpu/drm/radeon/uvd_v1_0.c > index c6b1cbc..12ddcfa 100644 > --- a/drivers/gpu/drm/radeon/uvd_v1_0.c > +++ b/drivers/gpu/drm/radeon/uvd_v1_0.c > @@ -522,11 +522,17 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev, struct > radeon_ring *ring) > goto error; > } > > - r = radeon_fence_wait(fence, false); > - if (r) { > + r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies( > + RADEON_USEC_IB_TEST_TIMEOUT)); > + if (r < 0) { > DRM_ERROR("radeon: fence wait failed (%d).\n", r); > goto error; > + } else if (r == 0) { > + DRM_ERROR("radeon: fence wait timed out.\n"); > + r = -ETIMEDOUT; > + goto error; > } > + r = 0; > DRM_INFO("ib test on ring %d succeeded\n", ring->idx); > error: > radeon_fence_unref(&fence);