Am 31.07.2014 um 17:33 schrieb Maarten Lankhorst: > Signed-off-by: Maarten Lankhorst <maarten.lankhorst at canonical.com> > --- > V1 had a nasty bug breaking gpu lockup recovery. The fix is not > allowing radeon_fence_driver_check_lockup to take exclusive_lock, > and kill it during lockup recovery instead.
That looks like the delayed work starts running as soon as we submit a fence, and not when it's needed for waiting. Since it's a backup for failing IRQs I would rather put it into radeon_irq_kms.c and start/stop it when the IRQs are started/stoped. Christian. > --- > drivers/gpu/drm/radeon/radeon.h | 3 + > drivers/gpu/drm/radeon/radeon_device.c | 5 + > drivers/gpu/drm/radeon/radeon_fence.c | 124 > ++++++++++++++++++-------------- > drivers/gpu/drm/radeon/radeon_ring.c | 1 > 4 files changed, 77 insertions(+), 56 deletions(-) > > diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h > index 60c47f829122..b01d88fc10cb 100644 > --- a/drivers/gpu/drm/radeon/radeon.h > +++ b/drivers/gpu/drm/radeon/radeon.h > @@ -347,6 +347,8 @@ struct radeon_fence_driver { > uint64_t sync_seq[RADEON_NUM_RINGS]; > atomic64_t last_seq; > bool initialized; > + struct delayed_work fence_check_work; > + struct radeon_device *rdev; > }; > > struct radeon_fence { > @@ -360,6 +362,7 @@ struct radeon_fence { > > int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring); > int radeon_fence_driver_init(struct radeon_device *rdev); > +void radeon_fence_cancel_delayed_check(struct radeon_device *rdev, int ring); > void radeon_fence_driver_fini(struct radeon_device *rdev); > void radeon_fence_driver_force_completion(struct radeon_device *rdev); > int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence > **fence, int ring); > diff --git a/drivers/gpu/drm/radeon/radeon_device.c > b/drivers/gpu/drm/radeon/radeon_device.c > index 697add2cd4e3..21efd32d07ee 100644 > --- a/drivers/gpu/drm/radeon/radeon_device.c > +++ b/drivers/gpu/drm/radeon/radeon_device.c > @@ -1637,6 +1637,11 @@ int radeon_gpu_reset(struct radeon_device *rdev) > radeon_save_bios_scratch_regs(rdev); > /* block TTM */ > resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev); > + > + /* kill all the hangcheck tests too */ > + for (i = 0; i < RADEON_NUM_RINGS; ++i) > + radeon_fence_cancel_delayed_check(rdev, i); > + > radeon_pm_suspend(rdev); > radeon_suspend(rdev); > > diff --git a/drivers/gpu/drm/radeon/radeon_fence.c > b/drivers/gpu/drm/radeon/radeon_fence.c > index 913787085dfa..c055acc3a271 100644 > --- a/drivers/gpu/drm/radeon/radeon_fence.c > +++ b/drivers/gpu/drm/radeon/radeon_fence.c > @@ -125,16 +125,7 @@ int radeon_fence_emit(struct radeon_device *rdev, > return 0; > } > > -/** > - * radeon_fence_process - process a fence > - * > - * @rdev: radeon_device pointer > - * @ring: ring index the fence is associated with > - * > - * Checks the current fence value and wakes the fence queue > - * if the sequence number has increased (all asics). > - */ > -void radeon_fence_process(struct radeon_device *rdev, int ring) > +static bool __radeon_fence_process(struct radeon_device *rdev, int ring) > { > uint64_t seq, last_seq, last_emitted; > unsigned count_loop = 0; > @@ -190,7 +181,51 @@ void radeon_fence_process(struct radeon_device *rdev, > int ring) > } > } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq); > > - if (wake) > + if (seq < last_emitted) > + mod_delayed_work(system_power_efficient_wq, > + &rdev->fence_drv[ring].fence_check_work, > + RADEON_FENCE_JIFFIES_TIMEOUT); > + > + return wake; > +} > + > +static void radeon_fence_driver_check_lockup(struct work_struct *work) > +{ > + struct radeon_fence_driver *fence_drv; > + struct radeon_device *rdev; > + unsigned long iring; > + > + fence_drv = container_of(work, struct radeon_fence_driver, > fence_check_work.work); > + rdev = fence_drv->rdev; > + iring = fence_drv - &rdev->fence_drv[0]; > + > + if (__radeon_fence_process(rdev, iring)) > + wake_up_all(&rdev->fence_queue); > + else if (radeon_ring_is_lockup(rdev, iring, &rdev->ring[iring])) { > + /* good news we believe it's a lockup */ > + dev_warn(rdev->dev, "GPU lockup (current fence id " > + "0x%016llx last fence id 0x%016llx on ring %ld)\n", > + (uint64_t)atomic64_read(&fence_drv->last_seq), > + fence_drv->sync_seq[iring], iring); > + > + /* remember that we need an reset */ > + rdev->needs_reset = true; > + wake_up_all(&rdev->fence_queue); > + } > +} > + > +/** > + * radeon_fence_process - process a fence > + * > + * @rdev: radeon_device pointer > + * @ring: ring index the fence is associated with > + * > + * Checks the current fence value and wakes the fence queue > + * if the sequence number has increased (all asics). > + */ > +void radeon_fence_process(struct radeon_device *rdev, int ring) > +{ > + if (__radeon_fence_process(rdev, ring)) > wake_up_all(&rdev->fence_queue); > } > > @@ -302,9 +337,10 @@ static int radeon_fence_wait_seq(struct radeon_device > *rdev, u64 *target_seq, > { > uint64_t last_seq[RADEON_NUM_RINGS]; > bool signaled; > - int i, r; > + int i; > > while (!radeon_fence_any_seq_signaled(rdev, target_seq)) { > + long r; > > /* Save current sequence values, used to check for GPU lockups > */ > for (i = 0; i < RADEON_NUM_RINGS; ++i) { > @@ -319,11 +355,11 @@ static int radeon_fence_wait_seq(struct radeon_device > *rdev, u64 *target_seq, > if (intr) { > r = wait_event_interruptible_timeout(rdev->fence_queue, > ( > (signaled = radeon_fence_any_seq_signaled(rdev, > target_seq)) > - || rdev->needs_reset), > RADEON_FENCE_JIFFIES_TIMEOUT); > + || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT); > } else { > r = wait_event_timeout(rdev->fence_queue, ( > (signaled = radeon_fence_any_seq_signaled(rdev, > target_seq)) > - || rdev->needs_reset), > RADEON_FENCE_JIFFIES_TIMEOUT); > + || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT); > } > > for (i = 0; i < RADEON_NUM_RINGS; ++i) { > @@ -334,50 +370,11 @@ static int radeon_fence_wait_seq(struct radeon_device > *rdev, u64 *target_seq, > trace_radeon_fence_wait_end(rdev->ddev, i, > target_seq[i]); > } > > - if (unlikely(r < 0)) > + if (r < 0) > return r; > > - if (unlikely(!signaled)) { > - if (rdev->needs_reset) > - return -EDEADLK; > - > - /* we were interrupted for some reason and fence > - * isn't signaled yet, resume waiting */ > - if (r) > - continue; > - > - for (i = 0; i < RADEON_NUM_RINGS; ++i) { > - if (!target_seq[i]) > - continue; > - > - if (last_seq[i] != > atomic64_read(&rdev->fence_drv[i].last_seq)) > - break; > - } > - > - if (i != RADEON_NUM_RINGS) > - continue; > - > - for (i = 0; i < RADEON_NUM_RINGS; ++i) { > - if (!target_seq[i]) > - continue; > - > - if (radeon_ring_is_lockup(rdev, i, > &rdev->ring[i])) > - break; > - } > - > - if (i < RADEON_NUM_RINGS) { > - /* good news we believe it's a lockup */ > - dev_warn(rdev->dev, "GPU lockup (waiting for " > - "0x%016llx last fence id 0x%016llx on" > - " ring %d)\n", > - target_seq[i], last_seq[i], i); > - > - /* remember that we need an reset */ > - rdev->needs_reset = true; > - wake_up_all(&rdev->fence_queue); > - return -EDEADLK; > - } > - } > + if (rdev->needs_reset) > + return -EDEADLK; > } > return 0; > } > @@ -711,6 +708,8 @@ static void radeon_fence_driver_init_ring(struct > radeon_device *rdev, int ring) > rdev->fence_drv[ring].sync_seq[i] = 0; > atomic64_set(&rdev->fence_drv[ring].last_seq, 0); > rdev->fence_drv[ring].initialized = false; > + INIT_DELAYED_WORK(&rdev->fence_drv[ring].fence_check_work, > radeon_fence_driver_check_lockup); > + rdev->fence_drv[ring].rdev = rdev; > } > > /** > @@ -740,6 +739,17 @@ int radeon_fence_driver_init(struct radeon_device *rdev) > } > > /** > + * radeon_fence_cancel_delayed_check - cancel all delayed checks on a ring > during lockup > + * > + * This prevents the lockup check from being done while suspend is running > + * during a recovery from a lockup. > + */ > +void radeon_fence_cancel_delayed_check(struct radeon_device *rdev, int ring) > +{ > + cancel_delayed_work_sync(&rdev->fence_drv[ring].fence_check_work); > +} > + > +/** > * radeon_fence_driver_fini - tear down the fence driver > * for all possible rings. > * > @@ -755,6 +765,8 @@ void radeon_fence_driver_fini(struct radeon_device *rdev) > for (ring = 0; ring < RADEON_NUM_RINGS; ring++) { > if (!rdev->fence_drv[ring].initialized) > continue; > + > + > cancel_delayed_work_sync(&rdev->fence_drv[ring].fence_check_work); > r = radeon_fence_wait_empty(rdev, ring); > if (r) { > /* no need to trigger GPU reset as we are unloading */ > diff --git a/drivers/gpu/drm/radeon/radeon_ring.c > b/drivers/gpu/drm/radeon/radeon_ring.c > index f8050f5429e2..594d871a6fce 100644 > --- a/drivers/gpu/drm/radeon/radeon_ring.c > +++ b/drivers/gpu/drm/radeon/radeon_ring.c > @@ -261,6 +261,7 @@ int radeon_ib_ring_tests(struct radeon_device *rdev) > > r = radeon_ib_test(rdev, i, ring); > if (r) { > + radeon_fence_cancel_delayed_check(rdev, i); > ring->ready = false; > rdev->needs_reset = false; > >