Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst: > Signed-off-by: Maarten Lankhorst <maarten.lankhorst at canonical.com> > --- > V1 had a nasty bug breaking gpu lockup recovery. The fix is not > allowing radeon_fence_driver_check_lockup to take exclusive_lock, > and kill it during lockup recovery instead. > V2 used delayed work that ran during lockup recovery, but required > read lock. I've fixed this by downgrading the write, and retrying if > recovery fails. > --- > drivers/gpu/drm/radeon/radeon.h | 2 + > drivers/gpu/drm/radeon/radeon_fence.c | 115 > +++++++++++++++++----------------- > 2 files changed, 61 insertions(+), 56 deletions(-) > > diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h > index 1d806983ec7b..29504efe8971 100644 > --- a/drivers/gpu/drm/radeon/radeon.h > +++ b/drivers/gpu/drm/radeon/radeon.h > @@ -355,6 +355,8 @@ struct radeon_fence_driver { > uint64_t sync_seq[RADEON_NUM_RINGS]; > atomic64_t last_seq; > bool initialized; > + struct delayed_work fence_check_work; > + struct radeon_device *rdev;
Put the reference to the device as the first field in the structure. > }; > > struct radeon_fence { > diff --git a/drivers/gpu/drm/radeon/radeon_fence.c > b/drivers/gpu/drm/radeon/radeon_fence.c > index 913787085dfa..94eca53d99f8 100644 > --- a/drivers/gpu/drm/radeon/radeon_fence.c > +++ b/drivers/gpu/drm/radeon/radeon_fence.c > @@ -125,16 +125,7 @@ int radeon_fence_emit(struct radeon_device *rdev, > return 0; > } > > -/** > - * radeon_fence_process - process a fence > - * > - * @rdev: radeon_device pointer > - * @ring: ring index the fence is associated with > - * > - * Checks the current fence value and wakes the fence queue > - * if the sequence number has increased (all asics). > - */ > -void radeon_fence_process(struct radeon_device *rdev, int ring) > +static bool __radeon_fence_process(struct radeon_device *rdev, int ring) Don't use "__" for internal radeon function names and especially don't remove the function documentation. > { > uint64_t seq, last_seq, last_emitted; > unsigned count_loop = 0; > @@ -190,7 +181,53 @@ void radeon_fence_process(struct radeon_device *rdev, > int ring) > } > } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq); > > - if (wake) > + if (seq < last_emitted && !rdev->in_reset) > + mod_delayed_work(system_power_efficient_wq, > + &rdev->fence_drv[ring].fence_check_work, > + RADEON_FENCE_JIFFIES_TIMEOUT); Am I wrong or do you queue the work only after radeon_fence_process is called for the first time? Might be a good idea to have an explicit queue_delayed_work in radeon_fence_emit as well. > + > + return wake; > +} > + > +static void radeon_fence_driver_check_lockup(struct work_struct *work) > +{ > + struct radeon_fence_driver *fence_drv; > + struct radeon_device *rdev; > + unsigned long iring; > + > + fence_drv = container_of(work, struct radeon_fence_driver, > fence_check_work.work); > + rdev = fence_drv->rdev; > + iring = fence_drv - &rdev->fence_drv[0]; > + > + down_read(&rdev->exclusive_lock); > + if (__radeon_fence_process(rdev, iring)) > + wake_up_all(&rdev->fence_queue); > + else if (radeon_ring_is_lockup(rdev, iring, &rdev->ring[iring])) { > + /* good news we believe it's a lockup */ > + dev_warn(rdev->dev, "GPU lockup (current fence id " > + "0x%016llx last fence id 0x%016llx on ring %ld)\n", > + (uint64_t)atomic64_read(&fence_drv->last_seq), > + fence_drv->sync_seq[iring], iring); > + > + /* remember that we need an reset */ > + rdev->needs_reset = true; > + wake_up_all(&rdev->fence_queue); > + } > + up_read(&rdev->exclusive_lock); > +} > + > +/** > + * radeon_fence_process - process a fence > + * > + * @rdev: radeon_device pointer > + * @ring: ring index the fence is associated with > + * > + * Checks the current fence value and wakes the fence queue > + * if the sequence number has increased (all asics). > + */ > +void radeon_fence_process(struct radeon_device *rdev, int ring) > +{ > + if (__radeon_fence_process(rdev, ring)) > wake_up_all(&rdev->fence_queue); > } > > @@ -302,9 +339,10 @@ static int radeon_fence_wait_seq(struct radeon_device > *rdev, u64 *target_seq, > { > uint64_t last_seq[RADEON_NUM_RINGS]; > bool signaled; > - int i, r; > + int i; > > while (!radeon_fence_any_seq_signaled(rdev, target_seq)) { > + long r; > > /* Save current sequence values, used to check for GPU lockups > */ > for (i = 0; i < RADEON_NUM_RINGS; ++i) { > @@ -319,11 +357,11 @@ static int radeon_fence_wait_seq(struct radeon_device > *rdev, u64 *target_seq, > if (intr) { > r = wait_event_interruptible_timeout(rdev->fence_queue, > ( > (signaled = radeon_fence_any_seq_signaled(rdev, > target_seq)) > - || rdev->needs_reset), > RADEON_FENCE_JIFFIES_TIMEOUT); > + || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT); > } else { > r = wait_event_timeout(rdev->fence_queue, ( > (signaled = radeon_fence_any_seq_signaled(rdev, > target_seq)) > - || rdev->needs_reset), > RADEON_FENCE_JIFFIES_TIMEOUT); > + || rdev->needs_reset), MAX_SCHEDULE_TIMEOUT); > } > > for (i = 0; i < RADEON_NUM_RINGS; ++i) { > @@ -334,50 +372,11 @@ static int radeon_fence_wait_seq(struct radeon_device > *rdev, u64 *target_seq, > trace_radeon_fence_wait_end(rdev->ddev, i, > target_seq[i]); > } > > - if (unlikely(r < 0)) > + if (r < 0) > return r; > > - if (unlikely(!signaled)) { > - if (rdev->needs_reset) > - return -EDEADLK; > - > - /* we were interrupted for some reason and fence > - * isn't signaled yet, resume waiting */ > - if (r) > - continue; > - > - for (i = 0; i < RADEON_NUM_RINGS; ++i) { > - if (!target_seq[i]) > - continue; > - > - if (last_seq[i] != > atomic64_read(&rdev->fence_drv[i].last_seq)) > - break; > - } > - > - if (i != RADEON_NUM_RINGS) > - continue; > - > - for (i = 0; i < RADEON_NUM_RINGS; ++i) { > - if (!target_seq[i]) > - continue; > - > - if (radeon_ring_is_lockup(rdev, i, > &rdev->ring[i])) > - break; > - } > - > - if (i < RADEON_NUM_RINGS) { > - /* good news we believe it's a lockup */ > - dev_warn(rdev->dev, "GPU lockup (waiting for " > - "0x%016llx last fence id 0x%016llx on" > - " ring %d)\n", > - target_seq[i], last_seq[i], i); > - > - /* remember that we need an reset */ > - rdev->needs_reset = true; > - wake_up_all(&rdev->fence_queue); > - return -EDEADLK; > - } > - } > + if (rdev->needs_reset) > + return -EDEADLK; > } > return 0; > } > @@ -711,6 +710,9 @@ static void radeon_fence_driver_init_ring(struct > radeon_device *rdev, int ring) > rdev->fence_drv[ring].sync_seq[i] = 0; > atomic64_set(&rdev->fence_drv[ring].last_seq, 0); > rdev->fence_drv[ring].initialized = false; > + INIT_DELAYED_WORK(&rdev->fence_drv[ring].fence_check_work, > + radeon_fence_driver_check_lockup); > + rdev->fence_drv[ring].rdev = rdev; > } > > /** > @@ -760,6 +762,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev) > /* no need to trigger GPU reset as we are unloading */ > radeon_fence_driver_force_completion(rdev); > } > + > cancel_delayed_work_sync(&rdev->fence_drv[ring].fence_check_work); > wake_up_all(&rdev->fence_queue); > radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg); > rdev->fence_drv[ring].initialized = false;