Am 2021-11-24 um 5:58 p.m. schrieb Philip Yang:
> IH ring1 is used to process GPU retry fault, overflow is enabled to
> drain retry fault because we want receive other interrupts while
> handling retry fault to recover range. There is no overflow flag set
> when wptr pass rptr. Use timestamp of rptr and wptr to handle overflow
> and drain retry fault.
>
> Add amdgpu_ih_function interface decode_iv_ts for different chips to get
> timestamp from IV entry with different iv size and timestamp offset.
> amdgpu_ih_decode_iv_ts_helper is used for vega10, vega20, navi10.
>
> Drain retry fault is done if processed_timestamp is equal to or larger
> than checkpoint timestamp. Page fault handler skips retry fault entry if
> entry timestamp goes backward.
>
> Signed-off-by: Philip Yang <philip.y...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 58 +++++++++++++-------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h | 16 ++++++-
>  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  5 +++
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  5 +++
>  drivers/gpu/drm/amd/amdgpu/navi10_ih.c |  1 +
>  drivers/gpu/drm/amd/amdgpu/vega10_ih.c |  1 +
>  drivers/gpu/drm/amd/amdgpu/vega20_ih.c |  1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c   |  2 +-
>  8 files changed, 56 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> index 0c7963dfacad..3e043acaab82 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
> @@ -164,52 +164,32 @@ void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, 
> const uint32_t *iv,
>       }
>  }
>  
> -/* Waiter helper that checks current rptr matches or passes checkpoint wptr 
> */
> -static bool amdgpu_ih_has_checkpoint_processed(struct amdgpu_device *adev,
> -                                     struct amdgpu_ih_ring *ih,
> -                                     uint32_t checkpoint_wptr,
> -                                     uint32_t *prev_rptr)
> -{
> -     uint32_t cur_rptr = ih->rptr | (*prev_rptr & ~ih->ptr_mask);
> -
> -     /* rptr has wrapped. */
> -     if (cur_rptr < *prev_rptr)
> -             cur_rptr += ih->ptr_mask + 1;
> -     *prev_rptr = cur_rptr;
> -
> -     /* check ring is empty to workaround missing wptr overflow flag */
> -     return cur_rptr >= checkpoint_wptr ||
> -            (cur_rptr & ih->ptr_mask) == amdgpu_ih_get_wptr(adev, ih);
> -}
> -
>  /**
> - * amdgpu_ih_wait_on_checkpoint_process - wait to process IVs up to 
> checkpoint
> + * amdgpu_ih_wait_on_checkpoint_process_ts - wait to process IVs up to 
> checkpoint
>   *
>   * @adev: amdgpu_device pointer
>   * @ih: ih ring to process
>   *
>   * Used to ensure ring has processed IVs up to the checkpoint write pointer.
>   */
> -int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev,
> +int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev,
>                                       struct amdgpu_ih_ring *ih)
>  {
> -     uint32_t checkpoint_wptr, rptr;
> +     uint32_t checkpoint_wptr;
> +     uint64_t checkpoint_ts;
> +     long timeout = HZ;
>  
>       if (!ih->enabled || adev->shutdown)
>               return -ENODEV;
>  
>       checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
> -     /* Order wptr with rptr. */
> +     /* Order wptr with ring data. */
>       rmb();
> -     rptr = READ_ONCE(ih->rptr);
> -
> -     /* wptr has wrapped. */
> -     if (rptr > checkpoint_wptr)
> -             checkpoint_wptr += ih->ptr_mask + 1;
> +     checkpoint_ts = amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1);
>  
> -     return wait_event_interruptible(ih->wait_process,
> -                             amdgpu_ih_has_checkpoint_processed(adev, ih,
> -                                             checkpoint_wptr, &rptr));
> +     return wait_event_interruptible_timeout(ih->wait_process,
> +                 !amdgpu_ih_ts_after(ih->processed_timestamp, checkpoint_ts),
> +                 timeout);
>  }
>  
>  /**
> @@ -298,4 +278,22 @@ void amdgpu_ih_decode_iv_helper(struct amdgpu_device 
> *adev,
>  
>       /* wptr/rptr are in bytes! */
>       ih->rptr += 32;
> +     if (ih == &adev->irq.ih1 &&
> +         amdgpu_ih_ts_after(ih->processed_timestamp, entry->timestamp))
> +             ih->processed_timestamp = entry->timestamp;

I'd call this "latest_decoded_timestamp". At this point it hasn't been
processed yet.

Also, I think it would be safe and cheap enough to do this on all IH
rings, in case someone finds it useful for something else, e.g. using
amdgpu_ih_wait_on_checkpoint_process_ts on IH ring 0.


> +}
> +
> +uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr,
> +                                    signed int offset)
> +{
> +     uint32_t iv_size = 32;
> +     uint32_t dw1, dw2;
> +     uint32_t index;
> +
> +     rptr += iv_size * offset;
> +     index = (rptr & ih->ptr_mask) >> 2;
> +
> +     dw1 = le32_to_cpu(ih->ring[index + 1]);
> +     dw2 = le32_to_cpu(ih->ring[index + 2]);
> +     return dw1 | ((u64)(dw2 & 0xffff) << 32);
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> index 0649b59830a5..dd1c2eded6b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
> @@ -68,20 +68,30 @@ struct amdgpu_ih_ring {
>  
>       /* For waiting on IH processing at checkpoint. */
>       wait_queue_head_t wait_process;
> +     uint64_t                processed_timestamp;
>  };
>  
> +/* return true if time stamp t2 is after t1 with 48bit wrap around */
> +#define amdgpu_ih_ts_after(t1, t2) \
> +             (((int64_t)((t2) << 16) - (int64_t)((t1) << 16)) > 0LL)
> +
>  /* provided by the ih block */
>  struct amdgpu_ih_funcs {
>       /* ring read/write ptr handling, called from interrupt context */
>       u32 (*get_wptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
>       void (*decode_iv)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
>                         struct amdgpu_iv_entry *entry);
> +     uint64_t (*decode_iv_ts)(struct amdgpu_ih_ring *ih, u32 rptr,
> +                              signed int offset);
>       void (*set_rptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
>  };
>  
>  #define amdgpu_ih_get_wptr(adev, ih) (adev)->irq.ih_funcs->get_wptr((adev), 
> (ih))
>  #define amdgpu_ih_decode_iv(adev, iv) \
>       (adev)->irq.ih_funcs->decode_iv((adev), (ih), (iv))
> +#define amdgpu_ih_decode_iv_ts(adev, ih, rptr, offset) \
> +     (WARN_ON_ONCE(!(adev)->irq.ih_funcs->decode_iv_ts) ? 0 : \
> +     (adev)->irq.ih_funcs->decode_iv_ts((ih), (rptr), (offset)))
>  #define amdgpu_ih_set_rptr(adev, ih) (adev)->irq.ih_funcs->set_rptr((adev), 
> (ih))
>  
>  int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring 
> *ih,
> @@ -89,10 +99,12 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, 
> struct amdgpu_ih_ring *ih,
>  void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct amdgpu_ih_ring 
> *ih);
>  void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
>                         unsigned int num_dw);
> -int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev,
> -                                     struct amdgpu_ih_ring *ih);
> +int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev,
> +                                         struct amdgpu_ih_ring *ih);
>  int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
>  void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev,
>                               struct amdgpu_ih_ring *ih,
>                               struct amdgpu_iv_entry *entry);
> +uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr,
> +                                    signed int offset);
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 3ec5ff5a6dbe..b129898db433 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -119,6 +119,11 @@ static int gmc_v10_0_process_interrupt(struct 
> amdgpu_device *adev,
>                       return 1;
>               }
>  
> +             /* Stale retry fault if timestamp goes backward */
> +             if (entry->ih == &adev->irq.ih1 &&
> +                 amdgpu_ih_ts_after(entry->timestamp, 
> entry->ih->processed_timestamp))
> +                     return 1;
> +

This check should go before amdgpu_gmc_filter_faults. Otherwise
amdgpu_gmc_filter_faults may later drop a real fault because it added a
stale fault in its hash table.


>               /* Try to handle the recoverable page faults by filling page
>                * tables
>                */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index cb82404df534..c0d9b254bbb5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -535,6 +535,11 @@ static int gmc_v9_0_process_interrupt(struct 
> amdgpu_device *adev,
>                       return 1;
>               }
>  
> +             /* Stale retry fault if timestamp goes backward */
> +             if (entry->ih == &adev->irq.ih1 &&
> +                 amdgpu_ih_ts_after(entry->timestamp, 
> entry->ih->processed_timestamp))
> +                     return 1;
> +

Same as above.

Regards,
  Felix


>               /* Try to handle the recoverable page faults by filling page
>                * tables
>                */
> diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c 
> b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> index 38241cf0e1f1..8ce5b8ca1fd7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
> @@ -716,6 +716,7 @@ static const struct amd_ip_funcs navi10_ih_ip_funcs = {
>  static const struct amdgpu_ih_funcs navi10_ih_funcs = {
>       .get_wptr = navi10_ih_get_wptr,
>       .decode_iv = amdgpu_ih_decode_iv_helper,
> +     .decode_iv_ts = amdgpu_ih_decode_iv_ts_helper,
>       .set_rptr = navi10_ih_set_rptr
>  };
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c 
> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> index a9ca6988009e..3070466f54e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> @@ -640,6 +640,7 @@ const struct amd_ip_funcs vega10_ih_ip_funcs = {
>  static const struct amdgpu_ih_funcs vega10_ih_funcs = {
>       .get_wptr = vega10_ih_get_wptr,
>       .decode_iv = amdgpu_ih_decode_iv_helper,
> +     .decode_iv_ts = amdgpu_ih_decode_iv_ts_helper,
>       .set_rptr = vega10_ih_set_rptr
>  };
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c 
> b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> index f51dfc38ac65..3b4eb8285943 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
> @@ -688,6 +688,7 @@ const struct amd_ip_funcs vega20_ih_ip_funcs = {
>  static const struct amdgpu_ih_funcs vega20_ih_funcs = {
>       .get_wptr = vega20_ih_get_wptr,
>       .decode_iv = amdgpu_ih_decode_iv_helper,
> +     .decode_iv_ts = amdgpu_ih_decode_iv_ts_helper,
>       .set_rptr = vega20_ih_set_rptr
>  };
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 10868d5b549f..663489ae56d7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1974,7 +1974,7 @@ static void svm_range_drain_retry_fault(struct 
> svm_range_list *svms)
>  
>               pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
>  
> -             amdgpu_ih_wait_on_checkpoint_process(pdd->dev->adev,
> +             amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
>                                                    &pdd->dev->adev->irq.ih1);
>               pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
>       }

Reply via email to