On 1/13/2025 7:12 AM, Jiang Liu wrote:
> Add helper functions to track status for ras manager and ip blocks.
> 
> Signed-off-by: Jiang Liu <ge...@linux.alibaba.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h     | 38 +++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 10 +++++++
>  3 files changed, 85 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 5e55a44f9eef..f0f773659faf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -377,12 +377,28 @@ int amdgpu_ip_block_resume(struct amdgpu_ip_block 
> *ip_block);
>  
>  #define AMDGPU_MAX_IP_NUM 16
>  
> +enum amdgpu_marker {
> +     // Markers for IRQs, used for both ip blocks and ras blocks.
> +     AMDGPU_MARKER_IRQ0 = 32,
> +     AMDGPU_MARKER_IRQ1,
> +     AMDGPU_MARKER_IRQ2,
> +     AMDGPU_MARKER_IRQ3,
> +     AMDGPU_MARKER_IRQ4,
> +     AMDGPU_MARKER_IRQ5,
> +     AMDGPU_MARKER_IRQ6,
> +     AMDGPU_MARKER_IRQ7,
> +     AMDGPU_MARKER_IRQ_MAX = 63,
> +};
> +
> +#define AMDGPU_MARKER_IRQ(idx)               (AMDGPU_MARKER_IRQ0 + (idx))
> +
>  struct amdgpu_ip_block_status {
>       bool valid;
>       bool sw;
>       bool hw;
>       bool late_initialized;
>       bool hang;
> +     uint64_t markers;
>  };
>  

This fine grained levels maintained at IP layer doesn't look like a
proper solution. It's either IP or RAS block has the required IRQs
enabled or disabled. Unwinding them needs to be tracked at IRQ object
layer and not here.

Thanks,
Lijo

>  struct amdgpu_ip_block_version {
> @@ -410,6 +426,28 @@ amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
>  int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
>                              const struct amdgpu_ip_block_version 
> *ip_block_version);
>  
> +static inline void amdgpu_ip_block_set_marker(struct amdgpu_ip_block 
> *ip_block,
> +                                           enum amdgpu_marker marker)
> +{
> +     WARN_ON(marker > 63);
> +     WARN_ON(ip_block->status.markers & (0x1ull << marker));
> +     ip_block->status.markers |= 0x1ull << (int)marker;
> +}
> +
> +static inline bool amdgpu_ip_block_test_and_clear_marker(struct 
> amdgpu_ip_block *ip_block,
> +                                                      enum amdgpu_marker 
> marker)
> +{
> +     bool set = false;
> +     uint64_t value = 0x1ull << (int)marker;
> +
> +     if ((ip_block->status.markers & value) != 0) {
> +             ip_block->status.markers &= ~value;
> +             set = true;
> +     }
> +
> +     return set;
> +}
> +
>  /*
>   * BIOS.
>   */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index f0924aa3f4e4..5e19d820ab34 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -5207,3 +5207,40 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
>  
>       return con->is_rma;
>  }
> +
> +bool amdgpu_ras_test_marker(struct amdgpu_device *adev,
> +                         struct ras_common_if *head, int marker)
> +{
> +     struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
> +
> +     if (obj && obj->markers & (0x1ull << marker))
> +             return true;
> +
> +     return false;
> +}
> +
> +void amdgpu_ras_set_marker(struct amdgpu_device *adev,
> +                        struct ras_common_if *head, int marker)
> +{
> +     struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
> +
> +     WARN_ON(marker > 63);
> +     WARN_ON(obj->markers & (0x1ull << marker));
> +     if (obj)
> +             obj->markers |= 0x1ull << marker;
> +}
> +
> +bool amdgpu_ras_test_and_clear_marker(struct amdgpu_device *adev,
> +                                   struct ras_common_if *head, int marker)
> +{
> +     bool set = false;
> +     uint64_t value = 0x1ull << marker;
> +     struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
> +
> +     if (obj && (obj->markers & value) != 0) {
> +             obj->markers &= ~value;
> +             set = true;
> +     }
> +
> +     return set;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 82db986c36a0..35881087b17b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -634,6 +634,8 @@ struct ras_manager {
>       struct ras_common_if head;
>       /* reference count */
>       int use;
> +     /* Flags for status tracking */
> +     uint64_t markers;
>       /* ras block link */
>       struct list_head node;
>       /* the device */
> @@ -977,4 +979,12 @@ void amdgpu_ras_event_log_print(struct amdgpu_device 
> *adev, u64 event_id,
>                               const char *fmt, ...);
>  
>  bool amdgpu_ras_is_rma(struct amdgpu_device *adev);
> +
> +bool amdgpu_ras_test_marker(struct amdgpu_device *adev,
> +                         struct ras_common_if *head, int marker);
> +void amdgpu_ras_set_marker(struct amdgpu_device *adev,
> +                        struct ras_common_if *head, int marker);
> +bool amdgpu_ras_test_and_clear_marker(struct amdgpu_device *adev,
> +                                   struct ras_common_if *head,
> +                                   int marker);
>  #endif

Reply via email to