[AMD Official Use Only - General]

Reviewed-by: Stanley.Yang <stanley.y...@amd.com>

Regards,
Stanley
> -----Original Message-----
> From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of
> Hawking Zhang
> Sent: Friday, October 14, 2022 2:19 PM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <tao.zh...@amd.com>;
> Yang, Stanley <stanley.y...@amd.com>
> Cc: Russell, Kent <kent.russ...@amd.com>; Zhang, Hawking
> <hawking.zh...@amd.com>
> Subject: [PATCH] drm/amdgpu: move convert_error_address out of umc_ras
> 
> RAS error address translation algorithm is common across dGPU and A + A
> platform as along as the SOC integrates the same generation of UMC IP.
> 
> UMC RAS is managed by x86 MCA on A + A platform, umc_ras in GPU driver
> is not initialized at all on A + A platform. In such case, any umc_ras 
> callback
> implemented for dGPU config shouldn't be invoked from A + A specific
> callback.
> 
> The change moves convert_error_address out of dGPU umc_ras structure
> and makes it share between A + A and dGPU config.
> 
> Signed-off-by: Hawking Zhang <hawking.zh...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 15 +++++++++++----
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  3 ---
>  drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   |  7 +++----
>  drivers/gpu/drm/amd/amdgpu/umc_v6_7.h   |  4 +++-
>  4 files changed, 17 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 75f1402101f4..ff92ea99d513 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -36,6 +36,7 @@
>  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
>  #include "atom.h"
>  #include "amdgpu_reset.h"
> +#include "umc_v6_7.h"
> 
>  #ifdef CONFIG_X86_MCE_AMD
>  #include <asm/mce.h>
> @@ -2885,10 +2886,16 @@ static int amdgpu_bad_page_notifier(struct
> notifier_block *nb,
>       /*
>        * Translate UMC channel address to Physical address
>        */
> -     if (adev->umc.ras &&
> -         adev->umc.ras->convert_ras_error_address)
> -             adev->umc.ras->convert_ras_error_address(adev,
> -                     &err_data, m->addr, ch_inst, umc_inst);
> +     switch (adev->ip_versions[UMC_HWIP][0]) {
> +     case IP_VERSION(6, 7, 0):
> +             umc_v6_7_convert_error_address(adev,
> +                             &err_data, m->addr, ch_inst, umc_inst);
> +             break;
> +     default:
> +             dev_warn(adev->dev,
> +                      "UMC address to Physical address translation is not
> supported\n");
> +             return NOTIFY_DONE;
> +     }
> 
>       if (amdgpu_bad_page_threshold != 0) {
>               amdgpu_ras_add_bad_pages(adev, err_data.err_addr, diff --
> git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> index e46439274f3a..3629d8f292ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> @@ -51,9 +51,6 @@ struct amdgpu_umc_ras {
>       struct amdgpu_ras_block_object ras_block;
>       void (*err_cnt_init)(struct amdgpu_device *adev);
>       bool (*query_ras_poison_mode)(struct amdgpu_device *adev);
> -     void (*convert_ras_error_address)(struct amdgpu_device *adev,
> -                             struct ras_err_data *err_data, uint64_t
> err_addr,
> -                             uint32_t ch_inst, uint32_t umc_inst);
>       void (*ecc_info_query_ras_error_count)(struct amdgpu_device
> *adev,
>                                     void *ras_error_status);
>       void (*ecc_info_query_ras_error_address)(struct amdgpu_device
> *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> index 5d5d031c9e7d..72fd963f178b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> @@ -187,9 +187,9 @@ static void
> umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
>       }
>  }
> 
> -static void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
> -                                     struct ras_err_data *err_data,
> uint64_t err_addr,
> -                                     uint32_t ch_inst, uint32_t umc_inst)
> +void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
> +                                 struct ras_err_data *err_data, uint64_t
> err_addr,
> +                                 uint32_t ch_inst, uint32_t umc_inst)
>  {
>       uint32_t channel_index;
>       uint64_t soc_pa, retired_page, column; @@ -553,5 +553,4 @@ struct
> amdgpu_umc_ras umc_v6_7_ras = {
>       .query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
>       .ecc_info_query_ras_error_count =
> umc_v6_7_ecc_info_query_ras_error_count,
>       .ecc_info_query_ras_error_address =
> umc_v6_7_ecc_info_query_ras_error_address,
> -     .convert_ras_error_address = umc_v6_7_convert_error_address,
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> index fe41ed2f5945..105245d5b6e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> @@ -71,5 +71,7 @@ extern const uint32_t
> 
>       umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NU
> M][UMC_V6_7_CHANNEL_INSTANCE_NUM];
>  extern const uint32_t
> 
>       umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM]
> [UMC_V6_7_CHANNEL_INSTANCE_NUM];
> -
> +void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
> +                                    struct ras_err_data *err_data, uint64_t 
> err_addr,
> +                                    uint32_t ch_inst, uint32_t
> +umc_inst);
>  #endif
> --
> 2.17.1

Reply via email to