[AMD Official Use Only]

Dear Paul and Tao,

Comments are added inline.

Regards,
Zafar

>-----Original Message-----
>From: Paul Menzel <pmen...@molgen.mpg.de>
>Sent: Monday, March 28, 2022 1:22 PM
>To: Zhou1, Tao <tao.zh...@amd.com>; Ziya, Mohammad zafar
><mohammadzafar.z...@amd.com>
>Cc: amd-gfx@lists.freedesktop.org; Lazar, Lijo <lijo.la...@amd.com>; Zhang,
>Hawking <hawking.zh...@amd.com>
>Subject: Re: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
>
>Dear Mohammad, dear Tao,
>
>
>Tao, it’s hard to find your reply in the quote, as the message is not quoted
>correctly (> prepended). Is it possible to use a different email client?
>
>
>Am 28.03.22 um 09:43 schrieb Zhou1, Tao:
>> -----Original Message-----
>> From: Ziya, Mohammad zafar <mohammadzafar.z...@amd.com>
>> Sent: Monday, March 28, 2022 2:25 PM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Lazar, Lijo <lijo.la...@amd.com>; Zhou1, Tao <tao.zh...@amd.com>;
>> Zhang, Hawking <hawking.zh...@amd.com>; Ziya, Mohammad zafar
>> <mohammadzafar.z...@amd.com>
>> Subject: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
>>
>> RAS error query support addition for VCN 2.6
>>
>> V2: removed unused option and corrected comment format Moved the
>> register definition under header file
>
>Please wrap lines after 75 characters. (`scripts/checkpatch.pl` should have
>warned you about that).
>
>> V3: poison query status check added.
>> Removed error query interface
>>
>> V4: MMSCH poison check option removed, return true/false refactored.
>>
>> Signed-off-by: Mohammad Zafar Ziya <mohammadzafar.z...@amd.com>
>> Reviewed-by: Hawking Zhang <hawking.zh...@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>>   drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   | 71
>+++++++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h   |  6 +++
>>   3 files changed, 78 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> index 1e1a3b736859..606df8869b89 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> @@ -508,6 +508,7 @@ struct amdgpu_ras_block_hw_ops {
>>      void (*query_ras_error_address)(struct amdgpu_device *adev, void
>*ras_error_status);
>>      void (*reset_ras_error_count)(struct amdgpu_device *adev);
>>      void (*reset_ras_error_status)(struct amdgpu_device *adev);
>> +    bool (*query_poison_status)(struct amdgpu_device *adev);
>>   };
>>
>>   /* work flow
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
>> b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
>> index 1869bae4104b..3988fc647741 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
>> @@ -31,6 +31,7 @@
>>   #include "soc15d.h"
>>   #include "vcn_v2_0.h"
>>   #include "mmsch_v1_0.h"
>> +#include "vcn_v2_5.h"
>>
>>   #include "vcn/vcn_2_5_offset.h"
>>   #include "vcn/vcn_2_5_sh_mask.h"
>> @@ -59,6 +60,7 @@ static int vcn_v2_5_set_powergating_state(void
>*handle,  static int vcn_v2_5_pause_dpg_mode(struct amdgpu_device
>*adev,
>>                              int inst_idx, struct dpg_pause_state
>*new_state);  static int
>> vcn_v2_5_sriov_start(struct amdgpu_device *adev);
>> +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev);
>>
>>   static int amdgpu_ih_clientid_vcns[] = {
>>      SOC15_IH_CLIENTID_VCN,
>> @@ -100,6 +102,7 @@ static int vcn_v2_5_early_init(void *handle)
>>      vcn_v2_5_set_dec_ring_funcs(adev);
>>      vcn_v2_5_set_enc_ring_funcs(adev);
>>      vcn_v2_5_set_irq_funcs(adev);
>> +    vcn_v2_5_set_ras_funcs(adev);
>>
>>      return 0;
>>   }
>> @@ -1930,3 +1933,71 @@ const struct amdgpu_ip_block_version
>vcn_v2_6_ip_block =
>>              .rev = 0,
>>              .funcs = &vcn_v2_6_ip_funcs,
>>   };
>> +
>> +static uint32_t vcn_v2_6_query_poison_by_instance(struct
>amdgpu_device *adev,
>> +                    uint32_t instance, uint32_t sub_block) {
>> +    uint32_t poison_stat = 0, reg_value = 0;
>> +
>> +    switch (sub_block) {
>> +    case AMDGPU_VCN_V2_6_VCPU_VCODEC:
>> +            reg_value = RREG32_SOC15(VCN, instance,
>mmUVD_RAS_VCPU_VCODEC_STATUS);
>> +            poison_stat = REG_GET_FIELD(reg_value,
>UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
>> +            break;
>> +    default:
>> +            break;
>> +    };
>> +
>> +    if (poison_stat)
>> +            dev_info(adev->dev, "Poison detected in VCN%d,
>sub_block%d\n",
>> +                    instance, sub_block);
>
>What should a user do with that information? Faulty hardware, …?

[Mohammad]: This message will help to identify the faulty hardware, the 
hardware ID will also log along with poison, help to identify among multiple 
hardware installed on the system.

>
>> +
>> +    return poison_stat;
>> +}
>> +
>> +static bool vcn_v2_6_query_poison_status(struct amdgpu_device *adev) {
>> +    uint32_t inst, sub;
>> +    uint32_t poison_stat = 0;
>> +
>> +    for (inst = 0; inst < adev->vcn.num_vcn_inst; inst++)
>> +            for (sub = 0; sub < AMDGPU_VCN_V2_6_MAX_SUB_BLOCK;
>sub++)
>> +                    poison_stat +=
>> +                    vcn_v2_6_query_poison_by_instance(adev, inst,
>sub);
>> +
>> +    return poison_stat ? true : false;
>>
>> [Tao] just want to confirm the logic, if the POISONED_PF of one instance is 1
>and another is 0, the poison_stat is true, correct?
>> Can we add a print message for this case? Same question to JPEG.

[Mohammad]: Message will be printed on function block ahead of the function.

>
>Is the `dev_info` message in `vcn_v2_6_query_poison_by_instance()` doing
>what you want?
>
>Also, instead of `poison_stat ? true : false;` a common pattern is
>`!!poison_stat` I believe.
>
>
>Kind regards,
>
>Paul

[Mohammad]: Noted the change. Will make to return !!poison_stat ? true : false;

>
>
>> +}
>> +
>> +const struct amdgpu_ras_block_hw_ops vcn_v2_6_ras_hw_ops = {
>> +    .query_poison_status = vcn_v2_6_query_poison_status, };
>> +
>> +static struct amdgpu_vcn_ras vcn_v2_6_ras = {
>> +    .ras_block = {
>> +            .hw_ops = &vcn_v2_6_ras_hw_ops,
>> +    },
>> +};
>> +
>> +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev) {
>> +    switch (adev->ip_versions[VCN_HWIP][0]) {
>> +    case IP_VERSION(2, 6, 0):
>> +            adev->vcn.ras = &vcn_v2_6_ras;
>> +            break;
>> +    default:
>> +            break;
>> +    }
>> +
>> +    if (adev->vcn.ras) {
>> +            amdgpu_ras_register_ras_block(adev, &adev->vcn.ras-
>>ras_block);
>> +
>> +            strcpy(adev->vcn.ras->ras_block.ras_comm.name, "vcn");
>> +            adev->vcn.ras->ras_block.ras_comm.block =
>AMDGPU_RAS_BLOCK__VCN;
>> +            adev->vcn.ras->ras_block.ras_comm.type =
>AMDGPU_RAS_ERROR__POISON;
>> +            adev->vcn.ras_if = &adev->vcn.ras->ras_block.ras_comm;
>> +
>> +            /* If don't define special ras_late_init function, use default
>ras_late_init */
>> +            if (!adev->vcn.ras->ras_block.ras_late_init)
>> +                    adev->vcn.ras->ras_block.ras_late_init =
>amdgpu_ras_block_late_init;
>> +    }
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
>b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
>> index e72f799ed0fd..1c19af74e4fd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
>> @@ -24,6 +24,12 @@
>>   #ifndef __VCN_V2_5_H__
>>   #define __VCN_V2_5_H__
>>
>> +enum amdgpu_vcn_v2_6_sub_block {
>> +    AMDGPU_VCN_V2_6_VCPU_VCODEC = 0,
>> +
>> +    AMDGPU_VCN_V2_6_MAX_SUB_BLOCK,
>> +};
>> +
>>   extern const struct amdgpu_ip_block_version vcn_v2_5_ip_block;  extern
>const struct amdgpu_ip_block_version vcn_v2_6_ip_block;
>>
>> --
>> 2.25.1

Reply via email to