On Wed, Mar 11, 2026 at 03:59:16PM +0530, Riana Tauro wrote:
> Add support for clear-error-counter command in XE DRM RAS.
> This resets the counter value.
>
> Usage:
>
> $ sudo ynl --family drm_ras --do clear-error-counter --json \
> '{"node-id":1, "error-id":1}'
> None
>
> Signed-off-by: Riana Tauro <[email protected]>
> ---
> drivers/gpu/drm/xe/xe_drm_ras.c | 35 +++++++++++++++++++++++++++++++--
> 1 file changed, 33 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c
> index e07dc23a155e..c21c8b428de6 100644
> --- a/drivers/gpu/drm/xe/xe_drm_ras.c
> +++ b/drivers/gpu/drm/xe/xe_drm_ras.c
> @@ -27,6 +27,16 @@ static int hw_query_error_counter(struct
> xe_drm_ras_counter *info,
> return 0;
> }
>
> +static int hw_clear_error_counter(struct xe_drm_ras_counter *info, u32
> error_id)
> +{
> + if (!info || !info[error_id].name)
> + return -ENOENT;
> +
> + atomic_set(&info[error_id].counter, 0);
> +
> + return 0;
> +}
> +
> static int query_uncorrectable_error_counter(struct drm_ras_node *ep, u32
> error_id,
> const char **name, u32 *val)
> {
> @@ -37,6 +47,15 @@ static int query_uncorrectable_error_counter(struct
> drm_ras_node *ep, u32 error_
> return hw_query_error_counter(info, error_id, name, val);
> }
>
> +static int clear_uncorrectable_error_counter(struct drm_ras_node *node, u32
> error_id)
> +{
> + struct xe_device *xe = node->priv;
> + struct xe_drm_ras *ras = &xe->ras;
> + struct xe_drm_ras_counter *info =
> ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE];
> +
> + return hw_clear_error_counter(info, error_id);
> +}
> +
> static int query_correctable_error_counter(struct drm_ras_node *ep, u32
> error_id,
> const char **name, u32 *val)
> {
> @@ -47,6 +66,15 @@ static int query_correctable_error_counter(struct
> drm_ras_node *ep, u32 error_id
> return hw_query_error_counter(info, error_id, name, val);
> }
>
> +static int clear_correctable_error_counter(struct drm_ras_node *node, u32
> error_id)
> +{
> + struct xe_device *xe = node->priv;
> + struct xe_drm_ras *ras = &xe->ras;
> + struct xe_drm_ras_counter *info =
> ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE];
> +
> + return hw_clear_error_counter(info, error_id);
> +}
This would've been much simpler if we had per node info, but for now
Reviewed-by: Raag Jadav <[email protected]>
> static struct xe_drm_ras_counter *allocate_and_copy_counters(struct
> xe_device *xe)
> {
> struct xe_drm_ras_counter *counter;
> @@ -92,10 +120,13 @@ static int assign_node_params(struct xe_device *xe,
> struct drm_ras_node *node,
> if (IS_ERR(ras->info[severity]))
> return PTR_ERR(ras->info[severity]);
>
> - if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
> + if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) {
> node->query_error_counter = query_correctable_error_counter;
> - else
> + node->clear_error_counter = clear_correctable_error_counter;
> + } else {
> node->query_error_counter = query_uncorrectable_error_counter;
> + node->clear_error_counter = clear_uncorrectable_error_counter;
> + }
>
> return 0;
> }
> --
> 2.47.1
>