[Public]

Hi Jesse,

> -----Original Message-----
> From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Huang,
> Tim
> Sent: Tuesday, October 29, 2024 12:21 PM
> To: Zhang, Jesse(Jie) <jesse.zh...@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <alexander.deuc...@amd.com>; Koenig, Christian
> <christian.koe...@amd.com>; Zhang, Jesse(Jie) <jesse.zh...@amd.com>;
> Zhang, Jesse(Jie) <jesse.zh...@amd.com>
> Subject: RE: [PATCH V3 1/5] drm/amdgpu: Add sysfs interface for gc reset
> mask
>
> [Public]
>
> [Public]
>
> Hi Jesse,
>
> > -----Original Message-----
> > From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of
> > jesse.zh...@amd.com
> > Sent: Thursday, October 24, 2024 3:39 PM
> > To: amd-gfx@lists.freedesktop.org
> > Cc: Deucher, Alexander <alexander.deuc...@amd.com>; Koenig, Christian
> > <christian.koe...@amd.com>; Zhang, Jesse(Jie) <jesse.zh...@amd.com>;
> > Zhang, Jesse(Jie) <jesse.zh...@amd.com>
> > Subject: [PATCH V3 1/5] drm/amdgpu: Add sysfs interface for gc reset
> > mask
> >
> > Add two sysfs interfaces for gfx and compute:
> > gfx_reset_mask
> > compute_reset_mask
> >
> > These interfaces are read-only and show the resets supported by the IP.
> > For example, full adapter reset (mode1/mode2/BACO/etc), soft reset,
> > queue reset, and pipe reset.
> >
> > V2: the sysfs node returns a text string instead of some flags
> > (Christian)
> > v3: add a generic helper which takes the ring as parameter
> >     and print the strings in the order they are applied (Christian)
> >
> >     check amdgpu_gpu_recovery  before creating sysfs file itself,
> >     and initialize supported_reset_types in IP version files (Lijo)
> >
> > Signed-off-by: Jesse Zhang <jesse.zh...@amd.com> Suggested-by:Alex
> > Deucher <alexander.deuc...@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  8 +++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 66
> > ++++++++++++++++++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  4 ++
> >  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c     |  6 ++
> >  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c     | 14 +++++
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c    | 12 ++++
> >  7 files changed, 147 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index 48c9b9b06905..aea1031d7b84 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -300,6 +300,12 @@ extern int amdgpu_wbrf;
> >  #define AMDGPU_RESET_VCE                     (1 << 13)
> >  #define AMDGPU_RESET_VCE1                    (1 << 14)
> >
> > +/* reset mask */
> > +#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset,
> > +mode1/mode2/BACO/etc. */ #define AMDGPU_RESET_TYPE_SOFT_RESET
> (1
> > << 1)
> > +/* IP level soft reset */ #define AMDGPU_RESET_TYPE_PER_QUEUE (1 <<
> > +2)
> > +/* per queue */ #define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per
> > +pipe */
> > +
> >  /* max cursor sizes (in pixels) */
> >  #define CIK_CURSOR_WIDTH 128
> >  #define CIK_CURSOR_HEIGHT 128
> > @@ -1466,6 +1472,8 @@ struct dma_fence
> *amdgpu_device_get_gang(struct
> > amdgpu_device *adev);  struct dma_fence
> > *amdgpu_device_switch_gang(struct amdgpu_device *adev,
> >                                           struct dma_fence *gang);
> > bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
> > +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
> > +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
> >
> >  /* atpx handler */
> >  #if defined(CONFIG_VGA_SWITCHEROO)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index ef715b2bbcdb..cd1e3f018893 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -6684,3 +6684,40 @@ uint32_t amdgpu_device_wait_on_rreg(struct
> > amdgpu_device *adev,
> >       }
> >       return ret;
> >  }
> > +
> > +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) {
> > +     ssize_t size = 0;
> > +
> > +     if (!ring)
> > +             return size;
> > +
> > +     if (amdgpu_device_should_recover_gpu(ring->adev))
> > +             size |= AMDGPU_RESET_TYPE_FULL;
> > +
> > +     if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
> > +         !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
> > +             size |= AMDGPU_RESET_TYPE_SOFT_RESET;
> > +
> > +     return size;
> > +}
> > +
> > +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) {
> > +     ssize_t size = 0;
> > +
> > +     if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
> > +             size += sysfs_emit_at(buf, size, "soft ");
> > +
> > +     if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
> > +             size += sysfs_emit_at(buf, size, "queue ");
> > +
> > +     if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
> > +             size += sysfs_emit_at(buf, size, "pipe ");
> > +
> > +     if (supported_reset & AMDGPU_RESET_TYPE_FULL)
> > +             size += sysfs_emit_at(buf, size, "full ");
> > +
> > +     size += sysfs_emit_at(buf, size, "\n");
> > +     return size;
> > +}
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > index e96984c53e72..6de1f3bf6863 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > @@ -1588,6 +1588,32 @@ static ssize_t
> > amdgpu_gfx_set_enforce_isolation(struct device *dev,
> >       return count;
> >  }
> >
> > +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
> > +                                             struct
> device_attribute *attr,
> > +                                             char *buf) {
> > +     struct drm_device *ddev = dev_get_drvdata(dev);
> > +     struct amdgpu_device *adev = drm_to_adev(ddev);
> > +
> > +     if (!adev)
> > +             return -ENODEV;
> > +
> > +     return amdgpu_show_reset_mask(buf,
> > + adev->gfx.gfx_supported_reset); }
> > +
> > +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
> > +                                             struct
> device_attribute *attr,
> > +                                             char *buf) {
> > +     struct drm_device *ddev = dev_get_drvdata(dev);
> > +     struct amdgpu_device *adev = drm_to_adev(ddev);
> > +
> > +     if (!adev)
> > +             return -ENODEV;
> > +
> > +     return amdgpu_show_reset_mask(buf,
> > adev->gfx.compute_supported_reset);
> > +}
> > +
> >  static DEVICE_ATTR(run_cleaner_shader, 0200,
> >                  NULL, amdgpu_gfx_set_run_cleaner_shader);
> >
> > @@ -1602,6 +1628,12 @@ static DEVICE_ATTR(current_compute_partition,
> > 0644,  static DEVICE_ATTR(available_compute_partition, 0444,
> >                  amdgpu_gfx_get_available_compute_partition, NULL);
> >
> > +static DEVICE_ATTR(gfx_reset_mask, 0444,
> > +                amdgpu_gfx_get_gfx_reset_mask, NULL);
> > +
> > +static DEVICE_ATTR(compute_reset_mask, 0444,
> > +                amdgpu_gfx_get_compute_reset_mask, NULL);
> > +
> >  int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)  {
> >       struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; @@ -1702,6
> > +1734,40 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device
> > *adev,
> >                           cleaner_shader_size);  }
> >
> > +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev) {
> > +     int r = 0;
> > +
> > +     if (!amdgpu_gpu_recovery)
> > +             return r;
> > +
> > +     if (adev->gfx.num_gfx_rings) {
> > +             r = device_create_file(adev->dev,
> &dev_attr_gfx_reset_mask);
> > +             if (r)
> > +                     return r;
> > +     }
> > +
> > +     if (adev->gfx.num_compute_rings) {
> > +             r = device_create_file(adev->dev,
> &dev_attr_compute_reset_mask);
> > +             if (r)
> > +                     return r;
> > +     }
> > +
> > +     return r;
> > +}
> > +
> > +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev) {
> > +     if (!amdgpu_gpu_recovery)
> > +             return;
> > +
> > +     if (adev->gfx.num_gfx_rings)
> > +             device_remove_file(adev->dev, &dev_attr_gfx_reset_mask);
> > +
> > +     if (adev->gfx.num_compute_rings)
> > +             device_remove_file(adev->dev,
> > + &dev_attr_compute_reset_mask); }
> > +
> >  /**
> >   * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD
> > (Graphics Driver)
> >   * @adev: amdgpu_device pointer
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > index f710178a21bc..fb0e1adf6766 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > @@ -424,6 +424,8 @@ struct amdgpu_gfx {
> >       /* reset mask */
> >       uint32_t                        grbm_soft_reset;
> >       uint32_t                        srbm_soft_reset;
> > +     uint32_t                        gfx_supported_reset;
> > +     uint32_t                        compute_supported_reset;
> >
> >       /* gfx off */
> >       bool                            gfx_off_state;      /* true:
> > enabled, false: disabled */
> > @@ -582,6 +584,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct
> > amdgpu_device *adev);  void
> > amdgpu_gfx_enforce_isolation_handler(struct
> > work_struct *work);  void
> > amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring);
> > void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring
> > *ring);
> > +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev);
> > +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev);
> >
> >  static inline const char *amdgpu_gfx_compute_mode_desc(int mode)  {
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > index 9da95b25e158..446e37768397 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>
> Here may miss the reset_mask_int and reset_mask_fini for gfx_v10.
>
>
> Best Regards
> Tim
>
> > @@ -4806,6 +4806,9 @@ static int gfx_v10_0_sw_init(struct
> > amdgpu_ip_block *ip_block)
> >                       }
> >               }
> >       }
> > +     /* TODO: Check the version that supports fully queue reset */
> > +     adev->gfx.gfx_supported_reset |=
> > +             amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> >
> >       ring_id = 0;
> >       /* set up the compute queues - allocate horizontally across
> > pipes */ @@
> > -4825,6 +4828,9 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block
> > *ip_block)
> >                       }
> >               }
> >       }
> > +     /* TODO: Check the version that supports fully queue reset */
> > +     adev->gfx.compute_supported_reset |=
> > +
> > + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> >
> >       r = amdgpu_gfx_kiq_init(adev, GFX10_MEC_HPD_SIZE, 0);
> >       if (r) {
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > index 5aff8f72de9c..3b23402dfb47 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > @@ -1560,6 +1560,11 @@ static int gfx_v11_0_sw_init(struct
> > amdgpu_ip_block *ip_block)
> >               adev->userq_funcs[AMDGPU_HW_IP_GFX] =
> > &userq_mes_v11_0_funcs;
> >               adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] =
> > &userq_mes_v11_0_funcs;  #endif
> > +             if ((adev->gfx.me_fw_version >= 2280) &&
> > +                 (adev->gfx.mec_fw_version >= 2410)) {
> > +                     adev->gfx.compute_supported_reset =
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > +                     adev->gfx.gfx_supported_reset =
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > +             }
> >               break;
> >       case IP_VERSION(11, 0, 1):
> >       case IP_VERSION(11, 0, 4):
> > @@ -1663,6 +1668,8 @@ static int gfx_v11_0_sw_init(struct
> > amdgpu_ip_block *ip_block)
> >                       }
> >               }
> >       }
> > +     adev->gfx.gfx_supported_reset |=
> > +             amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> >
> >       ring_id = 0;
> >       /* set up the compute queues - allocate horizontally across
> > pipes */ @@
> > -1682,6 +1689,8 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block
> > *ip_block)
> >                       }
> >               }
> >       }
> > +     adev->gfx.compute_supported_reset |=
> > +
> > + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> >
> >       if (!adev->enable_mes_kiq) {
> >               r = amdgpu_gfx_kiq_init(adev, GFX11_MEC_HPD_SIZE, 0);
> @@
> > -1721,6 +1730,10 @@ static int gfx_v11_0_sw_init(struct
> > amdgpu_ip_block
> > *ip_block)
> >       if (r)
> >               return r;
> >
> > +     r = amdgpu_gfx_sysfs_reset_mask_init (adev);
> > +     if (r)
> > +             return r;
> > +
> >       return 0;
> >  }
> >
> > @@ -1783,6 +1796,7 @@ static int gfx_v11_0_sw_fini(struct
> > amdgpu_ip_block *ip_block)
> >       gfx_v11_0_free_microcode(adev);
> >
> >       amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> > +     amdgpu_gfx_sysfs_reset_mask_fini(adev);
> >
> >       kfree(adev->gfx.ip_dump_core);
> >       kfree(adev->gfx.ip_dump_compute_queues);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > index 016290f00592..b9d5a79ba85c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > @@ -1067,6 +1067,11 @@ static int gfx_v9_4_3_sw_init(struct
> > amdgpu_ip_block *ip_block)
> >                               dev_err(adev->dev, "Failed to initialize
> cleaner shader\n");
> >                       }
> >               }
> > +
> > +             if (adev->gfx.mec_fw_version >= 155) {
> > +                     adev->gfx.compute_supported_reset =
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > +                     adev->gfx.compute_supported_reset |=
> > AMDGPU_RESET_TYPE_PER_PIPE;
> > +             }
> >               break;
> >       default:
> >               adev->gfx.enable_cleaner_shader = false; @@ -1157,6
> > +1162,9 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block
> *ip_block)
> >                       return r;
> >       }
> >
> > +     adev->gfx.compute_supported_reset |=
> > +
> > + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);

May careful handling is required for the initialization of 
`adev->gfx.gfx_supported_reset` and `adev->gfx.compute_supported_reset`. For 
instance, in `gfx_v9`, `adev->gfx.gfx_supported_reset` is not initialized, yet 
the sysfs file should be created by `amdgpu_gfx_sysfs_reset_mask_init`. 
Additionally, `adev->gfx.compute_supported_reset` may perform a bitwise OR 
operation with an uninitialized value when adev->gfx.mec_fw_version < 155.

Best Regards
Tim Huang

> > +
> >       r = gfx_v9_4_3_gpu_early_init(adev);
> >       if (r)
> >               return r;
> > @@ -1175,6 +1183,9 @@ static int gfx_v9_4_3_sw_init(struct
> > amdgpu_ip_block *ip_block)
> >       if (r)
> >               return r;
> >
> > +     r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> > +     if (r)
> > +             return r;
> >       return 0;
> >  }
> >
> > @@ -1200,6 +1211,7 @@ static int gfx_v9_4_3_sw_fini(struct
> > amdgpu_ip_block *ip_block)
> >       gfx_v9_4_3_free_microcode(adev);
> >       amdgpu_gfx_sysfs_fini(adev);
> >       amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> > +     amdgpu_gfx_sysfs_reset_mask_fini(adev);
> >
> >       kfree(adev->gfx.ip_dump_core);
> >       kfree(adev->gfx.ip_dump_compute_queues);
> > --
> > 2.25.1

Reply via email to