amdgpu: Avoid HW GPU reset for RAS.

Zhou1, Tao Thu, 29 Aug 2019 00:56:48 -0700


> -----Original Message-----
> From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of
> Andrey Grodzovsky
> Sent: 2019年8月29日 4:00
> To: amd-gfx@lists.freedesktop.org
> Cc: alexdeuc...@gmail.com; ckoenig.leichtzumer...@gmail.com;
> Grodzovsky, Andrey <andrey.grodzov...@amd.com>; Zhang, Hawking
> <hawking.zh...@amd.com>
> Subject: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.
> 
> Problem:
> Under certain conditions, when some IP bocks take a RAS error, we can get
[Tao] typo: "dmr/amdgpu" -> "drm/amdgpu", "IP bocks" -> "IP blocks"


> into a situation where a GPU reset is not possible due to issues in RAS in
> SMU/PSP.
> 
> Temporary fix until proper solution in PSP/SMU is ready:
> When uncorrectable error happens the DF will unconditionally broadcast
> error event packets to all its clients/slave upon receiving fatal error event 
> and
> freeze all its outbound queues, err_event_athub interrupt  will be triggered.
> In such case and we use this interrupt
> to issue GPU reset. THe GPU reset code is modified for such case to avoid HW
> reset, only stops schedulers, deatches all in progress and not yet scheduled
> job's fences, set error code on them and signals.
> Also reject any new incoming job submissions from user space.
> All this is done to notify the applications of the problem.
> 
> Signed-off-by: Andrey Grodzovsky <andrey.grodzov...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  4 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98
> ++++++++++++++++++++++--------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  5 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |  6 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 30 +++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 12 +++-
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c      | 10 +--
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      | 24 ++++----
>  drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c     |  5 ++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c     | 32 +++++-----
>  10 files changed, 164 insertions(+), 62 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 9da681e..300adb8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -38,6 +38,7 @@
>  #include "amdgpu_gmc.h"
>  #include "amdgpu_gem.h"
>  #include "amdgpu_display.h"
> +#include "amdgpu_ras.h"
> 
>  #if defined(HAVE_DRM_FREE_LARGE)
>  #define kvfree drm_free_large
> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void
> *data, struct drm_file *filp)
>       bool reserved_buffers = false;
>       int i, r;
> 
> +     if (amdgpu_ras_intr_triggered())
> +             return -EHWPOISON;
> +
>       if (!adev->accel_working)
>               return -EBUSY;
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 07a4ba0..3ecee10 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3734,10 +3734,10 @@ static bool amdgpu_device_lock_adev(struct
> amdgpu_device *adev, bool trylock)
>       return true;
>  }
> 
> -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev, bool
> +skip_kfd)
>  {
>       /*unlock kfd: SRIOV would do it separately */
> -     if (!amdgpu_sriov_vf(adev))
> +     if (!amdgpu_sriov_vf(adev) && !skip_kfd)
>                  amdgpu_amdkfd_post_reset(adev);
>       amdgpu_vf_error_trans_all(adev);
>       adev->mp1_state = PP_MP1_STATE_NONE;
> @@ -3746,6 +3746,44 @@ static void amdgpu_device_unlock_adev(struct
> amdgpu_device *adev)  }
> 
> 
> +#define to_drm_sched_job(sched_job)          \
> +             container_of((sched_job), struct drm_sched_job,
> queue_node)
> +
> +static void amdgpu_stop_all_jobs_on_sched(struct drm_gpu_scheduler
> +*sched) {
> +     struct drm_sched_job *s_job;
> +     struct drm_sched_entity *s_entity = NULL;
> +     int i;
> +
> +     /* Signal all jobs not yet scheduled */
> +     for (i = DRM_SCHED_PRIORITY_MAX - 1; i >=
> DRM_SCHED_PRIORITY_MIN; i--) {
> +             struct drm_sched_rq *rq = &sched->sched_rq[i];
> +
> +             if (!rq)
> +                     continue;
> +
> +             spin_lock(&rq->lock);
> +             list_for_each_entry(s_entity, &rq->entities, list) {
> +                     while ((s_job =
> to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
> +                             struct drm_sched_fence *s_fence = s_job-
> >s_fence;
> +
> +                             dma_fence_signal(&s_fence->scheduled);
> +                             dma_fence_set_error(&s_fence->finished, -
> EHWPOISON);
> +                             dma_fence_signal(&s_fence->finished);
> +                     }
> +             }
> +             spin_unlock(&rq->lock);
> +     }
> +
> +     /* Signal all jobs already scheduled to HW */
> +     list_for_each_entry(s_job, &sched->ring_mirror_list, node) {
> +             struct drm_sched_fence *s_fence = s_job->s_fence;
> +
> +             dma_fence_set_error(&s_fence->finished, -EHWPOISON);
> +             dma_fence_signal(&s_fence->finished);
> +     }
> +}
> +
>  /**
>   * amdgpu_device_gpu_recover - reset the asic and recover scheduler
>   *
> @@ -3765,11 +3803,12 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
>       struct amdgpu_hive_info *hive = NULL;
>       struct amdgpu_device *tmp_adev = NULL;
>       int i, r = 0;
> +     bool in_ras_intr = amdgpu_ras_intr_triggered();
> 
>       need_full_reset = job_signaled = false;
>       INIT_LIST_HEAD(&device_list);
> 
> -     dev_info(adev->dev, "GPU reset begin!\n");
> +     dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs
> +stop":"reset");
> 
>       cancel_delayed_work_sync(&adev->delayed_init_work);
> 
> @@ -3799,7 +3838,7 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
>       /* Build list of devices to reset */
>       if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>               if (!hive) {
> -                     amdgpu_device_unlock_adev(adev);
> +                     amdgpu_device_unlock_adev(adev, false);
>                       return -ENODEV;
>               }
> 
> @@ -3824,7 +3863,7 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
>       /* block all schedulers and reset given job's ring */
>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>               /* disable ras on ALL IPs */
> -             if (amdgpu_device_ip_need_full_reset(tmp_adev))
> +             if (!in_ras_intr &&
> amdgpu_device_ip_need_full_reset(tmp_adev))
>                       amdgpu_ras_suspend(tmp_adev);
> 
>               for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { @@ -3834,10
> +3873,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                               continue;
> 
>                       drm_sched_stop(&ring->sched, job ? &job->base :
> NULL);
> +
> +                     if (in_ras_intr)
> +                             amdgpu_stop_all_jobs_on_sched(&ring-
> >sched);
>               }
>       }
> 
> 
> +     if (in_ras_intr)
> +             goto skip_hw_reset;
> +
>       /*
>        * Must check guilty signal here since after this point all old
>        * HW fences are force signaled.
> @@ -3902,34 +3947,37 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
> 
>       /* Post ASIC reset for all devs .*/
>       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> -             for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> -                     struct amdgpu_ring *ring = tmp_adev->rings[i];
> 
> -                     if (!ring || !ring->sched.thread)
> -                             continue;
> +             if (!in_ras_intr) {
> +                     for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +                             struct amdgpu_ring *ring = tmp_adev-
> >rings[i];
> 
> -                     /* No point to resubmit jobs if we didn't HW reset*/
> -                     if (!tmp_adev->asic_reset_res && !job_signaled)
> -                             drm_sched_resubmit_jobs(&ring->sched);
> +                             if (!ring || !ring->sched.thread)
> +                                     continue;
> 
> -                     drm_sched_start(&ring->sched, !tmp_adev-
> >asic_reset_res);
> -             }
> +                             /* No point to resubmit jobs if we didn't HW
> reset*/
> +                             if (!tmp_adev->asic_reset_res
> && !job_signaled)
> +                                     drm_sched_resubmit_jobs(&ring-
> >sched);
> 
> -             if (!amdgpu_device_has_dc_support(tmp_adev)
> && !job_signaled) {
> -                     drm_helper_resume_force_mode(tmp_adev->ddev);
> -             }
> +                             drm_sched_start(&ring->sched, !tmp_adev-
> >asic_reset_res);
> +                     }
> 
> -             tmp_adev->asic_reset_res = 0;
> +                     if (!amdgpu_device_has_dc_support(tmp_adev)
> && !job_signaled) {
> +                             drm_helper_resume_force_mode(tmp_adev-
> >ddev);
> +                     }
> 
> -             if (r) {
> -                     /* bad news, how to tell it to userspace ? */
> -                     dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
> atomic_read(&adev->gpu_reset_counter));
> -                     amdgpu_vf_error_put(tmp_adev,
> AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
> -             } else {
> -                     dev_info(tmp_adev->dev, "GPU reset(%d)
> succeeded!\n", atomic_read(&adev->gpu_reset_counter));
> +                     tmp_adev->asic_reset_res = 0;
> +
> +                     if (r) {
> +                             /* bad news, how to tell it to userspace ? */
> +                             dev_info(tmp_adev->dev, "GPU reset(%d)
> failed\n", atomic_read(&adev->gpu_reset_counter));
> +                             amdgpu_vf_error_put(tmp_adev,
> AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
> +                     } else {
> +                             dev_info(tmp_adev->dev, "GPU reset(%d)
> succeeded!\n", atomic_read(&adev->gpu_reset_counter));
> +                     }
>               }
> 
> -             amdgpu_device_unlock_adev(tmp_adev);
> +             amdgpu_device_unlock_adev(tmp_adev, in_ras_intr);
>       }
> 
>       if (hive)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 151d7f2..757fd6d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -40,6 +40,8 @@
> 
>  #include "amdgpu_amdkfd.h"
> 
> +#include "amdgpu_ras.h"
> +
>  /*
>   * KMS wrapper.
>   * - 3.0.0 - initial driver
> @@ -1179,6 +1181,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
>       struct drm_device *dev = pci_get_drvdata(pdev);
>       struct amdgpu_device *adev = dev->dev_private;
> 
> +     if (amdgpu_ras_intr_triggered())
> +             return;
> +
>       /* if we are running in a VM, make sure the device
>        * torn down properly on reboot/shutdown.
>        * unfortunately we can't detect certain diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index da2143d..ced766c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -1046,6 +1046,12 @@ int amdgpu_driver_open_kms(struct drm_device
> *dev, struct drm_file *file_priv)
>       /* Ensure IB tests are run on ring */
>       flush_delayed_work(&adev->delayed_init_work);
> 
> +
> +     if (amdgpu_ras_intr_triggered()) {
> +             DRM_ERROR("RAS Intr triggered, device disabled!!");
> +             return -EHWPOISON;
> +     }
> +
>       file_priv->driver_priv = NULL;
> 
>       r = pm_runtime_get_sync(dev->dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2d5897a..086e6df 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -24,6 +24,8 @@
>  #include <linux/debugfs.h>
>  #include <linux/list.h>
>  #include <linux/module.h>
> +#include <linux/reboot.h>
> +#include <linux/syscalls.h>
>  #include "amdgpu.h"
>  #include "amdgpu_ras.h"
>  #include "amdgpu_atomfirmware.h"
> @@ -64,6 +66,9 @@ const char *ras_block_string[] = {
>  /* inject address is 52 bits */
>  #define      RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
> 
> +
> +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
> +
>  static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
>               uint64_t offset, uint64_t size,
>               struct amdgpu_bo **bo_ptr);
> @@ -80,7 +85,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f,
> char __user *buf,
>       ssize_t s;
>       char val[128];
> 
> -     if (amdgpu_ras_error_query(obj->adev, &info))
> +     if (amdgpu_ras_error_query(obj->adev, &info, false))
>               return -EINVAL;
> 
>       s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", @@ -188,6
> +193,10 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
> 
>       return 0;
>  }
> +
> +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device
> *adev,
> +             struct ras_common_if *head);
> +
>  /**
>   * DOC: AMDGPU RAS debugfs control interface
>   *
> @@ -304,7 +313,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device
> *dev,
>               .head = obj->head,
>       };
> 
> -     if (amdgpu_ras_error_query(obj->adev, &info))
> +     if (amdgpu_ras_error_query(obj->adev, &info, false))
>               return -EINVAL;
> 
>       return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", @@ -591,7
> +600,7 @@ static int amdgpu_ras_enable_all_features(struct
> amdgpu_device *adev,
> 
>  /* query/inject/cure begin */
>  int amdgpu_ras_error_query(struct amdgpu_device *adev,
> -             struct ras_query_if *info)
> +             struct ras_query_if *info, bool print)
>  {
>       struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
>       struct ras_err_data err_data = {0, 0, 0, NULL}; @@ -627,12 +636,14
> @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
>       info->ue_count = obj->err_data.ue_count;
>       info->ce_count = obj->err_data.ce_count;
> 
> -     if (err_data.ce_count)
> +     if (err_data.ce_count || print) {
>               dev_info(adev->dev, "%ld correctable errors detected in %s
> block\n",
>                        obj->err_data.ce_count, ras_block_str(info-
> >head.block));
[Tao] Could you explain why print is needed even ce/ue_count == 0? And I think 
these codes can be split into a single patch.

> -     if (err_data.ue_count)
> +     }
> +     if (err_data.ue_count || print) {
>               dev_info(adev->dev, "%ld uncorrectable errors detected
> in %s block\n",
>                        obj->err_data.ue_count, ras_block_str(info-
> >head.block));
> +     }
> 
>       return 0;
>  }
> @@ -702,7 +713,7 @@ int amdgpu_ras_query_error_count(struct
> amdgpu_device *adev,
>                       .head = obj->head,
>               };
> 
> -             if (amdgpu_ras_error_query(adev, &info))
> +             if (amdgpu_ras_error_query(adev, &info, true))
>                       return -EINVAL;
> 
>               data.ce_count += info.ce_count;
> @@ -1718,3 +1729,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
> 
>       return 0;
>  }
> +
> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) {
> +     if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
> +             DRM_WARN("RAS event of type
> ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
> +     }
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 5a0df73..c0e22af 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -587,7 +587,7 @@ void amdgpu_ras_debugfs_remove(struct
> amdgpu_device *adev,
>               struct ras_common_if *head);
> 
>  int amdgpu_ras_error_query(struct amdgpu_device *adev,
> -             struct ras_query_if *info);
> +             struct ras_query_if *info, bool print);
> 
>  int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>               struct ras_inject_if *info);
> @@ -600,4 +600,14 @@ int amdgpu_ras_interrupt_remove_handler(struct
> amdgpu_device *adev,
> 
>  int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
>               struct ras_dispatch_if *info);
> +
> +extern atomic_t amdgpu_ras_in_intr;
> +
> +static inline bool amdgpu_ras_intr_triggered(void) {
> +     return !!atomic_read(&amdgpu_ras_in_intr);
> +}
> +
> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
> +
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index b2c86a0..e7a83f6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -5669,10 +5669,12 @@ static int gfx_v9_0_process_ras_data_cb(struct
> amdgpu_device *adev,
>               struct amdgpu_iv_entry *entry)
>  {
>       /* TODO ue will trigger an interrupt. */
> -     kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> -     if (adev->gfx.funcs->query_ras_error_count)
> -             adev->gfx.funcs->query_ras_error_count(adev, err_data);
> -     amdgpu_ras_reset_gpu(adev, 0);
> +     if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
[Tao] Have you encountered any error without the check? ras_data_cb would not 
be registered if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))

> +             kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> +             if (adev->gfx.funcs->query_ras_error_count)
> +                     adev->gfx.funcs->query_ras_error_count(adev,
> err_data);
> +             amdgpu_ras_reset_gpu(adev, 0);
> +     }
>       return AMDGPU_RAS_SUCCESS;
>  }
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 43b4fbc..87a66c2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -243,18 +243,20 @@ static int gmc_v9_0_process_ras_data_cb(struct
> amdgpu_device *adev,
>               struct ras_err_data *err_data,
>               struct amdgpu_iv_entry *entry)
>  {
> -     kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> -     if (adev->umc.funcs->query_ras_error_count)
> -             adev->umc.funcs->query_ras_error_count(adev, err_data);
> -     /* umc query_ras_error_address is also responsible for clearing
> -      * error status
> -      */
> -     if (adev->umc.funcs->query_ras_error_address)
> -             adev->umc.funcs->query_ras_error_address(adev, err_data);
> +     if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
[Tao] AMDGPU_RAS_BLOCK__UMC

> +             kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> +             if (adev->umc.funcs->query_ras_error_count)
> +                     adev->umc.funcs->query_ras_error_count(adev,
> err_data);
> +             /* umc query_ras_error_address is also responsible for
> clearing
> +              * error status
> +              */
> +             if (adev->umc.funcs->query_ras_error_address)
> +                     adev->umc.funcs->query_ras_error_address(adev,
> err_data);
> 
> -     /* only uncorrectable error needs gpu reset */
> -     if (err_data->ue_count)
> -             amdgpu_ras_reset_gpu(adev, 0);
> +             /* only uncorrectable error needs gpu reset */
> +             if (err_data->ue_count)
> +                     amdgpu_ras_reset_gpu(adev, 0);
> +     }
> 
>       return AMDGPU_RAS_SUCCESS;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> index 367f9d6..545990c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> @@ -30,6 +30,7 @@
>  #include "nbio/nbio_7_4_0_smn.h"
>  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
>  #include <uapi/linux/kfd_ioctl.h>
> +#include "amdgpu_ras.h"
> 
>  #define smnNBIF_MGCG_CTRL_LCLK       0x1013a21c
> 
> @@ -329,6 +330,8 @@ static void
> nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
>                                               BIF_DOORBELL_INT_CNTL,
> 
>       RAS_CNTLR_INTERRUPT_CLEAR, 1);
>               WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL,
> bif_doorbell_intr_cntl);
> +
> +             amdgpu_ras_global_ras_isr(adev);
>       }
>  }
> 
> @@ -344,6 +347,8 @@ static void
> nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d
>                                               BIF_DOORBELL_INT_CNTL,
> 
>       RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
>               WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL,
> bif_doorbell_intr_cntl);
> +
> +             amdgpu_ras_global_ras_isr(adev);
>       }
>  }
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index 956432f..438e504 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -1972,24 +1972,26 @@ static int
> sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
>       uint32_t err_source;
>       int instance;
> 
> -     instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
> -     if (instance < 0)
> -             return 0;
> +     if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
[Tao] AMDGPU_RAS_BLOCK__SDMA

> +             instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
> +             if (instance < 0)
> +                     return 0;
> 
> -     switch (entry->src_id) {
> -     case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
> -             err_source = 0;
> -             break;
> -     case SDMA0_4_0__SRCID__SDMA_ECC:
> -             err_source = 1;
> -             break;
> -     default:
> -             return 0;
> -     }
> +             switch (entry->src_id) {
> +             case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
> +                     err_source = 0;
> +                     break;
> +             case SDMA0_4_0__SRCID__SDMA_ECC:
> +                     err_source = 1;
> +                     break;
> +             default:
> +                     return 0;
> +             }
> 
> -     kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> +             kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> 
> -     amdgpu_ras_reset_gpu(adev, 0);
> +             amdgpu_ras_reset_gpu(adev, 0);
> +     }
> 
>       return AMDGPU_RAS_SUCCESS;
>  }
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

Reply via email to