[AMD Official Use Only - Internal Distribution Only]


-----Original Message-----
From: Lin, Amber <amber....@amd.com> 
Sent: Thursday, July 23, 2020 5:27 PM
To: Joshi, Mukul <mukul.jo...@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix <felix.kuehl...@amd.com>
Subject: Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event



On 2020-07-22 12:08 p.m., Mukul Joshi wrote:
> Add support for reporting thermal throttling events through SMI.
> Also, add a counter to count the number of throttling interrupts 
> observed and report the count in the SMI event message.
>
> Signed-off-by: Mukul Joshi <mukul.jo...@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  4 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  7 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 68 ++++++++++++++-----
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  2 +
>   drivers/gpu/drm/amd/powerplay/amdgpu_smu.c    |  1 +
>   drivers/gpu/drm/amd/powerplay/arcturus_ppt.c  |  1 +
>   .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h    |  1 +
>   drivers/gpu/drm/amd/powerplay/smu_v11_0.c     |  5 ++
>   include/uapi/linux/kfd_ioctl.h                |  3 +-
>   10 files changed, 75 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 1b865fed74ca..19e4658756d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void 
> *ih_ring_entry)
>   void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
>   {
>   }
> +
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t 
> +throttle_bitmask) { }
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 3f2b695cf19e..e8b0258aae24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
>   int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
>                                              struct dma_fence *fence);
>   void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t 
> +throttle_bitmask);
>   
>   #endif /* AMDGPU_AMDKFD_H_INCLUDED */ diff --git 
> a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 4bfedaab183f..d5e790f046b4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -29,6 +29,7 @@
>   #include "cwsr_trap_handler.h"
>   #include "kfd_iommu.h"
>   #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>   
>   #define MQD_SIZE_ALIGNED 768
>   
> @@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
>       WARN_ONCE(count < 0, "Compute profile ref. count error");
>   }
>   
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t 
> +throttle_bitmask) {
> +     if (kfd)
> +             kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); 
> }
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
>   /* This function will send a package to HIQ to hang the HWS diff 
> --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index 7b348bf9df21..00c90b47155b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -24,6 +24,7 @@
>   #include <linux/wait.h>
>   #include <linux/anon_inodes.h>
>   #include <uapi/linux/kfd_ioctl.h>
> +#include "amdgpu.h"
>   #include "amdgpu_vm.h"
>   #include "kfd_priv.h"
>   #include "kfd_smi_events.h"
> @@ -148,6 +149,55 @@ static int kfd_smi_ev_release(struct inode *inode, 
> struct file *filep)
>       return 0;
>   }
>   
> +static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long 
> smi_event,
> +                           char *event_msg, int len)
> +{
> +     struct kfd_smi_client *client;
> +
> +     rcu_read_lock();
> +
> +     list_for_each_entry_rcu(client, &dev->smi_clients, list) {
> +             if (!(READ_ONCE(client->events) & smi_event))
> +                     continue;
> +             spin_lock(&client->lock);
> +             if (kfifo_avail(&client->fifo) >= len) {
> +                     kfifo_in(&client->fifo, event_msg, len);
> +                     wake_up_all(&client->wait_queue);
> +             } else {
> +                     pr_debug("smi_event(EventID: %llu): no space left\n",
> +                                     smi_event);
> +             }
> +             spin_unlock(&client->lock);
> +     }
> +
> +     rcu_read_unlock();
> +}
> +
> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
> +                                          uint32_t throttle_bitmask)
> +{
> +     struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
> +     /*
> +      * ThermalThrottle msg = gpu_id(4):throttle_bitmask(4):
gpu_id is not needed. The user calls ioctl with GPU specified and KFD returns 
an anonymous fd. Read from this anon_fd already identify the GPU.

I agree with you. But I would prefer to keep gpu_id in the SMI event message. 
The benefit is it explicitly identifies the GPU sending the message. This way 
user-space doesn't need to maintain an internal mapping of anon_fd to gpu_id.

Regards,
Mukul

> +      *                       thermal_interrupt_count(8):
> +      * 16 bytes event + 1 byte space + 4 bytes gpu_id + 1 byte : +
> +      * 4 byte throttle_bitmask + 1 byte : +
> +      * 8 byte thermal_interupt_counter + 1 byte \n = 36
> +      */
> +     char fifo_in[36];
> +     int len;
> +
> +     if (list_empty(&dev->smi_clients))
> +             return;
> +
> +     len = snprintf(fifo_in, 36, "%x %x:%x:%llx\n",
> +                    KFD_SMI_EVENT_THERMAL_THROTTLE,
> +                    dev->id, throttle_bitmask,
> +                    atomic64_read(&adev->smu.throttle_int_counter));
> +
> +     add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, 
> +len); }
> +
>   void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>   {
>       struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; @@ 
> -156,7 +206,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, 
> uint16_t pasid)
>       /* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
>        */
>       char fifo_in[43];
> -     struct kfd_smi_client *client;
>       int len;
>   
>       if (list_empty(&dev->smi_clients))
> @@ -171,22 +220,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, 
> uint16_t pasid)
>       len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
>               task_info.pid, task_info.task_name);
>   
> -     rcu_read_lock();
> -
> -     list_for_each_entry_rcu(client, &dev->smi_clients, list) {
> -             if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
> -                     continue;
> -             spin_lock(&client->lock);
> -             if (kfifo_avail(&client->fifo) >= len) {
> -                     kfifo_in(&client->fifo, fifo_in, len);
> -                     wake_up_all(&client->wait_queue);
> -             }
> -             else
> -                     pr_debug("smi_event(vmfault): no space left\n");
> -             spin_unlock(&client->lock);
> -     }
> -
> -     rcu_read_unlock();
> +     add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
>   }
>   
>   int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) diff --git 
> a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> index a9cb218fef96..15537b2cccb5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -25,5 +25,7 @@
>   
>   int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
>   void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t 
> pasid);
> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
> +                                          uint32_t throttle_bitmask);
>   
>   #endif
> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
> b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> index b197dcaed064..52b52cbb90e2 100644
> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> @@ -639,6 +639,7 @@ static int smu_sw_init(void *handle)
>       mutex_init(&smu->message_lock);
>   
>       INIT_WORK(&smu->throttling_logging_work, 
> smu_throttling_logging_work_fn);
> +     atomic64_set(&smu->throttle_int_counter, 0);
>       smu->watermarks_bitmap = 0;
>       smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
>       smu->default_power_profile_mode = 
> PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
> diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c 
> b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> index 9b68760dd35b..eb3a57593f69 100644
> --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> @@ -2251,6 +2251,7 @@ static void 
> arcturus_log_thermal_throttling_event(struct smu_context *smu)
>   
>       dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, 
> expect performance decrease. %s.\n",
>                       log_buf);
> +     kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status);
>   }
>   
>   static const struct pptable_funcs arcturus_ppt_funcs = { diff --git 
> a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
> b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> index 896b443f1ce8..18ba421c59bd 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> @@ -446,6 +446,7 @@ struct smu_context
>       bool dc_controlled_by_gpio;
>   
>       struct work_struct throttling_logging_work;
> +     atomic64_t throttle_int_counter;
>   };
>   
>   struct i2c_adapter;
> diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c 
> b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> index fd82402065e6..a9453ec01619 100644
> --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> @@ -1311,6 +1311,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device 
> *adev,
>                               smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
>                               break;
>                       case 0x7:
> +                             /*
> +                              * Increment the throttle interrupt counter
> +                              */
> +                             atomic64_inc(&smu->throttle_int_counter);
> +
>                               if 
> (!atomic_read(&adev->throttling_logging_enabled))
>                                       return 0;
>   
> diff --git a/include/uapi/linux/kfd_ioctl.h 
> b/include/uapi/linux/kfd_ioctl.h index f738c3b53f4e..df6c7a43aadc 
> 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -450,7 +450,8 @@ struct kfd_ioctl_import_dmabuf_args {
>    * KFD SMI(System Management Interface) events
>    */
>   /* Event type (defined by bitmask) */
> -#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
> +#define KFD_SMI_EVENT_VMFAULT                        0x0000000000000001
> +#define KFD_SMI_EVENT_THERMAL_THROTTLE               0x0000000000000002
>   
>   struct kfd_ioctl_smi_events_args {
>       __u32 gpuid;    /* to KFD */
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to