Am 2020-08-20 um 11:53 p.m. schrieb Huang Rui:
> On Fri, Aug 21, 2020 at 10:41:17AM +0800, Kuehling, Felix wrote:
>> Am 2020-08-20 um 4:40 a.m. schrieb Huang Rui:
>>> We still have a few iommu issues which need to address, so force raven
>>> as "dgpu" path for the moment.
>>>
>>> This is to add the fallback path to bypass IOMMU if IOMMU v2 is disabled
>>> or ACPI CRAT table not correct.
>>>
>>> v2: Use ignore_crat parameter to decide whether it will go with IOMMUv2.
>>> v3: Align with existed thunk, don't change the way of raven, only renoir
>>>     will use "dgpu" path by default.
>>> v4: don't update global ignore_crat in the driver, and revise fallback
>>>     function if CRAT is broken.
>>>
>>> Signed-off-by: Huang Rui <ray.hu...@amd.com>
>>> ---
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |  5 ++++-
>>>  drivers/gpu/drm/amd/amdkfd/kfd_crat.c     | 23 +++++++++++++++++++++--
>>>  drivers/gpu/drm/amd/amdkfd/kfd_device.c   |  5 ++++-
>>>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h     | 10 ++++++++--
>>>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 20 ++++++++++++++++++++
>>>  5 files changed, 57 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index a9a4319c24ae..189f9d7e190d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -684,11 +684,14 @@ MODULE_PARM_DESC(debug_largebar,
>>>   * Ignore CRAT table during KFD initialization. By default, KFD uses the 
>>> ACPI CRAT
>>>   * table to get information about AMD APUs. This option can serve as a 
>>> workaround on
>>>   * systems with a broken CRAT table.
>>> + *
>>> + * Default is auto (according to asic type, iommu_v2, and crat table, to 
>>> decide
>>> + * whehter use CRAT)
>>>   */
>>>  int ignore_crat;
>>>  module_param(ignore_crat, int, 0444);
>>>  MODULE_PARM_DESC(ignore_crat,
>>> -   "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 
>>> = ignore CRAT)");
>>> +   "Ignore CRAT table during KFD initialization (0 = auto (default), 1 = 
>>> ignore CRAT)");
>>>  
>>>  /**
>>>   * DOC: halt_if_hws_hang (int)
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>>> index 59557e3e206a..a17cfc290072 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>>> @@ -22,6 +22,7 @@
>>>  
>>>  #include <linux/pci.h>
>>>  #include <linux/acpi.h>
>>> +#include <asm/processor.h>
>>>  #include "kfd_crat.h"
>>>  #include "kfd_priv.h"
>>>  #include "kfd_topology.h"
>>> @@ -740,6 +741,25 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev 
>>> *kdev,
>>>     return 0;
>>>  }
>>>  
>>> +
>>> +#ifdef CONFIG_ACPI
>>> +
>>> +bool kfd_ignore_crat(void)
>>> +{
>>> +   bool ret;
>>> +
>>> +   if (ignore_crat)
>>> +           return true;
>>> +
>>> +#ifndef KFD_SUPPORT_IOMMU_V2
>>> +   ret = true;
>>> +#else
>>> +   ret = false;
>>> +#endif
>>> +
>>> +   return ret;
>>> +}
>>> +
>>>  /*
>>>   * kfd_create_crat_image_acpi - Allocates memory for CRAT image and
>>>   * copies CRAT from ACPI (if available).
>>> @@ -751,7 +771,6 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
>>>   *
>>>   * Return 0 if successful else return error code
>>>   */
>>> -#ifdef CONFIG_ACPI
>>>  int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
>>>  {
>>>     struct acpi_table_header *crat_table;
>>> @@ -775,7 +794,7 @@ int kfd_create_crat_image_acpi(void **crat_image, 
>>> size_t *size)
>>>             return -EINVAL;
>>>     }
>>>  
>>> -   if (ignore_crat) {
>>> +   if (kfd_ignore_crat()) {
>>>             pr_info("CRAT table disabled by module option\n");
>>>             return -ENODATA;
>>>     }
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> index 2c030c2b5b8d..fdf64d361be3 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> @@ -112,6 +112,7 @@ static const struct kfd_device_info carrizo_device_info 
>>> = {
>>>     .num_xgmi_sdma_engines = 0,
>>>     .num_sdma_queues_per_engine = 2,
>>>  };
>>> +#endif
>>>  
>>>  static const struct kfd_device_info raven_device_info = {
>>>     .asic_family = CHIP_RAVEN,
>>> @@ -130,7 +131,6 @@ static const struct kfd_device_info raven_device_info = 
>>> {
>>>     .num_xgmi_sdma_engines = 0,
>>>     .num_sdma_queues_per_engine = 2,
>>>  };
>>> -#endif
>>>  
>>>  static const struct kfd_device_info hawaii_device_info = {
>>>     .asic_family = CHIP_HAWAII,
>>> @@ -688,6 +688,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>>>             goto gws_error;
>>>     }
>>>  
>>> +   /* If CRAT is broken, won't set iommu enabled */
>>> +   kfd_double_confirm_iommu_support(kfd);
>>> +
>>>     if (kfd_iommu_device_init(kfd)) {
>>>             dev_err(kfd_device, "Error initializing iommuv2\n");
>>>             goto device_iommu_error;
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 82f955750e75..5b70fbe429f1 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -308,12 +308,14 @@ struct kfd_dev {
>>>  
>>>     /* xGMI */
>>>     uint64_t hive_id;
>>> -    
>>>     /* UUID */
>>>     uint64_t unique_id;
>>>  
>>>     bool pci_atomic_requested;
>>>  
>>> +   /* Use IOMMU v2 flag */
>>> +   bool use_iommu_v2;
>>> +
>>>     /* SRAM ECC flag */
>>>     atomic_t sram_ecc_flag;
>>>  
>>> @@ -1009,6 +1011,7 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct 
>>> pci_dev *pdev);
>>>  struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd);
>>>  int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev);
>>>  int kfd_numa_node_to_apic_id(int numa_node_id);
>>> +void kfd_double_confirm_iommu_support(struct kfd_dev *gpu);
>>>  
>>>  /* Interrupts */
>>>  int kfd_interrupt_init(struct kfd_dev *dev);
>>> @@ -1232,9 +1235,12 @@ static inline int 
>>> kfd_devcgroup_check_permission(struct kfd_dev *kfd)
>>>  #endif
>>>  }
>>>  
>>> +bool kfd_ignore_crat(void);
>>> +
>>>  static inline bool kfd_device_use_iommu_v2(const struct kfd_dev *dev)
>>>  {
>>> -   return dev && dev->device_info->needs_iommu_device;
>>> +   return !kfd_ignore_crat() && dev && dev->use_iommu_v2 &&
>>> +           dev->device_info->needs_iommu_device;
>> I think this could now be simplified:
>>
>>     return dev && dev->use_iommu_v2; 
>>
>> So maybe you don't need this function any more.
> In Renoir, if ACPI CRAT from SBIOS is good, we may still use
> dev->device_info->needs_iommu_device to confirm whether we should go dGPU.

That should be reflected correctly in the dev->use_iommu_v2 flag.
dev->use_iommu_v2 should not be set on any GPU that doesn't have
dev->device_info->needs_iommu_device. See below.


>
>>
>>>  }
>>>  
>>>  /* Debugfs */
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
>>> index 4b29815e9205..8907b5317103 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
>>> @@ -1579,6 +1579,26 @@ int kfd_numa_node_to_apic_id(int numa_node_id)
>>>     return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id));
>>>  }
>>>  
>>> +void kfd_double_confirm_iommu_support(struct kfd_dev *gpu)
>>> +{
>>> +   struct kfd_topology_device *dev;
>>> +
>>> +   unsigned temp = 0;
>>> +
>>> +   down_read(&topology_lock);
>>> +
>>> +   /* The cpu_cores_count and simd_count aren't zero at the same time in
>>> +    * APU node.
>>> +    */
>>> +   list_for_each_entry(dev, &topology_device_list, list)
>>> +           temp |= dev->node_props.cpu_cores_count *
>>> +                   dev->node_props.simd_count;
>> You shouldn't look at all GPUs, only at the GPU currently being
>> initialized. Otherwise all your dGPUs in an APU system will also have
>> use_iommu_v2 == true, which would be confusing.
>>
>> I'd do this in kfd_assign_gpu, because at that point you have access to
>> the kfd_topology_device and the kfd_dev at the same time without having
>> to add another loop.
>>
> Actually, I follow your comment to do it like this, however, we have to set
> the use_iommu_v2 before kfd_iommu_device_init(). kfd_assign_gpu in 
> kfd_topology_add_device()
> is a little late.

I see, I missed that. But we still should make sure we set
gpu->use_iommu_v2 only on devices that have
gpu->device_info->needs_iommu_device, and only if this GPU can be
assigned to an APU node in the topology. So then
kfd_double_confirm_iommu_support would look like this:

+       if (!gpu->device_info->needs_iommu_device)
+               return;
+       down_read(&topology_lock);
+       /* Only use IOMMUv2 if there is an APU topology node with no GPU 
assigned yet.
+        * This GPU will be assigned to it.
+        */
+       list_for_each_entry(dev, &topology_device_list, list)
+               if (dev->node_props.cpu_cores_count &&
+                   dev->node_props.simd_count &&
+                   !dev->gpu) {
+                       gpu->use_iommu_v2 = true;
+                       break;
+               }
+       up_read(&topology_lock);

Regards,
  Felix


>
> Thanks,
> Ray
>
>>             ...
>>      list_for_each_entry(dev, &topology_device_list, list) {
>>                     /* Discrete GPUs need their own topology device list
>>                      * entries. Don't assign them to CPU/APU nodes.
>>                      */
>>                     if (!gpu->device_info->needs_iommu_device &&
>>                         dev->node_props.cpu_cores_count)
>>                             continue;
>>
>>                     if (!dev->gpu && (dev->node_props.simd_count > 0)) {
>>     +                       if (dev->node_props.cpu_cores_count)
>>     +                               dev->use_iommu_v2 = true;
>>                             ...
>>
>> Regards,
>>   Felix
>>
>>
>>> +
>>> +   up_read(&topology_lock);
>>> +
>>> +   gpu->use_iommu_v2 = temp ? true : false;
>>> +}
>>> +
>>>  #if defined(CONFIG_DEBUG_FS)
>>>  
>>>  int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data)
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to