On 12/8/25 16:52, Philip Yang wrote:
> On 2025-12-08 03:40, Christian König wrote:
>> On 12/5/25 22:49, Philip Yang wrote:
>>> MQD BO on VRAM access via FB aperture is mtype UC uncaching, map
>>> to GART as mtype RW caching, to reduce queue switch latency.
>>>
>>> Add helper amdgpu_ttm_alloc/free_gart_entries.
>>> Add helper amdgpu_ttm_gart_bind_gfx9_mqd_vram to bind VRAM pages
>>> to GART mapping.
>>>
>>> Add GART drm mm_node to kfd mem obj to free the GART entries after
>>> MQD is freed.
>>>
>>> Signed-off-by: Philip Yang <[email protected]>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c       | 103 ++++++++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h       |   8 ++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  |   1 +
>>>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |   9 ++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   1 +
>>>   5 files changed, 122 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> index 4f8bc7f35cdc..fc6f4daa9b87 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> @@ -880,6 +880,42 @@ static void amdgpu_ttm_gart_bind_gfx9_mqd(struct 
>>> amdgpu_device *adev,
>>>       }
>>>   }
>>>   +static void amdgpu_ttm_gart_bind_gfx9_mqd_vram(struct amdgpu_device 
>>> *adev,
>>> +                struct ttm_buffer_object *tbo,
>>> +                struct drm_mm_node *mm_node,
>>> +                uint64_t flags)
>>> +{
>>> +    uint64_t total_pages;
>>> +    int num_xcc = max(1U, adev->gfx.num_xcc_per_xcp);
>>> +    uint64_t page_idx, pages_per_xcc;
>>> +    uint64_t ctrl_flags = flags;
>>> +    int i;
>>> +
>>> +    total_pages = tbo->resource->size >> PAGE_SHIFT;
>>> +
>>> +    amdgpu_gmc_get_vm_pte(adev, NULL, NULL, AMDGPU_VM_MTYPE_NC, 
>>> &ctrl_flags);
>>> +
>>> +    if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 3))
>>> +        amdgpu_gmc_get_vm_pte(adev, NULL, NULL, AMDGPU_VM_MTYPE_RW, 
>>> &flags);
>>> +
>>> +    pages_per_xcc = total_pages;
>>> +    do_div(pages_per_xcc, num_xcc);
>>> +
>>> +    for (i = 0, page_idx = 0; i < num_xcc; i++, page_idx += pages_per_xcc) 
>>> {
>>> +        u64 pa = (tbo->resource->start + page_idx) << PAGE_SHIFT;
>>> +        u64 start_page = mm_node->start + page_idx;
>> Don't use resource->start and ḿm_node->start directly. Use the resource 
>> iterators for that.
> VRAM resource allocated with AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS in previous 
> patch, is in one block, GART entries allocated from 
> drm_mm_insert_node_in_range is always in one block. The MQD size is 32 pages 
> for MI300 and 6 pages for VG10, use contiguous allocation is fine unless the 
> VRAM is fragmented too much, or I can remove the 
> AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS flag in this patch, and then use resource 
> iterators to update GART mapping.

It doesn't matter if the block is contigious or not, the point is you should 
not touch resource->start nor mm_node->start directly.

That is a deprecated field from TTM and internals of the VRAM manager backend, 
both should not be touched here.

Regards,
Christian.

>>
>>> +
>>> +        pa += adev->vm_manager.vram_base_offset;
>>> +        amdgpu_gart_map_vram_range(adev, pa, start_page, 1,
>>> +                       flags, NULL);
>>> +
>>> +        amdgpu_gart_map_vram_range(adev, pa + PAGE_SIZE,
>>> +                       start_page + 1,
>>> +                       pages_per_xcc - 1,
>>> +                       ctrl_flags, NULL);
>>> +    }
>>> +}
>>> +s
>>>   static void amdgpu_ttm_gart_bind(struct amdgpu_device *adev,
>>>                    struct ttm_buffer_object *tbo,
>>>                    uint64_t flags)
>>> @@ -1017,6 +1053,73 @@ int amdgpu_ttm_alloc_gart(struct ttm_buffer_object 
>>> *bo)
>>>       return 0;
>>>   }
>>>   +int amdgpu_ttm_alloc_gart_entries(struct amdgpu_device *adev,
>>> +                  struct drm_mm_node *mm_node,
>>> +                  u64 num_pages)
>>> +{
>>> +    struct ttm_resource_manager *man;
>>> +    struct amdgpu_gtt_mgr *mgr;
>>> +    int r;
>>> +
>>> +    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>>> +    mgr = container_of(man, struct amdgpu_gtt_mgr, manager);
>>> +
>>> +    spin_lock(&mgr->lock);
>>> +    r = drm_mm_insert_node_in_range(&mgr->mm, mm_node, num_pages,
>>> +                    0, 0, 0,
>>> +                    adev->gmc.gart_size >> PAGE_SHIFT,
>>> +                    DRM_MM_INSERT_BEST);
>> That belongs into amdgpu_gtt_mgr.c and clearly not here!
> Yes, I will move the helper function to amdgpu_gtt_mgr.c
> 
> Regards,
> Philip
>>
>> Regards,
>> Christian.
>>
>>> +    spin_unlock(&mgr->lock);
>>> +    return r;
>>> +}
>>> +
>>> +void amdgpu_ttm_free_gart_entries(struct amdgpu_device *adev,
>>> +                  struct drm_mm_node *mm_node)
>>> +{
>>> +    struct ttm_resource_manager *man;
>>> +    struct amdgpu_gtt_mgr *mgr;
>>> +
>>> +    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>>> +    mgr = container_of(man, struct amdgpu_gtt_mgr, manager);
>>> +
>>> +    spin_lock(&mgr->lock);
>>> +    if (drm_mm_node_allocated(mm_node))
>>> +        drm_mm_remove_node(mm_node);
>>> +    spin_unlock(&mgr->lock);
>>> +}
>>> +
>>> +/*
>>> + * amdgpu_ttm_alloc_gart_vram_bo - Bind VRAM pages to GART mapping
>>> + *
>>> + * call amdgpu_ttm_alloc_gart_entries to alloc GART dynamically
>>> + */
>>> +int amdgpu_ttm_alloc_gart_vram_bo(struct amdgpu_bo *abo,
>>> +                  struct drm_mm_node *mm_node,
>>> +                  u64 *gpu_addr)
>>> +{
>>> +    struct ttm_buffer_object *bo = &abo->tbo;
>>> +    struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
>>> +    uint64_t flags;
>>> +    int r;
>>> +
>>> +    /* Only for valid VRAM bo resource */
>>> +    if (bo->resource->start == AMDGPU_BO_INVALID_OFFSET)
>>> +        return 0;
>>> +
>>> +    r = amdgpu_ttm_alloc_gart_entries(adev, mm_node,
>>> +                      amdgpu_bo_ngpu_pages(abo));
>>> +    if (r)
>>> +        return r;
>>> +
>>> +    /* compute PTE flags for this buffer object */
>>> +    flags = amdgpu_ttm_tt_pte_flags(adev, NULL, bo->resource);
>>> +    amdgpu_ttm_gart_bind_gfx9_mqd_vram(adev, bo, mm_node, flags);
>>> +    amdgpu_gart_invalidate_tlb(adev);
>>> +
>>> +    *gpu_addr = mm_node->start << PAGE_SHIFT;
>>> +    return 0;
>>> +}
>>> +
>>>   /*
>>>    * amdgpu_ttm_recover_gart - Rebind GTT pages
>>>    *
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> index 72488124aa59..cb6123358843 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> @@ -185,6 +185,14 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_buffer_entity 
>>> *entity,
>>>                  u64 k_job_id);
>>>     int amdgpu_ttm_alloc_gart(struct ttm_buffer_object *bo);
>>> +int amdgpu_ttm_alloc_gart_vram_bo(struct amdgpu_bo *abo,
>>> +                  struct drm_mm_node *mm_node,
>>> +                  u64 *gpu_addr);
>>> +int amdgpu_ttm_alloc_gart_entries(struct amdgpu_device *adev,
>>> +                  struct drm_mm_node *mm_node,
>>> +                  u64 num_pages);
>>> +void amdgpu_ttm_free_gart_entries(struct amdgpu_device *adev,
>>> +                  struct drm_mm_node *mm_node);
>>>   void amdgpu_ttm_recover_gart(struct ttm_buffer_object *tbo);
>>>   uint64_t amdgpu_ttm_domain_start(struct amdgpu_device *adev, uint32_t 
>>> type);
>>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
>>> index f78b249e1a41..00e1e5b30a3a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
>>> @@ -225,6 +225,7 @@ void kfd_free_mqd_cp(struct mqd_manager *mm, void *mqd,
>>>             struct kfd_mem_obj *mqd_mem_obj)
>>>   {
>>>       if (mqd_mem_obj->mem) {
>>> +        amdgpu_ttm_free_gart_entries(mm->dev->adev, &mqd_mem_obj->mm_node);
>>>           amdgpu_amdkfd_free_kernel_mem(mm->dev->adev, &mqd_mem_obj->mem);
>>>           kfree(mqd_mem_obj);
>>>       } else {
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>>> index 14123e1a9716..5828220056bd 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>>> @@ -148,6 +148,15 @@ static struct kfd_mem_obj *allocate_mqd(struct 
>>> kfd_node *node,
>>>               kfree(mqd_mem_obj);
>>>               return NULL;
>>>           }
>>> +
>>> +        retval = amdgpu_ttm_alloc_gart_vram_bo(mqd_mem_obj->mem,
>>> +                               &mqd_mem_obj->mm_node,
>>> +                               &(mqd_mem_obj->gpu_addr));
>>> +        if (retval) {
>>> +            amdgpu_amdkfd_free_kernel_mem(node->adev, &(mqd_mem_obj->mem));
>>> +            kfree(mqd_mem_obj);
>>> +            return NULL;
>>> +        }
>>>       } else {
>>>           retval = kfd_gtt_sa_allocate(node, sizeof(struct v9_mqd),
>>>                   &mqd_mem_obj);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 29419b3249cf..fdde907836fb 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -252,6 +252,7 @@ struct kfd_mem_obj {
>>>       uint64_t gpu_addr;
>>>       uint32_t *cpu_ptr;
>>>       void *mem;
>>> +    struct drm_mm_node mm_node;
>>>   };
>>>     struct kfd_vmid_info {
> 

Reply via email to