To reduce queue switch latency, alloc GART entries and map MQD with MTYPE_RW and control stack with MTYPE_NC on GART. Before invoking get_wave_state, debugger unmaps the runlist and at that time CP writes back TC, so debugger will read the updated state.
Add GART mm_node to kfd mem obj to free the GART entries after MQD mem obj is freed. Use resource cursor to handle VRAM resource which maybe on multiple blocks and use cursor_gart to handle GART entries. Signed-off-by: Philip Yang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 74 +++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 6 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 2 + .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 12 +++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + 5 files changed, 93 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index f27ffe64aafa..92a21af789ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -875,6 +875,80 @@ static void amdgpu_ttm_gart_bind_gfx9_mqd(struct amdgpu_device *adev, } } +/* + * Same function and MQD description from amdgpu_ttm_gart_bind_gfx9_mqd, + * except this is for MQD on VRAM BO and use dynamic alloc GART entries. + */ +int amdgpu_ttm_gart_bind_gfx9_mqd_vram(struct amdgpu_device *adev, + struct amdgpu_bo *abo, + struct drm_mm_node *mm_node, + u64 *gpu_addr) +{ + struct ttm_buffer_object *bo = &abo->tbo; + int num_xcc = max(1U, adev->gfx.num_xcc_per_xcp); + u64 page_idx, pages_per_xcc; + struct amdgpu_res_cursor cursor_gart; + struct amdgpu_res_cursor cursor; + u64 ctrl_flags; + u64 total_pages; + u64 flags; + int i, r; + + r = amdgpu_gtt_mgr_alloc_entries(&adev->mman.gtt_mgr, mm_node, + amdgpu_bo_ngpu_pages(abo), 0); + if (r) + return r; + + /* compute PTE flags for this buffer object */ + flags = amdgpu_ttm_tt_pte_flags(adev, NULL, bo->resource); + ctrl_flags = flags; + amdgpu_gmc_get_vm_pte(adev, NULL, NULL, AMDGPU_VM_MTYPE_RW, &flags); + amdgpu_gmc_get_vm_pte(adev, NULL, NULL, AMDGPU_VM_MTYPE_NC, &ctrl_flags); + + total_pages = bo->base.size >> PAGE_SHIFT; + pages_per_xcc = total_pages; + do_div(pages_per_xcc, num_xcc); + + amdgpu_res_first(NULL, mm_node->start, total_pages, &cursor_gart); + amdgpu_res_first(bo->resource, 0, bo->resource->size, &cursor); + + for (i = 0, page_idx = 0; i < num_xcc; i++, page_idx += pages_per_xcc) { + u64 start_page; + u64 npages, n; + u64 pa; + + /* MQD page: use flags MTYPE_RW */ + start_page = cursor_gart.start; + pa = cursor.start + adev->vm_manager.vram_base_offset; + n = 1; + amdgpu_gart_map_vram_range(adev, pa, start_page, n, + flags, NULL); + + /* Ctrl stack pages: modify the memory type to NC */ + npages = pages_per_xcc - 1; + while (npages) { + amdgpu_res_next(&cursor_gart, n); + amdgpu_res_next(&cursor, n * PAGE_SIZE); + + start_page = cursor_gart.start; + pa = cursor.start + adev->vm_manager.vram_base_offset; + n = min3(cursor.size / PAGE_SIZE, cursor_gart.size, npages); + + amdgpu_gart_map_vram_range(adev, pa, start_page, n, + ctrl_flags, NULL); + + npages -= n; + } + + amdgpu_res_next(&cursor_gart, n); + amdgpu_res_next(&cursor, n * PAGE_SIZE); + } + + amdgpu_gart_invalidate_tlb(adev); + *gpu_addr = mm_node->start << PAGE_SHIFT; + return 0; +} + static void amdgpu_ttm_gart_bind(struct amdgpu_device *adev, struct ttm_buffer_object *tbo, uint64_t flags) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index 143201ecea3f..3751f010f14a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -140,7 +140,6 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev); bool amdgpu_gtt_mgr_has_gart_addr(struct ttm_resource *mem); void amdgpu_gtt_mgr_recover(struct amdgpu_gtt_mgr *mgr); - int amdgpu_gtt_mgr_alloc_entries(struct amdgpu_gtt_mgr *mgr, struct drm_mm_node *mm_node, u64 num_pages, @@ -189,8 +188,11 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_buffer_entity *entity, struct dma_resv *resv, struct dma_fence **f, u64 k_job_id); - int amdgpu_ttm_alloc_gart(struct ttm_buffer_object *bo); +int amdgpu_ttm_gart_bind_gfx9_mqd_vram(struct amdgpu_device *adev, + struct amdgpu_bo *abo, + struct drm_mm_node *mm_node, + u64 *gpu_addr); void amdgpu_ttm_recover_gart(struct ttm_buffer_object *tbo); uint64_t amdgpu_ttm_domain_start(struct amdgpu_device *adev, uint32_t type); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index f78b249e1a41..edb72f4ef82d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -225,6 +225,8 @@ void kfd_free_mqd_cp(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { if (mqd_mem_obj->mem) { + amdgpu_gtt_mgr_free_entries(&mm->dev->adev->mman.gtt_mgr, + &mqd_mem_obj->mm_node); amdgpu_amdkfd_free_kernel_mem(mm->dev->adev, &mqd_mem_obj->mem); kfree(mqd_mem_obj); } else { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index d867dccae675..1893dabb823a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -160,6 +160,18 @@ static struct kfd_mem_obj *allocate_mqd(struct kfd_node *node, kfree(mqd_mem_obj); return NULL; } + + if (mqd_on_vram(node->adev)) { + retval = amdgpu_ttm_gart_bind_gfx9_mqd_vram(node->adev, + mqd_mem_obj->mem, + &(mqd_mem_obj->mm_node), + &(mqd_mem_obj->gpu_addr)); + if (retval) { + amdgpu_amdkfd_free_kernel_mem(node->adev, &(mqd_mem_obj->mem)); + kfree(mqd_mem_obj); + return NULL; + } + } } else { retval = kfd_gtt_sa_allocate(node, sizeof(struct v9_mqd), &mqd_mem_obj); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 042b8e747b5b..7c95cc9e2f0d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -253,6 +253,7 @@ struct kfd_mem_obj { uint64_t gpu_addr; uint32_t *cpu_ptr; void *mem; + struct drm_mm_node mm_node; }; struct kfd_vmid_info { -- 2.50.1
