Applied. Thanks! Alex
On Tue, Mar 24, 2026 at 10:58 PM Kuehling, Felix <[email protected]> wrote: > > > On 2026-03-23 00:28, Donet Tom wrote: > > For gfxV9, due to a hardware bug ("based on the comments in the code > > here [1]"), the control stack of a user-mode compute queue must be > > allocated immediately after the page boundary of its regular MQD buffer. > > To handle this, we allocate an enlarged MQD buffer where the first page > > is used as the MQD and the remaining pages store the control stack. > > Although these regions share the same BO, they require different memory > > types: the MQD must be UC (uncached), while the control stack must be > > NC (non-coherent), matching the behavior when the control stack is > > allocated in user space. > > > > This logic works correctly on systems where the CPU page size matches > > the GPU page size (4K). However, the current implementation aligns both > > the MQD and the control stack to the CPU PAGE_SIZE. On systems with a > > larger CPU page size, the entire first CPU page is marked UC—even though > > that page may contain multiple GPU pages. The GPU treats the second 4K > > GPU page inside that CPU page as part of the control stack, but it is > > incorrectly mapped as UC. > > > > This patch fixes the issue by aligning both the MQD and control stack > > sizes to the GPU page size (4K). The first 4K page is correctly marked > > as UC for the MQD, and the remaining GPU pages are marked NC for the > > control stack. This ensures proper memory type assignment on systems > > with larger CPU page sizes. > > > > [1]: > > https://elixir.bootlin.com/linux/v6.18/source/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c#L118 > > > > Signed-off-by: Donet Tom <[email protected]> > > Acked-by: Felix Kuehling <[email protected]> > > > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c | 44 +++++++++++++++++++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h | 2 + > > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 16 ++----- > > .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 23 ++++++---- > > 4 files changed, 64 insertions(+), 21 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c > > index ec911dce345f..4d884180cf61 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c > > @@ -403,6 +403,50 @@ void amdgpu_gart_map_vram_range(struct amdgpu_device > > *adev, uint64_t pa, > > drm_dev_exit(idx); > > } > > > > +/** > > + * amdgpu_gart_map_gfx9_mqd - map mqd and ctrl_stack dma_addresses into > > GART entries > > + * > > + * @adev: amdgpu_device pointer > > + * @offset: offset into the GPU's gart aperture > > + * @pages: number of pages to bind > > + * @dma_addr: DMA addresses of pages > > + * @flags: page table entry flags > > + * > > + * Map the MQD and control stack addresses into GART entries with the > > correct > > + * memory types on gfxv9. The MQD occupies the first 4KB and is followed by > > + * the control stack. The MQD uses UC (uncached) memory, while the control > > stack > > + * uses NC (non-coherent) memory. > > + */ > > +void amdgpu_gart_map_gfx9_mqd(struct amdgpu_device *adev, uint64_t offset, > > + int pages, dma_addr_t *dma_addr, uint64_t flags) > > +{ > > + uint64_t page_base; > > + unsigned int i, j, t; > > + int idx; > > + uint64_t ctrl_flags = AMDGPU_PTE_MTYPE_VG10(flags, AMDGPU_MTYPE_NC); > > + void *dst; > > + > > + if (!adev->gart.ptr) > > + return; > > + > > + if (!drm_dev_enter(adev_to_drm(adev), &idx)) > > + return; > > + > > + t = offset / AMDGPU_GPU_PAGE_SIZE; > > + dst = adev->gart.ptr; > > + for (i = 0; i < pages; i++) { > > + page_base = dma_addr[i]; > > + for (j = 0; j < AMDGPU_GPU_PAGES_IN_CPU_PAGE; j++, t++) { > > + if ((i == 0) && (j == 0)) > > + amdgpu_gmc_set_pte_pde(adev, dst, t, > > page_base, flags); > > + else > > + amdgpu_gmc_set_pte_pde(adev, dst, t, > > page_base, ctrl_flags); > > + page_base += AMDGPU_GPU_PAGE_SIZE; > > + } > > + } > > + drm_dev_exit(idx); > > +} > > + > > /** > > * amdgpu_gart_bind - bind pages into the gart page table > > * > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h > > index d3118275ddae..6ebd2da32ea6 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h > > @@ -62,6 +62,8 @@ void amdgpu_gart_unbind(struct amdgpu_device *adev, > > uint64_t offset, > > void amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset, > > int pages, dma_addr_t *dma_addr, uint64_t flags, > > void *dst); > > +void amdgpu_gart_map_gfx9_mqd(struct amdgpu_device *adev, uint64_t offset, > > + int pages, dma_addr_t *dma_addr, uint64_t flags); > > void amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset, > > int pages, dma_addr_t *dma_addr, uint64_t flags); > > void amdgpu_gart_map_vram_range(struct amdgpu_device *adev, uint64_t pa, > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > > index 67983955a124..e086eb1d2b24 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > > @@ -855,25 +855,15 @@ static void amdgpu_ttm_gart_bind_gfx9_mqd(struct > > amdgpu_device *adev, > > int num_xcc = max(1U, adev->gfx.num_xcc_per_xcp); > > uint64_t page_idx, pages_per_xcc; > > int i; > > - uint64_t ctrl_flags = AMDGPU_PTE_MTYPE_VG10(flags, AMDGPU_MTYPE_NC); > > > > pages_per_xcc = total_pages; > > do_div(pages_per_xcc, num_xcc); > > > > for (i = 0, page_idx = 0; i < num_xcc; i++, page_idx += > > pages_per_xcc) { > > - /* MQD page: use default flags */ > > - amdgpu_gart_bind(adev, > > + amdgpu_gart_map_gfx9_mqd(adev, > > gtt->offset + (page_idx << PAGE_SHIFT), > > - 1, >t->ttm.dma_address[page_idx], flags); > > - /* > > - * Ctrl pages - modify the memory type to NC (ctrl_flags) from > > - * the second page of the BO onward. > > - */ > > - amdgpu_gart_bind(adev, > > - gtt->offset + ((page_idx + 1) << PAGE_SHIFT), > > - pages_per_xcc - 1, > > - >t->ttm.dma_address[page_idx + 1], > > - ctrl_flags); > > + pages_per_xcc, > > >t->ttm.dma_address[page_idx], > > + flags); > > } > > } > > > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > index dcf4bbfa641b..ff0e483514da 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > @@ -42,9 +42,16 @@ static uint64_t mqd_stride_v9(struct mqd_manager *mm, > > struct queue_properties *q) > > { > > if (mm->dev->kfd->cwsr_enabled && > > - q->type == KFD_QUEUE_TYPE_COMPUTE) > > - return ALIGN(q->ctl_stack_size, PAGE_SIZE) + > > - ALIGN(sizeof(struct v9_mqd), PAGE_SIZE); > > + q->type == KFD_QUEUE_TYPE_COMPUTE) { > > + > > + /* On gfxv9, the MQD resides in the first 4K page, > > + * followed by the control stack. Align both to > > + * AMDGPU_GPU_PAGE_SIZE to maintain the required 4K boundary. > > + */ > > + > > + return ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) + > > + ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), > > PAGE_SIZE); > > + } > > > > return mm->mqd_size; > > } > > @@ -148,8 +155,8 @@ static struct kfd_mem_obj *allocate_mqd(struct > > mqd_manager *mm, > > if (!mqd_mem_obj) > > return NULL; > > retval = amdgpu_amdkfd_alloc_kernel_mem(node->adev, > > - (ALIGN(q->ctl_stack_size, PAGE_SIZE) + > > - ALIGN(sizeof(struct v9_mqd), PAGE_SIZE)) * > > + (ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) > > + > > + ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), > > PAGE_SIZE)) * > > NUM_XCC(node->xcc_mask), > > mqd_on_vram(node->adev) ? AMDGPU_GEM_DOMAIN_VRAM : > > AMDGPU_GEM_DOMAIN_GTT, > > @@ -357,7 +364,7 @@ static int get_wave_state(struct mqd_manager *mm, void > > *mqd, > > struct kfd_context_save_area_header header; > > > > /* Control stack is located one page after MQD. */ > > - void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); > > + void *mqd_ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE); > > > > m = get_mqd(mqd); > > > > @@ -394,7 +401,7 @@ static void checkpoint_mqd(struct mqd_manager *mm, void > > *mqd, void *mqd_dst, voi > > { > > struct v9_mqd *m; > > /* Control stack is located one page after MQD. */ > > - void *ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); > > + void *ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE); > > > > m = get_mqd(mqd); > > > > @@ -440,7 +447,7 @@ static void restore_mqd(struct mqd_manager *mm, void > > **mqd, > > *gart_addr = addr; > > > > /* Control stack is located one page after MQD. */ > > - ctl_stack = (void *)((uintptr_t)*mqd + PAGE_SIZE); > > + ctl_stack = (void *)((uintptr_t)*mqd + AMDGPU_GPU_PAGE_SIZE); > > memcpy(ctl_stack, ctl_stack_src, ctl_stack_size); > > > > m->cp_hqd_pq_doorbell_control =
