Hi Mario,

first of all please loop me in on TTM changes as maintainer explicitely. I 
don't see everything which flys by on dri-devel.

Then changing the 50% limit is an absolutely NO-GO. It's completely irrelevant 
that AI wants to use more, HPC use cases complained about that for decades, but 
we simply can't do that reliable.

Regards,
Christian.

On 3/20/26 15:34, Mario Limonciello wrote:
> I think there is actually a very easy way to trigger it and it's not obvious 
> that a user messed it up.
> 
> Assume you're on a 128GB system with VRAM set to 512MB.
> 1) Set TTM page limit corresponding to 96GB
> 2) Use uma_carveout sysfs or BIOS to set VRAM to 96GB
> 3) Reboot system
> 4) Now VRAM is 96GB, but the page limit was a module parameter and will be 
> wrong.
> 
> I actually /think/ that the RFC [1] I proposed a few weeks ago could be a 
> good way to prevent this.  By using EFI variable instead, TTM could sanity 
> check anything it reads at startup and save sane values to EFI for the next 
> reboot (if they're insane).
> 
> https://lore.kernel.org/dri-devel/[email protected]/
>  [1]
> 
> On 3/20/2026 9:28 AM, Zhang, Yifan wrote:
>> [AMD Official Use Only - AMD Internal Distribution Only]
>>
>> Yes, I agree. I’ve just been notified that this memory configuration is a 
>> mistake rather than a valid user case. So the fix is low priority for now.
>>
>> -----Original Message-----
>> From: Limonciello, Mario <[email protected]>
>> Sent: Friday, March 20, 2026 11:14 AM
>> To: Zhang, Yifan <[email protected]>; [email protected]
>> Cc: Deucher, Alexander <[email protected]>; Koenig, Christian 
>> <[email protected]>; Limonciello, Mario <[email protected]>; 
>> Yuan, Perry <[email protected]>
>> Subject: Re: [PATCH v2] drm/amdkfd: check system memory when set 
>> apu_prefer_gtt
>>
>>
>>
>> On 3/19/2026 2:32 AM, Yifan Zhang wrote:
>>> Current apu_prefer_gtt setting only check gtt_size, which could be set
>>> by user to a larger than system memory value (via ttm modules
>>> parameter pages_limit). E.g. carveout vram 32GB, gtt_size 50GB (via
>>> ttm modules parameter pages_limit), system memory 31GB. In that case,
>>> apu_prefer_gtt will be set incorrectly. Take system memory into
>>> account when set apu_prefer_gtt.
>>>
>>
>> Wouldn't it be cleaner to do this in TTM?  IE test that a bad option was set 
>> by user pages_limit value and then show something like:
>>
>> if (user > possible) {
>>       pr_warn("Requested invalid %d pages, limiting to %d pages", user, 
>> possible);
>>       user = possible;
>> }
>>
>> Then we can always trust what we get from TTM.
>>
>>> Signed-off-by: Yifan Zhang <[email protected]>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c       | 2 --
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h       | 4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 6 ++++--
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c          | 7 ++++++-
>>>    4 files changed, 12 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> index 3bfd79c89df3..a6ee9d9bfafb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> @@ -170,8 +170,6 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device 
>>> *adev)
>>>        int i;
>>>        int last_valid_bit;
>>>
>>> -     amdgpu_amdkfd_gpuvm_init_mem_limits();
>>> -
>>>        if (adev->kfd.dev) {
>>>                struct kgd2kfd_shared_resources gpu_resources = {
>>>                        .compute_vmid_bitmap =
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> index cdbab7f8cee8..13cada7da4a9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> @@ -369,7 +369,7 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct
>>> amdgpu_device *adev, int xcp_id);
>>>
>>>
>>>    #if IS_ENABLED(CONFIG_HSA_AMD)
>>> -void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
>>> +uint64_t amdgpu_amdkfd_gpuvm_init_mem_limits(void);
>>>    void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
>>>                                struct amdgpu_vm *vm);
>>>
>>> @@ -382,7 +382,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo);
>>>    void amdgpu_amdkfd_reserve_system_mem(uint64_t size);
>>>    #else
>>>    static inline
>>> -void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
>>> +uint64_t amdgpu_amdkfd_gpuvm_init_mem_limits(void)
>>>    {
>>>    }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index 8a869fe41acd..4fba7d2f34a9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -109,13 +109,13 @@ static bool reuse_dmamap(struct amdgpu_device *adev, 
>>> struct amdgpu_device *bo_ad
>>>     *  System (TTM + userptr) memory - 15/16th System RAM
>>>     *  TTM memory - 3/8th System RAM
>>>     */
>>> -void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
>>> +uint64_t amdgpu_amdkfd_gpuvm_init_mem_limits(void)
>>>    {
>>>        struct sysinfo si;
>>>        uint64_t mem;
>>>
>>>        if (kfd_mem_limit.max_system_mem_limit)
>>> -             return;
>>> +             return kfd_mem_limit.max_system_mem_limit;
>>>
>>>        si_meminfo(&si);
>>>        mem = si.totalram - si.totalhigh;
>>> @@ -132,6 +132,8 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
>>>        pr_debug("Kernel memory limit %lluM, TTM limit %lluM\n",
>>>                (kfd_mem_limit.max_system_mem_limit >> 20),
>>>                (kfd_mem_limit.max_ttm_mem_limit >> 20));
>>> +
>>> +     return kfd_mem_limit.max_system_mem_limit;
>>>    }
>>>
>>>    void amdgpu_amdkfd_reserve_system_mem(uint64_t size) diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> index 714fd8d12ca5..df98ece071e1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> @@ -2071,6 +2071,7 @@ static void amdgpu_ttm_buffer_entity_fini(struct 
>>> amdgpu_gtt_mgr *mgr,
>>>    int amdgpu_ttm_init(struct amdgpu_device *adev)
>>>    {
>>>        uint64_t gtt_size;
>>> +     uint64_t max_system_mem_limit;
>>>        int r;
>>>
>>>        dma_set_max_seg_size(adev->dev, UINT_MAX); @@ -2210,8 +2211,12 @@
>>> int amdgpu_ttm_init(struct amdgpu_device *adev)
>>>        dev_info(adev->dev, " %uM of GTT memory ready.\n",
>>>                 (unsigned int)(gtt_size / (1024 * 1024)));
>>>
>>> +
>>> +     max_system_mem_limit = amdgpu_amdkfd_gpuvm_init_mem_limits();
>>> +
>>>        if (adev->flags & AMD_IS_APU) {
>>> -             if (adev->gmc.real_vram_size < gtt_size)
>>> +             if (adev->gmc.real_vram_size < gtt_size &&
>>> +                     adev->gmc.real_vram_size < max_system_mem_limit)
>>>                        adev->apu_prefer_gtt = true;
>>>        }
>>>
>>
> 

Reply via email to