The default ttm_tt_pages_limit is 1/2 of system memory.
It is prone to out of memory with such a configuration.

Signed-off-by: Lang Yu <lang...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c       |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h       |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 +++++++++---
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 3295838e9a1d..c01c6f3ab562 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -167,7 +167,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
        int i;
        int last_valid_bit;
 
-       amdgpu_amdkfd_gpuvm_init_mem_limits();
+       amdgpu_amdkfd_gpuvm_init_mem_limits(adev);
 
        if (adev->kfd.dev) {
                struct kgd2kfd_shared_resources gpu_resources = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 1de021ebdd46..13284dbd8c58 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -363,7 +363,7 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device 
*adev, int xcp_id);
 
 
 #if IS_ENABLED(CONFIG_HSA_AMD)
-void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
+void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev);
 void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
                                struct amdgpu_vm *vm);
 
@@ -376,7 +376,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo);
 void amdgpu_amdkfd_reserve_system_mem(uint64_t size);
 #else
 static inline
-void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
+void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev)
 {
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 7eb5afcc4895..a3e623a320b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -60,6 +60,7 @@ static struct {
        int64_t system_mem_used;
        int64_t ttm_mem_used;
        spinlock_t mem_limit_lock;
+       bool alow_oversubscribe;
 } kfd_mem_limit;
 
 static const char * const domain_bit_to_string[] = {
@@ -110,7 +111,7 @@ static bool reuse_dmamap(struct amdgpu_device *adev, struct 
amdgpu_device *bo_ad
  *  System (TTM + userptr) memory - 15/16th System RAM
  *  TTM memory - 3/8th System RAM
  */
-void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
+void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev)
 {
        struct sysinfo si;
        uint64_t mem;
@@ -130,6 +131,7 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
                kfd_mem_limit.max_system_mem_limit -= AMDGPU_RESERVE_MEM_LIMIT;
 
        kfd_mem_limit.max_ttm_mem_limit = ttm_tt_pages_limit() << PAGE_SHIFT;
+       kfd_mem_limit.alow_oversubscribe = !!(adev->flags & AMD_IS_APU);
        pr_debug("Kernel memory limit %lluM, TTM limit %lluM\n",
                (kfd_mem_limit.max_system_mem_limit >> 20),
                (kfd_mem_limit.max_ttm_mem_limit >> 20));
@@ -221,8 +223,12 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
             kfd_mem_limit.max_ttm_mem_limit) ||
            (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed >
             vram_size - reserved_for_pt - 
atomic64_read(&adev->vram_pin_size))) {
-               ret = -ENOMEM;
-               goto release;
+               if (kfd_mem_limit.alow_oversubscribe) {
+                       pr_warn_ratelimited("Memory is getting 
oversubscried.\n");
+               } else {
+                       ret = -ENOMEM;
+                       goto release;
+               }
        }
 
        /* Update memory accounting by decreasing available system
-- 
2.25.1

Reply via email to