Mixing GPU and CPU atomics does not work unless a strict migration
policy of GPU atomics must be device memory. Enforce a policy of must be
in VRAM with a retry loop of 2 attempts, if retry loop fails abort
fault.

v2:
 - Only retry migration on atomics
 - Drop alway migrate modparam

Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimi...@intel.com>
Signed-off-by: Matthew Brost <matthew.br...@intel.com>
---
 drivers/gpu/drm/xe/xe_module.c |  3 --
 drivers/gpu/drm/xe/xe_module.h |  1 -
 drivers/gpu/drm/xe/xe_svm.c    | 57 ++++++++++++++++++++++++++--------
 drivers/gpu/drm/xe/xe_svm.h    |  5 ---
 4 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
index 05c7d0ae6d83..1c4dfafbcd0b 100644
--- a/drivers/gpu/drm/xe/xe_module.c
+++ b/drivers/gpu/drm/xe/xe_module.c
@@ -33,9 +33,6 @@ struct xe_modparam xe_modparam = {
 module_param_named(svm_notifier_size, xe_modparam.svm_notifier_size, uint, 
0600);
 MODULE_PARM_DESC(svm_notifier_size, "Set the svm notifier size(in MiB), must 
be power of 2");
 
-module_param_named(always_migrate_to_vram, xe_modparam.always_migrate_to_vram, 
bool, 0444);
-MODULE_PARM_DESC(always_migrate_to_vram, "Always migrate to VRAM on GPU 
fault");
-
 module_param_named_unsafe(force_execlist, xe_modparam.force_execlist, bool, 
0444);
 MODULE_PARM_DESC(force_execlist, "Force Execlist submission");
 
diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
index 84339e509c80..5a3bfea8b7b4 100644
--- a/drivers/gpu/drm/xe/xe_module.h
+++ b/drivers/gpu/drm/xe/xe_module.h
@@ -12,7 +12,6 @@
 struct xe_modparam {
        bool force_execlist;
        bool probe_display;
-       bool always_migrate_to_vram;
        u32 force_vram_bar_size;
        int guc_log_level;
        char *guc_firmware_path;
diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
index 56b18a293bbc..1cc41ce7b684 100644
--- a/drivers/gpu/drm/xe/xe_svm.c
+++ b/drivers/gpu/drm/xe/xe_svm.c
@@ -726,6 +726,35 @@ static int xe_svm_alloc_vram(struct xe_vm *vm, struct 
xe_tile *tile,
 }
 #endif
 
+static bool supports_4K_migration(struct xe_device *xe)
+{
+       if (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
+               return false;
+
+       return true;
+}
+
+static bool xe_svm_range_needs_migrate_to_vram(struct xe_svm_range *range,
+                                              struct xe_vma *vma)
+{
+       struct xe_vm *vm = range_to_vm(&range->base);
+       u64 range_size = xe_svm_range_size(range);
+
+       if (!range->base.flags.migrate_devmem)
+               return false;
+
+       if (xe_svm_range_in_vram(range)) {
+               drm_dbg(&vm->xe->drm, "Range is already in VRAM\n");
+               return false;
+       }
+
+       if (range_size <= SZ_64K && !supports_4K_migration(vm->xe)) {
+               drm_dbg(&vm->xe->drm, "Platform doesn't support SZ_4K range 
migration\n");
+               return false;
+       }
+
+       return true;
+}
 
 /**
  * xe_svm_handle_pagefault() - SVM handle page fault
@@ -750,12 +779,14 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct 
xe_vma *vma,
                        IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR),
                .check_pages_threshold = IS_DGFX(vm->xe) &&
                        IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? SZ_64K : 0,
+               .vram_only = atomic,
        };
        struct xe_svm_range *range;
        struct drm_gpusvm_range *r;
        struct drm_exec exec;
        struct dma_fence *fence;
        struct xe_tile *tile = gt_to_tile(gt);
+       int migrate_try_count = atomic ? 3 : 1;
        ktime_t end = 0;
        int err;
 
@@ -782,18 +813,21 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct 
xe_vma *vma,
 
        range_debug(range, "PAGE FAULT");
 
-       /* XXX: Add migration policy, for now migrate range once */
-       if (!range->skip_migrate && range->base.flags.migrate_devmem &&
-           xe_svm_range_size(range) >= SZ_64K) {
-               range->skip_migrate = true;
-
+       if (--migrate_try_count >= 0 &&
+           xe_svm_range_needs_migrate_to_vram(range, vma)) {
                err = xe_svm_alloc_vram(vm, tile, range, &ctx);
                if (err) {
-                       drm_dbg(&vm->xe->drm,
-                               "VRAM allocation failed, falling back to "
-                               "retrying fault, asid=%u, errno=%pe\n",
-                               vm->usm.asid, ERR_PTR(err));
-                       goto retry;
+                       if (migrate_try_count || !ctx.vram_only) {
+                               drm_dbg(&vm->xe->drm,
+                                       "VRAM allocation failed, falling back 
to retrying fault, asid=%u, errno=%pe\n",
+                                       vm->usm.asid, ERR_PTR(err));
+                               goto retry;
+                       } else {
+                               drm_err(&vm->xe->drm,
+                                       "VRAM allocation failed, retry count 
exceeded, asid=%u, errno=%pe\n",
+                                       vm->usm.asid, ERR_PTR(err));
+                               return err;
+                       }
                }
        }
 
@@ -843,9 +877,6 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma 
*vma,
        }
        drm_exec_fini(&exec);
 
-       if (xe_modparam.always_migrate_to_vram)
-               range->skip_migrate = false;
-
        dma_fence_wait(fence, false);
        dma_fence_put(fence);
 
diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h
index 3d441eb1f7ea..0e1f376a7471 100644
--- a/drivers/gpu/drm/xe/xe_svm.h
+++ b/drivers/gpu/drm/xe/xe_svm.h
@@ -39,11 +39,6 @@ struct xe_svm_range {
         * range. Protected by GPU SVM notifier lock.
         */
        u8 tile_invalidated;
-       /**
-        * @skip_migrate: Skip migration to VRAM, protected by GPU fault handler
-        * locking.
-        */
-       u8 skip_migrate :1;
 };
 
 /**
-- 
2.34.1

Reply via email to