xe: Strict migration policy for atomic SVM faults

Thomas Hellström Fri, 25 Apr 2025 02:10:16 -0700

On Fri, 2025-04-25 at 00:39 -0700, Matthew Brost wrote:
> On Fri, Apr 25, 2025 at 09:18:19AM +0200, Thomas Hellström wrote:
> > On Thu, 2025-04-24 at 11:03 -0700, Matthew Brost wrote:
> > > On Thu, Apr 24, 2025 at 04:39:21PM +0200, Thomas Hellström wrote:
> > > > On Tue, 2025-04-22 at 10:04 -0700, Matthew Brost wrote:
> > > > > Mixing GPU and CPU atomics does not work unless a strict
> > > > > migration
> > > > > policy of GPU atomics must be device memory. Enforce a policy
> > > > > of
> > > > > must
> > > > > be
> > > > > in VRAM with a retry loop of 2 attempts, if retry loop fails
> > > > > abort
> > > > > fault.
> > > > > 
> > > > > v2:
> > > > >  - Only retry migration on atomics
> > > > >  - Drop alway migrate modparam
> > > > > v3:
> > > > >  - Only set vram_only on DGFX (Himal)
> > > > >  - Bail on get_pages failure if vram_only and retry count
> > > > > exceeded
> > > > > (Himal)
> > > > >  - s/vram_only/devmem_only
> > > > >  - Update xe_svm_range_is_valid to accept devmem_only
> > > > > argument
> > > > > v4:
> > > > >  - Fix logic bug get_pages failure
> > > > > 
> > > > > Signed-off-by: Himal Prasad Ghimiray
> > > > > <himal.prasad.ghimi...@intel.com>
> > > > > Signed-off-by: Matthew Brost <matthew.br...@intel.com>
> > > > > ---
> > > > >  drivers/gpu/drm/xe/xe_module.c |  3 --
> > > > >  drivers/gpu/drm/xe/xe_module.h |  1 -
> > > > >  drivers/gpu/drm/xe/xe_svm.c    | 89
> > > > > +++++++++++++++++++++++++---
> > > > > ----
> > > > > --
> > > > >  drivers/gpu/drm/xe/xe_svm.h    |  5 --
> > > > >  4 files changed, 65 insertions(+), 33 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/xe/xe_module.c
> > > > > b/drivers/gpu/drm/xe/xe_module.c
> > > > > index 05c7d0ae6d83..1c4dfafbcd0b 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_module.c
> > > > > +++ b/drivers/gpu/drm/xe/xe_module.c
> > > > > @@ -33,9 +33,6 @@ struct xe_modparam xe_modparam = {
> > > > >  module_param_named(svm_notifier_size,
> > > > > xe_modparam.svm_notifier_size,
> > > > > uint, 0600);
> > > > >  MODULE_PARM_DESC(svm_notifier_size, "Set the svm notifier
> > > > > size(in
> > > > > MiB), must be power of 2");
> > > > >  
> > > > > -module_param_named(always_migrate_to_vram,
> > > > > xe_modparam.always_migrate_to_vram, bool, 0444);
> > > > > -MODULE_PARM_DESC(always_migrate_to_vram, "Always migrate to
> > > > > VRAM
> > > > > on
> > > > > GPU fault");
> > > > > -
> > > >  module_param_named_unsafe(force_execlist,
> > > > > xe_modparam.force_execlist, bool, 0444);
> > > > >  MODULE_PARM_DESC(force_execlist, "Force Execlist
> > > > > submission");
> > > > >  
> > > > > diff --git a/drivers/gpu/drm/xe/xe_module.h
> > > > > b/drivers/gpu/drm/xe/xe_module.h
> > > > > index 84339e509c80..5a3bfea8b7b4 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_module.h
> > > > > +++ b/drivers/gpu/drm/xe/xe_module.h
> > > > > @@ -12,7 +12,6 @@
> > > > >  struct xe_modparam {
> > > > >       bool force_execlist;
> > > > >       bool probe_display;
> > > > > -     bool always_migrate_to_vram;
> > > > >       u32 force_vram_bar_size;
> > > > >       int guc_log_level;
> > > > >       char *guc_firmware_path;
> > > > > diff --git a/drivers/gpu/drm/xe/xe_svm.c
> > > > > b/drivers/gpu/drm/xe/xe_svm.c
> > > > > index 890f6b2f40e9..f749ae367a8f 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_svm.c
> > > > > +++ b/drivers/gpu/drm/xe/xe_svm.c
> > > > > @@ -650,9 +650,11 @@ void xe_svm_fini(struct xe_vm *vm)
> > > > >  }
> > > > >  
> > > > >  static bool xe_svm_range_is_valid(struct xe_svm_range
> > > > > *range,
> > > > > -                               struct xe_tile *tile)
> > > > > +                               struct xe_tile *tile,
> > > > > +                               bool devmem_only)
> > > > >  {
> > > > > -     return (range->tile_present & ~range-
> > > > > >tile_invalidated)
> > > > > &
> > > > > BIT(tile->id);
> > > > > +     return ((range->tile_present & ~range-
> > > > > >tile_invalidated)
> > > > > &
> > > > > BIT(tile->id))
> > > > > +             && (!devmem_only || range-
> > > > > > base.flags.migrate_devmem);
> > > > >  }
> > > > 
> > > > So let's say devmem_only is true here, and range-
> > > > > base.flags.migrate_devmem is false. Wouldn't that mean the
> > > > > range
> > > > > is
> > > > unusable and needs to be freed and re-allocated?
> > > > 
> > > 
> > > This is typo, this should be s/migrate_devmem/has_devmem_pages.
> > > 
> > > This translates to:
> > > 
> > > Either devmem_only is not required or we have devmem pages with a
> > > valid mapping.
> > > 
> > > If migrate_devmem is false and devmem_only is true, that is a
> > > fatal
> > > error actually, we should have check for that and kill the fault.
> > > An
> > > example of this would be shared mapping which cannot be migrated
> > > to
> > > devmem.
> > >  
> > > > Also another thing going back to older code, it seems like
> > > > range-
> > > > > tile_invalidated is protected by the notifier lock, so
> > > > > shouldn't
> > > > > we
> > > > assert that to be held in the function? It seems not to be held
> > > > further
> > > > below:
> > > 
> > > Yea techincally to get a stable value we'd need the notifier lock
> > > but
> > > this is an opportunistic check - at worst if we read a valid
> > > range we
> > > skip the page faults and will immediately get another page fault.
> > > So
> > > we
> > > could take the notifier lock here but I don't think this is
> > > strickly
> > > required. Let me know what you think here.
> > 
> > The problem with this is that the code gets harder to maintain and
> 
> Agree.
> 
> > understand. A new reader would probably first react over the
> > lockless
> > read, and then why there are no memory barriers and then what
> > happens
> > if the page-fault was marked as resolved without actually resolving
> > it.
> > 
> > So IMO if we do opportunistic tests to opt out of locking (which is
> > discouraged in the drm locking guidelines
> > https://blog.ffwll.ch/2022/08/locking-hierarchy.html)
> > we should definitely add separate functions for that with extensive
> > docs and READ_ONCE() annotation.
> > 
> 
> A lock here doesn't actually gain us anything, though, as the state
> can
> immediately change after lock release triggering another fault. If
> you
> agree, I'll go with READ_ONCE and add comments in the code indicating
> it's an opportunistic check.


Yeah. Ideally I think a well documented
xe_svm_range_check_valid() (or perhaps better name),

and an
xe_svm_range_is_valid() with an assert.

in the spirit of 
mmu_interval_check_retry() (lockless, documented) and
mmu_interval_read_retry() (requires lock).

> 
> > But also think if this is really worth sacrificing readability
> > instead
> > of actually relying on alloc_vram() and get_pages() exiting early
> > if
> > everything looks ok?
> > 
> 
> alloc_vram() as is very expensive, get_pages() less but still CPU
> cycles.
> 
> The idea here is to short-circuit "page fault storms," where many EUs
> access the same page simultaneously. If I recall correctly, this was
> a
> significant issue on PVC—so much so that we are considering firmware
> and
> hardware changes going forward. We should try to mitigate these
> conditions in the page fault handler, if possible.

Yes, I think that's a perfectly legit case.

Perhaps in the future we could even short-circuit pf storms in the G2H
handler?

/Thomas


> 
> Matt
> 
> > > 
> > > > 
> > > > >  
> > > > >  #if IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR)
> > > > > @@ -726,6 +728,35 @@ static int xe_svm_alloc_vram(struct
> > > > > xe_vm
> > > > > *vm,
> > > > > struct xe_tile *tile,
> > > > >  }
> > > > >  #endif
> > > > >  
> > > > > +static bool supports_4K_migration(struct xe_device *xe)
> > > > > +{
> > > > > +     if (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
> > > > > +             return false;
> > > > > +
> > > > > +     return true;
> > > > > +}
> > > > 
> > > > Do we have any hardware that supports pagefaults but not 4K
> > > > VRAM
> > > > pages?
> > > > 
> > > 
> > > PVC
> > 
> > OK, I was under the impression that PVC actually supported 4K
> > pages.
> > But perhaps there was a bug encountered while implementing that.
> > 
> > 
> > > 
> > > > > +
> > > > > +static bool xe_svm_range_needs_migrate_to_vram(struct
> > > > > xe_svm_range
> > > > > *range,
> > > > > +                                            struct xe_vma
> > > > > *vma)
> > > > > +{
> > > > > +     struct xe_vm *vm = range_to_vm(&range->base);
> > > > > +     u64 range_size = xe_svm_range_size(range);
> > > > > +
> > > > > +     if (!range->base.flags.migrate_devmem)
> > > > > +             return false;
> > > > > +
> > > > > +     if (xe_svm_range_in_vram(range)) {
> > > > > +             drm_dbg(&vm->xe->drm, "Range is already in
> > > > > VRAM\n");
> > > > > +             return false;
> > > > > +     }
> > > > > +
> > > > > +     if (range_size <= SZ_64K &&
> > > > > !supports_4K_migration(vm-
> > > > > > xe))
> > > > > {
> > > > > +             drm_dbg(&vm->xe->drm, "Platform doesn't
> > > > > support
> > > > > SZ_4K range migration\n");
> > > > > +             return false;
> > > > > +     }
> > > > > +
> > > > > +     return true;
> > > > > +}
> > > > >  
> > > > >  /**
> > > > >   * xe_svm_handle_pagefault() - SVM handle page fault
> > > > > @@ -750,12 +781,15 @@ int xe_svm_handle_pagefault(struct
> > > > > xe_vm
> > > > > *vm,
> > > > > struct xe_vma *vma,
> > > > >                       IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRR
> > > > > OR),
> > > > >               .check_pages_threshold = IS_DGFX(vm->xe) &&
> > > > >                       IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRR
> > > > > OR)
> > > > > ?
> > > > > SZ_64K : 0,
> > > > > +             .devmem_only = atomic && IS_DGFX(vm->xe) &&
> > > > > +                     IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRR
> > > > > OR),
> > > > >       };
> > > > >       struct xe_svm_range *range;
> > > > >       struct drm_gpusvm_range *r;
> > > > >       struct drm_exec exec;
> > > > >       struct dma_fence *fence;
> > > > >       struct xe_tile *tile = gt_to_tile(gt);
> > > > > +     int migrate_try_count = ctx.devmem_only ? 3 : 1;
> > > > >       ktime_t end = 0;
> > > > >       int err;
> > > > >  
> > > > > @@ -777,23 +811,26 @@ int xe_svm_handle_pagefault(struct
> > > > > xe_vm
> > > > > *vm,
> > > > > struct xe_vma *vma,
> > > > >               return PTR_ERR(r);
> > > > >  
> > > > >       range = to_xe_range(r);
> > > > > -     if (xe_svm_range_is_valid(range, tile))
> > > > > +     if (xe_svm_range_is_valid(range, tile,
> > > > > ctx.devmem_only))
> > > > 
> > > > Requires notifier lock. Also see comment on re-allocating the
> > > > range
> > > > above.
> > > > 
> > > 
> > > Same as above.
> > > 
> > > > >               return 0;
> > > > >  
> > > > >       range_debug(range, "PAGE FAULT");
> > > > >  
> > > > > -     /* XXX: Add migration policy, for now migrate range
> > > > > once
> > > > > */
> > > > > -     if (!range->skip_migrate && range-
> > > > > > base.flags.migrate_devmem
> > > > > &&
> > > > > -         xe_svm_range_size(range) >= SZ_64K) {
> > > > > -             range->skip_migrate = true;
> > > > > -
> > > > > +     if (--migrate_try_count >= 0 &&
> > > > > +         xe_svm_range_needs_migrate_to_vram(range, vma)
> > > > 
> > > > Requires notifier lock.
> > > > 
> > > 
> > > Same as above.
> > > 
> > > > Should we have some sort of timeout instead of a try-count?
> > > > Perhaps
> > > > as
> > > > a last resort fall back to a 4K range?
> > > > 
> > > 
> > > I did have code like that at one point to reduce range size but
> > > it is
> > > a
> > > bit complicated as we'd have to remove the range... I'd rather
> > > stick
> > > with the retry loop for now and if this becomes problematic,
> > > circle
> > > back
> > > to reducing the size of the fault page on each retry loop.
> > 
> > OK, makes sense.
> > 
> > /Thomas
> > 
> > 
> > > 
> > > Matt
> > > 
> > > > /Thomas
> > > > 
> > > > 
> > > > 
> > > > > ) {
> > > > >               err = xe_svm_alloc_vram(vm, tile, range,
> > > > > &ctx);
> > > > >               if (err) {
> > > > > -                     drm_dbg(&vm->xe->drm,
> > > > > -                             "VRAM allocation failed,
> > > > > falling
> > > > > back to "
> > > > > -                             "retrying fault, asid=%u,
> > > > > errno=%pe\n",
> > > > > -                             vm->usm.asid, ERR_PTR(err));
> > > > > -                     goto retry;
> > > > > +                     if (migrate_try_count ||
> > > > > !ctx.devmem_only) {
> > > > > +                             drm_dbg(&vm->xe->drm,
> > > > > +                                     "VRAM allocation
> > > > > failed,
> > > > > falling back to retrying fault, asid=%u, errno=%pe\n",
> > > > > +                                     vm->usm.asid,
> > > > > ERR_PTR(err));
> > > > > +                             goto retry;
> > > > > +                     } else {
> > > > > +                             drm_err(&vm->xe->drm,
> > > > > +                                     "VRAM allocation
> > > > > failed,
> > > > > retry count exceeded, asid=%u, errno=%pe\n",
> > > > > +                                     vm->usm.asid,
> > > > > ERR_PTR(err));
> > > > > +                             return err;
> > > > > +                     }
> > > > >               }
> > > > >       }
> > > > >  
> > > > > @@ -801,15 +838,22 @@ int xe_svm_handle_pagefault(struct
> > > > > xe_vm
> > > > > *vm,
> > > > > struct xe_vma *vma,
> > > > >       err = drm_gpusvm_range_get_pages(&vm->svm.gpusvm, r,
> > > > > &ctx);
> > > > >       /* Corner where CPU mappings have changed */
> > > > >       if (err == -EOPNOTSUPP || err == -EFAULT || err == -
> > > > > EPERM) {
> > > > > -             if (err == -EOPNOTSUPP) {
> > > > > -                     range_debug(range, "PAGE FAULT -
> > > > > EVICT
> > > > > PAGES");
> > > > > -                     drm_gpusvm_range_evict(&vm-
> > > > > >svm.gpusvm,
> > > > > &range->base);
> > > > > +             if (migrate_try_count > 0 ||
> > > > > !ctx.devmem_only) {
> > > > > +                     if (err == -EOPNOTSUPP) {
> > > > > +                             range_debug(range, "PAGE
> > > > > FAULT -
> > > > > EVICT PAGES");
> > > > > +                             drm_gpusvm_range_evict(&vm-
> > > > > > svm.gpusvm,
> > > > > +                                                   
> > > > > &range-
> > > > > > base);
> > > > > +                     }
> > > > > +                     drm_dbg(&vm->xe->drm,
> > > > > +                             "Get pages failed, falling
> > > > > back
> > > > > to
> > > > > retrying, asid=%u, gpusvm=%p, errno=%pe\n",
> > > > > +                             vm->usm.asid, &vm-
> > > > > >svm.gpusvm,
> > > > > ERR_PTR(err));
> > > > > +                     range_debug(range, "PAGE FAULT -
> > > > > RETRY
> > > > > PAGES");
> > > > > +                     goto retry;
> > > > > +             } else {
> > > > > +                     drm_err(&vm->xe->drm,
> > > > > +                             "Get pages failed, retry
> > > > > count
> > > > > exceeded, asid=%u, gpusvm=%p, errno=%pe\n",
> > > > > +                             vm->usm.asid, &vm-
> > > > > >svm.gpusvm,
> > > > > ERR_PTR(err));
> > > > >               }
> > > > > -             drm_dbg(&vm->xe->drm,
> > > > > -                     "Get pages failed, falling back to
> > > > > retrying,
> > > > > asid=%u, gpusvm=%p, errno=%pe\n",
> > > > > -                     vm->usm.asid, &vm->svm.gpusvm,
> > > > > ERR_PTR(err));
> > > > > -             range_debug(range, "PAGE FAULT - RETRY
> > > > > PAGES");
> > > > > -             goto retry;
> > > > >       }
> > > > >       if (err) {
> > > > >               range_debug(range, "PAGE FAULT - FAIL PAGE
> > > > > COLLECT");
> > > > > @@ -843,9 +887,6 @@ int xe_svm_handle_pagefault(struct xe_vm
> > > > > *vm,
> > > > > struct xe_vma *vma,
> > > > >       }
> > > > >       drm_exec_fini(&exec);
> > > > >  
> > > > > -     if (xe_modparam.always_migrate_to_vram)
> > > > > -             range->skip_migrate = false;
> > > > > -
> > > > >       dma_fence_wait(fence, false);
> > > > >       dma_fence_put(fence);
> > > > >  
> > > > > diff --git a/drivers/gpu/drm/xe/xe_svm.h
> > > > > b/drivers/gpu/drm/xe/xe_svm.h
> > > > > index 3d441eb1f7ea..0e1f376a7471 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_svm.h
> > > > > +++ b/drivers/gpu/drm/xe/xe_svm.h
> > > > > @@ -39,11 +39,6 @@ struct xe_svm_range {
> > > > >        * range. Protected by GPU SVM notifier lock.
> > > > >        */
> > > > >       u8 tile_invalidated;
> > > > > -     /**
> > > > > -      * @skip_migrate: Skip migration to VRAM, protected
> > > > > by
> > > > > GPU
> > > > > fault handler
> > > > > -      * locking.
> > > > > -      */
> > > > > -     u8 skip_migrate :1;
> > > > >  };
> > > > >  
> > > > >  /**
> > > > 
> >

Re: [PATCH v4 2/5] drm/xe: Strict migration policy for atomic SVM faults

Reply via email to