On Fri, 2025-04-25 at 00:39 -0700, Matthew Brost wrote: > On Fri, Apr 25, 2025 at 09:18:19AM +0200, Thomas Hellström wrote: > > On Thu, 2025-04-24 at 11:03 -0700, Matthew Brost wrote: > > > On Thu, Apr 24, 2025 at 04:39:21PM +0200, Thomas Hellström wrote: > > > > On Tue, 2025-04-22 at 10:04 -0700, Matthew Brost wrote: > > > > > Mixing GPU and CPU atomics does not work unless a strict > > > > > migration > > > > > policy of GPU atomics must be device memory. Enforce a policy > > > > > of > > > > > must > > > > > be > > > > > in VRAM with a retry loop of 2 attempts, if retry loop fails > > > > > abort > > > > > fault. > > > > > > > > > > v2: > > > > > - Only retry migration on atomics > > > > > - Drop alway migrate modparam > > > > > v3: > > > > > - Only set vram_only on DGFX (Himal) > > > > > - Bail on get_pages failure if vram_only and retry count > > > > > exceeded > > > > > (Himal) > > > > > - s/vram_only/devmem_only > > > > > - Update xe_svm_range_is_valid to accept devmem_only > > > > > argument > > > > > v4: > > > > > - Fix logic bug get_pages failure > > > > > > > > > > Signed-off-by: Himal Prasad Ghimiray > > > > > <himal.prasad.ghimi...@intel.com> > > > > > Signed-off-by: Matthew Brost <matthew.br...@intel.com> > > > > > --- > > > > > drivers/gpu/drm/xe/xe_module.c | 3 -- > > > > > drivers/gpu/drm/xe/xe_module.h | 1 - > > > > > drivers/gpu/drm/xe/xe_svm.c | 89 > > > > > +++++++++++++++++++++++++--- > > > > > ---- > > > > > -- > > > > > drivers/gpu/drm/xe/xe_svm.h | 5 -- > > > > > 4 files changed, 65 insertions(+), 33 deletions(-) > > > > > > > > > > diff --git a/drivers/gpu/drm/xe/xe_module.c > > > > > b/drivers/gpu/drm/xe/xe_module.c > > > > > index 05c7d0ae6d83..1c4dfafbcd0b 100644 > > > > > --- a/drivers/gpu/drm/xe/xe_module.c > > > > > +++ b/drivers/gpu/drm/xe/xe_module.c > > > > > @@ -33,9 +33,6 @@ struct xe_modparam xe_modparam = { > > > > > module_param_named(svm_notifier_size, > > > > > xe_modparam.svm_notifier_size, > > > > > uint, 0600); > > > > > MODULE_PARM_DESC(svm_notifier_size, "Set the svm notifier > > > > > size(in > > > > > MiB), must be power of 2"); > > > > > > > > > > -module_param_named(always_migrate_to_vram, > > > > > xe_modparam.always_migrate_to_vram, bool, 0444); > > > > > -MODULE_PARM_DESC(always_migrate_to_vram, "Always migrate to > > > > > VRAM > > > > > on > > > > > GPU fault"); > > > > > - > > > > module_param_named_unsafe(force_execlist, > > > > > xe_modparam.force_execlist, bool, 0444); > > > > > MODULE_PARM_DESC(force_execlist, "Force Execlist > > > > > submission"); > > > > > > > > > > diff --git a/drivers/gpu/drm/xe/xe_module.h > > > > > b/drivers/gpu/drm/xe/xe_module.h > > > > > index 84339e509c80..5a3bfea8b7b4 100644 > > > > > --- a/drivers/gpu/drm/xe/xe_module.h > > > > > +++ b/drivers/gpu/drm/xe/xe_module.h > > > > > @@ -12,7 +12,6 @@ > > > > > struct xe_modparam { > > > > > bool force_execlist; > > > > > bool probe_display; > > > > > - bool always_migrate_to_vram; > > > > > u32 force_vram_bar_size; > > > > > int guc_log_level; > > > > > char *guc_firmware_path; > > > > > diff --git a/drivers/gpu/drm/xe/xe_svm.c > > > > > b/drivers/gpu/drm/xe/xe_svm.c > > > > > index 890f6b2f40e9..f749ae367a8f 100644 > > > > > --- a/drivers/gpu/drm/xe/xe_svm.c > > > > > +++ b/drivers/gpu/drm/xe/xe_svm.c > > > > > @@ -650,9 +650,11 @@ void xe_svm_fini(struct xe_vm *vm) > > > > > } > > > > > > > > > > static bool xe_svm_range_is_valid(struct xe_svm_range > > > > > *range, > > > > > - struct xe_tile *tile) > > > > > + struct xe_tile *tile, > > > > > + bool devmem_only) > > > > > { > > > > > - return (range->tile_present & ~range- > > > > > >tile_invalidated) > > > > > & > > > > > BIT(tile->id); > > > > > + return ((range->tile_present & ~range- > > > > > >tile_invalidated) > > > > > & > > > > > BIT(tile->id)) > > > > > + && (!devmem_only || range- > > > > > > base.flags.migrate_devmem); > > > > > } > > > > > > > > So let's say devmem_only is true here, and range- > > > > > base.flags.migrate_devmem is false. Wouldn't that mean the > > > > > range > > > > > is > > > > unusable and needs to be freed and re-allocated? > > > > > > > > > > This is typo, this should be s/migrate_devmem/has_devmem_pages. > > > > > > This translates to: > > > > > > Either devmem_only is not required or we have devmem pages with a > > > valid mapping. > > > > > > If migrate_devmem is false and devmem_only is true, that is a > > > fatal > > > error actually, we should have check for that and kill the fault. > > > An > > > example of this would be shared mapping which cannot be migrated > > > to > > > devmem. > > > > > > > Also another thing going back to older code, it seems like > > > > range- > > > > > tile_invalidated is protected by the notifier lock, so > > > > > shouldn't > > > > > we > > > > assert that to be held in the function? It seems not to be held > > > > further > > > > below: > > > > > > Yea techincally to get a stable value we'd need the notifier lock > > > but > > > this is an opportunistic check - at worst if we read a valid > > > range we > > > skip the page faults and will immediately get another page fault. > > > So > > > we > > > could take the notifier lock here but I don't think this is > > > strickly > > > required. Let me know what you think here. > > > > The problem with this is that the code gets harder to maintain and > > Agree. > > > understand. A new reader would probably first react over the > > lockless > > read, and then why there are no memory barriers and then what > > happens > > if the page-fault was marked as resolved without actually resolving > > it. > > > > So IMO if we do opportunistic tests to opt out of locking (which is > > discouraged in the drm locking guidelines > > https://blog.ffwll.ch/2022/08/locking-hierarchy.html) > > we should definitely add separate functions for that with extensive > > docs and READ_ONCE() annotation. > > > > A lock here doesn't actually gain us anything, though, as the state > can > immediately change after lock release triggering another fault. If > you > agree, I'll go with READ_ONCE and add comments in the code indicating > it's an opportunistic check.
Yeah. Ideally I think a well documented xe_svm_range_check_valid() (or perhaps better name), and an xe_svm_range_is_valid() with an assert. in the spirit of mmu_interval_check_retry() (lockless, documented) and mmu_interval_read_retry() (requires lock). > > > But also think if this is really worth sacrificing readability > > instead > > of actually relying on alloc_vram() and get_pages() exiting early > > if > > everything looks ok? > > > > alloc_vram() as is very expensive, get_pages() less but still CPU > cycles. > > The idea here is to short-circuit "page fault storms," where many EUs > access the same page simultaneously. If I recall correctly, this was > a > significant issue on PVC—so much so that we are considering firmware > and > hardware changes going forward. We should try to mitigate these > conditions in the page fault handler, if possible. Yes, I think that's a perfectly legit case. Perhaps in the future we could even short-circuit pf storms in the G2H handler? /Thomas > > Matt > > > > > > > > > > > > > > > > > > #if IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) > > > > > @@ -726,6 +728,35 @@ static int xe_svm_alloc_vram(struct > > > > > xe_vm > > > > > *vm, > > > > > struct xe_tile *tile, > > > > > } > > > > > #endif > > > > > > > > > > +static bool supports_4K_migration(struct xe_device *xe) > > > > > +{ > > > > > + if (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K) > > > > > + return false; > > > > > + > > > > > + return true; > > > > > +} > > > > > > > > Do we have any hardware that supports pagefaults but not 4K > > > > VRAM > > > > pages? > > > > > > > > > > PVC > > > > OK, I was under the impression that PVC actually supported 4K > > pages. > > But perhaps there was a bug encountered while implementing that. > > > > > > > > > > > > + > > > > > +static bool xe_svm_range_needs_migrate_to_vram(struct > > > > > xe_svm_range > > > > > *range, > > > > > + struct xe_vma > > > > > *vma) > > > > > +{ > > > > > + struct xe_vm *vm = range_to_vm(&range->base); > > > > > + u64 range_size = xe_svm_range_size(range); > > > > > + > > > > > + if (!range->base.flags.migrate_devmem) > > > > > + return false; > > > > > + > > > > > + if (xe_svm_range_in_vram(range)) { > > > > > + drm_dbg(&vm->xe->drm, "Range is already in > > > > > VRAM\n"); > > > > > + return false; > > > > > + } > > > > > + > > > > > + if (range_size <= SZ_64K && > > > > > !supports_4K_migration(vm- > > > > > > xe)) > > > > > { > > > > > + drm_dbg(&vm->xe->drm, "Platform doesn't > > > > > support > > > > > SZ_4K range migration\n"); > > > > > + return false; > > > > > + } > > > > > + > > > > > + return true; > > > > > +} > > > > > > > > > > /** > > > > > * xe_svm_handle_pagefault() - SVM handle page fault > > > > > @@ -750,12 +781,15 @@ int xe_svm_handle_pagefault(struct > > > > > xe_vm > > > > > *vm, > > > > > struct xe_vma *vma, > > > > > IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRR > > > > > OR), > > > > > .check_pages_threshold = IS_DGFX(vm->xe) && > > > > > IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRR > > > > > OR) > > > > > ? > > > > > SZ_64K : 0, > > > > > + .devmem_only = atomic && IS_DGFX(vm->xe) && > > > > > + IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRR > > > > > OR), > > > > > }; > > > > > struct xe_svm_range *range; > > > > > struct drm_gpusvm_range *r; > > > > > struct drm_exec exec; > > > > > struct dma_fence *fence; > > > > > struct xe_tile *tile = gt_to_tile(gt); > > > > > + int migrate_try_count = ctx.devmem_only ? 3 : 1; > > > > > ktime_t end = 0; > > > > > int err; > > > > > > > > > > @@ -777,23 +811,26 @@ int xe_svm_handle_pagefault(struct > > > > > xe_vm > > > > > *vm, > > > > > struct xe_vma *vma, > > > > > return PTR_ERR(r); > > > > > > > > > > range = to_xe_range(r); > > > > > - if (xe_svm_range_is_valid(range, tile)) > > > > > + if (xe_svm_range_is_valid(range, tile, > > > > > ctx.devmem_only)) > > > > > > > > Requires notifier lock. Also see comment on re-allocating the > > > > range > > > > above. > > > > > > > > > > Same as above. > > > > > > > > return 0; > > > > > > > > > > range_debug(range, "PAGE FAULT"); > > > > > > > > > > - /* XXX: Add migration policy, for now migrate range > > > > > once > > > > > */ > > > > > - if (!range->skip_migrate && range- > > > > > > base.flags.migrate_devmem > > > > > && > > > > > - xe_svm_range_size(range) >= SZ_64K) { > > > > > - range->skip_migrate = true; > > > > > - > > > > > + if (--migrate_try_count >= 0 && > > > > > + xe_svm_range_needs_migrate_to_vram(range, vma) > > > > > > > > Requires notifier lock. > > > > > > > > > > Same as above. > > > > > > > Should we have some sort of timeout instead of a try-count? > > > > Perhaps > > > > as > > > > a last resort fall back to a 4K range? > > > > > > > > > > I did have code like that at one point to reduce range size but > > > it is > > > a > > > bit complicated as we'd have to remove the range... I'd rather > > > stick > > > with the retry loop for now and if this becomes problematic, > > > circle > > > back > > > to reducing the size of the fault page on each retry loop. > > > > OK, makes sense. > > > > /Thomas > > > > > > > > > > Matt > > > > > > > /Thomas > > > > > > > > > > > > > > > > > ) { > > > > > err = xe_svm_alloc_vram(vm, tile, range, > > > > > &ctx); > > > > > if (err) { > > > > > - drm_dbg(&vm->xe->drm, > > > > > - "VRAM allocation failed, > > > > > falling > > > > > back to " > > > > > - "retrying fault, asid=%u, > > > > > errno=%pe\n", > > > > > - vm->usm.asid, ERR_PTR(err)); > > > > > - goto retry; > > > > > + if (migrate_try_count || > > > > > !ctx.devmem_only) { > > > > > + drm_dbg(&vm->xe->drm, > > > > > + "VRAM allocation > > > > > failed, > > > > > falling back to retrying fault, asid=%u, errno=%pe\n", > > > > > + vm->usm.asid, > > > > > ERR_PTR(err)); > > > > > + goto retry; > > > > > + } else { > > > > > + drm_err(&vm->xe->drm, > > > > > + "VRAM allocation > > > > > failed, > > > > > retry count exceeded, asid=%u, errno=%pe\n", > > > > > + vm->usm.asid, > > > > > ERR_PTR(err)); > > > > > + return err; > > > > > + } > > > > > } > > > > > } > > > > > > > > > > @@ -801,15 +838,22 @@ int xe_svm_handle_pagefault(struct > > > > > xe_vm > > > > > *vm, > > > > > struct xe_vma *vma, > > > > > err = drm_gpusvm_range_get_pages(&vm->svm.gpusvm, r, > > > > > &ctx); > > > > > /* Corner where CPU mappings have changed */ > > > > > if (err == -EOPNOTSUPP || err == -EFAULT || err == - > > > > > EPERM) { > > > > > - if (err == -EOPNOTSUPP) { > > > > > - range_debug(range, "PAGE FAULT - > > > > > EVICT > > > > > PAGES"); > > > > > - drm_gpusvm_range_evict(&vm- > > > > > >svm.gpusvm, > > > > > &range->base); > > > > > + if (migrate_try_count > 0 || > > > > > !ctx.devmem_only) { > > > > > + if (err == -EOPNOTSUPP) { > > > > > + range_debug(range, "PAGE > > > > > FAULT - > > > > > EVICT PAGES"); > > > > > + drm_gpusvm_range_evict(&vm- > > > > > > svm.gpusvm, > > > > > + > > > > > &range- > > > > > > base); > > > > > + } > > > > > + drm_dbg(&vm->xe->drm, > > > > > + "Get pages failed, falling > > > > > back > > > > > to > > > > > retrying, asid=%u, gpusvm=%p, errno=%pe\n", > > > > > + vm->usm.asid, &vm- > > > > > >svm.gpusvm, > > > > > ERR_PTR(err)); > > > > > + range_debug(range, "PAGE FAULT - > > > > > RETRY > > > > > PAGES"); > > > > > + goto retry; > > > > > + } else { > > > > > + drm_err(&vm->xe->drm, > > > > > + "Get pages failed, retry > > > > > count > > > > > exceeded, asid=%u, gpusvm=%p, errno=%pe\n", > > > > > + vm->usm.asid, &vm- > > > > > >svm.gpusvm, > > > > > ERR_PTR(err)); > > > > > } > > > > > - drm_dbg(&vm->xe->drm, > > > > > - "Get pages failed, falling back to > > > > > retrying, > > > > > asid=%u, gpusvm=%p, errno=%pe\n", > > > > > - vm->usm.asid, &vm->svm.gpusvm, > > > > > ERR_PTR(err)); > > > > > - range_debug(range, "PAGE FAULT - RETRY > > > > > PAGES"); > > > > > - goto retry; > > > > > } > > > > > if (err) { > > > > > range_debug(range, "PAGE FAULT - FAIL PAGE > > > > > COLLECT"); > > > > > @@ -843,9 +887,6 @@ int xe_svm_handle_pagefault(struct xe_vm > > > > > *vm, > > > > > struct xe_vma *vma, > > > > > } > > > > > drm_exec_fini(&exec); > > > > > > > > > > - if (xe_modparam.always_migrate_to_vram) > > > > > - range->skip_migrate = false; > > > > > - > > > > > dma_fence_wait(fence, false); > > > > > dma_fence_put(fence); > > > > > > > > > > diff --git a/drivers/gpu/drm/xe/xe_svm.h > > > > > b/drivers/gpu/drm/xe/xe_svm.h > > > > > index 3d441eb1f7ea..0e1f376a7471 100644 > > > > > --- a/drivers/gpu/drm/xe/xe_svm.h > > > > > +++ b/drivers/gpu/drm/xe/xe_svm.h > > > > > @@ -39,11 +39,6 @@ struct xe_svm_range { > > > > > * range. Protected by GPU SVM notifier lock. > > > > > */ > > > > > u8 tile_invalidated; > > > > > - /** > > > > > - * @skip_migrate: Skip migration to VRAM, protected > > > > > by > > > > > GPU > > > > > fault handler > > > > > - * locking. > > > > > - */ > > > > > - u8 skip_migrate :1; > > > > > }; > > > > > > > > > > /** > > > > > >