There are several race conditions with XNACK enabled. For now just some
FIXME comments with ideas how to fix it.

Change-Id: If0abab6dcb8f4e95c9d8820f6c569263eda29a89
Signed-off-by: Felix Kuehling <felix.kuehl...@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  5 +++++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 21 ++++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 5c8b32873086..101d1f71db84 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -539,6 +539,11 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
        src = (uint64_t *)(scratch + npages);
        dst = scratch;
 
+       /* FIXME: Is it legal to hold on to this page array? We don't have
+        * proper references to the pages and we may not have an MMU notifier
+        * set up for the range at this point that could invalidate it (if
+        * it's a child range).
+        */
        prange->pages_addr = kvmalloc_array(npages, sizeof(*prange->pages_addr),
                                            GFP_KERNEL | __GFP_ZERO);
        if (!prange->pages_addr) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index fbcb1491e987..c48fe2f276b9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1727,7 +1727,7 @@ svm_range_handle_list_op(struct svm_range_list *svms, 
struct svm_range *prange)
                pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n",
                         svms, prange, prange->start, prange->last);
                svm_range_update_notifier_and_interval_tree(mm, prange);
-
+               /* FIXME: need to validate somewhere */
                r = svm_range_map_to_gpus(prange, true);
                if (r)
                        pr_debug("failed %d map 0x%p [0x%lx 0x%lx]\n",
@@ -1744,6 +1744,7 @@ svm_range_handle_list_op(struct svm_range_list *svms, 
struct svm_range *prange)
                         prange, prange->start, prange->last);
                svm_range_add_to_svms(prange);
                svm_range_add_notifier_locked(mm, prange);
+               /* FIXME: need to validate somewhere */
                r = svm_range_map_to_gpus(prange, true);
                if (r)
                        pr_debug("failed %d map 0x%p [0x%lx 0x%lx]\n",
@@ -2068,6 +2069,14 @@ svm_range_best_restore_location(struct svm_range *prange,
        return -1;
 }
 
+/* FIXME: This function can race with MMU notifiers. MMU notifiers can
+ * invalidate the page addresses concurrently, so we may end up mapping
+ * invalid addresses here. We cannot hold the prange->lock (held in MMU
+ * notifier) while updating page tables because of lock dependencies,
+ * as SDMA page table updates need reservation locks. Only unmapping
+ * works without reservations. May need to hold the mmap_write_lock to
+ * prevent concurrent MMU notifiers.
+ */
 int
 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
                        uint64_t addr)
@@ -2592,6 +2601,16 @@ svm_range_set_attr(struct kfd_process *p, uint64_t 
start, uint64_t size,
                        continue;
                }
 
+               /* FIXME: With xnack on, this can race with MMU notifiers.
+                * They may invalidate page addresses before we map them.
+                * Then we end up mapping invalid addresses in the GPU page
+                * table. May need to find a way to still hold the mmap write
+                * for map_to_gpus but drop it for validate to allow
+                * concurrent evictions. This will lead to some retry logic
+                * and the need to protect the update list differently.
+                * Maybe factor migration and validation into a common helper
+                * function shared with the GPU page fault handler.
+                */
                r = svm_range_validate(mm, prange);
                if (r) {
                        pr_debug("failed %d to validate svm range\n", r);
-- 
2.31.0

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Reply via email to