Re: [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job

2021-08-05 Thread Andrey Grodzovsky



On 2021-08-05 4:31 a.m., Jingwen Chen wrote:

From: Jack Zhang 

Why: Previously hw fence is alloced separately with job.
It caused historical lifetime issues and corner cases.
The ideal situation is to take fence to manage both job
and fence's lifetime, and simplify the design of gpu-scheduler.

How:
We propose to embed hw_fence into amdgpu_job.
1. We cover the normal job submission by this method.
2. For ib_test, and submit without a parent job keep the
legacy way to create a hw fence separately.
v2:
use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
embeded in a job.

Signed-off-by: Jingwen Chen 
Signed-off-by: Jack Zhang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 35 
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  4 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h|  5 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  2 +-
  8 files changed, 84 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7b46ba551cb2..3003ee1c9487 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum 
kgd_engine_type engine,
ret = dma_fence_wait(f, false);
  
  err_ib_sched:

-   dma_fence_put(f);
amdgpu_job_free(job);
  err:
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 536005bff24a..277128846dd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct 
amdgpu_ring *ring)
continue;
}
job = to_amdgpu_job(s_job);
-   if (preempted && job->fence == fence)
+   if (preempted && (&job->hw_fence) == fence)
/* mark the job as preempted */
job->preemption_status |= AMDGPU_IB_PREEMPTED;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 7495911516c2..5e29d797a265 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
   *
   * @ring: ring the fence is associated with
   * @f: resulting fence object
+ * @job: job the fence is embeded in
   * @flags: flags to pass into the subordinate .emit_fence() call
   *
   * Emits a fence command on the requested ring (all asics).
   * Returns 0 on success, -ENOMEM on failure.
   */
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct 
amdgpu_job *job,
  unsigned flags)
  {
struct amdgpu_device *adev = ring->adev;
-   struct amdgpu_fence *fence;
+   struct dma_fence *fence;
+   struct amdgpu_fence *am_fence;
struct dma_fence __rcu **ptr;
uint32_t seq;
int r;
  
-	fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);

-   if (fence == NULL)
-   return -ENOMEM;
+   if (job == NULL) {
+   /* create a sperate hw fence */
+   am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
+   if (am_fence == NULL)
+   return -ENOMEM;
+   fence = &am_fence->base;
+   am_fence->ring = ring;
+   } else {
+   /* take use of job-embedded fence */
+   fence = &job->hw_fence;
+   job->ring = ring;



If you would make hw_fence of type amdgpu_fence
you could probably avoid the special job->ring = ring
See more in related comment at the bottom



+   }
  
  	seq = ++ring->fence_drv.sync_seq;

-   fence->ring = ring;
-   dma_fence_init(&fence->base, &amdgpu_fence_ops,
+   dma_fence_init(fence, &amdgpu_fence_ops,
   &ring->fence_drv.lock,
   adev->fence_context + ring->idx,
   seq);
+
+   if (job != NULL) {
+   /* mark this fence has a parent job */
+   set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &fence->flags);
+   }
+
amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
   seq, flags | AMDGPU_FENCE_FLAG_INT);
pm_runtime_get_noresume(adev_to_drm(adev)->dev);
@@ -175,9 +191,9 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
dma_fence **f,
/* This function can't be called concurrently anyway, otherwise
 * emitting the fence would 

Re: [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence

2021-08-09 Thread Andrey Grodzovsky



On 2021-08-05 4:31 a.m., Jingwen Chen wrote:

[Why]
After embeded hw_fence to amdgpu_job, we need to add tdr support
for this feature.

[How]
1. Add a resubmit_flag for resubmit jobs.
2. Clear job fence from RCU and force complete vm flush fences in
pre_asic_reset
3. skip dma_fence_get for resubmit jobs and add a dma_fence_put
for guilty jobs.
v2:
use a job_run_counter in amdgpu_job to replace the resubmit_flag in
drm_sched_job. When the job_run_counter >= 1, it means this job is a
resubmit job.

Signed-off-by: Jack Zhang 
Signed-off-by: Jingwen Chen 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 13 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  5 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h|  3 +++
  4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9e53ff851496..ade2fa07a50a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
  int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 struct amdgpu_reset_context *reset_context)
  {
-   int i, r = 0;
+   int i, j, r = 0;
struct amdgpu_job *job = NULL;
bool need_full_reset =
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
@@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device 
*adev,
if (!ring || !ring->sched.thread)
continue;
  
+		/*clear job fence from fence drv to avoid force_completion

+*leave NULL and vm flush fence in fence drv */
+   for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) {
+   struct dma_fence *old,**ptr;
+   ptr = &ring->fence_drv.fences[j];
+   old = rcu_dereference_protected(*ptr, 1);
+   if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, 
&old->flags)) {
+   RCU_INIT_POINTER(*ptr, NULL);
+   }
+   }
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 5e29d797a265..c9752cf794fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
dma_fence **f, struct amd
}
  
  	seq = ++ring->fence_drv.sync_seq;

-   dma_fence_init(fence, &amdgpu_fence_ops,
-  &ring->fence_drv.lock,
-  adev->fence_context + ring->idx,
-  seq);
+   if (job != NULL && job->job_run_counter) {
+   /* reinit seq for resubmitted jobs */
+   fence->seqno = seq;
+   } else {
+   dma_fence_init(fence, &amdgpu_fence_ops,
+   &ring->fence_drv.lock,
+   adev->fence_context + ring->idx,
+   seq);
+   }



I think this should be in the first patch actually (and the counter too),
without it the first patch is buggy.


  
  	if (job != NULL) {

/* mark this fence has a parent job */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 65a395060de2..19b13a65c73b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct 
drm_sched_job *sched_job)
dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if 
VRAM lost */
  
  	if (finished->error < 0) {

+   dma_fence_put(&job->hw_fence);



Would put this check bellow with the job_run_counter check



DRM_INFO("Skip scheduling IBs!\n");
} else {
r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
@@ -262,7 +263,9 @@ static struct dma_fence *amdgpu_job_run(struct 
drm_sched_job *sched_job)
DRM_ERROR("Error scheduling IBs (%d)\n", r);
}
  
-	dma_fence_get(fence);

+   if (!job->job_run_counter)
+   dma_fence_get(fence);
+   job->job_run_counter ++;
amdgpu_job_free_resources(job);



Here you  modify code you already changed in patch 1, looks to me
like those 2 patches should be squashed into one patch as the changes are
directly dependent and it's hard to follow

Andrey


  
  	fence = r ? ERR_PTR(r) : fence;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 92324c978534..1fa667f2

Re: [PATCH v4] drm/amd/amdgpu embed hw_fence into amdgpu_job

2021-08-10 Thread Andrey Grodzovsky

Reviewed-by: Andrey Grodzovsky 

Andrey

On 2021-08-09 11:22 p.m., Jingwen Chen wrote:

From: Jack Zhang 

Why: Previously hw fence is alloced separately with job.
It caused historical lifetime issues and corner cases.
The ideal situation is to take fence to manage both job
and fence's lifetime, and simplify the design of gpu-scheduler.

How:
We propose to embed hw_fence into amdgpu_job.
1. We cover the normal job submission by this method.
2. For ib_test, and submit without a parent job keep the
legacy way to create a hw fence separately.
v2:
use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
embeded in a job.
v3:
remove redundant variable ring in amdgpu_job
v4:
add tdr sequence support for this feature. Add a job_run_counter to
indicate whether this job is a resubmit job.

Signed-off-by: Jingwen Chen 
Signed-off-by: Jack Zhang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 12 +++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 73 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 39 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  6 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h|  5 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  2 +-
  9 files changed, 108 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7b46ba551cb2..3003ee1c9487 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum 
kgd_engine_type engine,
ret = dma_fence_wait(f, false);
  
  err_ib_sched:

-   dma_fence_put(f);
amdgpu_job_free(job);
  err:
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 536005bff24a..277128846dd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct 
amdgpu_ring *ring)
continue;
}
job = to_amdgpu_job(s_job);
-   if (preempted && job->fence == fence)
+   if (preempted && (&job->hw_fence) == fence)
/* mark the job as preempted */
job->preemption_status |= AMDGPU_IB_PREEMPTED;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9e53ff851496..ade2fa07a50a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
  int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 struct amdgpu_reset_context *reset_context)
  {
-   int i, r = 0;
+   int i, j, r = 0;
struct amdgpu_job *job = NULL;
bool need_full_reset =
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
@@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device 
*adev,
if (!ring || !ring->sched.thread)
continue;
  
+		/*clear job fence from fence drv to avoid force_completion

+*leave NULL and vm flush fence in fence drv */
+   for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) {
+   struct dma_fence *old,**ptr;
+   ptr = &ring->fence_drv.fences[j];
+   old = rcu_dereference_protected(*ptr, 1);
+   if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, 
&old->flags)) {
+   RCU_INIT_POINTER(*ptr, NULL);
+   }
+   }
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 7495911516c2..a8302e324110 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -129,30 +129,50 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
   *
   * @ring: ring the fence is associated with
   * @f: resulting fence object
+ * @job: job the fence is embeded in
   * @flags: flags to pass into the subordinate .emit_fence() call
   *
   * Emits a fence command on the requested ring (all asics).
   * Returns 0 on success, -ENOMEM on failure.
   */
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_

Re: [PATCH] Revert "drm/scheduler: Avoid accessing freed bad job."

2021-08-17 Thread Andrey Grodzovsky

On 2021-08-17 12:28 a.m., Jingwen Chen wrote:

[Why]
for bailing job, this commit will delete it from pending list thus the
bailing job will never have a chance to be resubmitted even in advance
tdr mode.

[How]
after embeded hw_fence into amdgpu_job is done, the race condition that
this commit tries to work around is completely solved.So revert this
commit.
This reverts commit 135517d3565b48f4def3b1b82008bc17eb5d1c90.



Can you elaborate please how this solves the race ?
As far as I see and  with this patch reverted, in drm_sched_job_timedout
you get a pointer to next job to process in timed out handler, immediately
next this job is actually finished and it's fence signaled, this in turn 
triggers
drm_sched_get_cleanup_job which fetches this job and returns to 
drm_sched_main
which in turn call free_job callabck->...->amdgpu_fence_free which frees 
the job
from the HW dma_fence release callback. After that you proceed with a 
freed job

in timed out handler.

If you could take the HW fence reference from drm_sched_job_timedout before
starting processing then yes, I think it would work.

Andrey




Signed-off-by: Jingwen Chen 
---
  drivers/gpu/drm/scheduler/sched_main.c | 27 --
  1 file changed, 27 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a953693b45..31d1176d939f 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -317,21 +317,10 @@ static void drm_sched_job_timedout(struct work_struct 
*work)
enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL;
  
  	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);

-
-   /* Protects against concurrent deletion in drm_sched_get_cleanup_job */
-   spin_lock(&sched->job_list_lock);
job = list_first_entry_or_null(&sched->pending_list,
   struct drm_sched_job, list);
  
  	if (job) {

-   /*
-* Remove the bad job so it cannot be freed by concurrent
-* drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
-* is parked at which point it's safe.
-*/
-   list_del_init(&job->list);
-   spin_unlock(&sched->job_list_lock);
-
status = job->sched->ops->timedout_job(job);
  
  		/*

@@ -342,8 +331,6 @@ static void drm_sched_job_timedout(struct work_struct *work)
job->sched->ops->free_job(job);
sched->free_guilty = false;
}
-   } else {
-   spin_unlock(&sched->job_list_lock);
}
  
  	if (status != DRM_GPU_SCHED_STAT_ENODEV) {

@@ -392,20 +379,6 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, 
struct drm_sched_job *bad)
  
  	kthread_park(sched->thread);
  
-	/*

-* Reinsert back the bad job here - now it's safe as
-* drm_sched_get_cleanup_job cannot race against us and release the
-* bad job at this point - we parked (waited for) any in progress
-* (earlier) cleanups and drm_sched_get_cleanup_job will not be called
-* now until the scheduler thread is unparked.
-*/
-   if (bad && bad->sched == sched)
-   /*
-* Add at the head of the queue to reflect it was the earliest
-* job extracted.
-*/
-   list_add(&bad->list, &sched->pending_list);
-
/*
 * Iterate the job list from later to  earlier one and either deactive
 * their HW callbacks or remove them from pending list if they already


Re: [PATCH] drm/amd/amdgpu:flush ttm delayed work before cancel_sync

2021-08-17 Thread Andrey Grodzovsky

Looks reasonable to me.

Reviewed-by: Andrey Grodzovsky 

Andrey

On 2021-08-17 5:50 a.m., YuBiao Wang wrote:

[Why]
In some cases when we unload driver, warning call trace
will show up in vram_mgr_fini which claims that LRU is not empty, caused
by the ttm bo inside delay deleted queue.

[How]
We should flush delayed work to make sure the delay deleting is done.

Signed-off-by: YuBiao Wang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4d266c40382c..0b5764aa98a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3824,8 +3824,10 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
  {
dev_info(adev->dev, "amdgpu: finishing device.\n");
flush_delayed_work(&adev->delayed_init_work);
-   if (adev->mman.initialized)
+   if (adev->mman.initialized) {
+   flush_delayed_work(&adev->mman.bdev.wq);
ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
+   }
adev->shutdown = true;
  
  	/* make sure IB test finished before entering exclusive mode


Re: [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test (v2)

2021-08-17 Thread Andrey Grodzovsky



On 2021-08-02 1:16 a.m., Guchun Chen wrote:

In amdgpu_fence_driver_hw_fini, no need to call drm_sched_fini to stop
scheduler in s3 test, otherwise, fence related failure will arrive
after resume. To fix this and for a better clean up, move drm_sched_fini
from fence_hw_fini to fence_sw_fini, as it's part of driver shutdown, and
should never be called in hw_fini.

v2: rename amdgpu_fence_driver_init to amdgpu_fence_driver_sw_init,
to keep sw_init and sw_fini paired.

Fixes: cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence
Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 ++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 12 +++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  4 ++--
  3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b1d2dc39e8be..9e53ff851496 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3646,9 +3646,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
  
  fence_driver_init:

/* Fence driver */
-   r = amdgpu_fence_driver_init(adev);
+   r = amdgpu_fence_driver_sw_init(adev);
if (r) {
-   dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
+   dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 
0);
goto failed;
}
@@ -3988,7 +3988,6 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
fbcon)
}
amdgpu_fence_driver_hw_init(adev);
  
-

r = amdgpu_device_ip_late_init(adev);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 49c5c7331c53..7495911516c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -498,7 +498,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
  }
  
  /**

- * amdgpu_fence_driver_init - init the fence driver
+ * amdgpu_fence_driver_sw_init - init the fence driver
   * for all possible rings.
   *
   * @adev: amdgpu device pointer
@@ -509,13 +509,13 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring 
*ring,
   * amdgpu_fence_driver_start_ring().
   * Returns 0 for success.
   */
-int amdgpu_fence_driver_init(struct amdgpu_device *adev)
+int amdgpu_fence_driver_sw_init(struct amdgpu_device *adev)
  {
return 0;
  }
  
  /**

- * amdgpu_fence_driver_fini - tear down the fence driver
+ * amdgpu_fence_driver_hw_fini - tear down the fence driver
   * for all possible rings.
   *
   * @adev: amdgpu device pointer
@@ -531,8 +531,7 @@ void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev)
  
  		if (!ring || !ring->fence_drv.initialized)

continue;
-   if (!ring->no_scheduler)
-   drm_sched_fini(&ring->sched);
+
/* You can't wait for HW to signal if it's gone */
if (!drm_dev_is_unplugged(&adev->ddev))
r = amdgpu_fence_wait_empty(ring);



Sorry for late notice, missed this patch. By moving drm_sched_fini
past amdgpu_fence_wait_empty a race is created as even after you waited
for all fences on the ring to signal the sw scheduler will keep submitting
new jobs on the ring and so the ring won't stay empty.

For hot device removal also we want to prevent any access to HW past PCI 
removal
in order to not do any MMIO accesses inside the physical MMIO range that 
no longer
belongs to this device after it's removal by the PCI core. Stopping all 
the schedulers prevents any MMIO
accesses done during job submissions and that why drm_sched_fini was 
done as part of amdgpu_fence_driver_hw_fini

and not amdgpu_fence_driver_sw_fini

Andrey


@@ -560,6 +559,9 @@ void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev)
if (!ring || !ring->fence_drv.initialized)
continue;
  
+		if (!ring->no_scheduler)

+   drm_sched_fini(&ring->sched);
+
for (j = 0; j <= ring->fence_drv.num_fences_mask; ++j)
dma_fence_put(ring->fence_drv.fences[j]);
kfree(ring->fence_drv.fences);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 27adffa7658d..9c11ced4312c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -106,7 +106,6 @@ struct amdgpu_fence_driver {
struct dma_fence**fences;
  };
  
-int amdgpu_fence_driver_init(struct amdgpu_device *adev);

  void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
  
  int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,

@@ -115,9 +114,10 @@ int amdgpu_fence_driver_init_ring

Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

2021-08-18 Thread Andrey Grodzovsky

On 2021-08-18 10:02 a.m., Alex Deucher wrote:


+ dri-devel

Since scheduler is a shared component, please add dri-devel on all
scheduler patches.

On Wed, Aug 18, 2021 at 7:21 AM Jingwen Chen  wrote:

[Why]
for bailing job, this commit will delete it from pending list thus the
bailing job will never have a chance to be resubmitted even in advance
tdr mode.

[How]
after embeded hw_fence into amdgpu_job is done, the race condition that
this commit tries to work around is completely solved.So revert this
commit.
This reverts commit 135517d3565b48f4def3b1b82008bc17eb5d1c90.
v2:
add dma_fence_get/put() around timedout_job to avoid concurrent delete
during processing timedout_job

Signed-off-by: Jingwen Chen 
---
  drivers/gpu/drm/scheduler/sched_main.c | 23 +--
  1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a953693b45..f9b9b3aefc4a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
  {
 struct drm_gpu_scheduler *sched;
 struct drm_sched_job *job;
+   struct dma_fence *fence;
 enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL;

 sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
@@ -325,11 +326,10 @@ static void drm_sched_job_timedout(struct work_struct 
*work)

 if (job) {
 /*
-* Remove the bad job so it cannot be freed by concurrent
-* drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
-* is parked at which point it's safe.
+* Get job->s_fence->parent here to avoid concurrent delete 
during
+* processing timedout_job
  */
-   list_del_init(&job->list);
+   fence = dma_fence_get(job->s_fence->parent);



While this is true for amdgpu, it has no meaning for other drivers for 
whom we haven't
done the refactoring of embedding HW fence (parent) into the job 
structure. In fact thinking
about it, unless you do the HW fence embedding for all the drivers using 
the scheduler you cannot

revert this patch or you will just break them.

Andrey



 spin_unlock(&sched->job_list_lock);

 status = job->sched->ops->timedout_job(job);
@@ -342,6 +342,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
 job->sched->ops->free_job(job);
 sched->free_guilty = false;
 }
+   dma_fence_put(fence);
 } else {
 spin_unlock(&sched->job_list_lock);
 }
@@ -392,20 +393,6 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, 
struct drm_sched_job *bad)

 kthread_park(sched->thread);

-   /*
-* Reinsert back the bad job here - now it's safe as
-* drm_sched_get_cleanup_job cannot race against us and release the
-* bad job at this point - we parked (waited for) any in progress
-* (earlier) cleanups and drm_sched_get_cleanup_job will not be called
-* now until the scheduler thread is unparked.
-*/
-   if (bad && bad->sched == sched)
-   /*
-* Add at the head of the queue to reflect it was the earliest
-* job extracted.
-*/
-   list_add(&bad->list, &sched->pending_list);
-
 /*
  * Iterate the job list from later to  earlier one and either deactive
  * their HW callbacks or remove them from pending list if they already
--
2.25.1



Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

2021-08-18 Thread Andrey Grodzovsky



On 2021-08-18 10:32 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:26:25AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:02 a.m., Alex Deucher wrote:


+ dri-devel

Since scheduler is a shared component, please add dri-devel on all
scheduler patches.

On Wed, Aug 18, 2021 at 7:21 AM Jingwen Chen  wrote:

[Why]
for bailing job, this commit will delete it from pending list thus the
bailing job will never have a chance to be resubmitted even in advance
tdr mode.

[How]
after embeded hw_fence into amdgpu_job is done, the race condition that
this commit tries to work around is completely solved.So revert this
commit.
This reverts commit 135517d3565b48f4def3b1b82008bc17eb5d1c90.
v2:
add dma_fence_get/put() around timedout_job to avoid concurrent delete
during processing timedout_job

Signed-off-by: Jingwen Chen 
---
   drivers/gpu/drm/scheduler/sched_main.c | 23 +--
   1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a953693b45..f9b9b3aefc4a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
   {
  struct drm_gpu_scheduler *sched;
  struct drm_sched_job *job;
+   struct dma_fence *fence;
  enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL;

  sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
@@ -325,11 +326,10 @@ static void drm_sched_job_timedout(struct work_struct 
*work)

  if (job) {
  /*
-* Remove the bad job so it cannot be freed by concurrent
-* drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
-* is parked at which point it's safe.
+* Get job->s_fence->parent here to avoid concurrent delete 
during
+* processing timedout_job
   */
-   list_del_init(&job->list);
+   fence = dma_fence_get(job->s_fence->parent);


While this is true for amdgpu, it has no meaning for other drivers for whom
we haven't
done the refactoring of embedding HW fence (parent) into the job structure.
In fact thinking
about it, unless you do the HW fence embedding for all the drivers using the
scheduler you cannot
revert this patch or you will just break them.

btw, why did you do that embedding? I do still have my patches with
dma_fence annotations floating around, but my idea at least was to fix
that issue with a mempool, not with embeddeding. What was the motivation
for embedding the wh fence?
-Daniel



The motivation was 2 fold, avoid memory allocation during jobs submissions
(HW fence allocation) because as Christian explained this leads to 
deadlock with
mm code during evictions due to memory pressure (Christian can clarify 
if I messed
this explanation). Second is to exactly revert this patch because while 
it solved the issue
described in the patch it created another with drivers who baildc out 
early during TDR handling
for various reason and the job would just leak because it was already 
removed form pending list.


Andrey






Andrey



  spin_unlock(&sched->job_list_lock);

  status = job->sched->ops->timedout_job(job);
@@ -342,6 +342,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
  job->sched->ops->free_job(job);
  sched->free_guilty = false;
  }
+   dma_fence_put(fence);
  } else {
  spin_unlock(&sched->job_list_lock);
  }
@@ -392,20 +393,6 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, 
struct drm_sched_job *bad)

  kthread_park(sched->thread);

-   /*
-* Reinsert back the bad job here - now it's safe as
-* drm_sched_get_cleanup_job cannot race against us and release the
-* bad job at this point - we parked (waited for) any in progress
-* (earlier) cleanups and drm_sched_get_cleanup_job will not be called
-* now until the scheduler thread is unparked.
-*/
-   if (bad && bad->sched == sched)
-   /*
-* Add at the head of the queue to reflect it was the earliest
-* job extracted.
-*/
-   list_add(&bad->list, &sched->pending_list);
-
  /*
   * Iterate the job list from later to  earlier one and either deactive
   * their HW callbacks or remove them from pending list if they already
--
2.25.1



Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

2021-08-18 Thread Andrey Grodzovsky



On 2021-08-18 10:42 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:36:32AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:32 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:26:25AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:02 a.m., Alex Deucher wrote:


+ dri-devel

Since scheduler is a shared component, please add dri-devel on all
scheduler patches.

On Wed, Aug 18, 2021 at 7:21 AM Jingwen Chen  wrote:

[Why]
for bailing job, this commit will delete it from pending list thus the
bailing job will never have a chance to be resubmitted even in advance
tdr mode.

[How]
after embeded hw_fence into amdgpu_job is done, the race condition that
this commit tries to work around is completely solved.So revert this
commit.
This reverts commit 135517d3565b48f4def3b1b82008bc17eb5d1c90.
v2:
add dma_fence_get/put() around timedout_job to avoid concurrent delete
during processing timedout_job

Signed-off-by: Jingwen Chen 
---
drivers/gpu/drm/scheduler/sched_main.c | 23 +--
1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a953693b45..f9b9b3aefc4a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
{
   struct drm_gpu_scheduler *sched;
   struct drm_sched_job *job;
+   struct dma_fence *fence;
   enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL;

   sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
@@ -325,11 +326,10 @@ static void drm_sched_job_timedout(struct work_struct 
*work)

   if (job) {
   /*
-* Remove the bad job so it cannot be freed by concurrent
-* drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
-* is parked at which point it's safe.
+* Get job->s_fence->parent here to avoid concurrent delete 
during
+* processing timedout_job
*/
-   list_del_init(&job->list);
+   fence = dma_fence_get(job->s_fence->parent);

While this is true for amdgpu, it has no meaning for other drivers for whom
we haven't
done the refactoring of embedding HW fence (parent) into the job structure.
In fact thinking
about it, unless you do the HW fence embedding for all the drivers using the
scheduler you cannot
revert this patch or you will just break them.

btw, why did you do that embedding? I do still have my patches with
dma_fence annotations floating around, but my idea at least was to fix
that issue with a mempool, not with embeddeding. What was the motivation
for embedding the wh fence?
-Daniel


The motivation was 2 fold, avoid memory allocation during jobs submissions
(HW fence allocation) because as Christian explained this leads to deadlock
with
mm code during evictions due to memory pressure (Christian can clarify if I
messed

Yeah that's the exact same thing I've chased with my dma_fence
annotations, but thus far zero to none interested in getting it sorted. I
think it'd be good to have some cross-driver agreement on how this should
be solved before someone just charges ahead ...


this explanation). Second is to exactly revert this patch because while it
solved the issue
described in the patch it created another with drivers who baildc out early
during TDR handling
for various reason and the job would just leak because it was already
removed form pending list.

Can't we reinsert it before we restart the scheduler thread? It might need
a separate list for that due to the lockless queue tricks. Or am I
thinking about the wrong kind of "we lost the job"?
-Danile



If you look at the original patch it would reinsert it even earlier - 
right after stopping the  SW scheduler thread, and even then it was to 
late for
some drivers as they would decide to return back from their TDR handler 
even before that. It is solvable but in an ugly way as far as I see, you 
need to
require each driver in his code to put the job back in the list if they 
do it before reaching the place where scheduler framework does it. Kind of

spaghetti code seems to me.

Andrey





Andrey





Andrey



   spin_unlock(&sched->job_list_lock);

   status = job->sched->ops->timedout_job(job);
@@ -342,6 +342,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
   job->sched->ops->free_job(job);
   sched->free_guilty = false;
   }
+   dma_fence_put(fence);
   } else {
   spin_unlock(&sched->job_list_lock);
   }
@@ -392,20 +393,6 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, 
struct drm_sche

Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

2021-08-19 Thread Andrey Grodzovsky



On 2021-08-19 5:30 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:51:00AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:42 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:36:32AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:32 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:26:25AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:02 a.m., Alex Deucher wrote:


+ dri-devel

Since scheduler is a shared component, please add dri-devel on all
scheduler patches.

On Wed, Aug 18, 2021 at 7:21 AM Jingwen Chen  wrote:

[Why]
for bailing job, this commit will delete it from pending list thus the
bailing job will never have a chance to be resubmitted even in advance
tdr mode.

[How]
after embeded hw_fence into amdgpu_job is done, the race condition that
this commit tries to work around is completely solved.So revert this
commit.
This reverts commit 135517d3565b48f4def3b1b82008bc17eb5d1c90.
v2:
add dma_fence_get/put() around timedout_job to avoid concurrent delete
during processing timedout_job

Signed-off-by: Jingwen Chen 
---
 drivers/gpu/drm/scheduler/sched_main.c | 23 +--
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a953693b45..f9b9b3aefc4a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
 {
struct drm_gpu_scheduler *sched;
struct drm_sched_job *job;
+   struct dma_fence *fence;
enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL;

sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
@@ -325,11 +326,10 @@ static void drm_sched_job_timedout(struct work_struct 
*work)

if (job) {
/*
-* Remove the bad job so it cannot be freed by concurrent
-* drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
-* is parked at which point it's safe.
+* Get job->s_fence->parent here to avoid concurrent delete 
during
+* processing timedout_job
 */
-   list_del_init(&job->list);
+   fence = dma_fence_get(job->s_fence->parent);

While this is true for amdgpu, it has no meaning for other drivers for whom
we haven't
done the refactoring of embedding HW fence (parent) into the job structure.
In fact thinking
about it, unless you do the HW fence embedding for all the drivers using the
scheduler you cannot
revert this patch or you will just break them.

btw, why did you do that embedding? I do still have my patches with
dma_fence annotations floating around, but my idea at least was to fix
that issue with a mempool, not with embeddeding. What was the motivation
for embedding the wh fence?
-Daniel

The motivation was 2 fold, avoid memory allocation during jobs submissions
(HW fence allocation) because as Christian explained this leads to deadlock
with
mm code during evictions due to memory pressure (Christian can clarify if I
messed

Yeah that's the exact same thing I've chased with my dma_fence
annotations, but thus far zero to none interested in getting it sorted. I
think it'd be good to have some cross-driver agreement on how this should
be solved before someone just charges ahead ...


this explanation). Second is to exactly revert this patch because while it
solved the issue
described in the patch it created another with drivers who baildc out early
during TDR handling
for various reason and the job would just leak because it was already
removed form pending list.

Can't we reinsert it before we restart the scheduler thread? It might need
a separate list for that due to the lockless queue tricks. Or am I
thinking about the wrong kind of "we lost the job"?
-Danile


If you look at the original patch it would reinsert it even earlier - right
after stopping the  SW scheduler thread, and even then it was to late for
some drivers as they would decide to return back from their TDR handler even
before that. It is solvable but in an ugly way as far as I see, you need to
require each driver in his code to put the job back in the list if they do
it before reaching the place where scheduler framework does it. Kind of
spaghetti code seems to me.

Hm yeah I didn't realize this all happens before we stop the scheduler
thread.

Why can't we stop the scheduler thread first, so that there's guaranteed
no race? I've recently had a lot of discussions with panfrost folks about
their reset that spawns across engines, and without stopping the scheduler
thread first before you touch anything it's just plain impossible.



Talked with Christian on that, for each TDR we actually stop all the
schedulers for all the rings and n

Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

2021-08-20 Thread Andrey Grodzovsky
amp;bad->list, &sched->pending_list);
-
-   /*
   * Iterate the job list from later to  earlier one and either deactive
   * their HW callbacks or remove them from pending list if they already
   * signaled.


Thanks

--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Daniel Vetter 
Sent: Thursday, August 19, 2021 5:31 PM
To: Grodzovsky, Andrey 
Cc: Daniel Vetter ; Alex Deucher ; Chen, JingWen 
; Maling list - DRI developers ; amd-gfx list 
; Liu, Monk ; Koenig, Christian 

Subject: Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

On Wed, Aug 18, 2021 at 10:51:00AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:42 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:36:32AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:32 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:26:25AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:02 a.m., Alex Deucher wrote:


+ dri-devel

Since scheduler is a shared component, please add dri-devel
on all scheduler patches.

On Wed, Aug 18, 2021 at 7:21 AM Jingwen Chen  wrote:

[Why]
for bailing job, this commit will delete it from pending
list thus the bailing job will never have a chance to be
resubmitted even in advance tdr mode.

[How]
after embeded hw_fence into amdgpu_job is done, the race
condition that this commit tries to work around is
completely solved.So revert this commit.
This reverts commit 135517d3565b48f4def3b1b82008bc17eb5d1c90.
v2:
add dma_fence_get/put() around timedout_job to avoid
concurrent delete during processing timedout_job

Signed-off-by: Jingwen Chen 
---
  drivers/gpu/drm/scheduler/sched_main.c | 23 +--
  1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a953693b45..f9b9b3aefc4a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
  {
 struct drm_gpu_scheduler *sched;
 struct drm_sched_job *job;
+   struct dma_fence *fence;
 enum drm_gpu_sched_stat status =
DRM_GPU_SCHED_STAT_NOMINAL;

 sched = container_of(work, struct
drm_gpu_scheduler, work_tdr.work); @@ -325,11 +326,10 @@
static void drm_sched_job_timedout(struct work_struct
*work)

 if (job) {
 /*
-* Remove the bad job so it cannot be freed by concurrent
-* drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
-* is parked at which point it's safe.
+* Get job->s_fence->parent here to avoid concurrent delete 
during
+* processing timedout_job
  */
-   list_del_init(&job->list);
+   fence =
+ dma_fence_get(job->s_fence->parent);

While this is true for amdgpu, it has no meaning for other
drivers for whom we haven't done the refactoring of embedding
HW fence (parent) into the job structure.
In fact thinking
about it, unless you do the HW fence embedding for all the
drivers using the scheduler you cannot revert this patch or
you will just break them.

btw, why did you do that embedding? I do still have my patches
with dma_fence annotations floating around, but my idea at least
was to fix that issue with a mempool, not with embeddeding. What
was the motivation for embedding the wh fence?
-Daniel

The motivation was 2 fold, avoid memory allocation during jobs
submissions (HW fence allocation) because as Christian explained
this leads to deadlock with mm code during evictions due to memory
pressure (Christian can clarify if I messed

Yeah that's the exact same thing I've chased with my dma_fence
annotations, but thus far zero to none interested in getting it
sorted. I think it'd be good to have some cross-driver agreement on
how this should be solved before someone just charges ahead ...


this explanation). Second is to exactly revert this patch because
while it solved the issue described in the patch it created
another with drivers who baildc out early during TDR handling for
various reason and the job would just leak because it was already
removed form pending list.

Can't we reinsert it before we restart the scheduler thread? It
might need a separate list for that due to the lockless queue
tricks. Or am I thinking about the wrong kind of "we lost the job"?
-Danile

If you look at the original patch it would reinsert it even earlier -
right after stopping the  SW scheduler thread, and even then it was to
late for some drivers as they would decide to return back from their
TDR handler even before that. It is solvable but in an ugly way as far
as I see, you need to require each drive

Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

2021-08-20 Thread Andrey Grodzovsky
ed.


Thanks

--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Daniel Vetter 
Sent: Thursday, August 19, 2021 5:31 PM
To: Grodzovsky, Andrey 
Cc: Daniel Vetter ; Alex Deucher ; Chen, JingWen 
; Maling list - DRI developers ; amd-gfx list 
; Liu, Monk ; Koenig, Christian 

Subject: Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

On Wed, Aug 18, 2021 at 10:51:00AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:42 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:36:32AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:32 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:26:25AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:02 a.m., Alex Deucher wrote:


+ dri-devel

Since scheduler is a shared component, please add dri-devel
on all scheduler patches.

On Wed, Aug 18, 2021 at 7:21 AM Jingwen Chen  wrote:

[Why]
for bailing job, this commit will delete it from pending
list thus the bailing job will never have a chance to be
resubmitted even in advance tdr mode.

[How]
after embeded hw_fence into amdgpu_job is done, the race
condition that this commit tries to work around is
completely solved.So revert this commit.
This reverts commit 135517d3565b48f4def3b1b82008bc17eb5d1c90.
v2:
add dma_fence_get/put() around timedout_job to avoid
concurrent delete during processing timedout_job

Signed-off-by: Jingwen Chen 
---
 drivers/gpu/drm/scheduler/sched_main.c | 23 +--
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a953693b45..f9b9b3aefc4a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
 {
struct drm_gpu_scheduler *sched;
struct drm_sched_job *job;
+   struct dma_fence *fence;
enum drm_gpu_sched_stat status =
DRM_GPU_SCHED_STAT_NOMINAL;

sched = container_of(work, struct
drm_gpu_scheduler, work_tdr.work); @@ -325,11 +326,10 @@
static void drm_sched_job_timedout(struct work_struct
*work)

if (job) {
/*
-* Remove the bad job so it cannot be freed by concurrent
-* drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
-* is parked at which point it's safe.
+* Get job->s_fence->parent here to avoid concurrent delete 
during
+* processing timedout_job
 */
-   list_del_init(&job->list);
+   fence =
+ dma_fence_get(job->s_fence->parent);

While this is true for amdgpu, it has no meaning for other
drivers for whom we haven't done the refactoring of embedding
HW fence (parent) into the job structure.
In fact thinking
about it, unless you do the HW fence embedding for all the
drivers using the scheduler you cannot revert this patch or
you will just break them.

btw, why did you do that embedding? I do still have my patches
with dma_fence annotations floating around, but my idea at least
was to fix that issue with a mempool, not with embeddeding. What
was the motivation for embedding the wh fence?
-Daniel

The motivation was 2 fold, avoid memory allocation during jobs
submissions (HW fence allocation) because as Christian explained
this leads to deadlock with mm code during evictions due to memory
pressure (Christian can clarify if I messed

Yeah that's the exact same thing I've chased with my dma_fence
annotations, but thus far zero to none interested in getting it
sorted. I think it'd be good to have some cross-driver agreement on
how this should be solved before someone just charges ahead ...


this explanation). Second is to exactly revert this patch because
while it solved the issue described in the patch it created
another with drivers who baildc out early during TDR handling for
various reason and the job would just leak because it was already
removed form pending list.

Can't we reinsert it before we restart the scheduler thread? It
might need a separate list for that due to the lockless queue
tricks. Or am I thinking about the wrong kind of "we lost the job"?
-Danile


If you look at the original patch it would reinsert it even earlier -
right after stopping the  SW scheduler thread, and even then it was to
late for some drivers as they would decide to return back from their
TDR handler even before that. It is solvable but in an ugly way as far
as I see, you need to require each driver in his code to put the job
back in the list if they do it before reaching the place where
scheduler framework does it. Kind of spaghetti code seems to me.

Hm yeah I didn't realize this all happens before we stop the scheduler thread.

Why can

Re: [PATCH] drm/amdgpu: avoid over-handle of fence driver fini in s3 test (v2)

2021-08-23 Thread Andrey Grodzovsky



On 2021-08-23 2:50 a.m., Christian König wrote:

Good mornings guys,

Andrey has a rather valid concern here, but I think we need to 
approach this from a more high level view.


When hw_fini is called we should make sure that the scheduler can't 
submit any more work to the hardware, because the hw is finalized and 
not expected to response any more.


As far as I can see the cleanest approach would be to stop the 
scheduler in hw_fini and fully clean it up in sw_fini. That would also 
fit quite nicely with how GPU reset is supposed to work I think.


Problem is that this is currently done outside of the fence code for 
the at least the reset case, so before we restructure that we need to 
stick with what we have.


Andrey do you think it would be any problem if we stop the scheduler 
manually in the hot plug case as well?



As long as it's 'parked' inside HW fini - meaning the thread submitting 
to HW is done I think it should cover hot unplug as well.


Andrey




Thanks,
Christian.

Am 23.08.21 um 08:36 schrieb Chen, Guchun:

[Public]

Hi Andrey,

Thanks for your notice. The cause why moving drm_sched_fini to 
sw_fini is it's a SW behavior and part of SW shutdown, so hw_fini 
should not touch it. But if the race, that scheduler on the ring 
possibly keeps submitting jobs which causes un-empty ring is there, 
possibly we still need to call drm_sched_fini first in hw_fini to 
stop job submission first.


@Koenig, Christian what's your opinion?

Regards,
Guchun

-Original Message-
From: Alex Deucher 
Sent: Friday, August 20, 2021 2:13 AM
To: Mike Lothian 
Cc: Grodzovsky, Andrey ; Chen, Guchun 
; amd-gfx list ; 
Gao, Likun ; Koenig, Christian 
; Zhang, Hawking ; 
Deucher, Alexander 
Subject: Re: [PATCH] drm/amdgpu: avoid over-handle of fence driver 
fini in s3 test (v2)


Please go ahead.  Thanks!

Alex

On Thu, Aug 19, 2021 at 8:05 AM Mike Lothian  
wrote:

Hi

Do I need to open a new bug report for this?

Cheers

Mike

On Wed, 18 Aug 2021 at 06:26, Andrey Grodzovsky 
 wrote:


On 2021-08-02 1:16 a.m., Guchun Chen wrote:

In amdgpu_fence_driver_hw_fini, no need to call drm_sched_fini to
stop scheduler in s3 test, otherwise, fence related failure will
arrive after resume. To fix this and for a better clean up, move
drm_sched_fini from fence_hw_fini to fence_sw_fini, as it's part of
driver shutdown, and should never be called in hw_fini.

v2: rename amdgpu_fence_driver_init to amdgpu_fence_driver_sw_init,
to keep sw_init and sw_fini paired.

Fixes: cd87a6dcf6af drm/amdgpu: adjust fence driver enable sequence
Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 ++---
   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 12 +++-
   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  4 ++--
   3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b1d2dc39e8be..9e53ff851496 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3646,9 +3646,9 @@ int amdgpu_device_init(struct amdgpu_device
*adev,

   fence_driver_init:
   /* Fence driver */
- r = amdgpu_fence_driver_init(adev);
+ r = amdgpu_fence_driver_sw_init(adev);
   if (r) {
- dev_err(adev->dev, "amdgpu_fence_driver_init 
failed\n");

+ dev_err(adev->dev, "amdgpu_fence_driver_sw_init
+ failed\n");
   amdgpu_vf_error_put(adev, 
AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);

   goto failed;
   }
@@ -3988,7 +3988,6 @@ int amdgpu_device_resume(struct drm_device 
*dev, bool fbcon)

   }
   amdgpu_fence_driver_hw_init(adev);

-
   r = amdgpu_device_ip_late_init(adev);
   if (r)
   return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 49c5c7331c53..7495911516c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -498,7 +498,7 @@ int amdgpu_fence_driver_init_ring(struct 
amdgpu_ring *ring,

   }

   /**
- * amdgpu_fence_driver_init - init the fence driver
+ * amdgpu_fence_driver_sw_init - init the fence driver
    * for all possible rings.
    *
    * @adev: amdgpu device pointer
@@ -509,13 +509,13 @@ int amdgpu_fence_driver_init_ring(struct 
amdgpu_ring *ring,

    * amdgpu_fence_driver_start_ring().
    * Returns 0 for success.
    */
-int amdgpu_fence_driver_init(struct amdgpu_device *adev)
+int amdgpu_fence_driver_sw_init(struct amdgpu_device *adev)
   {
   return 0;
   }

   /**
- * amdgpu_fence_driver_fini - tear down the fence driver
+ * amdgpu_fence_driver_hw_fini - tear down the fence driver
    * for all possible rings.
    *
    * @adev: amdgpu device pointer
@@ -531,8 +531,7 @@ void amdgpu_fence_driver_hw_fini(struct
amdgpu_device *ad

Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

2021-08-24 Thread Andrey Grodzovsky
so, when the TO handler 
does execute eventually it's not because something wrong but simply because TO 
has expired. If in this case the pending list not empty a false TDR will be 
triggered. I think long ago we used TO handler per job and not per scheduler, 
this would solve this problem but hurt the serialization issue we are trying to 
solve. So not sure what to do.

[1] -
https://elixir.bootlin.com/linux/v5.14-rc1/source/kernel/workqueue.c#L1665

Andrey


Thanks

--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Liu, Monk
Sent: Thursday, August 19, 2021 6:26 PM
To: Daniel Vetter ; Grodzovsky, Andrey

Cc: Alex Deucher ; Chen, JingWen
; Maling list - DRI developers
; amd-gfx list
; Koenig, Christian

Subject: RE: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

[AMD Official Use Only]

Hi Daniel


Why can't we stop the scheduler thread first, so that there's guaranteed no 
race? I've recently had a lot of discussions with panfrost folks about their 
reset that spawns across engines, and without stopping the scheduler thread 
first before you touch anything it's just plain impossible.

Yeah we had this though as well in our mind.

Our second approach is to call ktrhead_stop() in job_timedout() routine so that  the 
"bad" job is guaranteed to be used without scheduler's touching or freeing, 
Check this sample patch one as well please:

diff --git a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a9536..50a49cb 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -319,17 +319,12 @@ static void drm_sched_job_timedout(struct work_struct 
*work)
  sched = container_of(work, struct drm_gpu_scheduler,
work_tdr.work);
   
  /* Protects against concurrent deletion in

drm_sched_get_cleanup_job */
+   kthread_park(sched->thread);
  spin_lock(&sched->job_list_lock);
  job = list_first_entry_or_null(&sched->pending_list,
 struct drm_sched_job, list);
   
  if (job) {

-   /*
-* Remove the bad job so it cannot be freed by concurrent
-* drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
-* is parked at which point it's safe.
-*/
-   list_del_init(&job->list);
  spin_unlock(&sched->job_list_lock);
   
  status = job->sched->ops->timedout_job(job);

@@ -345,6 +340,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
  } else {
  spin_unlock(&sched->job_list_lock);
  }
+   kthread_unpark(sched->thread);
   
  if (status != DRM_GPU_SCHED_STAT_ENODEV) {

  spin_lock(&sched->job_list_lock); @@ -393,20 +389,6 @@ void 
drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
  kthread_park(sched->thread);
   
  /*

-* Reinsert back the bad job here - now it's safe as
-* drm_sched_get_cleanup_job cannot race against us and release the
-* bad job at this point - we parked (waited for) any in progress
-* (earlier) cleanups and drm_sched_get_cleanup_job will not be called
-* now until the scheduler thread is unparked.
-*/
-   if (bad && bad->sched == sched)
-   /*
-* Add at the head of the queue to reflect it was the earliest
-* job extracted.
-*/
-   list_add(&bad->list, &sched->pending_list);
-
-   /*
   * Iterate the job list from later to  earlier one and either deactive
   * their HW callbacks or remove them from pending list if they already
   * signaled.


Thanks

--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Daniel Vetter 
Sent: Thursday, August 19, 2021 5:31 PM
To: Grodzovsky, Andrey 
Cc: Daniel Vetter ; Alex Deucher
; Chen, JingWen ; Maling
list - DRI developers ; amd-gfx list
; Liu, Monk ; Koenig,
Christian 
Subject: Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

On Wed, Aug 18, 2021 at 10:51:00AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:42 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:36:32AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:32 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:26:25AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:02 a.m., Alex Deucher wrote:


+ dri-devel

Since scheduler is a shared component, please add dri-devel on
all scheduler patches.

On Wed, Aug 18, 2021 at 7:21 AM Jingwen Chen  wrote:

[Why]

Re: [PATCH] drm/sched: fix the bug of time out calculation

2021-08-24 Thread Andrey Grodzovsky



On 2021-08-24 10:46 a.m., Andrey Grodzovsky wrote:


On 2021-08-24 5:51 a.m., Monk Liu wrote:

the original logic is wrong that the timeout will not be retriggerd
after the previous job siganled, and that lead to the scenario that all
jobs in the same scheduler shares the same timeout timer from the very
begining job in this scheduler which is wrong.

we should modify the timer everytime a previous job signaled.

Signed-off-by: Monk Liu 
---
  drivers/gpu/drm/scheduler/sched_main.c | 12 
  1 file changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c

index a2a9536..fb27025 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -235,6 +235,13 @@ static void drm_sched_start_timeout(struct 
drm_gpu_scheduler *sched)

  schedule_delayed_work(&sched->work_tdr, sched->timeout);
  }
  +static void drm_sched_restart_timeout(struct drm_gpu_scheduler 
*sched)

+{
+    if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
+    !list_empty(&sched->pending_list))
+    mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout);



3d point - if list empty you need to cancel the timer, let the new job 
coming after that restart it.


Andrey



+}
+
  /**
   * drm_sched_fault - immediately start timeout handler
   *
@@ -693,6 +700,11 @@ drm_sched_get_cleanup_job(struct 
drm_gpu_scheduler *sched)

  if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
  /* remove job from pending_list */
  list_del_init(&job->list);
+
+    /* once the job deleted from pending list we should restart
+ * the timeout calculation for the next job.
+ */
+    drm_sched_restart_timeout(sched);



I think this should work, but 2 points -

1st you should probably remove this now 
https://elixir.bootlin.com/linux/v5.14-rc1/source/drivers/gpu/drm/scheduler/sched_main.c#L797


2nd - if you have two adjacent jobs started very closely you 
effectively letting the second job to be twice longer hang without TDR 
because
you reset TDR timer for it when it's almost expired. If we could have 
TTL (time to live counter) for each job and then do mod_delayed_work
to the TTL of the following job instead of just full timer reset then 
this would be more precise. But this is more of recommendation for 
improvement.


Andrey



  /* make the scheduled timestamp more accurate */
  next = list_first_entry_or_null(&sched->pending_list,
  typeof(*next), list);


Re: [PATCH] drm/sched: fix the bug of time out calculation

2021-08-24 Thread Andrey Grodzovsky



On 2021-08-24 5:51 a.m., Monk Liu wrote:

the original logic is wrong that the timeout will not be retriggerd
after the previous job siganled, and that lead to the scenario that all
jobs in the same scheduler shares the same timeout timer from the very
begining job in this scheduler which is wrong.

we should modify the timer everytime a previous job signaled.

Signed-off-by: Monk Liu 
---
  drivers/gpu/drm/scheduler/sched_main.c | 12 
  1 file changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a9536..fb27025 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -235,6 +235,13 @@ static void drm_sched_start_timeout(struct 
drm_gpu_scheduler *sched)
schedule_delayed_work(&sched->work_tdr, sched->timeout);
  }
  
+static void drm_sched_restart_timeout(struct drm_gpu_scheduler *sched)

+{
+   if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
+   !list_empty(&sched->pending_list))
+   mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout);
+}
+
  /**
   * drm_sched_fault - immediately start timeout handler
   *
@@ -693,6 +700,11 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
/* remove job from pending_list */
list_del_init(&job->list);
+
+   /* once the job deleted from pending list we should restart
+* the timeout calculation for the next job.
+*/
+   drm_sched_restart_timeout(sched);



I think this should work, but 2 points -

1st you should probably remove this now 
https://elixir.bootlin.com/linux/v5.14-rc1/source/drivers/gpu/drm/scheduler/sched_main.c#L797


2nd - if you have two adjacent jobs started very closely you effectively 
letting the second job to be twice longer hang without TDR because
you reset TDR timer for it when it's almost expired. If we could have 
TTL (time to live counter) for each job and then do mod_delayed_work
to the TTL of the following job instead of just full timer reset then 
this would be more precise. But this is more of recommendation for 
improvement.


Andrey



/* make the scheduled timestamp more accurate */
next = list_first_entry_or_null(&sched->pending_list,
typeof(*next), list);


[PATCH 0/4] Various fixes to pass libdrm hotunplug tests

2021-08-24 Thread Andrey Grodzovsky
Bunch of fixes to enable passing hotplug tests i previosly added
here[1] with latest code. 
Once accepted I will enable the tests on libdrm side.

[1] - https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/172

Andrey Grodzovsky (4):
  drm/amdgpu: Move flush VCE idle_work during HW fini
  drm/ttm: Create pinned list
  drm/amdgpu: drm/amdgpu: Handle IOMMU enabled case
  drm/amdgpu: Add a UAPI flag for hot plug/unplug

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c| 50 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h|  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c|  1 -
 drivers/gpu/drm/amd/amdgpu/vce_v2_0.c  |  4 ++
 drivers/gpu/drm/amd/amdgpu/vce_v3_0.c  |  5 ++-
 drivers/gpu/drm/amd/amdgpu/vce_v4_0.c  |  2 +
 drivers/gpu/drm/ttm/ttm_bo.c   | 24 +--
 drivers/gpu/drm/ttm/ttm_resource.c |  1 +
 include/drm/ttm/ttm_resource.h |  1 +
 11 files changed, 88 insertions(+), 6 deletions(-)

-- 
2.25.1



[PATCH 2/4] drm/ttm: Create pinned list

2021-08-24 Thread Andrey Grodzovsky
This list will be used to capture all non VRAM BOs not
on LRU so when device is hot unplugged we can iterate
the list and unmap DMA mappings before device is removed.

Signed-off-by: Andrey Grodzovsky 
Suggested-by: Christian König 
---
 drivers/gpu/drm/ttm/ttm_bo.c   | 24 +---
 drivers/gpu/drm/ttm/ttm_resource.c |  1 +
 include/drm/ttm/ttm_resource.h |  1 +
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 1b950b45cf4b..84ba76ace58f 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -69,16 +69,34 @@ static void ttm_bo_mem_space_debug(struct ttm_buffer_object 
*bo,
}
 }
 
-static void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
+static void ttm_bo_del_from_lru_imp(struct ttm_buffer_object *bo, bool final)
 {
struct ttm_device *bdev = bo->bdev;
+   struct ttm_resource_manager *man = NULL;
 
-   list_del_init(&bo->lru);
+   if (bo->resource)
+   man = ttm_manager_type(bdev, bo->resource->mem_type);
+
+
+   if (!final && man && man->use_tt)
+   list_move_tail(&bo->lru, &man->pinned);
+   else
+   list_del_init(&bo->lru);
 
if (bdev->funcs->del_from_lru_notify)
bdev->funcs->del_from_lru_notify(bo);
 }
 
+static inline void ttm_bo_del_from_lru_final(struct ttm_buffer_object *bo)
+{
+   ttm_bo_del_from_lru_imp(bo, true);
+}
+
+static inline void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
+{
+   ttm_bo_del_from_lru_imp(bo, false);
+}
+
 static void ttm_bo_bulk_move_set_pos(struct ttm_lru_bulk_move_pos *pos,
 struct ttm_buffer_object *bo)
 {
@@ -453,7 +471,7 @@ static void ttm_bo_release(struct kref *kref)
}
 
spin_lock(&bo->bdev->lru_lock);
-   ttm_bo_del_from_lru(bo);
+   ttm_bo_del_from_lru_final(bo);
list_del(&bo->ddestroy);
spin_unlock(&bo->bdev->lru_lock);
 
diff --git a/drivers/gpu/drm/ttm/ttm_resource.c 
b/drivers/gpu/drm/ttm/ttm_resource.c
index 2431717376e7..91165f77fe0e 100644
--- a/drivers/gpu/drm/ttm/ttm_resource.c
+++ b/drivers/gpu/drm/ttm/ttm_resource.c
@@ -85,6 +85,7 @@ void ttm_resource_manager_init(struct ttm_resource_manager 
*man,
 
for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i)
INIT_LIST_HEAD(&man->lru[i]);
+   INIT_LIST_HEAD(&man->pinned);
man->move = NULL;
 }
 EXPORT_SYMBOL(ttm_resource_manager_init);
diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h
index 140b6b9a8bbe..1ec0d5ebb59f 100644
--- a/include/drm/ttm/ttm_resource.h
+++ b/include/drm/ttm/ttm_resource.h
@@ -130,6 +130,7 @@ struct ttm_resource_manager {
 */
 
struct list_head lru[TTM_MAX_BO_PRIORITY];
+   struct list_head pinned;
 
/*
 * Protected by @move_lock.
-- 
2.25.1



[PATCH 1/4] drm/amdgpu: Move flush VCE idle_work during HW fini

2021-08-24 Thread Andrey Grodzovsky
Attepmts to powergate after device is removed lead to crash.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 1 -
 drivers/gpu/drm/amd/amdgpu/vce_v2_0.c   | 4 
 drivers/gpu/drm/amd/amdgpu/vce_v3_0.c   | 5 -
 drivers/gpu/drm/amd/amdgpu/vce_v4_0.c   | 2 ++
 4 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index 1ae7f824adc7..8e8dee9fac9f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -218,7 +218,6 @@ int amdgpu_vce_sw_fini(struct amdgpu_device *adev)
if (adev->vce.vcpu_bo == NULL)
return 0;
 
-   cancel_delayed_work_sync(&adev->vce.idle_work);
drm_sched_entity_destroy(&adev->vce.entity);
 
amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev->vce.gpu_addr,
diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
index c7d28c169be5..716dfdd020b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
@@ -477,6 +477,10 @@ static int vce_v2_0_hw_init(void *handle)
 
 static int vce_v2_0_hw_fini(void *handle)
 {
+   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
+   cancel_delayed_work_sync(&adev->vce.idle_work);
+
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c 
b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
index 3b82fb289ef6..49581c6e0cea 100644
--- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
@@ -495,7 +495,10 @@ static int vce_v3_0_hw_fini(void *handle)
return r;
 
vce_v3_0_stop(adev);
-   return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
+   r =  vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
+   cancel_delayed_work_sync(&adev->vce.idle_work);
+
+   return r;
 }
 
 static int vce_v3_0_suspend(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
index 90910d19db12..3297405fd32d 100644
--- a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
@@ -550,6 +550,8 @@ static int vce_v4_0_hw_fini(void *handle)
DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
}
 
+   cancel_delayed_work_sync(&adev->vce.idle_work);
+
return 0;
 }
 
-- 
2.25.1



[PATCH 3/4] drm/amdgpu: drm/amdgpu: Handle IOMMU enabled case

2021-08-24 Thread Andrey Grodzovsky
Handle all DMA IOMMU group related dependencies before the
group is removed and we try to access it after free.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c| 50 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h|  1 +
 3 files changed, 53 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0b5764aa98a4..288a465b8101 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3860,6 +3860,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
 
amdgpu_device_ip_fini_early(adev);
 
+   amdgpu_ttm_clear_dma_mappings(adev);
+
amdgpu_gart_dummy_page_fini(adev);
 
amdgpu_device_unmap_mmio(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 446943e32e3e..f73d807db3b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -64,6 +64,7 @@
 static int amdgpu_ttm_backend_bind(struct ttm_device *bdev,
   struct ttm_tt *ttm,
   struct ttm_resource *bo_mem);
+
 static void amdgpu_ttm_backend_unbind(struct ttm_device *bdev,
  struct ttm_tt *ttm);
 
@@ -2293,6 +2294,55 @@ static ssize_t amdgpu_iomem_write(struct file *f, const 
char __user *buf,
return result;
 }
 
+void amdgpu_ttm_clear_dma_mappings(struct amdgpu_device *adev)
+{
+   struct ttm_device *bdev = &adev->mman.bdev;
+   struct ttm_resource_manager *man;
+   struct ttm_buffer_object *bo;
+   unsigned int i, j;
+
+   spin_lock(&bdev->lru_lock);
+   for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
+   man = ttm_manager_type(bdev, i);
+   if (!man || !man->use_tt)
+   continue;
+
+   while (!list_empty(&man->pinned)) {
+   bo = list_first_entry(&man->pinned, struct 
ttm_buffer_object, lru);
+   /* Take ref against racing releases once lru_lock is 
unlocked */
+   ttm_bo_get(bo);
+   list_del_init(&bo->lru);
+   spin_unlock(&bdev->lru_lock);
+
+   if (bo->ttm) {
+   amdgpu_ttm_backend_unbind(bo->bdev, bo->ttm);
+   ttm_tt_destroy_common(bo->bdev, bo->ttm);
+   }
+
+   ttm_bo_put(bo);
+   spin_lock(&bdev->lru_lock);
+   }
+
+   for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
+   while (!list_empty(&man->lru[j])) {
+   bo = list_first_entry(&man->lru[j], struct 
ttm_buffer_object, lru);
+   ttm_bo_get(bo);
+   list_del_init(&bo->lru);
+   spin_unlock(&bdev->lru_lock);
+
+   if (bo->ttm) {
+   amdgpu_ttm_backend_unbind(bo->bdev, 
bo->ttm);
+   ttm_tt_destroy_common(bo->bdev, 
bo->ttm);
+   }
+   ttm_bo_put(bo);
+   spin_lock(&bdev->lru_lock);
+   }
+   }
+   }
+   spin_unlock(&bdev->lru_lock);
+
+}
+
 static const struct file_operations amdgpu_ttm_iomem_fops = {
.owner = THIS_MODULE,
.read = amdgpu_iomem_read,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index e69f3e8e06e5..02c8eac48a64 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -190,6 +190,7 @@ bool amdgpu_ttm_tt_is_readonly(struct ttm_tt *ttm);
 uint64_t amdgpu_ttm_tt_pde_flags(struct ttm_tt *ttm, struct ttm_resource *mem);
 uint64_t amdgpu_ttm_tt_pte_flags(struct amdgpu_device *adev, struct ttm_tt 
*ttm,
 struct ttm_resource *mem);
+void amdgpu_ttm_clear_dma_mappings(struct amdgpu_device *adev);
 
 void amdgpu_ttm_debugfs_init(struct amdgpu_device *adev);
 
-- 
2.25.1



[PATCH 4/4] drm/amdgpu: Add a UAPI flag for hot plug/unplug

2021-08-24 Thread Andrey Grodzovsky
To support libdrm tests.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 6400259a7c4b..c2fdf67ff551 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -96,9 +96,10 @@
  * - 3.40.0 - Add AMDGPU_IDS_FLAGS_TMZ
  * - 3.41.0 - Add video codec query
  * - 3.42.0 - Add 16bpc fixed point display support
+ * - 3.43.0 - Add device hot plug/unplug support
  */
 #define KMS_DRIVER_MAJOR   3
-#define KMS_DRIVER_MINOR   42
+#define KMS_DRIVER_MINOR   43
 #define KMS_DRIVER_PATCHLEVEL  0
 
 int amdgpu_vram_limit;
-- 
2.25.1



Re: [PATCH 1/4] drm/amdgpu: Move flush VCE idle_work during HW fini

2021-08-24 Thread Andrey Grodzovsky
Right, they will cover my use case, when are they landing ? I rebased 
today and haven't seen them.


Andrey

On 2021-08-24 9:41 p.m., Quan, Evan wrote:

[AMD Official Use Only]

Hi Andrey,

I sent out a similar patch set to address S3 issue. And I believe it should be 
able to address the issue here too.
https://lists.freedesktop.org/archives/amd-gfx/2021-August/067972.html
https://lists.freedesktop.org/archives/amd-gfx/2021-August/067967.html

BR
Evan

-Original Message-
From: amd-gfx  On Behalf Of
Andrey Grodzovsky
Sent: Wednesday, August 25, 2021 5:01 AM
To: dri-de...@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: ckoenig.leichtzumer...@gmail.com; Grodzovsky, Andrey

Subject: [PATCH 1/4] drm/amdgpu: Move flush VCE idle_work during HW fini

Attepmts to powergate after device is removed lead to crash.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 1 -
  drivers/gpu/drm/amd/amdgpu/vce_v2_0.c   | 4 
  drivers/gpu/drm/amd/amdgpu/vce_v3_0.c   | 5 -
  drivers/gpu/drm/amd/amdgpu/vce_v4_0.c   | 2 ++
  4 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index 1ae7f824adc7..8e8dee9fac9f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -218,7 +218,6 @@ int amdgpu_vce_sw_fini(struct amdgpu_device
*adev)
if (adev->vce.vcpu_bo == NULL)
return 0;

-   cancel_delayed_work_sync(&adev->vce.idle_work);
drm_sched_entity_destroy(&adev->vce.entity);

amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev-

vce.gpu_addr,

diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
index c7d28c169be5..716dfdd020b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
@@ -477,6 +477,10 @@ static int vce_v2_0_hw_init(void *handle)

  static int vce_v2_0_hw_fini(void *handle)
  {
+   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
+   cancel_delayed_work_sync(&adev->vce.idle_work);
+
return 0;
  }

diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
index 3b82fb289ef6..49581c6e0cea 100644
--- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
@@ -495,7 +495,10 @@ static int vce_v3_0_hw_fini(void *handle)
return r;

vce_v3_0_stop(adev);
-   return vce_v3_0_set_clockgating_state(adev,
AMD_CG_STATE_GATE);
+   r =  vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
+   cancel_delayed_work_sync(&adev->vce.idle_work);
+
+   return r;
  }

  static int vce_v3_0_suspend(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
index 90910d19db12..3297405fd32d 100644
--- a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
@@ -550,6 +550,8 @@ static int vce_v4_0_hw_fini(void *handle)
DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
}

+   cancel_delayed_work_sync(&adev->vce.idle_work);
+
return 0;
  }

--
2.25.1


Re: [PATCH 3/4] drm/amdgpu: drm/amdgpu: Handle IOMMU enabled case

2021-08-25 Thread Andrey Grodzovsky



On 2021-08-25 2:43 a.m., Christian König wrote:



Am 24.08.21 um 23:01 schrieb Andrey Grodzovsky:

Handle all DMA IOMMU group related dependencies before the
group is removed and we try to access it after free.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    | 50 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h    |  1 +
  3 files changed, 53 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 0b5764aa98a4..288a465b8101 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3860,6 +3860,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device 
*adev)

    amdgpu_device_ip_fini_early(adev);
  +    amdgpu_ttm_clear_dma_mappings(adev);
+
  amdgpu_gart_dummy_page_fini(adev);
    amdgpu_device_unmap_mmio(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

index 446943e32e3e..f73d807db3b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -64,6 +64,7 @@
  static int amdgpu_ttm_backend_bind(struct ttm_device *bdev,
 struct ttm_tt *ttm,
 struct ttm_resource *bo_mem);
+
  static void amdgpu_ttm_backend_unbind(struct ttm_device *bdev,
    struct ttm_tt *ttm);
  @@ -2293,6 +2294,55 @@ static ssize_t amdgpu_iomem_write(struct 
file *f, const char __user *buf,

  return result;
  }
  +void amdgpu_ttm_clear_dma_mappings(struct amdgpu_device *adev)


I strongly think that this function should be part of TTM. Something 
like ttm_device_force_unpopulate.



Yes, this something I also wanted but see bellow





+{
+    struct ttm_device *bdev = &adev->mman.bdev;
+    struct ttm_resource_manager *man;
+    struct ttm_buffer_object *bo;
+    unsigned int i, j;
+
+    spin_lock(&bdev->lru_lock);
+    for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
+    man = ttm_manager_type(bdev, i);
+    if (!man || !man->use_tt)
+    continue;
+
+    while (!list_empty(&man->pinned)) {
+    bo = list_first_entry(&man->pinned, struct 
ttm_buffer_object, lru);
+    /* Take ref against racing releases once lru_lock is 
unlocked */

+    ttm_bo_get(bo);
+    list_del_init(&bo->lru);
+    spin_unlock(&bdev->lru_lock);
+
+    if (bo->ttm) {
+    amdgpu_ttm_backend_unbind(bo->bdev, bo->ttm);



amdgpu_ttm_backend_unbind is needed to be called separately from 
ttm_tt_unpopulate to take care of code
flows that do dma mapping though the gart bind and not through 
ttm_tt_populate, Since it's inside amdgpu

i had to place the entire function in amdgpu. Any suggestions ?

Andrey



+ ttm_tt_destroy_common(bo->bdev, bo->ttm);


Then you can also cleanly use ttm_tt_unpopulate here, cause this will 
result in incorrect statistics inside TTM atm.


Regards,
Christian.


+    }
+
+    ttm_bo_put(bo);
+    spin_lock(&bdev->lru_lock);
+    }
+
+    for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
+    while (!list_empty(&man->lru[j])) {
+    bo = list_first_entry(&man->lru[j], struct 
ttm_buffer_object, lru);

+    ttm_bo_get(bo);
+    list_del_init(&bo->lru);
+    spin_unlock(&bdev->lru_lock);
+
+    if (bo->ttm) {
+    amdgpu_ttm_backend_unbind(bo->bdev, bo->ttm);
+    ttm_tt_destroy_common(bo->bdev, bo->ttm);
+    }
+    ttm_bo_put(bo);
+    spin_lock(&bdev->lru_lock);
+    }
+    }
+    }
+    spin_unlock(&bdev->lru_lock);
+
+}
+
  static const struct file_operations amdgpu_ttm_iomem_fops = {
  .owner = THIS_MODULE,
  .read = amdgpu_iomem_read,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h

index e69f3e8e06e5..02c8eac48a64 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -190,6 +190,7 @@ bool amdgpu_ttm_tt_is_readonly(struct ttm_tt *ttm);
  uint64_t amdgpu_ttm_tt_pde_flags(struct ttm_tt *ttm, struct 
ttm_resource *mem);
  uint64_t amdgpu_ttm_tt_pte_flags(struct amdgpu_device *adev, struct 
ttm_tt *ttm,

   struct ttm_resource *mem);
+void amdgpu_ttm_clear_dma_mappings(struct amdgpu_device *adev);
    void amdgpu_ttm_debugfs_init(struct amdgpu_device *adev);




Re: [PATCH] drm/sched: fix the bug of time out calculation(v2)

2021-08-25 Thread Andrey Grodzovsky



On 2021-08-25 8:11 a.m., Christian König wrote:

No, this would break that logic here.

See drm_sched_start_timeout() can be called multiple times, this is 
intentional and very important!


The logic in queue_delayed_work() makes sure that the timer is only 
started once and then never again.


All we need to take care of is to cancel_delayed_work() when we know 
that the job is completed.



Seems to me you can only do it for empty pending list otherwise you risk 
cancelling a legit new timer that was started
by the next job or not restarting timer at all since your timer was 
still pending when next job tried to start it again (the common case).
For non empty pending list you have to adjust the currently active TDR's 
timer from your's
job TTL to TTL to the next job after you or just restart it as Monk does 
it here which prolongs the timeout more then required but still ok i guess.


What about returning to the old scheme of timer sched_work per job so 
each job has it's own timer and we don't share it and everything
is precise for each job, using the locking scheme we already have today 
the actual TDR handler will execute only once while
all the other arising from the guilty job hang will be rejected (for 
amdgpu, for other drivers it probably requires same locking or we can 
move this to the scheduler layer)


Andrey




This here works as intended as far as I can see and if you start to 
use mod_delayed_work() you actually break it.


Regards,
Christian.

Am 25.08.21 um 14:01 schrieb Liu, Monk:

[AMD Official Use Only]

I think we should remove the cancel_delayed_work() in the beginning 
of the cleanup_job().


Because by my patch the "mode_delayed_work" in cleanup_job is already 
doing its duty to retrigger the TO timer accordingly


Thanks

--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Liu, Monk
Sent: Wednesday, August 25, 2021 7:55 PM
To: 'Christian König' ; 
amd-gfx@lists.freedesktop.org

Subject: RE: [PATCH] drm/sched: fix the bug of time out calculation(v2)

[AMD Official Use Only]

The timeout started by queue_delayed_work() in 
drm_sched_start_timeout() is paired with the cancel_delayed_work() 
in drm_sched_get_cleanup_job().
No that's wrong, see that when we are in cleanup_job(), assume we do 
not have timeout on this sched (we are just keep submitting new jobs 
to this sched), Then the work_tdr is cancelled, and then we get the 
heading job, and let's assume the job is not signaled, then we run to 
the "queue timeout for next job" thus drm_sched_start_timeout() is 
called, so this heading job's TO timer is actually retriggered ... 
which is totally wrong.


With my patch the timer is already retriggered after previous JOB 
really signaled.


Can you be more specific on the incorrect part ?

Thanks
--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Christian König 
Sent: Wednesday, August 25, 2021 2:32 PM
To: Liu, Monk ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/sched: fix the bug of time out calculation(v2)

Well NAK to that approach. First of all your bug analyses is incorrect.

The timeout started by queue_delayed_work() in 
drm_sched_start_timeout() is paired with the cancel_delayed_work() in 
drm_sched_get_cleanup_job().


So you must have something else going on here.

Then please don't use mod_delayed_work(), instead always cancel it 
and restart it.


Regards,
Christian.

Am 25.08.21 um 06:14 schrieb Monk Liu:

the original logic is wrong that the timeout will not be retriggerd
after the previous job siganled, and that lead to the scenario that
all jobs in the same scheduler shares the same timeout timer from the
very begining job in this scheduler which is wrong.

we should modify the timer everytime a previous job signaled.

v2:
further cleanup the logic, and do the TDR timer cancelling if the
signaled job is the last one in its scheduler.

Signed-off-by: Monk Liu 
---
   drivers/gpu/drm/scheduler/sched_main.c | 29 
-

   1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a9536..8c102ac 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -305,8 +305,17 @@ static void drm_sched_job_begin(struct 
drm_sched_job *s_job)

   struct drm_gpu_scheduler *sched = s_job->sched;
      spin_lock(&sched->job_list_lock);
-    list_add_tail(&s_job->list, &sched->pending_list);
-    drm_sched_start_timeout(sched);
+    if (list_empty(&sched->pending_list)) {
+    list_add_tail(&s_job->list, &sched->pending_list);
+    drm_sched_start_timeout(sched);
+    } else {
+    /* the old jobs in pending list are not finished yet
+ * no need to restart TDR timer here, it is already
+  

Re: [PATCH] drm/sched: fix the bug of time out calculation(v2)

2021-08-25 Thread Andrey Grodzovsky



On 2021-08-25 10:31 p.m., Liu, Monk wrote:

[AMD Official Use Only]

Hi Andrey

I'm not quite sure if I read you correctly


Seems to me you can only do it for empty pending list otherwise you risk 
cancelling a legit new timer that was started by the next job or not restarting 
timer at all since your timer was still pending when next job tried to start it 
again (the common case).

I don't understand above sentence, from my understanding for the common case,  if the 
timer is pending, the cancel_delay_work in beginning will cancel it and then we will get 
to the line of "queue timeout for next job" since the heading job is not 
signaled (align with the timer is pending), then the timer will be restarted (for the 
next job)



Ignore it, i realized from looking that i missed the timer restart in 
then end of drm_sched_get_cleanup_job or the alternative one in 
drm_sched_main





And above sequence is actually wrong to me, because we cancelled a pending 
timer and restart the timer for the scheduler that its heading job is still 
running there, the whole counting is repeated from zero and inaccurate at all



But  for timer pending case (common case) your mod_delayed_work will 
effectively do exactly the same if you don't use per job TTLs - you mod 
it to  sched->timeout value which resets the pending timer

to again count from 0.

I just wonder why we stopped using per job TDR timers in the first place 
? Isn't the simplest way to count accurate timeouts for each job is to 
actually measure the timeouts for each job separately ?


Andrey


  


Thanks




--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Grodzovsky, Andrey 
Sent: Thursday, August 26, 2021 2:20 AM
To: Christian König ; Liu, Monk ; 
amd-gfx@lists.freedesktop.org; dri-devel 
Subject: Re: [PATCH] drm/sched: fix the bug of time out calculation(v2)


On 2021-08-25 8:11 a.m., Christian König wrote:

No, this would break that logic here.

See drm_sched_start_timeout() can be called multiple times, this is
intentional and very important!

The logic in queue_delayed_work() makes sure that the timer is only
started once and then never again.

All we need to take care of is to cancel_delayed_work() when we know
that the job is completed.


Seems to me you can only do it for empty pending list otherwise you risk 
cancelling a legit new timer that was started by the next job or not restarting 
timer at all since your timer was still pending when next job tried to start it 
again (the common case).
For non empty pending list you have to adjust the currently active TDR's timer 
from your's job TTL to TTL to the next job after you or just restart it as Monk 
does it here which prolongs the timeout more then required but still ok i guess.

What about returning to the old scheme of timer sched_work per job so each job 
has it's own timer and we don't share it and everything is precise for each 
job, using the locking scheme we already have today the actual TDR handler will 
execute only once while all the other arising from the guilty job hang will be 
rejected (for amdgpu, for other drivers it probably requires same locking or we 
can move this to the scheduler layer)

Andrey



This here works as intended as far as I can see and if you start to
use mod_delayed_work() you actually break it.

Regards,
Christian.

Am 25.08.21 um 14:01 schrieb Liu, Monk:

[AMD Official Use Only]

I think we should remove the cancel_delayed_work() in the beginning
of the cleanup_job().

Because by my patch the "mode_delayed_work" in cleanup_job is already
doing its duty to retrigger the TO timer accordingly

Thanks

--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Liu, Monk
Sent: Wednesday, August 25, 2021 7:55 PM
To: 'Christian König' ;
amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/sched: fix the bug of time out
calculation(v2)

[AMD Official Use Only]


The timeout started by queue_delayed_work() in
drm_sched_start_timeout() is paired with the cancel_delayed_work()
in drm_sched_get_cleanup_job().

No that's wrong, see that when we are in cleanup_job(), assume we do
not have timeout on this sched (we are just keep submitting new jobs
to this sched), Then the work_tdr is cancelled, and then we get the
heading job, and let's assume the job is not signaled, then we run to
the "queue timeout for next job" thus drm_sched_start_timeout() is
called, so this heading job's TO timer is actually retriggered ...
which is totally wrong.

With my patch the timer is already retriggered after previous JOB
really signaled.

Can you be more specific on the incorrect part ?

Thanks
--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Christian König 
Sent: Wednesday, August 25, 2021 2:3

Re: [PATCH] drm/sched: fix the bug of time out calculation(v2)

2021-08-25 Thread Andrey Grodzovsky



On 2021-08-26 12:55 a.m., Liu, Monk wrote:

[AMD Official Use Only]


But  for timer pending case (common case) your mod_delayed_work will effectively 
do exactly the same if you don't use per job TTLs - you mod it to  
sched->timeout value which resets the pending timer to again count from 0.

Ny patch will only modify the timer (restart it , actually) when the heading 
job is signaled, which means on HW ring the next job is just about start 
processing.


Not sure this is always true, see this specific test we added long ago 
https://gitlab.freedesktop.org/mesa/drm/-/commit/bc21168fa924d3fc4a000492e861f50a1a135b25
AFAIK a ring doesn't have strict serialization of processing jobs one 
after another, especially when 2 jobs are scheduled from different 
contexts like in the example above
which means that in this case the second job might be well into 
execution for some time when the first finish and restarts the TDR timer 
from scratch.




If the job is not signaled (your common case) the timer is still not touched at 
all ...


I just wonder why we stopped using per job TDR timers in the first place ? 
Isn't the simplest way to count accurate timeouts for each job is to actually 
measure the timeouts for each job separately ?

I'm not sure if Christian can recall something, and I believe it is due to some 
limitations we found (or some race issue like two job on the same scheduler TO 
in the same time, which is probably if they are scheduled to the ring almost in 
the same timeframe)

Anyway I have a V3 version patch, please take a look, it looks working for me



Will take a look tomorrow

Andrey


  


Thanks

--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Grodzovsky, Andrey 
Sent: Thursday, August 26, 2021 11:05 AM
To: Liu, Monk ; Christian König ; 
amd-gfx@lists.freedesktop.org; dri-devel 
Subject: Re: [PATCH] drm/sched: fix the bug of time out calculation(v2)


On 2021-08-25 10:31 p.m., Liu, Monk wrote:

[AMD Official Use Only]

Hi Andrey

I'm not quite sure if I read you correctly


Seems to me you can only do it for empty pending list otherwise you risk 
cancelling a legit new timer that was started by the next job or not restarting 
timer at all since your timer was still pending when next job tried to start it 
again (the common case).

I don't understand above sentence, from my understanding for the
common case,  if the timer is pending, the cancel_delay_work in
beginning will cancel it and then we will get to the line of "queue
timeout for next job" since the heading job is not signaled (align
with the timer is pending), then the timer will be restarted (for the
next job)


Ignore it, i realized from looking that i missed the timer restart in then end 
of drm_sched_get_cleanup_job or the alternative one in drm_sched_main



And above sequence is actually wrong to me, because we cancelled a
pending timer and restart the timer for the scheduler that its heading
job is still running there, the whole counting is repeated from zero
and inaccurate at all


But  for timer pending case (common case) your mod_delayed_work will effectively 
do exactly the same if you don't use per job TTLs - you mod it to  
sched->timeout value which resets the pending timer to again count from 0.

I just wonder why we stopped using per job TDR timers in the first place ? 
Isn't the simplest way to count accurate timeouts for each job is to actually 
measure the timeouts for each job separately ?

Andrey


   


Thanks
--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Grodzovsky, Andrey 
Sent: Thursday, August 26, 2021 2:20 AM
To: Christian König ; Liu, Monk
; amd-gfx@lists.freedesktop.org; dri-devel

Subject: Re: [PATCH] drm/sched: fix the bug of time out
calculation(v2)


On 2021-08-25 8:11 a.m., Christian König wrote:

No, this would break that logic here.

See drm_sched_start_timeout() can be called multiple times, this is
intentional and very important!

The logic in queue_delayed_work() makes sure that the timer is only
started once and then never again.

All we need to take care of is to cancel_delayed_work() when we know
that the job is completed.

Seems to me you can only do it for empty pending list otherwise you risk 
cancelling a legit new timer that was started by the next job or not restarting 
timer at all since your timer was still pending when next job tried to start it 
again (the common case).
For non empty pending list you have to adjust the currently active TDR's timer 
from your's job TTL to TTL to the next job after you or just restart it as Monk 
does it here which prolongs the timeout more then required but still ok i guess.

What about returning to the old scheme of timer sched_work per job so
each job has it's own timer and we don't share it and everything is
precise for each job, using th

Re: [PATCH 3/4] drm/amdgpu: drm/amdgpu: Handle IOMMU enabled case

2021-08-26 Thread Andrey Grodzovsky

Ping

Andrey

On 2021-08-25 11:36 a.m., Andrey Grodzovsky wrote:


On 2021-08-25 2:43 a.m., Christian König wrote:



Am 24.08.21 um 23:01 schrieb Andrey Grodzovsky:

Handle all DMA IOMMU group related dependencies before the
group is removed and we try to access it after free.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    | 50 
++

  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h    |  1 +
  3 files changed, 53 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 0b5764aa98a4..288a465b8101 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3860,6 +3860,8 @@ void amdgpu_device_fini_hw(struct 
amdgpu_device *adev)

    amdgpu_device_ip_fini_early(adev);
  +    amdgpu_ttm_clear_dma_mappings(adev);
+
  amdgpu_gart_dummy_page_fini(adev);
    amdgpu_device_unmap_mmio(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

index 446943e32e3e..f73d807db3b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -64,6 +64,7 @@
  static int amdgpu_ttm_backend_bind(struct ttm_device *bdev,
 struct ttm_tt *ttm,
 struct ttm_resource *bo_mem);
+
  static void amdgpu_ttm_backend_unbind(struct ttm_device *bdev,
    struct ttm_tt *ttm);
  @@ -2293,6 +2294,55 @@ static ssize_t amdgpu_iomem_write(struct 
file *f, const char __user *buf,

  return result;
  }
  +void amdgpu_ttm_clear_dma_mappings(struct amdgpu_device *adev)


I strongly think that this function should be part of TTM. Something 
like ttm_device_force_unpopulate.



Yes, this something I also wanted but see bellow





+{
+    struct ttm_device *bdev = &adev->mman.bdev;
+    struct ttm_resource_manager *man;
+    struct ttm_buffer_object *bo;
+    unsigned int i, j;
+
+    spin_lock(&bdev->lru_lock);
+    for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
+    man = ttm_manager_type(bdev, i);
+    if (!man || !man->use_tt)
+    continue;
+
+    while (!list_empty(&man->pinned)) {
+    bo = list_first_entry(&man->pinned, struct 
ttm_buffer_object, lru);
+    /* Take ref against racing releases once lru_lock is 
unlocked */

+    ttm_bo_get(bo);
+    list_del_init(&bo->lru);
+    spin_unlock(&bdev->lru_lock);
+
+    if (bo->ttm) {
+    amdgpu_ttm_backend_unbind(bo->bdev, bo->ttm);



amdgpu_ttm_backend_unbind is needed to be called separately from 
ttm_tt_unpopulate to take care of code
flows that do dma mapping though the gart bind and not through 
ttm_tt_populate, Since it's inside amdgpu

i had to place the entire function in amdgpu. Any suggestions ?

Andrey



+ ttm_tt_destroy_common(bo->bdev, bo->ttm);


Then you can also cleanly use ttm_tt_unpopulate here, cause this will 
result in incorrect statistics inside TTM atm.


Regards,
Christian.


+    }
+
+    ttm_bo_put(bo);
+    spin_lock(&bdev->lru_lock);
+    }
+
+    for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
+    while (!list_empty(&man->lru[j])) {
+    bo = list_first_entry(&man->lru[j], struct 
ttm_buffer_object, lru);

+    ttm_bo_get(bo);
+    list_del_init(&bo->lru);
+    spin_unlock(&bdev->lru_lock);
+
+    if (bo->ttm) {
+    amdgpu_ttm_backend_unbind(bo->bdev, bo->ttm);
+    ttm_tt_destroy_common(bo->bdev, bo->ttm);
+    }
+    ttm_bo_put(bo);
+    spin_lock(&bdev->lru_lock);
+    }
+    }
+    }
+    spin_unlock(&bdev->lru_lock);
+
+}
+
  static const struct file_operations amdgpu_ttm_iomem_fops = {
  .owner = THIS_MODULE,
  .read = amdgpu_iomem_read,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h

index e69f3e8e06e5..02c8eac48a64 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -190,6 +190,7 @@ bool amdgpu_ttm_tt_is_readonly(struct ttm_tt *ttm);
  uint64_t amdgpu_ttm_tt_pde_flags(struct ttm_tt *ttm, struct 
ttm_resource *mem);
  uint64_t amdgpu_ttm_tt_pte_flags(struct amdgpu_device *adev, 
struct ttm_tt *ttm,

   struct ttm_resource *mem);
+void amdgpu_ttm_clear_dma_mappings(struct amdgpu_device *adev);
    void amdgpu_ttm_debugfs_init(struct amdgpu_device *adev);




Re: [PATCH] drm/sched: fix the bug of time out calculation(v3)

2021-08-26 Thread Andrey Grodzovsky



On 2021-08-26 12:55 a.m., Monk Liu wrote:

issue:
in cleanup_job the cancle_delayed_work will cancel a TO timer
even the its corresponding job is still running.

fix:
do not cancel the timer in cleanup_job, instead do the cancelling
only when the heading job is signaled, and if there is a "next" job
we start_timeout again.

v2:
further cleanup the logic, and do the TDR timer cancelling if the signaled job
is the last one in its scheduler.

v3:
change the issue description
remove the cancel_delayed_work in the begining of the cleanup_job
recover the implement of drm_sched_job_begin.

TODO:
1)introduce pause/resume scheduler in job_timeout to serial the handling
of scheduler and job_timeout.
2)drop the bad job's del and insert in scheduler due to above serialization
(no race issue anymore with the serialization)

Signed-off-by: Monk Liu 
---
  drivers/gpu/drm/scheduler/sched_main.c | 25 ++---
  1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a9536..ecf8140 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -676,13 +676,7 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
  {
struct drm_sched_job *job, *next;
  
-	/*

-* Don't destroy jobs while the timeout worker is running  OR thread
-* is being parked and hence assumed to not touch pending_list
-*/
-   if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-   !cancel_delayed_work(&sched->work_tdr)) ||
-   kthread_should_park())
+   if (kthread_should_park())
return NULL;



I actually don't see why we need to keep the above,
on the other side (in drm_sched_stop) we won't touch the pending list
anyway until sched thread came to full stop (kthread_park). If you do 
see a reason why

this needed then a comment should be here i think.

Andrey


  
  	spin_lock(&sched->job_list_lock);

@@ -693,17 +687,21 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
/* remove job from pending_list */
list_del_init(&job->list);
+
+   /* cancel this job's TO timer */
+   cancel_delayed_work(&sched->work_tdr);
/* make the scheduled timestamp more accurate */
next = list_first_entry_or_null(&sched->pending_list,
typeof(*next), list);
-   if (next)
+
+   if (next) {
next->s_fence->scheduled.timestamp =
job->s_fence->finished.timestamp;
-
+   /* start TO timer for next job */
+   drm_sched_start_timeout(sched);
+   }
} else {
job = NULL;
-   /* queue timeout for next job */
-   drm_sched_start_timeout(sched);
}
  
  	spin_unlock(&sched->job_list_lock);

@@ -791,11 +789,8 @@ static int drm_sched_main(void *param)
  (entity = 
drm_sched_select_entity(sched))) ||
 kthread_should_stop());
  
-		if (cleanup_job) {

+   if (cleanup_job)
sched->ops->free_job(cleanup_job);
-   /* queue timeout for next job */
-   drm_sched_start_timeout(sched);
-   }
  
  		if (!entity)

continue;


[PATCH v2 1/4] drm/ttm: Create pinned list

2021-08-26 Thread Andrey Grodzovsky
This list will be used to capture all non VRAM BOs not
on LRU so when device is hot unplugged we can iterate
the list and unmap DMA mappings before device is removed.

v2: 
Reanme function to ttm_bo_move_to_pinned
Keep deleting BOs from LRU in the new function
if they have no resource struct assigned to them.

Signed-off-by: Andrey Grodzovsky 
Suggested-by: Christian König 
---
 drivers/gpu/drm/ttm/ttm_bo.c   | 30 ++
 drivers/gpu/drm/ttm/ttm_resource.c |  1 +
 include/drm/ttm/ttm_resource.h |  1 +
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 1b950b45cf4b..64594819e9e7 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -69,7 +69,29 @@ static void ttm_bo_mem_space_debug(struct ttm_buffer_object 
*bo,
}
 }
 
-static void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
+static inline void ttm_bo_move_to_pinned_or_del(struct ttm_buffer_object *bo)
+{
+   struct ttm_device *bdev = bo->bdev;
+   struct ttm_resource_manager *man = NULL;
+
+   if (bo->resource)
+   man = ttm_manager_type(bdev, bo->resource->mem_type);
+
+   /*
+* Some BOs might be in transient state where they don't belong
+* to any domain at the moment, simply remove them from whatever
+* LRU list they are still hanged on to keep previous functionality
+*/
+   if (man && man->use_tt)
+   list_move_tail(&bo->lru, &man->pinned);
+   else
+   list_del_init(&bo->lru);
+
+   if (bdev->funcs->del_from_lru_notify)
+   bdev->funcs->del_from_lru_notify(bo);
+}
+
+static inline void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
 {
struct ttm_device *bdev = bo->bdev;
 
@@ -98,7 +120,7 @@ void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo,
dma_resv_assert_held(bo->base.resv);
 
if (bo->pin_count) {
-   ttm_bo_del_from_lru(bo);
+   ttm_bo_move_to_pinned_or_del(bo);
return;
}
 
@@ -339,7 +361,7 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo,
return ret;
}
 
-   ttm_bo_del_from_lru(bo);
+   ttm_bo_move_to_pinned_or_del(bo);
list_del_init(&bo->ddestroy);
spin_unlock(&bo->bdev->lru_lock);
ttm_bo_cleanup_memtype_use(bo);
@@ -1154,7 +1176,7 @@ int ttm_bo_swapout(struct ttm_buffer_object *bo, struct 
ttm_operation_ctx *ctx,
return 0;
}
 
-   ttm_bo_del_from_lru(bo);
+   ttm_bo_move_to_pinned_or_del(bo);
/* TODO: Cleanup the locking */
spin_unlock(&bo->bdev->lru_lock);
 
diff --git a/drivers/gpu/drm/ttm/ttm_resource.c 
b/drivers/gpu/drm/ttm/ttm_resource.c
index 2431717376e7..91165f77fe0e 100644
--- a/drivers/gpu/drm/ttm/ttm_resource.c
+++ b/drivers/gpu/drm/ttm/ttm_resource.c
@@ -85,6 +85,7 @@ void ttm_resource_manager_init(struct ttm_resource_manager 
*man,
 
for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i)
INIT_LIST_HEAD(&man->lru[i]);
+   INIT_LIST_HEAD(&man->pinned);
man->move = NULL;
 }
 EXPORT_SYMBOL(ttm_resource_manager_init);
diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h
index 140b6b9a8bbe..1ec0d5ebb59f 100644
--- a/include/drm/ttm/ttm_resource.h
+++ b/include/drm/ttm/ttm_resource.h
@@ -130,6 +130,7 @@ struct ttm_resource_manager {
 */
 
struct list_head lru[TTM_MAX_BO_PRIORITY];
+   struct list_head pinned;
 
/*
 * Protected by @move_lock.
-- 
2.25.1



[PATCH v2 3/4] drm/amdgpu: drm/amdgpu: Handle IOMMU enabled case

2021-08-26 Thread Andrey Grodzovsky
Handle all DMA IOMMU group related dependencies before the
group is removed and we try to access it after free.

v2:
Move the actul handling function to TTM

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0b5764aa98a4..653bd8fdaa33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3860,6 +3860,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
 
amdgpu_device_ip_fini_early(adev);
 
+   ttm_device_clear_dma_mappings(&adev->mman.bdev);
+
amdgpu_gart_dummy_page_fini(adev);
 
amdgpu_device_unmap_mmio(adev);
-- 
2.25.1



[PATCH v2 0/4] Various fixes to pass libdrm hotunplug tests

2021-08-26 Thread Andrey Grodzovsky
Bunch of fixes to enable passing hotplug tests i previosly added
here[1] with latest code. 
Once accepted I will enable the tests on libdrm side.

[1] - https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/172

v2:
Dropping VCE patch since relevant function already fixed in latest
code.
Moving IOMMU hnadling to TTM layer.

Andrey Grodzovsky (4):
  drm/ttm: Create pinned list
  drm/ttm: Clear all DMA mappings on demand
  drm/amdgpu: drm/amdgpu: Handle IOMMU enabled case
  drm/amdgpu: Add a UAPI flag for hot plug/unplug

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  3 +-
 drivers/gpu/drm/ttm/ttm_bo.c   | 30 +--
 drivers/gpu/drm/ttm/ttm_device.c   | 45 ++
 drivers/gpu/drm/ttm/ttm_resource.c |  1 +
 include/drm/ttm/ttm_device.h   |  1 +
 include/drm/ttm/ttm_resource.h |  1 +
 7 files changed, 78 insertions(+), 5 deletions(-)

-- 
2.25.1



[PATCH v2 4/4] drm/amdgpu: Add a UAPI flag for hot plug/unplug

2021-08-26 Thread Andrey Grodzovsky
To support libdrm tests.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 6400259a7c4b..c2fdf67ff551 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -96,9 +96,10 @@
  * - 3.40.0 - Add AMDGPU_IDS_FLAGS_TMZ
  * - 3.41.0 - Add video codec query
  * - 3.42.0 - Add 16bpc fixed point display support
+ * - 3.43.0 - Add device hot plug/unplug support
  */
 #define KMS_DRIVER_MAJOR   3
-#define KMS_DRIVER_MINOR   42
+#define KMS_DRIVER_MINOR   43
 #define KMS_DRIVER_PATCHLEVEL  0
 
 int amdgpu_vram_limit;
-- 
2.25.1



[PATCH v2 2/4] drm/ttm: Clear all DMA mappings on demand

2021-08-26 Thread Andrey Grodzovsky
Used by drivers supporting hot unplug to handle all
DMA IOMMU group related dependencies before the group
is removed during device removal and we try to access
it after free when last device pointer from user space
is dropped.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/ttm/ttm_device.c | 45 
 include/drm/ttm/ttm_device.h |  1 +
 2 files changed, 46 insertions(+)

diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c
index 5f31acec3ad7..ea50aba13743 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -245,3 +245,48 @@ void ttm_device_fini(struct ttm_device *bdev)
ttm_global_release();
 }
 EXPORT_SYMBOL(ttm_device_fini);
+
+void ttm_device_clear_dma_mappings(struct ttm_device *bdev)
+{
+   struct ttm_resource_manager *man;
+   struct ttm_buffer_object *bo;
+   unsigned int i, j;
+
+   spin_lock(&bdev->lru_lock);
+   for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
+   man = ttm_manager_type(bdev, i);
+   if (!man || !man->use_tt)
+   continue;
+
+   while (!list_empty(&man->pinned)) {
+   bo = list_first_entry(&man->pinned, struct 
ttm_buffer_object, lru);
+   /* Take ref against racing releases once lru_lock is 
unlocked */
+   ttm_bo_get(bo);
+   list_del_init(&bo->lru);
+   spin_unlock(&bdev->lru_lock);
+
+   if (bo->ttm)
+   ttm_tt_destroy_common(bo->bdev, bo->ttm);
+
+   ttm_bo_put(bo);
+   spin_lock(&bdev->lru_lock);
+   }
+
+   for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
+   while (!list_empty(&man->lru[j])) {
+   bo = list_first_entry(&man->lru[j], struct 
ttm_buffer_object, lru);
+   ttm_bo_get(bo);
+   list_del_init(&bo->lru);
+   spin_unlock(&bdev->lru_lock);
+
+   if (bo->ttm)
+   ttm_tt_destroy_common(bo->bdev, 
bo->ttm);
+
+   ttm_bo_put(bo);
+   spin_lock(&bdev->lru_lock);
+   }
+   }
+   }
+   spin_unlock(&bdev->lru_lock);
+}
+EXPORT_SYMBOL(ttm_device_clear_dma_mappings);
diff --git a/include/drm/ttm/ttm_device.h b/include/drm/ttm/ttm_device.h
index cd592f8e941b..d2837decb49a 100644
--- a/include/drm/ttm/ttm_device.h
+++ b/include/drm/ttm/ttm_device.h
@@ -298,5 +298,6 @@ int ttm_device_init(struct ttm_device *bdev, struct 
ttm_device_funcs *funcs,
struct drm_vma_offset_manager *vma_manager,
bool use_dma_alloc, bool use_dma32);
 void ttm_device_fini(struct ttm_device *bdev);
+void ttm_device_clear_dma_mappings(struct ttm_device *bdev);
 
 #endif
-- 
2.25.1



Re: [PATCH] drm/sched: fix the bug of time out calculation(v3)

2021-08-26 Thread Andrey Grodzovsky
Attached quick patch for per job TTL calculation to make more precises 
next timer expiration. It's on top of the patch in this thread. Let me 
know if this makes sense.


Andrey

On 2021-08-26 10:03 a.m., Andrey Grodzovsky wrote:


On 2021-08-26 12:55 a.m., Monk Liu wrote:

issue:
in cleanup_job the cancle_delayed_work will cancel a TO timer
even the its corresponding job is still running.

fix:
do not cancel the timer in cleanup_job, instead do the cancelling
only when the heading job is signaled, and if there is a "next" job
we start_timeout again.

v2:
further cleanup the logic, and do the TDR timer cancelling if the 
signaled job

is the last one in its scheduler.

v3:
change the issue description
remove the cancel_delayed_work in the begining of the cleanup_job
recover the implement of drm_sched_job_begin.

TODO:
1)introduce pause/resume scheduler in job_timeout to serial the handling
of scheduler and job_timeout.
2)drop the bad job's del and insert in scheduler due to above 
serialization

(no race issue anymore with the serialization)

Signed-off-by: Monk Liu 
---
  drivers/gpu/drm/scheduler/sched_main.c | 25 ++---
  1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c

index a2a9536..ecf8140 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -676,13 +676,7 @@ drm_sched_get_cleanup_job(struct 
drm_gpu_scheduler *sched)

  {
  struct drm_sched_job *job, *next;
  -    /*
- * Don't destroy jobs while the timeout worker is running OR thread
- * is being parked and hence assumed to not touch pending_list
- */
-    if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-    !cancel_delayed_work(&sched->work_tdr)) ||
-    kthread_should_park())
+    if (kthread_should_park())
  return NULL;



I actually don't see why we need to keep the above,
on the other side (in drm_sched_stop) we won't touch the pending list
anyway until sched thread came to full stop (kthread_park). If you do 
see a reason why

this needed then a comment should be here i think.

Andrey



spin_lock(&sched->job_list_lock);
@@ -693,17 +687,21 @@ drm_sched_get_cleanup_job(struct 
drm_gpu_scheduler *sched)

  if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
  /* remove job from pending_list */
  list_del_init(&job->list);
+
+    /* cancel this job's TO timer */
+    cancel_delayed_work(&sched->work_tdr);
  /* make the scheduled timestamp more accurate */
  next = list_first_entry_or_null(&sched->pending_list,
  typeof(*next), list);
-    if (next)
+
+    if (next) {
  next->s_fence->scheduled.timestamp =
  job->s_fence->finished.timestamp;
-
+    /* start TO timer for next job */
+    drm_sched_start_timeout(sched);
+    }
  } else {
  job = NULL;
-    /* queue timeout for next job */
-    drm_sched_start_timeout(sched);
  }
    spin_unlock(&sched->job_list_lock);
@@ -791,11 +789,8 @@ static int drm_sched_main(void *param)
    (entity = drm_sched_select_entity(sched))) ||
   kthread_should_stop());
  -    if (cleanup_job) {
+    if (cleanup_job)
  sched->ops->free_job(cleanup_job);
-    /* queue timeout for next job */
-    drm_sched_start_timeout(sched);
-    }
        if (!entity)
  continue;
>From d4671ce3c3b18c369b512cd692aec3769f37e11a Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky 
Date: Thu, 26 Aug 2021 16:08:01 -0400
Subject: drm/sched: Add TTL per job for timeout handling.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/scheduler/sched_main.c | 16 ++--
 include/drm/gpu_scheduler.h|  2 ++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index ecf8140f6968..c8e31515803c 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -306,6 +306,7 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
 
 	spin_lock(&sched->job_list_lock);
 	list_add_tail(&s_job->list, &sched->pending_list);
+	s_job->ts = get_jiffies_64();
 	drm_sched_start_timeout(sched);
 	spin_unlock(&sched->job_list_lock);
 }
@@ -695,10 +696,21 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 		typeof(*next), list);
 
 		if (next) {
+			uint64_t ttl;
+
 			next->s_fence->scheduled.timestamp =
 job->s_fence->finished.timestamp;
-			/* start TO timer for next job */
-			drm_sched_start_timeout(sched);
+
+			/*
+			 * Make precise calculation how much time shou

Re: [PATCH] drm/sched: fix the bug of time out calculation(v3)

2021-08-27 Thread Andrey Grodzovsky
The TS  represents the point in time when the job was inserted into the 
pending list.
I don't think it matters when it actually starts to be processed, what 
matters is when this job was inserted into pending list because right at 
that point you arm the TO timer (when no other is running already)
and so when the previous job completes and you cancel and rearm again 
you can use that TS from the next job in pending list to calculate how 
much time has actually left for it to run before TDR must be initiated
and not just give it again full TO value to run even if it has already 
been running for a while.


Also, i am not sure also about the assumption that what we measure is 
processing by HW, what we measure is from the moment it was scheduled to 
ring to the moment the job completed (EOP event). At least that what our 
TDR timer measures and so it makes sense to set the TS at this point.


Andrey

On 2021-08-27 3:20 a.m., Liu, Monk wrote:

[AMD Official Use Only]

what is that 'ts' representing for ? it looks to me the jiffies that it get 
scheduled to the ring,  but a job scheduled to the ring doesn't represent it's 
being processed by hw.

Thanks

--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Grodzovsky, Andrey 
Sent: Friday, August 27, 2021 4:14 AM
To: Liu, Monk ; amd-gfx@lists.freedesktop.org; Koenig, Christian 

Cc: dri-de...@lists.freedesktop.org
Subject: Re: [PATCH] drm/sched: fix the bug of time out calculation(v3)

Attached quick patch for per job TTL calculation to make more precises next 
timer expiration. It's on top of the patch in this thread. Let me know if this 
makes sense.

Andrey

On 2021-08-26 10:03 a.m., Andrey Grodzovsky wrote:

On 2021-08-26 12:55 a.m., Monk Liu wrote:

issue:
in cleanup_job the cancle_delayed_work will cancel a TO timer even
the its corresponding job is still running.

fix:
do not cancel the timer in cleanup_job, instead do the cancelling
only when the heading job is signaled, and if there is a "next" job
we start_timeout again.

v2:
further cleanup the logic, and do the TDR timer cancelling if the
signaled job is the last one in its scheduler.

v3:
change the issue description
remove the cancel_delayed_work in the begining of the cleanup_job
recover the implement of drm_sched_job_begin.

TODO:
1)introduce pause/resume scheduler in job_timeout to serial the
handling of scheduler and job_timeout.
2)drop the bad job's del and insert in scheduler due to above
serialization (no race issue anymore with the serialization)

Signed-off-by: Monk Liu 
---
   drivers/gpu/drm/scheduler/sched_main.c | 25
++---
   1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a9536..ecf8140 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -676,13 +676,7 @@ drm_sched_get_cleanup_job(struct
drm_gpu_scheduler *sched)
   {
   struct drm_sched_job *job, *next;
   -    /*
- * Don't destroy jobs while the timeout worker is running OR
thread
- * is being parked and hence assumed to not touch pending_list
- */
-    if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-    !cancel_delayed_work(&sched->work_tdr)) ||
-    kthread_should_park())
+    if (kthread_should_park())
   return NULL;


I actually don't see why we need to keep the above, on the other side
(in drm_sched_stop) we won't touch the pending list anyway until sched
thread came to full stop (kthread_park). If you do see a reason why
this needed then a comment should be here i think.

Andrey



spin_lock(&sched->job_list_lock);
@@ -693,17 +687,21 @@ drm_sched_get_cleanup_job(struct
drm_gpu_scheduler *sched)
   if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
   /* remove job from pending_list */
   list_del_init(&job->list);
+
+    /* cancel this job's TO timer */
+    cancel_delayed_work(&sched->work_tdr);
   /* make the scheduled timestamp more accurate */
   next = list_first_entry_or_null(&sched->pending_list,
   typeof(*next), list);
-    if (next)
+
+    if (next) {
   next->s_fence->scheduled.timestamp =
   job->s_fence->finished.timestamp;
-
+    /* start TO timer for next job */
+    drm_sched_start_timeout(sched);
+    }
   } else {
   job = NULL;
-    /* queue timeout for next job */
-    drm_sched_start_timeout(sched);
   }
     spin_unlock(&sched->job_list_lock);
@@ -791,11 +789,8 @@ static int drm_sched_main(void *param)
     (entity = drm_sched_select_entity(sched))) ||
    kthr

Re: [PATCH] drm/sched: fix the bug of time out calculation(v3)

2021-08-27 Thread Andrey Grodzovsky

So we agree if (kthread_should_park()) return NULL should go away ?

Andrey


On 2021-08-27 3:46 a.m., Liu, Monk wrote:

[AMD Official Use Only]

Yeah, that "kthread_should_park" is also irrelevant looks to me as well and it 
delays the signaled job's cleanup/free

Thanks

--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Christian König 
Sent: Friday, August 27, 2021 2:12 PM
To: Grodzovsky, Andrey ; Liu, Monk ; 
amd-gfx@lists.freedesktop.org; Koenig, Christian 
Cc: dri-de...@lists.freedesktop.org
Subject: Re: [PATCH] drm/sched: fix the bug of time out calculation(v3)

I don't think that this will be necessary nor desired.

See the job should be cleaned up as soon as possible after it is finished or 
otherwise we won't cancel the timeout quick enough either.

Christian.

Am 26.08.21 um 22:14 schrieb Andrey Grodzovsky:

Attached quick patch for per job TTL calculation to make more precises
next timer expiration. It's on top of the patch in this thread. Let me
know if this makes sense.

Andrey

On 2021-08-26 10:03 a.m., Andrey Grodzovsky wrote:

On 2021-08-26 12:55 a.m., Monk Liu wrote:

issue:
in cleanup_job the cancle_delayed_work will cancel a TO timer even
the its corresponding job is still running.

fix:
do not cancel the timer in cleanup_job, instead do the cancelling
only when the heading job is signaled, and if there is a "next" job
we start_timeout again.

v2:
further cleanup the logic, and do the TDR timer cancelling if the
signaled job is the last one in its scheduler.

v3:
change the issue description
remove the cancel_delayed_work in the begining of the cleanup_job
recover the implement of drm_sched_job_begin.

TODO:
1)introduce pause/resume scheduler in job_timeout to serial the
handling of scheduler and job_timeout.
2)drop the bad job's del and insert in scheduler due to above
serialization (no race issue anymore with the serialization)

Signed-off-by: Monk Liu 
---
   drivers/gpu/drm/scheduler/sched_main.c | 25
++---
   1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a9536..ecf8140 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -676,13 +676,7 @@ drm_sched_get_cleanup_job(struct
drm_gpu_scheduler *sched)
   {
   struct drm_sched_job *job, *next;
   -    /*
- * Don't destroy jobs while the timeout worker is running OR
thread
- * is being parked and hence assumed to not touch pending_list
- */
-    if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-    !cancel_delayed_work(&sched->work_tdr)) ||
-    kthread_should_park())
+    if (kthread_should_park())
   return NULL;


I actually don't see why we need to keep the above, on the other side
(in drm_sched_stop) we won't touch the pending list anyway until
sched thread came to full stop (kthread_park). If you do see a reason
why this needed then a comment should be here i think.

Andrey



spin_lock(&sched->job_list_lock);
@@ -693,17 +687,21 @@ drm_sched_get_cleanup_job(struct
drm_gpu_scheduler *sched)
   if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
   /* remove job from pending_list */
   list_del_init(&job->list);
+
+    /* cancel this job's TO timer */
+    cancel_delayed_work(&sched->work_tdr);
   /* make the scheduled timestamp more accurate */
   next = list_first_entry_or_null(&sched->pending_list,
   typeof(*next), list);
-    if (next)
+
+    if (next) {
   next->s_fence->scheduled.timestamp =
   job->s_fence->finished.timestamp;
-
+    /* start TO timer for next job */
+    drm_sched_start_timeout(sched);
+    }
   } else {
   job = NULL;
-    /* queue timeout for next job */
-    drm_sched_start_timeout(sched);
   }
     spin_unlock(&sched->job_list_lock);
@@ -791,11 +789,8 @@ static int drm_sched_main(void *param)
     (entity = drm_sched_select_entity(sched)))
||
    kthread_should_stop());
   -    if (cleanup_job) {
+    if (cleanup_job)
   sched->ops->free_job(cleanup_job);
-    /* queue timeout for next job */
-    drm_sched_start_timeout(sched);
-    }
     if (!entity)
   continue;


Re: [PATCH v2 0/4] Various fixes to pass libdrm hotunplug tests

2021-08-27 Thread Andrey Grodzovsky

Ping

Andrey

On 2021-08-26 1:27 p.m., Andrey Grodzovsky wrote:

Bunch of fixes to enable passing hotplug tests i previosly added
here[1] with latest code.
Once accepted I will enable the tests on libdrm side.

[1] - https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/172

v2:
Dropping VCE patch since relevant function already fixed in latest
code.
Moving IOMMU hnadling to TTM layer.

Andrey Grodzovsky (4):
   drm/ttm: Create pinned list
   drm/ttm: Clear all DMA mappings on demand
   drm/amdgpu: drm/amdgpu: Handle IOMMU enabled case
   drm/amdgpu: Add a UAPI flag for hot plug/unplug

  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  3 +-
  drivers/gpu/drm/ttm/ttm_bo.c   | 30 +--
  drivers/gpu/drm/ttm/ttm_device.c   | 45 ++
  drivers/gpu/drm/ttm/ttm_resource.c |  1 +
  include/drm/ttm/ttm_device.h   |  1 +
  include/drm/ttm/ttm_resource.h |  1 +
  7 files changed, 78 insertions(+), 5 deletions(-)



Re: [PATCH] drm/sched: fix the bug of time out calculation(v3)

2021-08-27 Thread Andrey Grodzovsky
As I mentioned to Monk before - what about cases such as in this test - 
https://gitlab.freedesktop.org/mesa/drm/-/commit/bc21168fa924d3fc4a000492e861f50a1a135b25
Here you don't have serialized sequence where when jobs finishes 
processing and second starts, they execute together  concurrently - for 
those cases seems
to me restarting the timer for the second job from scratch will let it 
hang much longer then allowed by TO value.


Andrey

On 2021-08-27 10:29 a.m., Christian König wrote:

I don't think that makes sense.

See we don't want to start the time when the job is inserted into the 
ring buffer, but rather when it starts processing.


Starting processing is a bit swampy defined, but just starting the 
timer when the previous job completes should be fine enough.


Christian.

Am 27.08.21 um 15:57 schrieb Andrey Grodzovsky:
The TS represents the point in time when the job was inserted into 
the pending list.
I don't think it matters when it actually starts to be processed, 
what matters is when this job was inserted into pending list because 
right at that point you arm the TO timer (when no other is running 
already)
and so when the previous job completes and you cancel and rearm again 
you can use that TS from the next job in pending list to calculate 
how much time has actually left for it to run before TDR must be 
initiated
and not just give it again full TO value to run even if it has 
already been running for a while.


Also, i am not sure also about the assumption that what we measure is 
processing by HW, what we measure is from the moment it was scheduled 
to ring to the moment the job completed (EOP event). At least that 
what our TDR timer measures and so it makes sense to set the TS at 
this point.


Andrey

On 2021-08-27 3:20 a.m., Liu, Monk wrote:

[AMD Official Use Only]

what is that 'ts' representing for ? it looks to me the jiffies that 
it get scheduled to the ring,  but a job scheduled to the ring 
doesn't represent it's being processed by hw.


Thanks

--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Grodzovsky, Andrey 
Sent: Friday, August 27, 2021 4:14 AM
To: Liu, Monk ; amd-gfx@lists.freedesktop.org; 
Koenig, Christian 

Cc: dri-de...@lists.freedesktop.org
Subject: Re: [PATCH] drm/sched: fix the bug of time out calculation(v3)

Attached quick patch for per job TTL calculation to make more 
precises next timer expiration. It's on top of the patch in this 
thread. Let me know if this makes sense.


Andrey

On 2021-08-26 10:03 a.m., Andrey Grodzovsky wrote:

On 2021-08-26 12:55 a.m., Monk Liu wrote:

issue:
in cleanup_job the cancle_delayed_work will cancel a TO timer even
the its corresponding job is still running.

fix:
do not cancel the timer in cleanup_job, instead do the cancelling
only when the heading job is signaled, and if there is a "next" job
we start_timeout again.

v2:
further cleanup the logic, and do the TDR timer cancelling if the
signaled job is the last one in its scheduler.

v3:
change the issue description
remove the cancel_delayed_work in the begining of the cleanup_job
recover the implement of drm_sched_job_begin.

TODO:
1)introduce pause/resume scheduler in job_timeout to serial the
handling of scheduler and job_timeout.
2)drop the bad job's del and insert in scheduler due to above
serialization (no race issue anymore with the serialization)

Signed-off-by: Monk Liu 
---
   drivers/gpu/drm/scheduler/sched_main.c | 25
++---
   1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a9536..ecf8140 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -676,13 +676,7 @@ drm_sched_get_cleanup_job(struct
drm_gpu_scheduler *sched)
   {
   struct drm_sched_job *job, *next;
   -    /*
- * Don't destroy jobs while the timeout worker is running OR
thread
- * is being parked and hence assumed to not touch pending_list
- */
-    if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-    !cancel_delayed_work(&sched->work_tdr)) ||
-    kthread_should_park())
+    if (kthread_should_park())
   return NULL;


I actually don't see why we need to keep the above, on the other side
(in drm_sched_stop) we won't touch the pending list anyway until sched
thread came to full stop (kthread_park). If you do see a reason why
this needed then a comment should be here i think.

Andrey



spin_lock(&sched->job_list_lock);
@@ -693,17 +687,21 @@ drm_sched_get_cleanup_job(struct
drm_gpu_scheduler *sched)
   if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
   /* remove job from pending_list */
   list_del_init(&job->list);
+
+    /* cancel th

Re: [PATCH] drm/sched: fix the bug of time out calculation(v3)

2021-08-27 Thread Andrey Grodzovsky

Sure then.

Andrey

On 2021-08-27 2:30 p.m., Christian König wrote:
Yeah, that's what I meant with that the start of processing a job is a 
bit swampy defined.


Jobs overload, but we simply don't have another good indicator that a 
job started except that the previous one completed.


It's still better than starting the timer when pushing the job to the 
ring buffer, because that is completely off.


Christian.

Am 27.08.21 um 20:22 schrieb Andrey Grodzovsky:
As I mentioned to Monk before - what about cases such as in this test 
- 
https://gitlab.freedesktop.org/mesa/drm/-/commit/bc21168fa924d3fc4a000492e861f50a1a135b25 

Here you don't have serialized sequence where when jobs finishes 
processing and second starts, they execute together concurrently - 
for those cases seems
to me restarting the timer for the second job from scratch will let 
it hang much longer then allowed by TO value.


Andrey

On 2021-08-27 10:29 a.m., Christian König wrote:

I don't think that makes sense.

See we don't want to start the time when the job is inserted into 
the ring buffer, but rather when it starts processing.


Starting processing is a bit swampy defined, but just starting the 
timer when the previous job completes should be fine enough.


Christian.

Am 27.08.21 um 15:57 schrieb Andrey Grodzovsky:
The TS represents the point in time when the job was inserted into 
the pending list.
I don't think it matters when it actually starts to be processed, 
what matters is when this job was inserted into pending list 
because right at that point you arm the TO timer (when no other is 
running already)
and so when the previous job completes and you cancel and rearm 
again you can use that TS from the next job in pending list to 
calculate how much time has actually left for it to run before TDR 
must be initiated
and not just give it again full TO value to run even if it has 
already been running for a while.


Also, i am not sure also about the assumption that what we measure 
is processing by HW, what we measure is from the moment it was 
scheduled to ring to the moment the job completed (EOP event). At 
least that what our TDR timer measures and so it makes sense to set 
the TS at this point.


Andrey

On 2021-08-27 3:20 a.m., Liu, Monk wrote:

[AMD Official Use Only]

what is that 'ts' representing for ? it looks to me the jiffies 
that it get scheduled to the ring,  but a job scheduled to the 
ring doesn't represent it's being processed by hw.


Thanks

--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Grodzovsky, Andrey 
Sent: Friday, August 27, 2021 4:14 AM
To: Liu, Monk ; amd-gfx@lists.freedesktop.org; 
Koenig, Christian 

Cc: dri-de...@lists.freedesktop.org
Subject: Re: [PATCH] drm/sched: fix the bug of time out 
calculation(v3)


Attached quick patch for per job TTL calculation to make more 
precises next timer expiration. It's on top of the patch in this 
thread. Let me know if this makes sense.


Andrey

On 2021-08-26 10:03 a.m., Andrey Grodzovsky wrote:

On 2021-08-26 12:55 a.m., Monk Liu wrote:

issue:
in cleanup_job the cancle_delayed_work will cancel a TO timer even
the its corresponding job is still running.

fix:
do not cancel the timer in cleanup_job, instead do the cancelling
only when the heading job is signaled, and if there is a "next" job
we start_timeout again.

v2:
further cleanup the logic, and do the TDR timer cancelling if the
signaled job is the last one in its scheduler.

v3:
change the issue description
remove the cancel_delayed_work in the begining of the cleanup_job
recover the implement of drm_sched_job_begin.

TODO:
1)introduce pause/resume scheduler in job_timeout to serial the
handling of scheduler and job_timeout.
2)drop the bad job's del and insert in scheduler due to above
serialization (no race issue anymore with the serialization)

Signed-off-by: Monk Liu 
---
   drivers/gpu/drm/scheduler/sched_main.c | 25
++---
   1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a9536..ecf8140 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -676,13 +676,7 @@ drm_sched_get_cleanup_job(struct
drm_gpu_scheduler *sched)
   {
   struct drm_sched_job *job, *next;
   -    /*
- * Don't destroy jobs while the timeout worker is running OR
thread
- * is being parked and hence assumed to not touch pending_list
- */
-    if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
- !cancel_delayed_work(&sched->work_tdr)) ||
-    kthread_should_park())
+    if (kthread_should_park())
   return NULL;


I actually don't see why we need to keep the above, on the other 
side
(in drm_sched_stop) we won't touch the pending li

Re: [PATCH] drm/amdgpu: stop scheduler when calling hw_fini

2021-08-27 Thread Andrey Grodzovsky
I don't think it will start/stop twice because 
amdgpu_fence_driver_hw_fini/inint is not called during reset.


I am worried about calling drm_sched_start without calling 
drm_sched_resubmit_job first since that
the place where the jobs are actually restarted. Also calling 
drm_sched_start with false flag  wrong here since

it skips all the pending list handling.

Andrey

On 2021-08-27 7:34 a.m., Christian König wrote:
In general that looks good to me, but what could be is that we now try 
to stop/start the scheduler during reset twice.


Andrey what do you think?

Christian.

Am 27.08.21 um 12:40 schrieb Guchun Chen:

This gurantees no more work on the ring can be submitted
to hardware in suspend/resume case, otherwise the ring will
not be empty before suspend.

Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 6 ++
  1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c

index b439eb7d4177..d6e429e63604 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -552,6 +552,9 @@ void amdgpu_fence_driver_hw_fini(struct 
amdgpu_device *adev)

  if (!ring || !ring->fence_drv.initialized)
  continue;
  +    if (!ring->no_scheduler)
+    drm_sched_stop(&ring->sched, NULL);
+
  /* You can't wait for HW to signal if it's gone */
  if (!drm_dev_is_unplugged(&adev->ddev))
  r = amdgpu_fence_wait_empty(ring);
@@ -611,6 +614,9 @@ void amdgpu_fence_driver_hw_init(struct 
amdgpu_device *adev)

  if (!ring || !ring->fence_drv.initialized)
  continue;
  +    if (!ring->no_scheduler)
+    drm_sched_start(&ring->sched, false);
+
  /* enable the interrupt */
  if (ring->fence_drv.irq_src)
  amdgpu_irq_get(adev, ring->fence_drv.irq_src,




[PATCH v3 0/4] Various fixes to pass libdrm hotunplug tests

2021-08-27 Thread Andrey Grodzovsky
Bunch of fixes to enable passing hotplug tests i previosly added
here[1] with latest code. 
Once accepted I will enable the tests on libdrm side.

[1] - https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/172

v2:
Dropping VCE patch since relevant function already fixed in latest
code.
Moving IOMMU hnadling to TTM layer.

v3:
Move pinned list to ttm device and a few others.

Andrey Grodzovsky (4):
  drm/ttm: Create pinned list
  drm/ttm: Clear all DMA mappings on demand
  drm/amdgpu: drm/amdgpu: Handle IOMMU enabled case
  drm/amdgpu: Add a UAPI flag for hot plug/unplug

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  3 +-
 drivers/gpu/drm/ttm/ttm_bo.c   | 18 ++--
 drivers/gpu/drm/ttm/ttm_device.c   | 48 ++
 include/drm/ttm/ttm_device.h   |  2 +
 5 files changed, 68 insertions(+), 5 deletions(-)

-- 
2.25.1



[PATCH v3 1/4] drm/ttm: Create pinned list

2021-08-27 Thread Andrey Grodzovsky
This list will be used to capture all non VRAM BOs not
on LRU so when device is hot unplugged we can iterate
the list and unmap DMA mappings before device is removed.

v2: Reanme function to ttm_bo_move_to_pinned
v3: Move the pinned list to ttm device

Signed-off-by: Andrey Grodzovsky 
Suggested-by: Christian König 
---
 drivers/gpu/drm/ttm/ttm_bo.c | 18 ++
 drivers/gpu/drm/ttm/ttm_device.c |  1 +
 include/drm/ttm/ttm_device.h |  1 +
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 1b950b45cf4b..1fedd0eb67ba 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -69,7 +69,17 @@ static void ttm_bo_mem_space_debug(struct ttm_buffer_object 
*bo,
}
 }
 
-static void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
+static inline void ttm_bo_move_to_pinned(struct ttm_buffer_object *bo)
+{
+   struct ttm_device *bdev = bo->bdev;
+
+   list_move_tail(&bo->lru, &bdev->pinned);
+
+   if (bdev->funcs->del_from_lru_notify)
+   bdev->funcs->del_from_lru_notify(bo);
+}
+
+static inline void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
 {
struct ttm_device *bdev = bo->bdev;
 
@@ -98,7 +108,7 @@ void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo,
dma_resv_assert_held(bo->base.resv);
 
if (bo->pin_count) {
-   ttm_bo_del_from_lru(bo);
+   ttm_bo_move_to_pinned(bo);
return;
}
 
@@ -339,7 +349,7 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo,
return ret;
}
 
-   ttm_bo_del_from_lru(bo);
+   ttm_bo_move_to_pinned(bo);
list_del_init(&bo->ddestroy);
spin_unlock(&bo->bdev->lru_lock);
ttm_bo_cleanup_memtype_use(bo);
@@ -1154,7 +1164,7 @@ int ttm_bo_swapout(struct ttm_buffer_object *bo, struct 
ttm_operation_ctx *ctx,
return 0;
}
 
-   ttm_bo_del_from_lru(bo);
+   ttm_bo_move_to_pinned(bo);
/* TODO: Cleanup the locking */
spin_unlock(&bo->bdev->lru_lock);
 
diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c
index 5f31acec3ad7..530a9c36be37 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -208,6 +208,7 @@ int ttm_device_init(struct ttm_device *bdev, struct 
ttm_device_funcs *funcs,
INIT_DELAYED_WORK(&bdev->wq, ttm_device_delayed_workqueue);
spin_lock_init(&bdev->lru_lock);
INIT_LIST_HEAD(&bdev->ddestroy);
+   INIT_LIST_HEAD(&bdev->pinned);
bdev->dev_mapping = mapping;
mutex_lock(&ttm_global_mutex);
list_add_tail(&bdev->device_list, &glob->device_list);
diff --git a/include/drm/ttm/ttm_device.h b/include/drm/ttm/ttm_device.h
index cd592f8e941b..03fb44d061e0 100644
--- a/include/drm/ttm/ttm_device.h
+++ b/include/drm/ttm/ttm_device.h
@@ -265,6 +265,7 @@ struct ttm_device {
 */
spinlock_t lru_lock;
struct list_head ddestroy;
+   struct list_head pinned;
 
/*
 * Protected by load / firstopen / lastclose /unload sync.
-- 
2.25.1



[PATCH v3 4/4] drm/amdgpu: Add a UAPI flag for hot plug/unplug

2021-08-27 Thread Andrey Grodzovsky
To support libdrm tests.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 6400259a7c4b..c2fdf67ff551 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -96,9 +96,10 @@
  * - 3.40.0 - Add AMDGPU_IDS_FLAGS_TMZ
  * - 3.41.0 - Add video codec query
  * - 3.42.0 - Add 16bpc fixed point display support
+ * - 3.43.0 - Add device hot plug/unplug support
  */
 #define KMS_DRIVER_MAJOR   3
-#define KMS_DRIVER_MINOR   42
+#define KMS_DRIVER_MINOR   43
 #define KMS_DRIVER_PATCHLEVEL  0
 
 int amdgpu_vram_limit;
-- 
2.25.1



[PATCH v3 2/4] drm/ttm: Clear all DMA mappings on demand

2021-08-27 Thread Andrey Grodzovsky
Used by drivers supporting hot unplug to handle all
DMA IOMMU group related dependencies before the group
is removed during device removal and we try to access
it after free when last device pointer from user space
is dropped.

v3:
Switch to ttm_bo_get_unless_zerom
Iterate bdev for pinned list
Switch to ttm_tt_unpopulate

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/ttm/ttm_device.c | 47 
 include/drm/ttm/ttm_device.h |  1 +
 2 files changed, 48 insertions(+)

diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c
index 530a9c36be37..a691c89f5b20 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -246,3 +246,50 @@ void ttm_device_fini(struct ttm_device *bdev)
ttm_global_release();
 }
 EXPORT_SYMBOL(ttm_device_fini);
+
+void ttm_device_clear_dma_mappings(struct ttm_device *bdev)
+{
+   struct ttm_resource_manager *man;
+   struct ttm_buffer_object *bo;
+   unsigned int i, j;
+
+   spin_lock(&bdev->lru_lock);
+   while (!list_empty(&bdev->pinned)) {
+   bo = list_first_entry(&bdev->pinned, struct ttm_buffer_object, 
lru);
+   /* Take ref against racing releases once lru_lock is unlocked */
+   if (ttm_bo_get_unless_zero(bo)) {
+   list_del_init(&bo->lru);
+   spin_unlock(&bdev->lru_lock);
+
+   if (bo->ttm)
+   ttm_tt_unpopulate(bo->bdev, bo->ttm);
+
+   ttm_bo_put(bo);
+   spin_lock(&bdev->lru_lock);
+   }
+   }
+
+   for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
+   man = ttm_manager_type(bdev, i);
+   if (!man || !man->use_tt)
+   continue;
+
+   for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
+   while (!list_empty(&man->lru[j])) {
+   bo = list_first_entry(&man->lru[j], struct 
ttm_buffer_object, lru);
+   if (ttm_bo_get_unless_zero(bo)) {
+   list_del_init(&bo->lru);
+   spin_unlock(&bdev->lru_lock);
+
+   if (bo->ttm)
+   ttm_tt_unpopulate(bo->bdev, 
bo->ttm);
+
+   ttm_bo_put(bo);
+   spin_lock(&bdev->lru_lock);
+   }
+   }
+   }
+   }
+   spin_unlock(&bdev->lru_lock);
+}
+EXPORT_SYMBOL(ttm_device_clear_dma_mappings);
diff --git a/include/drm/ttm/ttm_device.h b/include/drm/ttm/ttm_device.h
index 03fb44d061e0..07d722950d5b 100644
--- a/include/drm/ttm/ttm_device.h
+++ b/include/drm/ttm/ttm_device.h
@@ -299,5 +299,6 @@ int ttm_device_init(struct ttm_device *bdev, struct 
ttm_device_funcs *funcs,
struct drm_vma_offset_manager *vma_manager,
bool use_dma_alloc, bool use_dma32);
 void ttm_device_fini(struct ttm_device *bdev);
+void ttm_device_clear_dma_mappings(struct ttm_device *bdev);
 
 #endif
-- 
2.25.1



[PATCH v3 3/4] drm/amdgpu: drm/amdgpu: Handle IOMMU enabled case

2021-08-27 Thread Andrey Grodzovsky
Handle all DMA IOMMU group related dependencies before the
group is removed and we try to access it after free.

v2:
Move the actul handling function to TTM

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0b5764aa98a4..653bd8fdaa33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3860,6 +3860,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
 
amdgpu_device_ip_fini_early(adev);
 
+   ttm_device_clear_dma_mappings(&adev->mman.bdev);
+
amdgpu_gart_dummy_page_fini(adev);
 
amdgpu_device_unmap_mmio(adev);
-- 
2.25.1



Re: [PATCH v3 1/4] drm/ttm: Create pinned list

2021-08-30 Thread Andrey Grodzovsky



On 2021-08-30 4:58 a.m., Christian König wrote:

Am 27.08.21 um 22:39 schrieb Andrey Grodzovsky:

This list will be used to capture all non VRAM BOs not
on LRU so when device is hot unplugged we can iterate
the list and unmap DMA mappings before device is removed.

v2: Reanme function to ttm_bo_move_to_pinned
v3: Move the pinned list to ttm device


As far as I can see there is not list_del() remaining. So this won't 
work correctly.



It's in ttm_bo_release, there was no code change there hence it's not 
captured in the patch.


Andrey




I suggest to rather rebase on top of the stuff I'm working on for a 
while to move the LRU into the resource instead.


Just send out the latest patch set of this with you in CC.

Christian.



Signed-off-by: Andrey Grodzovsky 
Suggested-by: Christian König 
---
  drivers/gpu/drm/ttm/ttm_bo.c | 18 ++
  drivers/gpu/drm/ttm/ttm_device.c |  1 +
  include/drm/ttm/ttm_device.h |  1 +
  3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 1b950b45cf4b..1fedd0eb67ba 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -69,7 +69,17 @@ static void ttm_bo_mem_space_debug(struct 
ttm_buffer_object *bo,

  }
  }
  -static void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
+static inline void ttm_bo_move_to_pinned(struct ttm_buffer_object *bo)
+{
+    struct ttm_device *bdev = bo->bdev;
+
+    list_move_tail(&bo->lru, &bdev->pinned);
+
+    if (bdev->funcs->del_from_lru_notify)
+    bdev->funcs->del_from_lru_notify(bo);
+}
+
+static inline void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
  {
  struct ttm_device *bdev = bo->bdev;
  @@ -98,7 +108,7 @@ void ttm_bo_move_to_lru_tail(struct 
ttm_buffer_object *bo,

  dma_resv_assert_held(bo->base.resv);
    if (bo->pin_count) {
-    ttm_bo_del_from_lru(bo);
+    ttm_bo_move_to_pinned(bo);
  return;
  }
  @@ -339,7 +349,7 @@ static int ttm_bo_cleanup_refs(struct 
ttm_buffer_object *bo,

  return ret;
  }
  -    ttm_bo_del_from_lru(bo);
+    ttm_bo_move_to_pinned(bo);
  list_del_init(&bo->ddestroy);
  spin_unlock(&bo->bdev->lru_lock);
  ttm_bo_cleanup_memtype_use(bo);
@@ -1154,7 +1164,7 @@ int ttm_bo_swapout(struct ttm_buffer_object 
*bo, struct ttm_operation_ctx *ctx,

  return 0;
  }
  -    ttm_bo_del_from_lru(bo);
+    ttm_bo_move_to_pinned(bo);
  /* TODO: Cleanup the locking */
  spin_unlock(&bo->bdev->lru_lock);
  diff --git a/drivers/gpu/drm/ttm/ttm_device.c 
b/drivers/gpu/drm/ttm/ttm_device.c

index 5f31acec3ad7..530a9c36be37 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -208,6 +208,7 @@ int ttm_device_init(struct ttm_device *bdev, 
struct ttm_device_funcs *funcs,

  INIT_DELAYED_WORK(&bdev->wq, ttm_device_delayed_workqueue);
  spin_lock_init(&bdev->lru_lock);
  INIT_LIST_HEAD(&bdev->ddestroy);
+    INIT_LIST_HEAD(&bdev->pinned);
  bdev->dev_mapping = mapping;
  mutex_lock(&ttm_global_mutex);
  list_add_tail(&bdev->device_list, &glob->device_list);
diff --git a/include/drm/ttm/ttm_device.h b/include/drm/ttm/ttm_device.h
index cd592f8e941b..03fb44d061e0 100644
--- a/include/drm/ttm/ttm_device.h
+++ b/include/drm/ttm/ttm_device.h
@@ -265,6 +265,7 @@ struct ttm_device {
   */
  spinlock_t lru_lock;
  struct list_head ddestroy;
+    struct list_head pinned;
    /*
   * Protected by load / firstopen / lastclose /unload sync.




Re: [PATCH v3 1/4] drm/ttm: Create pinned list

2021-08-30 Thread Andrey Grodzovsky



On 2021-08-30 12:51 p.m., Christian König wrote:

Am 30.08.21 um 16:16 schrieb Andrey Grodzovsky:


On 2021-08-30 4:58 a.m., Christian König wrote:

Am 27.08.21 um 22:39 schrieb Andrey Grodzovsky:

This list will be used to capture all non VRAM BOs not
on LRU so when device is hot unplugged we can iterate
the list and unmap DMA mappings before device is removed.

v2: Reanme function to ttm_bo_move_to_pinned
v3: Move the pinned list to ttm device


As far as I can see there is not list_del() remaining. So this won't 
work correctly.



It's in ttm_bo_release, there was no code change there hence it's not 
captured in the patch.


Ah! So you keep the logic as is there. Sorry totally missed that.

In this case the patch is Reviewed-by: Christian König 



Can you push this to drm-misc-next?

Thanks,
Christian.



I think It's supposed to go on top of your changes you mention here 
which are not pushed yet.
I will need to apply all the patches on top of yours and retest (I was 
doing everything in amd-staging-drm-next)

until now.

Andrey






Andrey




I suggest to rather rebase on top of the stuff I'm working on for a 
while to move the LRU into the resource instead.


Just send out the latest patch set of this with you in CC.

Christian.



Signed-off-by: Andrey Grodzovsky 
Suggested-by: Christian König 
---
  drivers/gpu/drm/ttm/ttm_bo.c | 18 ++
  drivers/gpu/drm/ttm/ttm_device.c |  1 +
  include/drm/ttm/ttm_device.h |  1 +
  3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 
b/drivers/gpu/drm/ttm/ttm_bo.c

index 1b950b45cf4b..1fedd0eb67ba 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -69,7 +69,17 @@ static void ttm_bo_mem_space_debug(struct 
ttm_buffer_object *bo,

  }
  }
  -static void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
+static inline void ttm_bo_move_to_pinned(struct ttm_buffer_object 
*bo)

+{
+    struct ttm_device *bdev = bo->bdev;
+
+    list_move_tail(&bo->lru, &bdev->pinned);
+
+    if (bdev->funcs->del_from_lru_notify)
+    bdev->funcs->del_from_lru_notify(bo);
+}
+
+static inline void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
  {
  struct ttm_device *bdev = bo->bdev;
  @@ -98,7 +108,7 @@ void ttm_bo_move_to_lru_tail(struct 
ttm_buffer_object *bo,

  dma_resv_assert_held(bo->base.resv);
    if (bo->pin_count) {
-    ttm_bo_del_from_lru(bo);
+    ttm_bo_move_to_pinned(bo);
  return;
  }
  @@ -339,7 +349,7 @@ static int ttm_bo_cleanup_refs(struct 
ttm_buffer_object *bo,

  return ret;
  }
  -    ttm_bo_del_from_lru(bo);
+    ttm_bo_move_to_pinned(bo);
  list_del_init(&bo->ddestroy);
  spin_unlock(&bo->bdev->lru_lock);
  ttm_bo_cleanup_memtype_use(bo);
@@ -1154,7 +1164,7 @@ int ttm_bo_swapout(struct ttm_buffer_object 
*bo, struct ttm_operation_ctx *ctx,

  return 0;
  }
  -    ttm_bo_del_from_lru(bo);
+    ttm_bo_move_to_pinned(bo);
  /* TODO: Cleanup the locking */
  spin_unlock(&bo->bdev->lru_lock);
  diff --git a/drivers/gpu/drm/ttm/ttm_device.c 
b/drivers/gpu/drm/ttm/ttm_device.c

index 5f31acec3ad7..530a9c36be37 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -208,6 +208,7 @@ int ttm_device_init(struct ttm_device *bdev, 
struct ttm_device_funcs *funcs,

  INIT_DELAYED_WORK(&bdev->wq, ttm_device_delayed_workqueue);
  spin_lock_init(&bdev->lru_lock);
  INIT_LIST_HEAD(&bdev->ddestroy);
+    INIT_LIST_HEAD(&bdev->pinned);
  bdev->dev_mapping = mapping;
  mutex_lock(&ttm_global_mutex);
  list_add_tail(&bdev->device_list, &glob->device_list);
diff --git a/include/drm/ttm/ttm_device.h 
b/include/drm/ttm/ttm_device.h

index cd592f8e941b..03fb44d061e0 100644
--- a/include/drm/ttm/ttm_device.h
+++ b/include/drm/ttm/ttm_device.h
@@ -265,6 +265,7 @@ struct ttm_device {
   */
  spinlock_t lru_lock;
  struct list_head ddestroy;
+    struct list_head pinned;
    /*
   * Protected by load / firstopen / lastclose /unload sync.






Re: [PATCH v3 1/4] drm/ttm: Create pinned list

2021-08-30 Thread Andrey Grodzovsky



On 2021-08-30 1:05 p.m., Christian König wrote:



Am 30.08.21 um 19:02 schrieb Andrey Grodzovsky:


On 2021-08-30 12:51 p.m., Christian König wrote:

Am 30.08.21 um 16:16 schrieb Andrey Grodzovsky:


On 2021-08-30 4:58 a.m., Christian König wrote:

Am 27.08.21 um 22:39 schrieb Andrey Grodzovsky:

This list will be used to capture all non VRAM BOs not
on LRU so when device is hot unplugged we can iterate
the list and unmap DMA mappings before device is removed.

v2: Reanme function to ttm_bo_move_to_pinned
v3: Move the pinned list to ttm device


As far as I can see there is not list_del() remaining. So this 
won't work correctly.



It's in ttm_bo_release, there was no code change there hence it's 
not captured in the patch.


Ah! So you keep the logic as is there. Sorry totally missed that.

In this case the patch is Reviewed-by: Christian König 



Can you push this to drm-misc-next?

Thanks,
Christian.



I think It's supposed to go on top of your changes you mention here 
which are not pushed yet.
I will need to apply all the patches on top of yours and retest (I 
was doing everything in amd-staging-drm-next)

until now.


Works for me as well. Alternatively you can just push this patch here 
to drm-misc-next so that I can rebase on top and merge the rest 
through amd-staging-drm-next.


The patch pushed to drm-misc-next should automatically fall out when 
Alex rebases his stuff on upstream the next time.


Christian.



So i can both push this specific patch to drm-misc-next and also push 
the entire 4 patch series to amd-stagin-drm-next (after rest of the 
patches RBed)?


Andrey






Andrey






Andrey




I suggest to rather rebase on top of the stuff I'm working on for 
a while to move the LRU into the resource instead.


Just send out the latest patch set of this with you in CC.

Christian.



Signed-off-by: Andrey Grodzovsky 
Suggested-by: Christian König 
---
  drivers/gpu/drm/ttm/ttm_bo.c | 18 ++
  drivers/gpu/drm/ttm/ttm_device.c |  1 +
  include/drm/ttm/ttm_device.h |  1 +
  3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 
b/drivers/gpu/drm/ttm/ttm_bo.c

index 1b950b45cf4b..1fedd0eb67ba 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -69,7 +69,17 @@ static void ttm_bo_mem_space_debug(struct 
ttm_buffer_object *bo,

  }
  }
  -static void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
+static inline void ttm_bo_move_to_pinned(struct 
ttm_buffer_object *bo)

+{
+    struct ttm_device *bdev = bo->bdev;
+
+    list_move_tail(&bo->lru, &bdev->pinned);
+
+    if (bdev->funcs->del_from_lru_notify)
+    bdev->funcs->del_from_lru_notify(bo);
+}
+
+static inline void ttm_bo_del_from_lru(struct ttm_buffer_object 
*bo)

  {
  struct ttm_device *bdev = bo->bdev;
  @@ -98,7 +108,7 @@ void ttm_bo_move_to_lru_tail(struct 
ttm_buffer_object *bo,

  dma_resv_assert_held(bo->base.resv);
    if (bo->pin_count) {
-    ttm_bo_del_from_lru(bo);
+    ttm_bo_move_to_pinned(bo);
  return;
  }
  @@ -339,7 +349,7 @@ static int ttm_bo_cleanup_refs(struct 
ttm_buffer_object *bo,

  return ret;
  }
  -    ttm_bo_del_from_lru(bo);
+    ttm_bo_move_to_pinned(bo);
  list_del_init(&bo->ddestroy);
  spin_unlock(&bo->bdev->lru_lock);
  ttm_bo_cleanup_memtype_use(bo);
@@ -1154,7 +1164,7 @@ int ttm_bo_swapout(struct ttm_buffer_object 
*bo, struct ttm_operation_ctx *ctx,

  return 0;
  }
  -    ttm_bo_del_from_lru(bo);
+    ttm_bo_move_to_pinned(bo);
  /* TODO: Cleanup the locking */
  spin_unlock(&bo->bdev->lru_lock);
  diff --git a/drivers/gpu/drm/ttm/ttm_device.c 
b/drivers/gpu/drm/ttm/ttm_device.c

index 5f31acec3ad7..530a9c36be37 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -208,6 +208,7 @@ int ttm_device_init(struct ttm_device *bdev, 
struct ttm_device_funcs *funcs,

  INIT_DELAYED_WORK(&bdev->wq, ttm_device_delayed_workqueue);
  spin_lock_init(&bdev->lru_lock);
  INIT_LIST_HEAD(&bdev->ddestroy);
+    INIT_LIST_HEAD(&bdev->pinned);
  bdev->dev_mapping = mapping;
  mutex_lock(&ttm_global_mutex);
  list_add_tail(&bdev->device_list, &glob->device_list);
diff --git a/include/drm/ttm/ttm_device.h 
b/include/drm/ttm/ttm_device.h

index cd592f8e941b..03fb44d061e0 100644
--- a/include/drm/ttm/ttm_device.h
+++ b/include/drm/ttm/ttm_device.h
@@ -265,6 +265,7 @@ struct ttm_device {
   */
  spinlock_t lru_lock;
  struct list_head ddestroy;
+    struct list_head pinned;
    /*
   * Protected by load / firstopen / lastclose /unload sync.








Re: [PATCH] drm/amdgpu: stop scheduler when calling hw_fini (v2)

2021-08-30 Thread Andrey Grodzovsky

Looks good to me too.

Andrey

On 2021-08-30 2:17 a.m., Christian König wrote:

Am 30.08.21 um 08:08 schrieb Guchun Chen:

This gurantees no more work on the ring can be submitted
to hardware in suspend/resume case, otherwise a potential
race will occur and the ring will get no chance to stay
empty before suspend.

v2: Call drm_sched_resubmit_job before drm_sched_start to
restart jobs from the pending list.

Suggested-by: Andrey Grodzovsky 
Suggested-by: Christian König 
Signed-off-by: Guchun Chen 


Reviewed-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 8 
  1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c

index b439eb7d4177..fd4ba076ff8a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -552,6 +552,9 @@ void amdgpu_fence_driver_hw_fini(struct 
amdgpu_device *adev)

  if (!ring || !ring->fence_drv.initialized)
  continue;
  +    if (!ring->no_scheduler)
+    drm_sched_stop(&ring->sched, NULL);
+
  /* You can't wait for HW to signal if it's gone */
  if (!drm_dev_is_unplugged(&adev->ddev))
  r = amdgpu_fence_wait_empty(ring);
@@ -611,6 +614,11 @@ void amdgpu_fence_driver_hw_init(struct 
amdgpu_device *adev)

  if (!ring || !ring->fence_drv.initialized)
  continue;
  +    if (!ring->no_scheduler) {
+    drm_sched_resubmit_jobs(&ring->sched);
+    drm_sched_start(&ring->sched, true);
+    }
+
  /* enable the interrupt */
  if (ring->fence_drv.irq_src)
  amdgpu_irq_get(adev, ring->fence_drv.irq_src,




Re: [PATCH] drm/amdgpu: Fix a deadlock if previous GEM object allocation fails

2021-08-30 Thread Andrey Grodzovsky



On 2021-08-30 11:24 p.m., Pan, Xinhui wrote:

[AMD Official Use Only]

[AMD Official Use Only]

Unreserve root BO before return otherwise next allocation got deadlock.

Signed-off-by: xinhui pan 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 11 +--
  1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 85b292ed5c43..c9db7d2c5816 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -355,19 +355,18 @@ int amdgpu_gem_create_ioctl(struct drm_device *dev, void 
*data,
 DRM_DEBUG("Failed to allocate GEM object (%llu, %d, %llu, 
%d)\n",
   size, initial_domain, args->in.alignment, r);
 }
+
+   if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
+   amdgpu_bo_unreserve(vm->root.bo);
 return r;
 }

 if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
-   if (!r) {
-   struct amdgpu_bo *abo = gem_to_amdgpu_bo(gobj);
+   struct amdgpu_bo *abo = gem_to_amdgpu_bo(gobj);

-   abo->parent = amdgpu_bo_ref(vm->root.bo);
-   }
+   abo->parent = amdgpu_bo_ref(vm->root.bo);
 amdgpu_bo_unreserve(vm->root.bo);
 }
-   if (r)
-   return r;



The above early return seems to be needed for -ERESTARTSYS case.

Andrey




 r = drm_gem_handle_create(filp, gobj, &handle);
 /* drop reference from allocate - handle holds it now */
--
2.25.1


Re: [PATCH 2/2] drm/sched: serialize job_timeout and scheduler

2021-08-31 Thread Andrey Grodzovsky

It's says patch [2/2] but i can't find patch 1

On 2021-08-31 6:35 a.m., Monk Liu wrote:

tested-by: jingwen chen 
Signed-off-by: Monk Liu 
Signed-off-by: jingwen chen 
---
  drivers/gpu/drm/scheduler/sched_main.c | 24 
  1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index ecf8140..894fdb24 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -319,19 +319,17 @@ static void drm_sched_job_timedout(struct work_struct 
*work)
sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
  
  	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */

+   if (!__kthread_should_park(sched->thread))
+   kthread_park(sched->thread);
+



As mentioned before, without serializing against other TDR handlers from 
other
schedulers you just race here against them, e.g. you parked it now but 
another
one in progress will unpark it as part of calling  drm_sched_start for 
other rings[1]

Unless I am missing something since I haven't found patch [1/2]

[1] - 
https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c#L5041


Andrey



spin_lock(&sched->job_list_lock);
job = list_first_entry_or_null(&sched->pending_list,
   struct drm_sched_job, list);
  
  	if (job) {

-   /*
-* Remove the bad job so it cannot be freed by concurrent
-* drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
-* is parked at which point it's safe.
-*/
-   list_del_init(&job->list);
spin_unlock(&sched->job_list_lock);
  
+		/* vendor's timeout_job should call drm_sched_start() */

status = job->sched->ops->timedout_job(job);
  
  		/*

@@ -393,20 +391,6 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, 
struct drm_sched_job *bad)
kthread_park(sched->thread);
  
  	/*

-* Reinsert back the bad job here - now it's safe as
-* drm_sched_get_cleanup_job cannot race against us and release the
-* bad job at this point - we parked (waited for) any in progress
-* (earlier) cleanups and drm_sched_get_cleanup_job will not be called
-* now until the scheduler thread is unparked.
-*/
-   if (bad && bad->sched == sched)
-   /*
-* Add at the head of the queue to reflect it was the earliest
-* job extracted.
-*/
-   list_add(&bad->list, &sched->pending_list);
-
-   /*
 * Iterate the job list from later to  earlier one and either deactive
 * their HW callbacks or remove them from pending list if they already
 * signaled.


Re: [PATCH 2/2] drm/sched: serialize job_timeout and scheduler

2021-08-31 Thread Andrey Grodzovsky



On 2021-08-31 10:03 a.m., Daniel Vetter wrote:

On Tue, Aug 31, 2021 at 09:53:36AM -0400, Andrey Grodzovsky wrote:

It's says patch [2/2] but i can't find patch 1

On 2021-08-31 6:35 a.m., Monk Liu wrote:

tested-by: jingwen chen 
Signed-off-by: Monk Liu 
Signed-off-by: jingwen chen 
---
   drivers/gpu/drm/scheduler/sched_main.c | 24 
   1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index ecf8140..894fdb24 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -319,19 +319,17 @@ static void drm_sched_job_timedout(struct work_struct 
*work)
sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
+   if (!__kthread_should_park(sched->thread))
+   kthread_park(sched->thread);
+


As mentioned before, without serializing against other TDR handlers from
other
schedulers you just race here against them, e.g. you parked it now but
another
one in progress will unpark it as part of calling  drm_sched_start for other
rings[1]
Unless I am missing something since I haven't found patch [1/2]

[1] - 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Flatest%2Fsource%2Fdrivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_device.c%23L5041&data=04%7C01%7Candrey.grodzovsky%40amd.com%7Cc697c75898664f678f4b08d96c8820e7%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637660154199259544%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=1Y8Tuh2fLtexYsGrmQD2ITTSIfUVJmqTylwgMryDjxw%3D&reserved=0

You need to have your own wq and run all your tdr work on the same wq if
your reset has any cross-engine impact.



IMHO what is problematic in serializing vs. locking (with trylock and 
bail out like we do in [1]) is for multiple TO events arising from same 
reason
like maybe one job just waits for another and once first is hanged the 
second will also appear to be hanged triggering it's own TO event.
In this case multiple TOs event will trigger multiple resets if we 
serialize but if we use lock with trylock the second one will quietly 
bail out.


[1] 
https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c#L4903


Andrey




See

https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdri.freedesktop.org%2Fdocs%2Fdrm%2Fgpu%2Fdrm-mm.html%23c.drm_sched_backend_ops&data=04%7C01%7Candrey.grodzovsky%40amd.com%7Cc697c75898664f678f4b08d96c8820e7%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637660154199259544%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=tLjFaN7mREYjjydxHszbQlTk3lwH4bQtBDVzHFHvPJg%3D&reserved=0

for the ->timeout_job callback docs. I thought I brought this up already?
-Daniel



Yes, this discussion is a continuation of your comment about 
serializing, I mentioned before that you proposed it.


Andrey





Andrey



spin_lock(&sched->job_list_lock);
job = list_first_entry_or_null(&sched->pending_list,
   struct drm_sched_job, list);
if (job) {
-   /*
-* Remove the bad job so it cannot be freed by concurrent
-* drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
-* is parked at which point it's safe.
-*/
-   list_del_init(&job->list);
spin_unlock(&sched->job_list_lock);
+   /* vendor's timeout_job should call drm_sched_start() */
status = job->sched->ops->timedout_job(job);
/*
@@ -393,20 +391,6 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, 
struct drm_sched_job *bad)
kthread_park(sched->thread);
/*
-* Reinsert back the bad job here - now it's safe as
-* drm_sched_get_cleanup_job cannot race against us and release the
-* bad job at this point - we parked (waited for) any in progress
-* (earlier) cleanups and drm_sched_get_cleanup_job will not be called
-* now until the scheduler thread is unparked.
-*/
-   if (bad && bad->sched == sched)
-   /*
-* Add at the head of the queue to reflect it was the earliest
-* job extracted.
-*/
-   list_add(&bad->list, &sched->pending_list);
-
-   /*
 * Iterate the job list from later to  earlier one and either deactive
 * their HW callbacks or remove them from pending list if they already
 * signaled.


Re: [PATCH 2/2] drm/sched: serialize job_timeout and scheduler

2021-08-31 Thread Andrey Grodzovsky



On 2021-08-31 10:38 a.m., Daniel Vetter wrote:

On Tue, Aug 31, 2021 at 10:20:40AM -0400, Andrey Grodzovsky wrote:

On 2021-08-31 10:03 a.m., Daniel Vetter wrote:

On Tue, Aug 31, 2021 at 09:53:36AM -0400, Andrey Grodzovsky wrote:

It's says patch [2/2] but i can't find patch 1

On 2021-08-31 6:35 a.m., Monk Liu wrote:

tested-by: jingwen chen 
Signed-off-by: Monk Liu 
Signed-off-by: jingwen chen 
---
drivers/gpu/drm/scheduler/sched_main.c | 24 
1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index ecf8140..894fdb24 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -319,19 +319,17 @@ static void drm_sched_job_timedout(struct work_struct 
*work)
sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
+   if (!__kthread_should_park(sched->thread))
+   kthread_park(sched->thread);
+

As mentioned before, without serializing against other TDR handlers from
other
schedulers you just race here against them, e.g. you parked it now but
another
one in progress will unpark it as part of calling  drm_sched_start for other
rings[1]
Unless I am missing something since I haven't found patch [1/2]

[1] - 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Flatest%2Fsource%2Fdrivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_device.c%23L5041&data=04%7C01%7Candrey.grodzovsky%40amd.com%7C86b39a7bbcd34a18c6e908d96c8cf862%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637660174991641911%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=tz7lxvL%2BR6NrpcdfIg1Mjw5lZ55%2F5HTPF%2Fkwu7wqvqE%3D&reserved=0

You need to have your own wq and run all your tdr work on the same wq if
your reset has any cross-engine impact.


IMHO what is problematic in serializing vs. locking (with trylock and bail
out like we do in [1]) is for multiple TO events arising from same reason
like maybe one job just waits for another and once first is hanged the
second will also appear to be hanged triggering it's own TO event.
In this case multiple TOs event will trigger multiple resets if we serialize
but if we use lock with trylock the second one will quietly bail out.

[1] 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Flatest%2Fsource%2Fdrivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_device.c%23L4903&data=04%7C01%7Candrey.grodzovsky%40amd.com%7C86b39a7bbcd34a18c6e908d96c8cf862%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637660174991651903%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=SpirDOLVdw5kIZAq0LHjnB0Qy6apwPLDPFjm61Wc2ko%3D&reserved=0

Hm so I guess a single wq here, that will hold up all other TO. And they
should recheck whether the job is moving meanwhile.



Can you clarify about this ? What job should be moving ? The dependent job ?




Also unless you use hw semaphores the job shouldn't even start before the
deps are singalled, so not sure how this goes wrong?



What about a simple example where
we actually can submit a shader on one ring and a simple
WAIT_REG_MEM packet on another to wait for the shader to write
a specific value to specific memory location. Here you have both of them 
started
in close proximity and no explicit dependencies involved (at the 
scheduler level)

and yet if the shader hangs also the WAIT_REG_MEM job will hang.




The vm_id flush stuff can make things a bit more fun for your specific
case, but in your specific case you have to run all TO handlers on the
same ordered workqueue anyway (because trying to paper over this in other
ways doesn't work imo).



I didn't get this one.

Andrey




So I think this should all work, no need for tricky cross-scheduler
locking.
-Daniel


Andrey



See

https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdri.freedesktop.org%2Fdocs%2Fdrm%2Fgpu%2Fdrm-mm.html%23c.drm_sched_backend_ops&data=04%7C01%7Candrey.grodzovsky%40amd.com%7C86b39a7bbcd34a18c6e908d96c8cf862%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637660174991651903%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=uV4s%2Fsu%2FKabZvsRMd1PAyd36JRSz91aPfYEn8PlvFlM%3D&reserved=0

for the ->timeout_job callback docs. I thought I brought this up already?
-Daniel


Yes, this discussion is a continuation of your comment about serializing, I
mentioned before that you proposed it.

Andrey



Andrey



spin_lock(&sched->job_list_lock);
job = list_first_entry_or_null(&sched->pending_list,
   struct drm_sched_job, list);
if (job) 

Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

2021-08-31 Thread Andrey Grodzovsky



On 2021-08-31 9:11 a.m., Daniel Vetter wrote:

On Thu, Aug 26, 2021 at 11:04:14AM +0200, Daniel Vetter wrote:

On Thu, Aug 19, 2021 at 11:25:09AM -0400, Andrey Grodzovsky wrote:

On 2021-08-19 5:30 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:51:00AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:42 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:36:32AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:32 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:26:25AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:02 a.m., Alex Deucher wrote:


+ dri-devel

Since scheduler is a shared component, please add dri-devel on all
scheduler patches.

On Wed, Aug 18, 2021 at 7:21 AM Jingwen Chen  wrote:

[Why]
for bailing job, this commit will delete it from pending list thus the
bailing job will never have a chance to be resubmitted even in advance
tdr mode.

[How]
after embeded hw_fence into amdgpu_job is done, the race condition that
this commit tries to work around is completely solved.So revert this
commit.
This reverts commit 135517d3565b48f4def3b1b82008bc17eb5d1c90.
v2:
add dma_fence_get/put() around timedout_job to avoid concurrent delete
during processing timedout_job

Signed-off-by: Jingwen Chen 
---
  drivers/gpu/drm/scheduler/sched_main.c | 23 +--
  1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a953693b45..f9b9b3aefc4a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
  {
 struct drm_gpu_scheduler *sched;
 struct drm_sched_job *job;
+   struct dma_fence *fence;
 enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL;

 sched = container_of(work, struct drm_gpu_scheduler, 
work_tdr.work);
@@ -325,11 +326,10 @@ static void drm_sched_job_timedout(struct work_struct 
*work)

 if (job) {
 /*
-* Remove the bad job so it cannot be freed by concurrent
-* drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
-* is parked at which point it's safe.
+* Get job->s_fence->parent here to avoid concurrent delete 
during
+* processing timedout_job
  */
-   list_del_init(&job->list);
+   fence = dma_fence_get(job->s_fence->parent);

While this is true for amdgpu, it has no meaning for other drivers for whom
we haven't
done the refactoring of embedding HW fence (parent) into the job structure.
In fact thinking
about it, unless you do the HW fence embedding for all the drivers using the
scheduler you cannot
revert this patch or you will just break them.

btw, why did you do that embedding? I do still have my patches with
dma_fence annotations floating around, but my idea at least was to fix
that issue with a mempool, not with embeddeding. What was the motivation
for embedding the wh fence?
-Daniel

The motivation was 2 fold, avoid memory allocation during jobs submissions
(HW fence allocation) because as Christian explained this leads to deadlock
with
mm code during evictions due to memory pressure (Christian can clarify if I
messed

Yeah that's the exact same thing I've chased with my dma_fence
annotations, but thus far zero to none interested in getting it sorted. I
think it'd be good to have some cross-driver agreement on how this should
be solved before someone just charges ahead ...


this explanation). Second is to exactly revert this patch because while it
solved the issue
described in the patch it created another with drivers who baildc out early
during TDR handling
for various reason and the job would just leak because it was already
removed form pending list.

Can't we reinsert it before we restart the scheduler thread? It might need
a separate list for that due to the lockless queue tricks. Or am I
thinking about the wrong kind of "we lost the job"?
-Danile

If you look at the original patch it would reinsert it even earlier - right
after stopping the  SW scheduler thread, and even then it was to late for
some drivers as they would decide to return back from their TDR handler even
before that. It is solvable but in an ugly way as far as I see, you need to
require each driver in his code to put the job back in the list if they do
it before reaching the place where scheduler framework does it. Kind of
spaghetti code seems to me.

Hm yeah I didn't realize this all happens before we stop the scheduler
thread.

Why can't we stop the scheduler thread first, so that there's guaranteed
no race? I've recently had a lot of discussions with panfrost folks about
their reset that spawns across engines, and without stopping the sch

Re: [PATCH 2/2] drm/sched: serialize job_timeout and scheduler

2021-08-31 Thread Andrey Grodzovsky



On 2021-08-31 12:01 p.m., Luben Tuikov wrote:

On 2021-08-31 11:23, Andrey Grodzovsky wrote:

On 2021-08-31 10:38 a.m., Daniel Vetter wrote:

On Tue, Aug 31, 2021 at 10:20:40AM -0400, Andrey Grodzovsky wrote:

On 2021-08-31 10:03 a.m., Daniel Vetter wrote:

On Tue, Aug 31, 2021 at 09:53:36AM -0400, Andrey Grodzovsky wrote:

It's says patch [2/2] but i can't find patch 1

On 2021-08-31 6:35 a.m., Monk Liu wrote:

tested-by: jingwen chen 
Signed-off-by: Monk Liu 
Signed-off-by: jingwen chen 
---
 drivers/gpu/drm/scheduler/sched_main.c | 24 
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index ecf8140..894fdb24 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -319,19 +319,17 @@ static void drm_sched_job_timedout(struct work_struct 
*work)
sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
+   if (!__kthread_should_park(sched->thread))
+   kthread_park(sched->thread);
+

As mentioned before, without serializing against other TDR handlers from
other
schedulers you just race here against them, e.g. you parked it now but
another
one in progress will unpark it as part of calling  drm_sched_start for other
rings[1]
Unless I am missing something since I haven't found patch [1/2]

[1] - 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Flatest%2Fsource%2Fdrivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_device.c%23L5041&data=04%7C01%7Cluben.tuikov%40amd.com%7C228bd1600c914efe24aa08d96c934bbb%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637660202148713283%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=PrrvFHAwDeLlbcOctlKHsCFs9%2F56XNVzoLVcT1RoJgI%3D&reserved=0

You need to have your own wq and run all your tdr work on the same wq if
your reset has any cross-engine impact.

IMHO what is problematic in serializing vs. locking (with trylock and bail
out like we do in [1]) is for multiple TO events arising from same reason
like maybe one job just waits for another and once first is hanged the
second will also appear to be hanged triggering it's own TO event.
In this case multiple TOs event will trigger multiple resets if we serialize
but if we use lock with trylock the second one will quietly bail out.

[1] 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Flatest%2Fsource%2Fdrivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_device.c%23L4903&data=04%7C01%7Cluben.tuikov%40amd.com%7C228bd1600c914efe24aa08d96c934bbb%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637660202148713283%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=kxSWBoshVTLMMFIFZdPsP4MBgUAoC%2F3szo9GUemSRLY%3D&reserved=0

Hm so I guess a single wq here, that will hold up all other TO. And they
should recheck whether the job is moving meanwhile.

Can you clarify about this ? What job should be moving ? The dependent job ?



Also unless you use hw semaphores the job shouldn't even start before the
deps are singalled, so not sure how this goes wrong?

What about a simple example where
we actually can submit a shader on one ring and a simple
WAIT_REG_MEM packet on another to wait for the shader to write
a specific value to specific memory location. Here you have both of them
started
in close proximity and no explicit dependencies involved (at the
scheduler level)
and yet if the shader hangs also the WAIT_REG_MEM job will hang.



The vm_id flush stuff can make things a bit more fun for your specific
case, but in your specific case you have to run all TO handlers on the
same ordered workqueue anyway (because trying to paper over this in other
ways doesn't work imo).

I didn't get this one.

So, awhile back I tried to "serialize" this by moving timed-out jobs
into their own timed-out-dedicated list, then freeing them asynchronously,
but I never got it to work reliably due to races with low-level drivers and
assumptions made way back.

My idea was to atomic-move timed-out jobs into their own list, at the time of
timeout, and later asynchronously to free them (or better yet, inquire about
their state, and free them or move them back--ideally the inquiry is atomic
and done at timeout time before being moved to the timeout list). Anyway...

But I found out that all these knobs and levers weren't in place and I was
getting problems with it and it never materialized.

The paradigm was loosely "let someone else do it", like, "on an event,
move it to a list, and let someone else handle it", or "on an event, mark
it, and let someone else handle it". (loosely borrowed from an iSC

Re: [diagnostic TDR mode patches] unify our solution opinions/suggestions in one thread

2021-08-31 Thread Andrey Grodzovsky

I will answer everything here -

On 2021-08-31 9:58 p.m., Liu, Monk wrote:


[AMD Official Use Only]

In the previous discussion, you guys stated that we should drop the 
“kthread_should_park” in cleanup_job.


@@ -676,15 +676,6 @@ drm_sched_get_cleanup_job(struct 
drm_gpu_scheduler *sched)


{

    struct drm_sched_job *job, *next;

-   /*

-    * Don't destroy jobs while the timeout worker is running  OR 
thread


-    * is being parked and hence assumed to not touch pending_list

-    */

-   if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&

- !cancel_delayed_work(&sched->work_tdr)) ||

-   kthread_should_park())

-   return NULL;

But I suddenly have a question here: if return the timedout job no 
matter kthread_should_park() or not, then we are backing to the 
original problem again: that the timedout_job is suddenly signaling 
and cleanup_job still returns it to sched_main and job is freed while 
it is still handling by vendor’s timeout callback


If we return NULL when kthread_should_park() in cleanup_job, we can 
prevent above scenario from happening: once a job is processed by 
job_timedout we can stop its scheduler, and after that even this job 
suddenly signaled the cleanup_job won’t return it so sched_main won’t 
free it in parallel …


What do you think ?



Is your analysis above takes into account that you also submit
'[PATCH 2/2] drm/sched: serialize job_timeout and scheduler' then I 
don't see a problem -

I think that as long as you put kthread_park(sched->thread) BEFORE
fetching next bad job from pending list (under spinlock) there is no
such issue as in the case you describe because this potential bad job
that became signaled will be removed from pending list before you
even fetch the next job and by the time you fetch it the scheduler
thread is already stopped anyway

If you don't submit and we keep the removal hack for now then also no 
problem because
we temporary remove the job we fetch for TDR from pending list under 
spinlock

exactly to avoid this race



Thanks

--

Monk Liu | Cloud-GPU Core team

--

*From:* Liu, Monk
*Sent:* Wednesday, September 1, 2021 9:23 AM
*To:* Koenig, Christian ; Grodzovsky, Andrey 
; Daniel Vetter ; Chen, 
JingWen 
*Cc:* DRI Development ; 
amd-gfx@lists.freedesktop.org
*Subject:* [diagnostic TDR mode patches] unify our solution 
opinions/suggestions in one thread


[AMD Official Use Only]

Hi Daniel/Christian/Andrey

It looks the voice from you three are spread over those email floods 
to me, the feature we are working on (diagnostic TDR scheme) is 
pending there for more than 6 month (we started it from feb 2021).


Honestly speaking the email ways that we are using now is not friendly 
and quite painful to me ….


Can we try to put all our opinions, suggestions, or even objects here 
together, let’s go through them one by one, it’s too hard for us to 
reply each email on different questions .


For [PATCH 1/2] drm/sched: fix the bug of time out calculation(v4)

This is a fixing patch on the timeout timer in scheduler, can we 
complete this one first ? it should already resolved all the questions 
and suggestions.




I have no objections for this one besides getting rid of the 
kthread_should_park()) return null part,

if my answer above is not wrong then it seems superfluous to me



For [PATCH 2/2] drm/sched: serialize job_timeout and scheduler

I think I already explained the questions raised by Daniel in other 
thread , regarding why I use __kthread_should_park()




Is this race free ? Can't the other thread execute kthread_park after 
the check ?




For other aspects, can we put all our opinion synthesized here ?



So to summarize from previous threads I think that the best solution
to the problem being solved in this patch is if we do HW fence embedding
at the drm_sched_job level instead of doing it only for amdgpu, and 
modifying all

the drivers to support this we can both remove this hack and solve the race
against concurrent drm_sched_cleanup_jobs job freeing just by taking 
reference

to the hw fence of the job at the beginning of drm_sched_job_timedout

If doing this refactoring for all the drivers is not an option now and 
you need a quick
solution such as the serialization you do here then take into account 
other concurrent
TDR handlers that might run, as mentioned before, without serializing 
against other TDR handlers from other
schedulers you just race here against them, e.g. you parked it now but 
another
one in progress will unpark it as part of calling  drm_sched_start for 
other rings.
So you either need a global lock or dedicated single threaded queue as 
Daniel suggested.
At minimum we should change cancel_delayed_work in drm_sched_stop to 
cancel_delayed_work_sync
to cancel and flush all concurrent TDRs and probably move it to the 
begining of the function, after kthread_park

and before we start playi

Re: [diagnostic TDR mode patches] unify our solution opinions/suggestions in one thread

2021-08-31 Thread Andrey Grodzovsky



On 2021-09-01 12:25 a.m., Jingwen Chen wrote:

On Wed Sep 01, 2021 at 12:04:47AM -0400, Andrey Grodzovsky wrote:

I will answer everything here -

On 2021-08-31 9:58 p.m., Liu, Monk wrote:


 [AMD Official Use Only]

  


 In the previous discussion, you guys stated that we should drop the
 “kthread_should_park” in cleanup_job.

  


 @@ -676,15 +676,6 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler
 *sched)

 {

 struct drm_sched_job *job, *next;

  


 -   /*

 -* Don't destroy jobs while the timeout worker is running  OR
 thread

 -* is being parked and hence assumed to not touch pending_list

 -*/

 -   if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&

 -   !cancel_delayed_work(&sched->work_tdr)) ||

 -   kthread_should_park())

 -   return NULL;

  


 But I suddenly have a question here: if return the timedout job no matter
 kthread_should_park() or not, then we are backing to the original problem
 again: that the timedout_job is suddenly signaling and cleanup_job still
 returns it to sched_main and job is freed while it is still handling by
 vendor’s timeout callback

  


 If we return NULL when kthread_should_park() in cleanup_job, we can prevent
 above scenario from happening: once a job is processed by job_timedout we
 can stop its scheduler, and after that even this job suddenly signaled the
 cleanup_job won’t return it so sched_main won’t free it in parallel …

  


 What do you think ?


Is your analysis above takes into account that you also submit
'[PATCH 2/2] drm/sched: serialize job_timeout and scheduler' then I don't see a
problem -

Hi Andrey,
Monk has talked to me and we agreed that as there're multiple opinions about the
'[PATCH 2/2] drm/sched: serialize job_timeout and scheduler' and patch
1 is an independent patch to fix some error. So we should not take the patch 2 
into
analysis.


I think that as long as you put kthread_park(sched->thread) BEFORE
fetching next bad job from pending list (under spinlock) there is no
such issue as in the case you describe because this potential bad job
that became signaled will be removed from pending list before you
even fetch the next job and by the time you fetch it the scheduler
thread is already stopped anyway

If you don't submit and we keep the removal hack for now then also no problem
because
we temporary remove the job we fetch for TDR from pending list under spinlock
exactly to avoid this race


So can you help review [PATCH 1/2] drm/sched: fix the bug of time out 
calculation(v3)?
patch v3 keeps this kthread_should_park check.



But since in both cases looks like there is no danger of use after free
then I see no reason to keep kthread_should_park check.

Andrey




Best Regards,
JingWen


 Thanks

  


 --

 Monk Liu | Cloud-GPU Core team

 --

  


 From: Liu, Monk
 Sent: Wednesday, September 1, 2021 9:23 AM
 To: Koenig, Christian ; Grodzovsky, Andrey
 ; Daniel Vetter ; Chen, JingWen
 
 Cc: DRI Development ;
 amd-gfx@lists.freedesktop.org
 Subject: [diagnostic TDR mode patches] unify our solution opinions/
 suggestions in one thread

  


 [AMD Official Use Only]

  


 Hi Daniel/Christian/Andrey

  


 It looks the voice from you three are spread over those email floods to me,
 the feature we are working on (diagnostic TDR scheme) is pending there for
 more than 6 month (we started it from feb 2021).

  


 Honestly speaking the email ways that we are using now is not friendly and
 quite painful to me ….

 Can we try to put all our opinions, suggestions, or even objects here
 together, let’s go through them one by one, it’s too hard for us to reply
 each email on different questions .

  


 For [PATCH 1/2] drm/sched: fix the bug of time out calculation(v4)

  


 This is a fixing patch on the timeout timer in scheduler, can we complete
 this one first ? it should already resolved all the questions and
 suggestions.


I have no objections for this one besides getting rid of the
kthread_should_park()) return null part,
if my answer above is not wrong then it seems superfluous to me


  


 For [PATCH 2/2] drm/sched: serialize job_timeout and scheduler

  


 I think I already explained the questions raised by Daniel in other thread
 , regarding why I use __kthread_should_park()


Is this race free ? Can't the other thread execute kthread_park after the check
?


 For other aspects, can we put all our opinion synthesized here ?


So to summarize from previous threads I think that the best solution
to the problem being solved in th

Re: [diagnostic TDR mode patches] unify our solution opinions/suggestions in one thread

2021-08-31 Thread Andrey Grodzovsky



On 2021-09-01 12:40 a.m., Jingwen Chen wrote:

On Wed Sep 01, 2021 at 12:28:59AM -0400, Andrey Grodzovsky wrote:

On 2021-09-01 12:25 a.m., Jingwen Chen wrote:

On Wed Sep 01, 2021 at 12:04:47AM -0400, Andrey Grodzovsky wrote:

I will answer everything here -

On 2021-08-31 9:58 p.m., Liu, Monk wrote:


  [AMD Official Use Only]


  In the previous discussion, you guys stated that we should drop the
  “kthread_should_park” in cleanup_job.


  @@ -676,15 +676,6 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler
  *sched)

  {

  struct drm_sched_job *job, *next;


  -   /*

  -* Don't destroy jobs while the timeout worker is running  OR
  thread

  -* is being parked and hence assumed to not touch pending_list

  -*/

  -   if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&

  -   !cancel_delayed_work(&sched->work_tdr)) ||

  -   kthread_should_park())

  -   return NULL;


  But I suddenly have a question here: if return the timedout job no matter
  kthread_should_park() or not, then we are backing to the original problem
  again: that the timedout_job is suddenly signaling and cleanup_job still
  returns it to sched_main and job is freed while it is still handling by
  vendor’s timeout callback


  If we return NULL when kthread_should_park() in cleanup_job, we can 
prevent
  above scenario from happening: once a job is processed by job_timedout we
  can stop its scheduler, and after that even this job suddenly signaled the
  cleanup_job won’t return it so sched_main won’t free it in parallel …


  What do you think ?


Is your analysis above takes into account that you also submit
'[PATCH 2/2] drm/sched: serialize job_timeout and scheduler' then I don't see a
problem -

Hi Andrey,
Monk has talked to me and we agreed that as there're multiple opinions about the
'[PATCH 2/2] drm/sched: serialize job_timeout and scheduler' and patch
1 is an independent patch to fix some error. So we should not take the patch 2 
into
analysis.


I think that as long as you put kthread_park(sched->thread) BEFORE
fetching next bad job from pending list (under spinlock) there is no
such issue as in the case you describe because this potential bad job
that became signaled will be removed from pending list before you
even fetch the next job and by the time you fetch it the scheduler
thread is already stopped anyway

If you don't submit and we keep the removal hack for now then also no problem
because
we temporary remove the job we fetch for TDR from pending list under spinlock
exactly to avoid this race


So can you help review [PATCH 1/2] drm/sched: fix the bug of time out 
calculation(v3)?
patch v3 keeps this kthread_should_park check.


But since in both cases looks like there is no danger of use after free
then I see no reason to keep kthread_should_park check.

Andrey

OK, I get it. So patch v4 has removed this check, can you help review
[PATCH 1/2] drm/sched: fix the bug of time out calculation(v4)?



Sure

Andrey





Best Regards,
JingWen

  Thanks


  --

  Monk Liu | Cloud-GPU Core team

  --


  From: Liu, Monk
  Sent: Wednesday, September 1, 2021 9:23 AM
  To: Koenig, Christian ; Grodzovsky, Andrey
  ; Daniel Vetter ; Chen, 
JingWen
  
  Cc: DRI Development ;
  amd-gfx@lists.freedesktop.org
  Subject: [diagnostic TDR mode patches] unify our solution opinions/
  suggestions in one thread


  [AMD Official Use Only]


  Hi Daniel/Christian/Andrey


  It looks the voice from you three are spread over those email floods to 
me,
  the feature we are working on (diagnostic TDR scheme) is pending there for
  more than 6 month (we started it from feb 2021).


  Honestly speaking the email ways that we are using now is not friendly and
  quite painful to me ….

  Can we try to put all our opinions, suggestions, or even objects here
  together, let’s go through them one by one, it’s too hard for us to reply
  each email on different questions .


  For [PATCH 1/2] drm/sched: fix the bug of time out calculation(v4)


  This is a fixing patch on the timeout timer in scheduler, can we complete
  this one first ? it should already resolved all the questions and
  suggestions.


I have no objections for this one besides getting rid of the
kthread_should_park()) return null part,
if my answer above is not wrong then it seems superfluous to me



  For [PATCH 2/2] drm/sched: serialize job_timeout and scheduler


  I think I already explained the questions raised by Daniel in other thread
  , regarding why I use __kthread_should_park()


Is this race free ? Can't the other thread execute kthread_pa

Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

2021-09-02 Thread Andrey Grodzovsky



On 2021-09-02 10:28 a.m., Daniel Vetter wrote:

On Tue, Aug 31, 2021 at 02:24:52PM -0400, Andrey Grodzovsky wrote:

On 2021-08-31 9:11 a.m., Daniel Vetter wrote:

On Thu, Aug 26, 2021 at 11:04:14AM +0200, Daniel Vetter wrote:

On Thu, Aug 19, 2021 at 11:25:09AM -0400, Andrey Grodzovsky wrote:

On 2021-08-19 5:30 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:51:00AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:42 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:36:32AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:32 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:26:25AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:02 a.m., Alex Deucher wrote:


+ dri-devel

Since scheduler is a shared component, please add dri-devel on all
scheduler patches.

On Wed, Aug 18, 2021 at 7:21 AM Jingwen Chen  wrote:

[Why]
for bailing job, this commit will delete it from pending list thus the
bailing job will never have a chance to be resubmitted even in advance
tdr mode.

[How]
after embeded hw_fence into amdgpu_job is done, the race condition that
this commit tries to work around is completely solved.So revert this
commit.
This reverts commit 135517d3565b48f4def3b1b82008bc17eb5d1c90.
v2:
add dma_fence_get/put() around timedout_job to avoid concurrent delete
during processing timedout_job

Signed-off-by: Jingwen Chen 
---
   drivers/gpu/drm/scheduler/sched_main.c | 23 +--
   1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a953693b45..f9b9b3aefc4a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
   {
  struct drm_gpu_scheduler *sched;
  struct drm_sched_job *job;
+   struct dma_fence *fence;
  enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL;

  sched = container_of(work, struct drm_gpu_scheduler, 
work_tdr.work);
@@ -325,11 +326,10 @@ static void drm_sched_job_timedout(struct work_struct 
*work)

  if (job) {
  /*
-* Remove the bad job so it cannot be freed by concurrent
-* drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
-* is parked at which point it's safe.
+* Get job->s_fence->parent here to avoid concurrent delete 
during
+* processing timedout_job
   */
-   list_del_init(&job->list);
+   fence = dma_fence_get(job->s_fence->parent);

While this is true for amdgpu, it has no meaning for other drivers for whom
we haven't
done the refactoring of embedding HW fence (parent) into the job structure.
In fact thinking
about it, unless you do the HW fence embedding for all the drivers using the
scheduler you cannot
revert this patch or you will just break them.

btw, why did you do that embedding? I do still have my patches with
dma_fence annotations floating around, but my idea at least was to fix
that issue with a mempool, not with embeddeding. What was the motivation
for embedding the wh fence?
-Daniel

The motivation was 2 fold, avoid memory allocation during jobs submissions
(HW fence allocation) because as Christian explained this leads to deadlock
with
mm code during evictions due to memory pressure (Christian can clarify if I
messed

Yeah that's the exact same thing I've chased with my dma_fence
annotations, but thus far zero to none interested in getting it sorted. I
think it'd be good to have some cross-driver agreement on how this should
be solved before someone just charges ahead ...


this explanation). Second is to exactly revert this patch because while it
solved the issue
described in the patch it created another with drivers who baildc out early
during TDR handling
for various reason and the job would just leak because it was already
removed form pending list.

Can't we reinsert it before we restart the scheduler thread? It might need
a separate list for that due to the lockless queue tricks. Or am I
thinking about the wrong kind of "we lost the job"?
-Danile

If you look at the original patch it would reinsert it even earlier - right
after stopping the  SW scheduler thread, and even then it was to late for
some drivers as they would decide to return back from their TDR handler even
before that. It is solvable but in an ugly way as far as I see, you need to
require each driver in his code to put the job back in the list if they do
it before reaching the place where scheduler framework does it. Kind of
spaghetti code seems to me.

Hm yeah I didn't realize this all happens before we stop the scheduler
thread.

Why can't we stop the scheduler thread first, so that there's guaranteed
no race? I've r

Re: [PATCH v2] drm/amdgpu: Fix a race of IB test

2021-09-13 Thread Andrey Grodzovsky

Please add a tag V2 in description explaining what was the delta from V1.
Other then that looks good to me.

Andrey

On 2021-09-12 7:48 p.m., xinhui pan wrote:

Direct IB submission should be exclusive. So use write lock.

Signed-off-by: xinhui pan 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 19323b4cce7b..be5d12ed3db1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1358,7 +1358,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
}
  
  	/* Avoid accidently unparking the sched thread during GPU reset */

-   r = down_read_killable(&adev->reset_sem);
+   r = down_write_killable(&adev->reset_sem);
if (r)
return r;
  
@@ -1387,7 +1387,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)

kthread_unpark(ring->sched.thread);
}
  
-	up_read(&adev->reset_sem);

+   up_write(&adev->reset_sem);
  
  	pm_runtime_mark_last_busy(dev->dev);

pm_runtime_put_autosuspend(dev->dev);


Re: [PATCH] drm/amdgpu: Put drm_dev_enter/exit outside hot codepath

2021-09-14 Thread Andrey Grodzovsky



On 2021-09-14 9:42 p.m., xinhui pan wrote:

We hit soft hang while doing memory pressure test on one numa system.
After a qucik look, this is because kfd invalid/valid userptr memory
frequently with process_info lock hold.

perf top says below,
75.81%  [kernel]   [k] __srcu_read_unlock



Do you have any idea why most of CPU cycles would be spent in SRCU 
unlock ? It's
not waiting on anything within this function and does some simple 
arithmetic inside

as far as I see.


  6.19%  [amdgpu]   [k] amdgpu_gmc_set_pte_pde
  3.56%  [kernel]   [k] __srcu_read_lock
  2.20%  [amdgpu]   [k] amdgpu_vm_cpu_update
  2.20%  [kernel]   [k] __sg_page_iter_dma_next
  2.15%  [drm]  [k] drm_dev_enter
  1.70%  [drm]  [k] drm_prime_sg_to_dma_addr_array
  1.18%  [kernel]   [k] __sg_alloc_table_from_pages
  1.09%  [drm]  [k] drm_dev_exit

So move drm_dev_enter/exit outside gmc code, instead let caller do it.



Not clear from explanation here how the soft hang with process_info lock 
being hold

is related to to SRCU lock of drm_dev_enter/exit.



They are gart_unbind, gart_map, vm_cpu_update(already hold in its
caller)



Where in the caller ?



and gmc_init_pdb0(no need)



Why no need ? Those guards protect from accessing MMIO ranges after
device is hot removed and hence they don't belong to him anymore. The
function above is also called during device resume from S3 and it's possible
to hot unplug device during S3 so this might be called with extracted device

Is it possible to run libdrm amdgpu test hot plug test suite on this 
change (before and after)
to verify if this actually breaks hot unplug ? The suite is committed 
into latest libdrm but disabled
until latest fixes from amd-staging-drm-next reach upstream drm-next. So 
to enable it this
code 
https://gitlab.freedesktop.org/mesa/drm/-/blob/main/tests/amdgpu/hotunplug_tests.c#L65

needs to be commented out.

Andrey




Signed-off-by: xinhui pan 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c | 11 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  |  7 ---
  2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index 76efd5f8950f..d7e4f4660acf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -34,6 +34,7 @@
  #include 
  #endif
  #include "amdgpu.h"
+#include 
  
  /*

   * GART
@@ -230,12 +231,16 @@ int amdgpu_gart_unbind(struct amdgpu_device *adev, 
uint64_t offset,
u64 page_base;
/* Starting from VEGA10, system bit must be 0 to mean invalid. */
uint64_t flags = 0;
+   int idx;
  
  	if (!adev->gart.ready) {

WARN(1, "trying to unbind memory from uninitialized GART !\n");
return -EINVAL;
}
  
+	if (!drm_dev_enter(&adev->ddev, &idx))

+   return 0;
+
t = offset / AMDGPU_GPU_PAGE_SIZE;
p = t / AMDGPU_GPU_PAGES_IN_CPU_PAGE;
for (i = 0; i < pages; i++, p++) {
@@ -254,6 +259,7 @@ int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t 
offset,
for (i = 0; i < adev->num_vmhubs; i++)
amdgpu_gmc_flush_gpu_tlb(adev, 0, i, 0);
  
+	drm_dev_exit(idx);

return 0;
  }
  
@@ -276,12 +282,16 @@ int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,

  {
uint64_t page_base;
unsigned i, j, t;
+   int idx;
  
  	if (!adev->gart.ready) {

WARN(1, "trying to bind memory to uninitialized GART !\n");
return -EINVAL;
}
  
+	if (!drm_dev_enter(&adev->ddev, &idx))

+   return 0;
+
t = offset / AMDGPU_GPU_PAGE_SIZE;
  
  	for (i = 0; i < pages; i++) {

@@ -291,6 +301,7 @@ int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t 
offset,
page_base += AMDGPU_GPU_PAGE_SIZE;
}
}
+   drm_dev_exit(idx);
return 0;
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c

index 54f059501a33..e973488250e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -31,7 +31,6 @@
  #include "amdgpu_ras.h"
  #include "amdgpu_xgmi.h"
  
-#include 
  
  /**

   * amdgpu_gmc_pdb0_alloc - allocate vram for pdb0
@@ -153,10 +152,6 @@ int amdgpu_gmc_set_pte_pde(struct amdgpu_device *adev, 
void *cpu_pt_addr,
  {
void __iomem *ptr = (void *)cpu_pt_addr;
uint64_t value;
-   int idx;
-
-   if (!drm_dev_enter(&adev->ddev, &idx))
-   return 0;
  
  	/*

 * The following is for PTE only. GART does not have PDEs.
@@ -165,8 +160,6 @@ int amdgpu_gmc_set_pte_pde(struct amdgpu_device *adev, void 
*cpu_pt_addr,
value |= flags;
writeq(value, ptr + (gpu_page_idx * 8));
  
-	drm_dev_exit(idx);

-
return 0;
  }
  


Re: 回复: [PATCH] drm/amdgpu: Put drm_dev_enter/exit outside hot codepath

2021-09-14 Thread Andrey Grodzovsky

I think you missed 'reply all' so bringing  back to public

On 2021-09-14 11:40 p.m., Pan, Xinhui wrote:

[AMD Official Use Only]

perf says it is the lock   addl   $0x0,-0x4(%rsp)
details is below. the contention is huge maybe.



Yes - that makes sense to me too as long as the lock here is some kind 
of busy wait (spinlock)
because for mutex you would sleep and hence not count toward CPU time 
spent here.





│void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
│{
│→ callq  __fentry__
│  push   %rbp
   0.46 │  mov%rsp,%rbp
│smp_mb(); /* C */  /* Avoid leaking the critical section. */
  99.10 │  lock   addl   $0x0,-0x4(%rsp)
│this_cpu_inc(ssp->sda->srcu_unlock_count[idx]);
   0.01 │  movslq %esi,%rsi
│  mov0xc500(%rdi),%rax
   0.22 │  incq   %gs:0x10(%rax,%rsi,8)
│}
│  pop%rbp
   0.21 │← retq



as for soft lockup, kfd ioctl would try lock process_info which is hold by 
kernel restrore thread and it runs for a long time. maybe some debug logs below 
helps.

[  637.463063] XH: lock &process_info->lock for 24s
[  637.463070] CPU: 42 PID: 450 Comm: kworker/42:1 Not tainted 5.13.0+ #5
[  637.463072] Hardware name: Supermicro SYS-4028GR-TRT2/X10DRG-OT+-CPU, BIOS 
2.0c 07/21/2017
[  637.463074] Workqueue: events amdgpu_amdkfd_restore_userptr_worker [amdgpu]
[  637.463416] Call Trace:
[  637.463418]  dump_stack+0x7d/0x9c
[  637.463422]  mutex_unlock_xh+0x7e/0xb0 [amdgpu]
[  637.463652]  amdgpu_amdkfd_restore_userptr_worker+0x470/0x790 [amdgpu]
[  637.463878]  process_one_work+0x236/0x420
[  637.463882]  worker_thread+0x34/0x400
[  637.463884]  ? process_one_work+0x420/0x420
[  637.463887]  kthread+0x126/0x140
[  637.463890]  ? kthread_park+0x90/0x90
[  637.463892]  ret_from_fork+0x22/0x30
[  637.463908]  mutex_lock_xh+0x32/0x60 [amdgpu]
[  637.464134]  amdgpu_amdkfd_restore_userptr_worker+0xd1/0x790 [amdgpu]
[  637.464360]  process_one_work+0x236/0x420
[  637.464362]  worker_thread+0x34/0x400
[  637.464364]  kthread+0x126/0x140
[  637.464366]  ret_from_fork+0x22/0x30
[  637.468717] XH: lock &p->mutex for 24s
[  637.468722] CPU: 14 PID: 2104 Comm: kfdtest Not tainted 5.13.0+ #5
[  637.468726] Hardware name: Supermicro SYS-4028GR-TRT2/X10DRG-OT+-CPU, BIOS 
2.0c 07/21/2017
[  637.468728] Call Trace:
[  637.468730]  dump_stack+0x7d/0x9c
[  637.468735]  mutex_unlock_xh+0x7e/0xb0 [amdgpu]
[  637.469251]  kfd_ioctl_map_memory_to_gpu+0x38a/0x5a0 [amdgpu]
[  637.469780]  kfd_ioctl+0x51f/0x6f0 [amdgpu]
[  637.470308]  ? kfd_ioctl_unmap_memory_from_gpu+0x520/0x520 [amdgpu]
[  637.470836]  ? __vm_munmap+0xa0/0x130
[  637.470842]  __x64_sys_ioctl+0x96/0xd0
[  637.470849]  ? exit_to_user_mode_prepare+0x32/0x1d0
[  637.470858]  do_syscall_64+0x3c/0xb0
[  637.470864]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  637.470869] RIP: 0033:0x7f3e6bac3317
[  637.470875] Code: b3 66 90 48 8b 05 71 4b 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff 
ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff 
ff 73 01 c3 48 8b 0d 41 4b 2d 00 f7 d8 64 89 01 48
[  637.470879] RSP: 002b:7ffcfea0f668 EFLAGS: 0202 ORIG_RAX: 
0010
[  637.470884] RAX: ffda RBX: 0001 RCX: 7f3e6bac3317
[  637.470887] RDX: 7ffcfea0f6f0 RSI: c0184b18 RDI: 0003
[  637.470889] RBP: 7ffcfea0f6a0 R08:  R09: 
[  637.470892] R10: 55afe3e7a010 R11: 0202 R12: 55afe2e4f3ba
[  637.470894] R13:  R14: 0021 R15: 
[  637.470919]  mutex_lock_xh+0x32/0x60 [amdgpu]
[  637.471455]  kfd_ioctl_map_memory_to_gpu+0x1eb/0x5a0 [amdgpu]
[  637.472006]  kfd_ioctl+0x51f/0x6f0 [amdgpu]
[  637.472544]  __x64_sys_ioctl+0x96/0xd0
[  637.472550]  do_syscall_64+0x3c/0xb0
[  637.472554]  entry_SYSCALL_64_after_hwframe+0x44/0xae



vm_cpu_update is called by amdgpu_vm_bo_update_mapping which already calls 
drm_dev_enter/exit.



What about other callers of vm_cpu_update such as amdgpu_vm_update_pde and
amdgpu_vm_clear_bo ?

Also in general - I am not clear what code path that uses 
amdgpu_gmc_set_pte_pde
was the problematic one and this change fixed it ? Is it one of the two 
I mentioned above ?


Andrey



yes, gmc_init_pdb0 could be called during s3, I will add the enter/exit there 
too.
I will do the plug/unplug test to verfiy it.

___
发件人: Grodzovsky, Andrey 
发送时间: 2021年9月15日 11:02
收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; Koenig, Christian
主题: Re: [PATCH] drm/amdgpu: Put drm_dev_enter/exit outside hot codepath


On 2021-09-14 9:42 p.m., xinhui pan wrote:

We hit soft hang while doing memory pressure test on one numa system.
After a qucik look, this is because kfd invalid/valid userptr memory
frequently with pro

Re: 回复: [PATCH v2] drm/amdgpu: Put drm_dev_enter/exit outside hot codepath

2021-09-15 Thread Andrey Grodzovsky

On 2021-09-15 2:42 a.m., Pan, Xinhui wrote:

[AMD Official Use Only]

Andrey
I hit panic with this plug/unplug test without this patch.



Can you please tell which ASIC you are using and which kernel branch and 
what is the tip commit ?




But as we add enter/exit in all its callers. maybe it would not impact 
plug/unplug.



If you add enter/exit in all callers then why this solves the problem ? 
Is it because in one or more callers
amdgpu_gmc_set_pte_pde is called many times and so calling enter/exit 
many times creates the problematic

contention from multiple threads ?

Andrey



[ 1109.041095] BUG: unable to handle page fault for address: 10e1
[ 1109.086353] RIP: 0010:vega10_power_gate_vce+0x15/0x40 [amdgpu]
[ 1109.196706] Call Trace:
[ 1109.199374]  ? pp_set_powergating_by_smu+0x1f9/0x4a0 [amdgpu]
[ 1109.205843]  amdgpu_dpm_set_powergating_by_smu+0xa6/0x150 [amdgpu]
[ 1109.212776]  amdgpu_dpm_enable_vce+0x36/0x100 [amdgpu]
[ 1109.218563]  vce_v4_0_hw_fini+0xe1/0xf0 [amdgpu]
[ 1109.223747]  amdgpu_device_fini_hw+0x333/0x483 [amdgpu]
[ 1109.229650]  amdgpu_driver_unload_kms+0x80/0xe0 [amdgpu]
[ 1109.235577]  amdgpu_pci_remove+0x37/0x70 [amdgpu]
[ 1109.240853]  pci_device_remove+0x3b/0xb0
[ 1109.245127]  device_release_driver_internal+0x100/0x1d0
[ 1109.250857]  device_release_driver+0x12/0x20
[ 1109.255535]  pci_stop_bus_device+0x79/0xa0
[ 1109.260016]  pci_stop_and_remove_bus_device_locked+0x1b/0x30
[ 1109.266197]  remove_store+0x7b/0x90
[ 1109.269990]  dev_attr_store+0x14/0x30
[ 1109.274002]  sysfs_kf_write+0x48/0x60
[ 1109.277998]  kernfs_fop_write_iter+0x14e/0x1e0


发件人: Pan, Xinhui 
发送时间: 2021年9月15日 14:37
收件人: amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; Koenig, Christian; Grodzovsky, Andrey; Pan, Xinhui
主题: [PATCH v2] drm/amdgpu: Put drm_dev_enter/exit outside hot codepath

We hit soft hang while doing memory pressure test on one numa system.
After a qucik look, this is because kfd invalid/valid userptr memory
frequently with process_info lock hold.
Looks like update page table mapping use too much cpu time.

perf top says below,
75.81%  [kernel]   [k] __srcu_read_unlock
  6.19%  [amdgpu]   [k] amdgpu_gmc_set_pte_pde
  3.56%  [kernel]   [k] __srcu_read_lock
  2.20%  [amdgpu]   [k] amdgpu_vm_cpu_update
  2.20%  [kernel]   [k] __sg_page_iter_dma_next
  2.15%  [drm]  [k] drm_dev_enter
  1.70%  [drm]  [k] drm_prime_sg_to_dma_addr_array
  1.18%  [kernel]   [k] __sg_alloc_table_from_pages
  1.09%  [drm]  [k] drm_dev_exit

So move drm_dev_enter/exit outside gmc code, instead let caller do it.
They are gart_unbind, gart_map, vm_clear_bo, vm_update_pdes and
gmc_init_pdb0. vm_bo_update_mapping already calls it.

Signed-off-by: xinhui pan 
---
change from v1:
add enter/exit in more gmc_set_pte_pde callers
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c | 11 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  | 11 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 28 +---
  3 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index 76efd5f8950f..d7e4f4660acf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -34,6 +34,7 @@
  #include 
  #endif
  #include "amdgpu.h"
+#include 

  /*
   * GART
@@ -230,12 +231,16 @@ int amdgpu_gart_unbind(struct amdgpu_device *adev, 
uint64_t offset,
 u64 page_base;
 /* Starting from VEGA10, system bit must be 0 to mean invalid. */
 uint64_t flags = 0;
+   int idx;

 if (!adev->gart.ready) {
 WARN(1, "trying to unbind memory from uninitialized GART !\n");
 return -EINVAL;
 }

+   if (!drm_dev_enter(&adev->ddev, &idx))
+   return 0;
+
 t = offset / AMDGPU_GPU_PAGE_SIZE;
 p = t / AMDGPU_GPU_PAGES_IN_CPU_PAGE;
 for (i = 0; i < pages; i++, p++) {
@@ -254,6 +259,7 @@ int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t 
offset,
 for (i = 0; i < adev->num_vmhubs; i++)
 amdgpu_gmc_flush_gpu_tlb(adev, 0, i, 0);

+   drm_dev_exit(idx);
 return 0;
  }

@@ -276,12 +282,16 @@ int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t 
offset,
  {
 uint64_t page_base;
 unsigned i, j, t;
+   int idx;

 if (!adev->gart.ready) {
 WARN(1, "trying to bind memory to uninitialized GART !\n");
 return -EINVAL;
 }

+   if (!drm_dev_enter(&adev->ddev, &idx))
+   return 0;
+
 t = offset / AMDGPU_GPU_PAGE_SIZE;

 for (i = 0; i < pages; i++) {
@@ -291,6 +301,7 @@ int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t 
offset,
 page_base += AMDGPU_GPU_PAGE_SIZE;
 }
 }
+   drm_dev_exit(idx);
 return 0;
  }

di

Re: 回复: [PATCH v2] drm/amdgpu: Put drm_dev_enter/exit outside hot codepath

2021-09-15 Thread Andrey Grodzovsky



On 2021-09-15 9:57 a.m., Christian König wrote:

Am 15.09.21 um 15:52 schrieb Andrey Grodzovsky:

On 2021-09-15 2:42 a.m., Pan, Xinhui wrote:

[AMD Official Use Only]

Andrey
I hit panic with this plug/unplug test without this patch.



Can you please tell which ASIC you are using and which kernel branch 
and what is the tip commit ?



But as we add enter/exit in all its callers. maybe it would not 
impact plug/unplug.



If you add enter/exit in all callers then why this solves the problem 
? Is it because in one or more callers
amdgpu_gmc_set_pte_pde is called many times and so calling enter/exit 
many times creates the problematic

contention from multiple threads ?


The most likely cause of this is cache line bouncing I think and yes 
moving the enter/exit a level up should fix this.


But I strongly suggest to test this with lockdep enabled and 
hotplug/GPU reset a couple of times.



Xinhui already tried testing with Hotplug and there is a regression 
crash as you can see bellow. That
why I wanted to know which kernel and which ASIC to see if I can 
reproduce and fix the regression quickly.


I will try on my side with latest kernel and vega 10 anyway since i see 
vega 10 functions from the trace.


Andrey




Christian.



Andrey



[ 1109.041095] BUG: unable to handle page fault for address: 
10e1

[ 1109.086353] RIP: 0010:vega10_power_gate_vce+0x15/0x40 [amdgpu]
[ 1109.196706] Call Trace:
[ 1109.199374]  ? pp_set_powergating_by_smu+0x1f9/0x4a0 [amdgpu]
[ 1109.205843]  amdgpu_dpm_set_powergating_by_smu+0xa6/0x150 [amdgpu]
[ 1109.212776]  amdgpu_dpm_enable_vce+0x36/0x100 [amdgpu]
[ 1109.218563]  vce_v4_0_hw_fini+0xe1/0xf0 [amdgpu]
[ 1109.223747]  amdgpu_device_fini_hw+0x333/0x483 [amdgpu]
[ 1109.229650]  amdgpu_driver_unload_kms+0x80/0xe0 [amdgpu]
[ 1109.235577]  amdgpu_pci_remove+0x37/0x70 [amdgpu]
[ 1109.240853]  pci_device_remove+0x3b/0xb0
[ 1109.245127]  device_release_driver_internal+0x100/0x1d0
[ 1109.250857]  device_release_driver+0x12/0x20
[ 1109.255535]  pci_stop_bus_device+0x79/0xa0
[ 1109.260016] pci_stop_and_remove_bus_device_locked+0x1b/0x30
[ 1109.266197]  remove_store+0x7b/0x90
[ 1109.269990]  dev_attr_store+0x14/0x30
[ 1109.274002]  sysfs_kf_write+0x48/0x60
[ 1109.277998]  kernfs_fop_write_iter+0x14e/0x1e0


发件人: Pan, Xinhui 
发送时间: 2021年9月15日 14:37
收件人: amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; Koenig, Christian; Grodzovsky, Andrey; Pan, 
Xinhui
主题: [PATCH v2] drm/amdgpu: Put drm_dev_enter/exit outside hot 
codepath


We hit soft hang while doing memory pressure test on one numa system.
After a qucik look, this is because kfd invalid/valid userptr memory
frequently with process_info lock hold.
Looks like update page table mapping use too much cpu time.

perf top says below,
75.81%  [kernel]   [k] __srcu_read_unlock
  6.19%  [amdgpu]   [k] amdgpu_gmc_set_pte_pde
  3.56%  [kernel]   [k] __srcu_read_lock
  2.20%  [amdgpu]   [k] amdgpu_vm_cpu_update
  2.20%  [kernel]   [k] __sg_page_iter_dma_next
  2.15%  [drm]  [k] drm_dev_enter
  1.70%  [drm]  [k] drm_prime_sg_to_dma_addr_array
  1.18%  [kernel]   [k] __sg_alloc_table_from_pages
  1.09%  [drm]  [k] drm_dev_exit

So move drm_dev_enter/exit outside gmc code, instead let caller do it.
They are gart_unbind, gart_map, vm_clear_bo, vm_update_pdes and
gmc_init_pdb0. vm_bo_update_mapping already calls it.

Signed-off-by: xinhui pan 
---
change from v1:
add enter/exit in more gmc_set_pte_pde callers
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c | 11 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  | 11 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 28 
+---

  3 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c

index 76efd5f8950f..d7e4f4660acf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -34,6 +34,7 @@
  #include 
  #endif
  #include "amdgpu.h"
+#include 

  /*
   * GART
@@ -230,12 +231,16 @@ int amdgpu_gart_unbind(struct amdgpu_device 
*adev, uint64_t offset,

 u64 page_base;
 /* Starting from VEGA10, system bit must be 0 to mean 
invalid. */

 uint64_t flags = 0;
+   int idx;

 if (!adev->gart.ready) {
 WARN(1, "trying to unbind memory from uninitialized 
GART !\n");

 return -EINVAL;
 }

+   if (!drm_dev_enter(&adev->ddev, &idx))
+   return 0;
+
 t = offset / AMDGPU_GPU_PAGE_SIZE;
 p = t / AMDGPU_GPU_PAGES_IN_CPU_PAGE;
 for (i = 0; i < pages; i++, p++) {
@@ -254,6 +259,7 @@ int amdgpu_gart_unbind(struct amdgpu_device 
*adev, uint64_t offset,

 for (i = 0; i < adev->num_vmhubs; i++)
 amdgpu_gmc_flush_gpu_tlb(adev, 0, i, 0);

+   drm_dev_exit(idx);
  

Re: [PATCH 1/2] drm/sched: fix the bug of time out calculation(v4)

2021-09-15 Thread Andrey Grodzovsky

Pushed

Andrey

On 2021-09-15 7:45 a.m., Christian König wrote:

Yes, I think so as well. Andrey can you push this?

Christian.

Am 15.09.21 um 00:59 schrieb Grodzovsky, Andrey:

AFAIK this one is independent.

Christian, can you confirm ?

Andrey

*From:* amd-gfx  on behalf of 
Alex Deucher 

*Sent:* 14 September 2021 15:33
*To:* Christian König 
*Cc:* Liu, Monk ; amd-gfx list 
; Maling list - DRI developers 

*Subject:* Re: [PATCH 1/2] drm/sched: fix the bug of time out 
calculation(v4)

Was this fix independent of the other discussions?  Should this be
applied to drm-misc?

Alex

On Wed, Sep 1, 2021 at 4:42 PM Alex Deucher  
wrote:

>
> On Wed, Sep 1, 2021 at 2:50 AM Christian König
>  wrote:
> >
> > Am 01.09.21 um 02:46 schrieb Monk Liu:
> > > issue:
> > > in cleanup_job the cancle_delayed_work will cancel a TO timer
> > > even the its corresponding job is still running.
> > >
> > > fix:
> > > do not cancel the timer in cleanup_job, instead do the cancelling
> > > only when the heading job is signaled, and if there is a "next" job
> > > we start_timeout again.
> > >
> > > v2:
> > > further cleanup the logic, and do the TDR timer cancelling if 
the signaled job

> > > is the last one in its scheduler.
> > >
> > > v3:
> > > change the issue description
> > > remove the cancel_delayed_work in the begining of the cleanup_job
> > > recover the implement of drm_sched_job_begin.
> > >
> > > v4:
> > > remove the kthread_should_park() checking in cleanup_job routine,
> > > we should cleanup the signaled job asap
> > >
> > > TODO:
> > > 1)introduce pause/resume scheduler in job_timeout to serial the 
handling

> > > of scheduler and job_timeout.
> > > 2)drop the bad job's del and insert in scheduler due to above 
serialization

> > > (no race issue anymore with the serialization)
> > >
> > > tested-by: jingwen 
> > > Signed-off-by: Monk Liu 
> >
> > Reviewed-by: Christian König 
> >
>
> Are you planning to push this to drm-misc?
>
> Alex
>
>
> > > ---
> > >   drivers/gpu/drm/scheduler/sched_main.c | 26 
+-

> > >   1 file changed, 9 insertions(+), 17 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c

> > > index a2a9536..3e0bbc7 100644
> > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > @@ -676,15 +676,6 @@ drm_sched_get_cleanup_job(struct 
drm_gpu_scheduler *sched)

> > >   {
> > >   struct drm_sched_job *job, *next;
> > >
> > > - /*
> > > -  * Don't destroy jobs while the timeout worker is 
running  OR thread
> > > -  * is being parked and hence assumed to not touch 
pending_list

> > > -  */
> > > - if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
> > > - !cancel_delayed_work(&sched->work_tdr)) ||
> > > - kthread_should_park())
> > > - return NULL;
> > > -
> > > spin_lock(&sched->job_list_lock);
> > >
> > >   job = list_first_entry_or_null(&sched->pending_list,
> > > @@ -693,17 +684,21 @@ drm_sched_get_cleanup_job(struct 
drm_gpu_scheduler *sched)

> > >   if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
> > >   /* remove job from pending_list */
> > > list_del_init(&job->list);
> > > +
> > > + /* cancel this job's TO timer */
> > > + cancel_delayed_work(&sched->work_tdr);
> > >   /* make the scheduled timestamp more accurate */
> > >   next = list_first_entry_or_null(&sched->pending_list,
> > > typeof(*next), list);
> > > - if (next)
> > > +
> > > + if (next) {
> > > next->s_fence->scheduled.timestamp =
> > > job->s_fence->finished.timestamp;
> > > -
> > > + /* start TO timer for next job */
> > > + drm_sched_start_timeout(sched);
> > > + }
> > >   } else {
> > >   job = NULL;
> > > - /* queue timeout for next job */
> > > - drm_sched_start_timeout(sched);
> > >   }
> > >
> > > spin_unlock(&sched->job_list_lock);
> > > @@ -791,11 +786,8 @@ static int drm_sched_main(void *param)
> > > (entity = drm_sched_select_entity(sched))) ||
> > > kthread_should_stop());
> > >
> > > - if (cleanup_job) {
> > > + if (cleanup_job)
> > > sched->ops->free_job(cleanup_job);
> > > - /* queue timeout for next job */
> > > - drm_sched_start_timeout(sched);
> > > - }
> > >
> > >   if (!entity)
> > >   continue;
> >




[PATCH] drm/amdgpu: Fix crash on device remove/driver unload

2021-09-15 Thread Andrey Grodzovsky
Crash:
BUG: unable to handle page fault for address: 10e1
RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
Call Trace:
pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
amdgpu_pci_remove+0x27/0x40 [amdgpu]
pci_device_remove+0x3e/0xb0
device_release_driver_internal+0x103/0x1d0
device_release_driver+0x12/0x20
pci_stop_bus_device+0x79/0xa0
pci_stop_and_remove_bus_device_locked+0x1b/0x30
remove_store+0x7b/0x90
dev_attr_store+0x17/0x30
sysfs_kf_write+0x4b/0x60
kernfs_fop_write_iter+0x151/0x1e0

Why:
VCE/UVD had dependency on SMC block for their suspend but
SMC block is the first to do HW fini due to some constraints

How:
Since the original patch was dealing with suspend issues
move the SMC block dependency back into suspend hooks as
was done in V1 of the original patches.
Keep flushing idle work both in suspend and HW fini seuqnces
since it's essential in both cases.

Fixes:
2178d3c189b9 drm/amdgpu: add missing cleanups for more ASICs on UVD/VCE suspend
ee6679aaa61c drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE on suspend
Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 24 ---
 drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 24 ---
 drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 24 ---
 drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 32 ++-
 drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 19 +++-
 drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 28 +
 drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 44 ++-
 7 files changed, 105 insertions(+), 90 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c 
b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
index 7232241e3bfb..0fef925b6602 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
@@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+   cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+   if (RREG32(mmUVD_STATUS) != 0)
+   uvd_v3_1_stop(adev);
+
+   return 0;
+}
+
+static int uvd_v3_1_suspend(void *handle)
+{
+   int r;
+   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
/*
 * Proper cleanups before halting the HW engine:
 *   - cancel the delayed idle work
@@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
   AMD_CG_STATE_GATE);
}
 
-   if (RREG32(mmUVD_STATUS) != 0)
-   uvd_v3_1_stop(adev);
-
-   return 0;
-}
-
-static int uvd_v3_1_suspend(void *handle)
-{
-   int r;
-   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
r = uvd_v3_1_hw_fini(adev);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c 
b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
index 52d6de969f46..c108b8381795 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
@@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+   cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+   if (RREG32(mmUVD_STATUS) != 0)
+   uvd_v4_2_stop(adev);
+
+   return 0;
+}
+
+static int uvd_v4_2_suspend(void *handle)
+{
+   int r;
+   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
/*
 * Proper cleanups before halting the HW engine:
 *   - cancel the delayed idle work
@@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
   AMD_CG_STATE_GATE);
}
 
-   if (RREG32(mmUVD_STATUS) != 0)
-   uvd_v4_2_stop(adev);
-
-   return 0;
-}
-
-static int uvd_v4_2_suspend(void *handle)
-{
-   int r;
-   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
r = uvd_v4_2_hw_fini(adev);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
index db6d06758e4d..563493d1f830 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
@@ -210,6 +210,19 @@ static int uvd_v5_0_hw_fini(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+   cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+   if (RREG32(mmUVD_STATUS) != 0)
+   uvd_v5_0_stop(adev);
+
+   return 0;
+}
+
+static int uvd_v5_0_suspend(void *handle)
+{
+   int r;
+   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
/*
 * Proper cleanups before halting the HW engine:
 *   - ca

[PATCH] drm/amd/display: Fix crash on device remove/driver unload

2021-09-15 Thread Andrey Grodzovsky
Why:
DC core is being released from DM before it's referenced
from hpd_rx wq destruction code.

How: Move hpd_rx destruction before DC core destruction.

Signed-off-by: Andrey Grodzovsky 
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 24 +--
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 5d3679bd6b29..1c0547bb09de 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -1572,6 +1572,18 @@ static void amdgpu_dm_fini(struct amdgpu_device *adev)
  &adev->dm.dmub_bo_gpu_addr,
  &adev->dm.dmub_bo_cpu_addr);
 
+   if (adev->dm.hpd_rx_offload_wq) {
+   for (i = 0; i < adev->dm.dc->caps.max_links; i++) {
+   if (adev->dm.hpd_rx_offload_wq[i].wq) {
+   
destroy_workqueue(adev->dm.hpd_rx_offload_wq[i].wq);
+   adev->dm.hpd_rx_offload_wq[i].wq = NULL;
+   }
+   }
+
+   kfree(adev->dm.hpd_rx_offload_wq);
+   adev->dm.hpd_rx_offload_wq = NULL;
+   }
+
/* DC Destroy TODO: Replace destroy DAL */
if (adev->dm.dc)
dc_destroy(&adev->dm.dc);
@@ -1590,18 +1602,6 @@ static void amdgpu_dm_fini(struct amdgpu_device *adev)
adev->dm.freesync_module = NULL;
}
 
-   if (adev->dm.hpd_rx_offload_wq) {
-   for (i = 0; i < adev->dm.dc->caps.max_links; i++) {
-   if (adev->dm.hpd_rx_offload_wq[i].wq) {
-   
destroy_workqueue(adev->dm.hpd_rx_offload_wq[i].wq);
-   adev->dm.hpd_rx_offload_wq[i].wq = NULL;
-   }
-   }
-
-   kfree(adev->dm.hpd_rx_offload_wq);
-   adev->dm.hpd_rx_offload_wq = NULL;
-   }
-
mutex_destroy(&adev->dm.audio_lock);
mutex_destroy(&adev->dm.dc_lock);
 
-- 
2.25.1



Re: [PATCH v2] drm/amdgpu: Put drm_dev_enter/exit outside hot codepath

2021-09-15 Thread Andrey Grodzovsky
I fixed 2 regressions and latest code, applied your patch on top and 
passed libdrm tests
on Vega 10. You can pickup those 2 patches and try too if you have time. 
In any case -


Reviewed-and-tested-by: Andrey Grodzovsky 

Andrey

On 2021-09-15 2:37 a.m., xinhui pan wrote:

We hit soft hang while doing memory pressure test on one numa system.
After a qucik look, this is because kfd invalid/valid userptr memory
frequently with process_info lock hold.
Looks like update page table mapping use too much cpu time.

perf top says below,
75.81%  [kernel]   [k] __srcu_read_unlock
  6.19%  [amdgpu]   [k] amdgpu_gmc_set_pte_pde
  3.56%  [kernel]   [k] __srcu_read_lock
  2.20%  [amdgpu]   [k] amdgpu_vm_cpu_update
  2.20%  [kernel]   [k] __sg_page_iter_dma_next
  2.15%  [drm]  [k] drm_dev_enter
  1.70%  [drm]  [k] drm_prime_sg_to_dma_addr_array
  1.18%  [kernel]   [k] __sg_alloc_table_from_pages
  1.09%  [drm]  [k] drm_dev_exit

So move drm_dev_enter/exit outside gmc code, instead let caller do it.
They are gart_unbind, gart_map, vm_clear_bo, vm_update_pdes and
gmc_init_pdb0. vm_bo_update_mapping already calls it.

Signed-off-by: xinhui pan 
---
change from v1:
add enter/exit in more gmc_set_pte_pde callers
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c | 11 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  | 11 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 28 +---
  3 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index 76efd5f8950f..d7e4f4660acf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -34,6 +34,7 @@
  #include 
  #endif
  #include "amdgpu.h"
+#include 
  
  /*

   * GART
@@ -230,12 +231,16 @@ int amdgpu_gart_unbind(struct amdgpu_device *adev, 
uint64_t offset,
u64 page_base;
/* Starting from VEGA10, system bit must be 0 to mean invalid. */
uint64_t flags = 0;
+   int idx;
  
  	if (!adev->gart.ready) {

WARN(1, "trying to unbind memory from uninitialized GART !\n");
return -EINVAL;
}
  
+	if (!drm_dev_enter(&adev->ddev, &idx))

+   return 0;
+
t = offset / AMDGPU_GPU_PAGE_SIZE;
p = t / AMDGPU_GPU_PAGES_IN_CPU_PAGE;
for (i = 0; i < pages; i++, p++) {
@@ -254,6 +259,7 @@ int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t 
offset,
for (i = 0; i < adev->num_vmhubs; i++)
amdgpu_gmc_flush_gpu_tlb(adev, 0, i, 0);
  
+	drm_dev_exit(idx);

return 0;
  }
  
@@ -276,12 +282,16 @@ int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,

  {
uint64_t page_base;
unsigned i, j, t;
+   int idx;
  
  	if (!adev->gart.ready) {

WARN(1, "trying to bind memory to uninitialized GART !\n");
return -EINVAL;
}
  
+	if (!drm_dev_enter(&adev->ddev, &idx))

+   return 0;
+
t = offset / AMDGPU_GPU_PAGE_SIZE;
  
  	for (i = 0; i < pages; i++) {

@@ -291,6 +301,7 @@ int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t 
offset,
page_base += AMDGPU_GPU_PAGE_SIZE;
}
}
+   drm_dev_exit(idx);
return 0;
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c

index 54f059501a33..1427fd70310c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -153,10 +153,6 @@ int amdgpu_gmc_set_pte_pde(struct amdgpu_device *adev, 
void *cpu_pt_addr,
  {
void __iomem *ptr = (void *)cpu_pt_addr;
uint64_t value;
-   int idx;
-
-   if (!drm_dev_enter(&adev->ddev, &idx))
-   return 0;
  
  	/*

 * The following is for PTE only. GART does not have PDEs.
@@ -165,8 +161,6 @@ int amdgpu_gmc_set_pte_pde(struct amdgpu_device *adev, void 
*cpu_pt_addr,
value |= flags;
writeq(value, ptr + (gpu_page_idx * 8));
  
-	drm_dev_exit(idx);

-
return 0;
  }
  
@@ -752,6 +746,10 @@ void amdgpu_gmc_init_pdb0(struct amdgpu_device *adev)

adev->gmc.xgmi.physical_node_id * 
adev->gmc.xgmi.node_segment_size;
u64 vram_end = vram_addr + vram_size;
u64 gart_ptb_gpu_pa = amdgpu_gmc_vram_pa(adev, adev->gart.bo);
+   int idx;
+
+   if (!drm_dev_enter(&adev->ddev, &idx))
+   return;
  
  	flags |= AMDGPU_PTE_VALID | AMDGPU_PTE_READABLE;

flags |= AMDGPU_PTE_WRITEABLE;
@@ -773,6 +771,7 @@ void amdgpu_gmc_init_pdb0(struct amdgpu_device *adev)
flags |= AMDGPU_PDE_BFS(0) | AMDGPU_PTE_SNOOPED;
/* Requires gart_ptb_gpu_pa to be 4K aligned */
amdgpu_gmc_set_pte_pde(adev, adev->gmc.ptr_pdb0, i, gart_ptb_gpu_pa, 
flags);
+  

Re: [PATCH] drm/amdgpu: Fix crash on device remove/driver unload

2021-09-16 Thread Andrey Grodzovsky



On 2021-09-16 4:20 a.m., Lazar, Lijo wrote:

A minor comment below.

On 9/16/2021 1:11 AM, Andrey Grodzovsky wrote:

Crash:
BUG: unable to handle page fault for address: 10e1
RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
Call Trace:
pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
amdgpu_pci_remove+0x27/0x40 [amdgpu]
pci_device_remove+0x3e/0xb0
device_release_driver_internal+0x103/0x1d0
device_release_driver+0x12/0x20
pci_stop_bus_device+0x79/0xa0
pci_stop_and_remove_bus_device_locked+0x1b/0x30
remove_store+0x7b/0x90
dev_attr_store+0x17/0x30
sysfs_kf_write+0x4b/0x60
kernfs_fop_write_iter+0x151/0x1e0

Why:
VCE/UVD had dependency on SMC block for their suspend but
SMC block is the first to do HW fini due to some constraints

How:
Since the original patch was dealing with suspend issues
move the SMC block dependency back into suspend hooks as
was done in V1 of the original patches.
Keep flushing idle work both in suspend and HW fini seuqnces
since it's essential in both cases.

Fixes:
2178d3c189b9 drm/amdgpu: add missing cleanups for more ASICs on 
UVD/VCE suspend
ee6679aaa61c drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE 
on suspend

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 24 ---
  drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 24 ---
  drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 24 ---
  drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 32 ++-
  drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 19 +++-
  drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 28 +
  drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 44 ++-
  7 files changed, 105 insertions(+), 90 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c 
b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c

index 7232241e3bfb..0fef925b6602 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
@@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)
  {
  struct amdgpu_device *adev = (struct amdgpu_device *)handle;
  +    cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+    if (RREG32(mmUVD_STATUS) != 0)
+    uvd_v3_1_stop(adev);
+
+    return 0;
+}
+
+static int uvd_v3_1_suspend(void *handle)
+{
+    int r;
+    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
  /*
   * Proper cleanups before halting the HW engine:
   *   - cancel the delayed idle work
@@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
 AMD_CG_STATE_GATE);
  }
  -    if (RREG32(mmUVD_STATUS) != 0)
-    uvd_v3_1_stop(adev);
-
-    return 0;
-}
-
-static int uvd_v3_1_suspend(void *handle)
-{
-    int r;
-    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
  r = uvd_v3_1_hw_fini(adev);


"cosmetic change" comment - hw_fini is supposed to be the final tear 
down call. So instead of suspend calling hw_fini, perhaps it makes 
sense to read the other way - hw_fini just suspends the hardware?


Thanks,
Lijo



Not sure what you mean ?

Andrey





  if (r)
  return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c 
b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c

index 52d6de969f46..c108b8381795 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
@@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)
  {
  struct amdgpu_device *adev = (struct amdgpu_device *)handle;
  +    cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+    if (RREG32(mmUVD_STATUS) != 0)
+    uvd_v4_2_stop(adev);
+
+    return 0;
+}
+
+static int uvd_v4_2_suspend(void *handle)
+{
+    int r;
+    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
  /*
   * Proper cleanups before halting the HW engine:
   *   - cancel the delayed idle work
@@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
 AMD_CG_STATE_GATE);
  }
  -    if (RREG32(mmUVD_STATUS) != 0)
-    uvd_v4_2_stop(adev);
-
-    return 0;
-}
-
-static int uvd_v4_2_suspend(void *handle)
-{
-    int r;
-    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
  r = uvd_v4_2_hw_fini(adev);
  if (r)
  return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c

index db6d06758e4d..563493d1f830 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
@@ -210,6 +210,19 @@ static int uvd_v5_0_hw_fini(void *handle)
  {
  struct amdgpu_device *adev = (struct amdgpu_device *)handle;
  +    cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+    if (RREG32(mmUVD_STATUS) != 0)
+    uvd_v5_0_stop(adev);
+
+    return 

Re: [PATCH] drm/amdgpu: Fix crash on device remove/driver unload

2021-09-16 Thread Andrey Grodzovsky



On 2021-09-16 11:51 a.m., Lazar, Lijo wrote:



On 9/16/2021 9:15 PM, Andrey Grodzovsky wrote:


On 2021-09-16 4:20 a.m., Lazar, Lijo wrote:

A minor comment below.

On 9/16/2021 1:11 AM, Andrey Grodzovsky wrote:

Crash:
BUG: unable to handle page fault for address: 10e1
RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
Call Trace:
pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
amdgpu_pci_remove+0x27/0x40 [amdgpu]
pci_device_remove+0x3e/0xb0
device_release_driver_internal+0x103/0x1d0
device_release_driver+0x12/0x20
pci_stop_bus_device+0x79/0xa0
pci_stop_and_remove_bus_device_locked+0x1b/0x30
remove_store+0x7b/0x90
dev_attr_store+0x17/0x30
sysfs_kf_write+0x4b/0x60
kernfs_fop_write_iter+0x151/0x1e0

Why:
VCE/UVD had dependency on SMC block for their suspend but
SMC block is the first to do HW fini due to some constraints

How:
Since the original patch was dealing with suspend issues
move the SMC block dependency back into suspend hooks as
was done in V1 of the original patches.
Keep flushing idle work both in suspend and HW fini seuqnces
since it's essential in both cases.

Fixes:
2178d3c189b9 drm/amdgpu: add missing cleanups for more ASICs on 
UVD/VCE suspend
ee6679aaa61c drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE 
on suspend

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 24 ---
  drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 24 ---
  drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 24 ---
  drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 32 ++-
  drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 19 +++-
  drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 28 +
  drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 44 
++-

  7 files changed, 105 insertions(+), 90 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c 
b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c

index 7232241e3bfb..0fef925b6602 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
@@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)
  {
  struct amdgpu_device *adev = (struct amdgpu_device *)handle;
  + cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+    if (RREG32(mmUVD_STATUS) != 0)
+    uvd_v3_1_stop(adev);
+
+    return 0;
+}
+
+static int uvd_v3_1_suspend(void *handle)
+{
+    int r;
+    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
  /*
   * Proper cleanups before halting the HW engine:
   *   - cancel the delayed idle work
@@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
 AMD_CG_STATE_GATE);
  }
  -    if (RREG32(mmUVD_STATUS) != 0)
-    uvd_v3_1_stop(adev);
-
-    return 0;
-}
-
-static int uvd_v3_1_suspend(void *handle)
-{
-    int r;
-    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
  r = uvd_v3_1_hw_fini(adev);


"cosmetic change" comment - hw_fini is supposed to be the final tear 
down call. So instead of suspend calling hw_fini, perhaps it makes 
sense to read the other way - hw_fini just suspends the hardware?


Thanks,
Lijo



Not sure what you mean ?


Now it is - suspend()-> calls hw_fini()

What I meant is hw_fini() -> calls suspend() and that is read as "to 
do hw_fini() only suspend the hardware and nothing extra is needed".


In short implementation stays in suspend() and hw_fini() calls suspend().



Sorry, still confused, what about amdgpu_vce_suspend being called from 
vce_v4_0_suspend for example - we don't want that to be called from hw_fini.
Can you maybe show draft change of what you mean for one specific UVD or 
VCE version ?


Andrey




Thanks,
Lijo



Andrey





  if (r)
  return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c 
b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c

index 52d6de969f46..c108b8381795 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
@@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)
  {
  struct amdgpu_device *adev = (struct amdgpu_device *)handle;
  + cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+    if (RREG32(mmUVD_STATUS) != 0)
+    uvd_v4_2_stop(adev);
+
+    return 0;
+}
+
+static int uvd_v4_2_suspend(void *handle)
+{
+    int r;
+    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
  /*
   * Proper cleanups before halting the HW engine:
   *   - cancel the delayed idle work
@@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
 AMD_CG_STATE_GATE);
  }
  -    if (RREG32(mmUVD_STATUS) != 0)
-    uvd_v4_2_stop(adev);
-
-    return 0;
-}
-
-static int uvd_v4_2_suspend(void *handle)
-{
-  

[PATCH 1/2] drm/amdgpu: Fix MMIO access page fault

2021-09-17 Thread Andrey Grodzovsky
Add more guards to MMIO access post device
unbind/unplug

Bug:https://bugs.archlinux.org/task/72092?project=1&order=dateopened&sort=desc&pagenum=1
Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c |  8 ++--
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 17 +++--
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
index e6e9ef50719e..a03c0fc8338f 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
@@ -22,6 +22,7 @@
  */
 
 #include 
+#include 
 
 #include "amdgpu.h"
 #include "amdgpu_vcn.h"
@@ -194,11 +195,14 @@ static int vcn_v2_0_sw_init(void *handle)
  */
 static int vcn_v2_0_sw_fini(void *handle)
 {
-   int r;
+   int r, idx;
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
volatile struct amdgpu_fw_shared *fw_shared = 
adev->vcn.inst->fw_shared_cpu_addr;
 
-   fw_shared->present_flag_0 = 0;
+   if (drm_dev_enter(&adev->ddev, &idx)) {
+   fw_shared->present_flag_0 = 0;
+   drm_dev_exit(idx);
+   }
 
amdgpu_virt_free_mm_table(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
index 2e6b7913bf6c..1780ad1eacd6 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -22,6 +22,7 @@
  */
 
 #include 
+#include 
 
 #include "amdgpu.h"
 #include "amdgpu_vcn.h"
@@ -235,17 +236,21 @@ static int vcn_v2_5_sw_init(void *handle)
  */
 static int vcn_v2_5_sw_fini(void *handle)
 {
-   int i, r;
+   int i, r, idx;
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
volatile struct amdgpu_fw_shared *fw_shared;
 
-   for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
-   if (adev->vcn.harvest_config & (1 << i))
-   continue;
-   fw_shared = adev->vcn.inst[i].fw_shared_cpu_addr;
-   fw_shared->present_flag_0 = 0;
+   if (drm_dev_enter(&adev->ddev, &idx)) {
+   for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
+   if (adev->vcn.harvest_config & (1 << i))
+   continue;
+   fw_shared = adev->vcn.inst[i].fw_shared_cpu_addr;
+   fw_shared->present_flag_0 = 0;
+   }
+   drm_dev_exit(idx);
}
 
+
if (amdgpu_sriov_vf(adev))
amdgpu_virt_free_mm_table(adev);
 
-- 
2.25.1



[PATCH 2/2] drm/amdgpu: Fix resume failures when device is gone

2021-09-17 Thread Andrey Grodzovsky
Problem:
When device goes into suspend and unplugged during it
then all HW programming during resume fails leading
to a bad SW during pci remove handling which follows.
Because device is first resumed and only later removed
we cannot rely on drm_dev_enter/exit here.

Fix:
Use a flag we use for PCIe error recovery to avoid
accessing registres. This allows to successfully complete
pm resume sequence and finish pci remove.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index db21af5e84ed..04fb4e74fb20 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1522,6 +1522,10 @@ static int amdgpu_pmops_resume(struct device *dev)
struct amdgpu_device *adev = drm_to_adev(drm_dev);
int r;
 
+   /* Avoids registers access if device is physically gone */
+   if (!pci_device_is_present(adev->pdev))
+   adev->no_hw_access = true;
+
r = amdgpu_device_resume(drm_dev, true);
if (amdgpu_acpi_is_s0ix_active(adev))
adev->in_s0ix = false;
-- 
2.25.1



Re: [PATCH 1/2] drm/amdgpu: Fix MMIO access page fault

2021-09-17 Thread Andrey Grodzovsky

Note that it already has this protection.

Andrey

On 2021-09-17 8:04 a.m., James Zhu wrote:

typo. vcn_v3_0_sw_init   -->  vcn_v3_0_sw_fini

On 2021-09-17 8:00 a.m., James Zhu wrote:

Hi Andrey

Can you apply this improvement  on vcn_v3_0_sw_init also?

With this adding, This patch is Reviewed-by: James Zhu 



Thanks & Best Regards!

James

On 2021-09-17 7:30 a.m., Andrey Grodzovsky wrote:

Add more guards to MMIO access post device
unbind/unplug

Bug:https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugs.archlinux.org%2Ftask%2F72092%3Fproject%3D1%26order%3Ddateopened%26sort%3Ddesc%26pagenum%3D1&data=04%7C01%7Candrey.grodzovsky%40amd.com%7C209112865fef455dba0208d979d35e93%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637674771021422447%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=pskUESxJBYWMjGxbTqb5W%2FwpXUpui9c%2FyUEl7HX9PA8%3D&reserved=0 


Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c |  8 ++--
  drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 17 +++--
  2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c

index e6e9ef50719e..a03c0fc8338f 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
@@ -22,6 +22,7 @@
   */
    #include 
+#include 
    #include "amdgpu.h"
  #include "amdgpu_vcn.h"
@@ -194,11 +195,14 @@ static int vcn_v2_0_sw_init(void *handle)
   */
  static int vcn_v2_0_sw_fini(void *handle)
  {
-    int r;
+    int r, idx;
  struct amdgpu_device *adev = (struct amdgpu_device *)handle;
  volatile struct amdgpu_fw_shared *fw_shared = 
adev->vcn.inst->fw_shared_cpu_addr;

  -    fw_shared->present_flag_0 = 0;
+    if (drm_dev_enter(&adev->ddev, &idx)) {
+    fw_shared->present_flag_0 = 0;
+    drm_dev_exit(idx);
+    }
    amdgpu_virt_free_mm_table(adev);
  diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c

index 2e6b7913bf6c..1780ad1eacd6 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -22,6 +22,7 @@
   */
    #include 
+#include 
    #include "amdgpu.h"
  #include "amdgpu_vcn.h"
@@ -235,17 +236,21 @@ static int vcn_v2_5_sw_init(void *handle)
   */
  static int vcn_v2_5_sw_fini(void *handle)
  {
-    int i, r;
+    int i, r, idx;
  struct amdgpu_device *adev = (struct amdgpu_device *)handle;
  volatile struct amdgpu_fw_shared *fw_shared;
  -    for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
-    if (adev->vcn.harvest_config & (1 << i))
-    continue;
-    fw_shared = adev->vcn.inst[i].fw_shared_cpu_addr;
-    fw_shared->present_flag_0 = 0;
+    if (drm_dev_enter(&adev->ddev, &idx)) {
+    for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
+    if (adev->vcn.harvest_config & (1 << i))
+    continue;
+    fw_shared = adev->vcn.inst[i].fw_shared_cpu_addr;
+    fw_shared->present_flag_0 = 0;
+    }
+    drm_dev_exit(idx);
  }
  +
  if (amdgpu_sriov_vf(adev))
  amdgpu_virt_free_mm_table(adev);


Re: [PATCH 2/2] drm/amdgpu: Fix resume failures when device is gone

2021-09-17 Thread Andrey Grodzovsky

Ping

Andrey

On 2021-09-17 7:30 a.m., Andrey Grodzovsky wrote:

Problem:
When device goes into suspend and unplugged during it
then all HW programming during resume fails leading
to a bad SW during pci remove handling which follows.
Because device is first resumed and only later removed
we cannot rely on drm_dev_enter/exit here.

Fix:
Use a flag we use for PCIe error recovery to avoid
accessing registres. This allows to successfully complete
pm resume sequence and finish pci remove.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 
  1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index db21af5e84ed..04fb4e74fb20 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1522,6 +1522,10 @@ static int amdgpu_pmops_resume(struct device *dev)
struct amdgpu_device *adev = drm_to_adev(drm_dev);
int r;
  
+	/* Avoids registers access if device is physically gone */

+   if (!pci_device_is_present(adev->pdev))
+   adev->no_hw_access = true;
+
r = amdgpu_device_resume(drm_dev, true);
if (amdgpu_acpi_is_s0ix_active(adev))
adev->in_s0ix = false;


Re: [PATCH] drm/amdkfd: fix svm_migrate_fini warning

2021-09-21 Thread Andrey Grodzovsky

In any case, once you converge on solution please include
the relevant ticket in the commit description  -

https://gitlab.freedesktop.org/drm/amd/-/issues/1718

Andrey

On 2021-09-20 10:20 p.m., Felix Kuehling wrote:

Am 2021-09-20 um 5:55 p.m. schrieb Philip Yang:

Don't use devm_request_free_mem_region to alloc VRAM region for pgmap,

devm_... refers to a device manager that automatically releases
device-specific resources when a driver disconnects from a device. So
maybe that just means, our devm_memunmap_pages and
devm_release_mem_region calls in svm_migrate_fini are redundant, and the
best solution is to remove those calls.

See Documentation/driver-api/driver-model/devres.rst

Regards,
   Felix



because driver__detach releases all device resource region, then calls
amdgpu_device_fini_sw, which calls devm_memunmap_pages generating below
warning trace:

WARNING: CPU: 1 PID: 3646 at drivers/base/devres.c:795
devm_release_action+0x51/0x60
Call Trace:
 ? memunmap_pages+0x360/0x360
 svm_migrate_fini+0x2d/0x60 [amdgpu]
 kgd2kfd_device_exit+0x23/0xa0 [amdgpu]
 amdgpu_amdkfd_device_fini_sw+0x1d/0x30 [amdgpu]
 amdgpu_device_fini_sw+0x45/0x290 [amdgpu]
 amdgpu_driver_release_kms+0x12/0x30 [amdgpu]
 drm_dev_release+0x20/0x40 [drm]
 release_nodes+0x196/0x1e0
 device_release_driver_internal+0x104/0x1d0
 driver_detach+0x47/0x90
 bus_remove_driver+0x7a/0xd0
 pci_unregister_driver+0x3d/0x90
 amdgpu_exit+0x11/0x20 [amdgpu]
Trying to free nonexistent resource <07fc-07fd>

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 13 ++---
  1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 21f745e0b86c..aa96767920a9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -880,7 +880,7 @@ int svm_migrate_init(struct amdgpu_device *adev)
 * should remove reserved size
 */
size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20);
-   res = devm_request_free_mem_region(adev->dev, &iomem_resource, size);
+   res = request_free_mem_region(&iomem_resource, size, "amdgpu_vram");
if (IS_ERR(res))
return -ENOMEM;
  
@@ -891,14 +891,13 @@ int svm_migrate_init(struct amdgpu_device *adev)

pgmap->ops = &svm_migrate_pgmap_ops;
pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev);
pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
-   r = devm_memremap_pages(adev->dev, pgmap);
+   r = memremap_pages(pgmap, dev_to_node(adev->dev));
if (IS_ERR(r)) {
pr_err("failed to register HMM device memory\n");
  
  		/* Disable SVM support capability */

pgmap->type = 0;
-   devm_release_mem_region(adev->dev, res->start,
-   res->end - res->start + 1);
+   release_mem_region(res->start, res->end - res->start + 1);
return PTR_ERR(r);
}
  
@@ -919,7 +918,7 @@ void svm_migrate_fini(struct amdgpu_device *adev)

if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev))
return;
  
-	devm_memunmap_pages(adev->dev, pgmap);

-   devm_release_mem_region(adev->dev, pgmap->range.start,
-   pgmap->range.end - pgmap->range.start + 1);
+   memunmap_pages(pgmap);
+   release_mem_region(pgmap->range.start,
+  pgmap->range.end - pgmap->range.start + 1);
  }


Re: [PATCH] drm/amdgpu: move amdgpu_virt_release_full_gpu to fini_early stage

2021-09-21 Thread Andrey Grodzovsky

Reviewed-by: Andrey Grodzovsky 

Andrey

On 2021-09-21 9:11 a.m., Chen, Guchun wrote:

[Public]

Ping...

Regards,
Guchun

-Original Message-
From: Chen, Guchun 
Sent: Saturday, September 18, 2021 2:09 PM
To: amd-gfx@lists.freedesktop.org; Koenig, Christian ; Pan, Xinhui 
; Deucher, Alexander ; Grodzovsky, Andrey 
; Liu, Monk 
Cc: Chen, Guchun ; Shi, Leslie 
Subject: [PATCH] drm/amdgpu: move amdgpu_virt_release_full_gpu to fini_early 
stage

adev->rmmio is set to be NULL in amdgpu_device_unmap_mmio to prevent
access after pci_remove, however, in SRIOV case, amdgpu_virt_release_full_gpu will 
still use adev->rmmio for access after amdgpu_device_unmap_mmio.
The patch is to move such SRIOV calling earlier to fini_early stage.

Fixes: 07775fc13878("drm/amdgpu: Unmap all MMIO mappings")
Cc: Andrey Grodzovsky 
Signed-off-by: Leslie Shi 
Signed-off-by: Guchun Chen 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +
  1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3da97086f7d..2a75c09c4884 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2810,6 +2810,11 @@ static int amdgpu_device_ip_fini_early(struct 
amdgpu_device *adev)
adev->ip_blocks[i].status.hw = false;
}
  
+	if (amdgpu_sriov_vf(adev)) {

+   if (amdgpu_virt_release_full_gpu(adev, false))
+   DRM_ERROR("failed to release exclusive mode on fini\n");
+   }
+
return 0;
  }
  
@@ -2870,10 +2875,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
  
  	amdgpu_ras_fini(adev);
  
-	if (amdgpu_sriov_vf(adev))

-   if (amdgpu_virt_release_full_gpu(adev, false))
-   DRM_ERROR("failed to release exclusive mode on fini\n");
-
return 0;
  }
  
--

2.17.1


Re: [PATCH v2 1/2] drm/amdkfd: handle svm migrate init error

2021-09-21 Thread Andrey Grodzovsky

Series is Acked-by: Andrey Grodzovsky 

Andrey

On 2021-09-21 2:53 p.m., Philip Yang wrote:

If svm migration init failed to create pgmap for device memory, set
pgmap type to 0 to disable device SVM support capability.

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index dab290a4d19d..165e0ebb619d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -894,6 +894,9 @@ int svm_migrate_init(struct amdgpu_device *adev)
r = devm_memremap_pages(adev->dev, pgmap);
if (IS_ERR(r)) {
pr_err("failed to register HMM device memory\n");
+
+   /* Disable SVM support capability */
+   pgmap->type = 0;
devm_release_mem_region(adev->dev, res->start,
res->end - res->start + 1);
return PTR_ERR(r);


Re: [PATCH] drm/amd/amdgpu: Do irq_fini_hw after ip_fini_early

2021-09-29 Thread Andrey Grodzovsky

Can you test  this change with hotunplug tests in libdrm ?
Since the tests are still in disabled mode until latest fixes propagate
to drm-next upstream you will need to comment out 
https://gitlab.freedesktop.org/mesa/drm/-/blob/main/tests/amdgpu/hotunplug_tests.c#L65
I recently fixed a few regressions in amdgpu so hopefully there isn't 
more regressions

which will interfere with your testing.

Andrey

On 2021-09-29 5:22 a.m., YuBiao Wang wrote:

Some IP such as SMU need irq_put to perform hw_fini.
So move irq_fini_hw after ip_fini.

Signed-off-by: YuBiao Wang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4c8f2f4647c0..18e26a78ef82 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3864,10 +3864,10 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
amdgpu_ucode_sysfs_fini(adev);
sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
  
-	amdgpu_irq_fini_hw(adev);

-
amdgpu_device_ip_fini_early(adev);
  
+	amdgpu_irq_fini_hw(adev);

+
ttm_device_clear_dma_mappings(&adev->mman.bdev);
  
  	amdgpu_gart_dummy_page_fini(adev);


Re: [PATCH] drm/amdgpu: add missed write lock for pci detected state pci_channel_io_normal

2021-09-30 Thread Andrey Grodzovsky

On 2021-09-30 10:00 p.m., Guchun Chen wrote:


When a PCI error state pci_channel_io_normal is detectd, it will
report PCI_ERS_RESULT_CAN_RECOVER status to PCI driver, and PCI driver
will continue the execution of PCI resume callback report_resume by
pci_walk_bridge, and the callback will go into amdgpu_pci_resume
finally, where write lock is releasd unconditionally without acquiring
such lock.



Good catch but, the issue is even wider in scope, what about 
drm_sched_resubmit_jobs
and drm_sched_start called without being stopped before ? Better to put 
the entire scope
of code in this function under flag that set only in 
pci_channel_io_frozen. As far as i remember

we don't need to do anything in case of pci_channel_io_normal.

Andrey




Fixes: c9a6b82f45e2("drm/amdgpu: Implement DPC recovery")
Signed-off-by: Guchun Chen 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bb5ad2b6ca13..12f822d51de2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5370,6 +5370,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev 
*pdev, pci_channel_sta
  
  	switch (state) {

case pci_channel_io_normal:
+   amdgpu_device_lock_adev(adev, NULL);
return PCI_ERS_RESULT_CAN_RECOVER;
/* Fatal error, prepare for slot reset */
case pci_channel_io_frozen:


Re: [PATCH] drm/amdgpu: add missed write lock for pci detected state pci_channel_io_normal

2021-10-01 Thread Andrey Grodzovsky
No, scheduler restart and device unlock must take place 
inamdgpu_pci_resume (see struct pci_error_handlers for the various 
states of PCI recovery). So just add a flag (probably in amdgpu_device) 
so we can remember what pci_channel_state_t we came from (unfortunately 
it's not passed to us in  amdgpu_pci_resume) and unless it's set don't 
do anything in amdgpu_pci_resume.


Andrey

On 2021-10-01 4:21 a.m., Chen, Guchun wrote:

[Public]

Hi Andrey,

Do you mean to move the code of drm_sched_resubmit_jobs and drm_sched_start in 
amdgpu_pci_resume to amdgpu_pci_error_detected, under the case 
pci_channel_io_frozen?
Then leave amdgpu_pci_resume as a null function, and in this way, we can drop 
the acquire/lock write lock for case of pci_channel_io_normal as well?

Regards,
Guchun

-Original Message-
From: Grodzovsky, Andrey 
Sent: Friday, October 1, 2021 10:22 AM
To: Chen, Guchun ; amd-gfx@lists.freedesktop.org; Koenig, Christian 
; Pan, Xinhui ; Deucher, Alexander 

Subject: Re: [PATCH] drm/amdgpu: add missed write lock for pci detected state 
pci_channel_io_normal

On 2021-09-30 10:00 p.m., Guchun Chen wrote:


When a PCI error state pci_channel_io_normal is detectd, it will
report PCI_ERS_RESULT_CAN_RECOVER status to PCI driver, and PCI driver
will continue the execution of PCI resume callback report_resume by
pci_walk_bridge, and the callback will go into amdgpu_pci_resume
finally, where write lock is releasd unconditionally without acquiring
such lock.


Good catch but, the issue is even wider in scope, what about 
drm_sched_resubmit_jobs and drm_sched_start called without being stopped before 
? Better to put the entire scope of code in this function under flag that set 
only in pci_channel_io_frozen. As far as i remember we don't need to do 
anything in case of pci_channel_io_normal.

Andrey



Fixes: c9a6b82f45e2("drm/amdgpu: Implement DPC recovery")
Signed-off-by: Guchun Chen 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 +
   1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bb5ad2b6ca13..12f822d51de2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5370,6 +5370,7 @@ pci_ers_result_t
amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
   
   	switch (state) {

case pci_channel_io_normal:
+   amdgpu_device_lock_adev(adev, NULL);
return PCI_ERS_RESULT_CAN_RECOVER;
/* Fatal error, prepare for slot reset */
case pci_channel_io_frozen:


Re: Lockdep spalt on killing a processes

2021-10-01 Thread Andrey Grodzovsky
From what I see here you supposed to have actual deadlock and not only 
warning, sched_fence->finished is  first signaled from within
hw fence done callback (drm_sched_job_done_cb) but then again from 
within it's own callback (drm_sched_entity_kill_jobs_cb) and so
looks like same fence  object is recursively signaled twice. This leads 
to attempt to lock fence->lock second time while it's already
locked. I don't see a need to call drm_sched_fence_finished from within 
drm_sched_entity_kill_jobs_cb as this callback already registered
on sched_fence->finished fence (entity->last_scheduled == 
s_fence->finished) and hence the signaling already took place.


Andrey

On 2021-10-01 6:50 a.m., Christian König wrote:

Hey, Andrey.

while investigating some memory management problems I've got the 
logdep splat below.


Looks like something is wrong with drm_sched_entity_kill_jobs_cb(), 
can you investigate?


Thanks,
Christian.

[11176.741052] 
[11176.741056] WARNING: possible recursive locking detected
[11176.741060] 5.15.0-rc1-00031-g9d546d600800 #171 Not tainted
[11176.741066] 
[11176.741070] swapper/12/0 is trying to acquire lock:
[11176.741074] 9c337ed175a8 (&fence->lock){-.-.}-{3:3}, at: 
dma_fence_signal+0x28/0x80

[11176.741088]
   but task is already holding lock:
[11176.741092] 9c337ed172a8 (&fence->lock){-.-.}-{3:3}, at: 
dma_fence_signal+0x28/0x80

[11176.741100]
   other info that might help us debug this:
[11176.741104]  Possible unsafe locking scenario:

[11176.741108]    CPU0
[11176.741110]    
[11176.741113]   lock(&fence->lock);
[11176.741118]   lock(&fence->lock);
[11176.741122]
    *** DEADLOCK ***

[11176.741125]  May be due to missing lock nesting notation

[11176.741128] 2 locks held by swapper/12/0:
[11176.741133]  #0: 9c339c30f768 
(&ring->fence_drv.lock){-.-.}-{3:3}, at: dma_fence_signal+0x28/0x80
[11176.741142]  #1: 9c337ed172a8 (&fence->lock){-.-.}-{3:3}, at: 
dma_fence_signal+0x28/0x80

[11176.741151]
   stack backtrace:
[11176.741155] CPU: 12 PID: 0 Comm: swapper/12 Not tainted 
5.15.0-rc1-00031-g9d546d600800 #171
[11176.741160] Hardware name: System manufacturer System Product 
Name/PRIME X399-A, BIOS 0808 10/12/2018

[11176.741165] Call Trace:
[11176.741169]  
[11176.741173]  dump_stack_lvl+0x5b/0x74
[11176.741181]  dump_stack+0x10/0x12
[11176.741186]  __lock_acquire.cold+0x208/0x2df
[11176.741197]  lock_acquire+0xc6/0x2d0
[11176.741204]  ? dma_fence_signal+0x28/0x80
[11176.741212]  _raw_spin_lock_irqsave+0x4d/0x70
[11176.741219]  ? dma_fence_signal+0x28/0x80
[11176.741225]  dma_fence_signal+0x28/0x80
[11176.741230]  drm_sched_fence_finished+0x12/0x20 [gpu_sched]
[11176.741240]  drm_sched_entity_kill_jobs_cb+0x1c/0x50 [gpu_sched]
[11176.741248]  dma_fence_signal_timestamp_locked+0xac/0x1a0
[11176.741254]  dma_fence_signal+0x3b/0x80
[11176.741260]  drm_sched_fence_finished+0x12/0x20 [gpu_sched]
[11176.741268]  drm_sched_job_done.isra.0+0x7f/0x1a0 [gpu_sched]
[11176.741277]  drm_sched_job_done_cb+0x12/0x20 [gpu_sched]
[11176.741284]  dma_fence_signal_timestamp_locked+0xac/0x1a0
[11176.741290]  dma_fence_signal+0x3b/0x80
[11176.741296]  amdgpu_fence_process+0xd1/0x140 [amdgpu]
[11176.741504]  sdma_v4_0_process_trap_irq+0x8c/0xb0 [amdgpu]
[11176.741731]  amdgpu_irq_dispatch+0xce/0x250 [amdgpu]
[11176.741954]  amdgpu_ih_process+0x81/0x100 [amdgpu]
[11176.742174]  amdgpu_irq_handler+0x26/0xa0 [amdgpu]
[11176.742393]  __handle_irq_event_percpu+0x4f/0x2c0
[11176.742402]  handle_irq_event_percpu+0x33/0x80
[11176.742408]  handle_irq_event+0x39/0x60
[11176.742414]  handle_edge_irq+0x93/0x1d0
[11176.742419]  __common_interrupt+0x50/0xe0
[11176.742426]  common_interrupt+0x80/0x90
[11176.742431]  
[11176.742436]  asm_common_interrupt+0x1e/0x40
[11176.742442] RIP: 0010:cpuidle_enter_state+0xff/0x470
[11176.742449] Code: 0f a3 05 04 54 24 01 0f 82 70 02 00 00 31 ff e8 
37 5d 6f ff 80 7d d7 00 0f 85 e9 01 00 00 e8 58 a2 7f ff fb 66 0f 1f 
44 00 00 <45> 85 ff 0f 88 01 01 00 00 49 63 c7 4c 2b 75 c8 48 8d 14 40 
48 8d

[11176.742455] RSP: 0018:b6970021fe48 EFLAGS: 0202
[11176.742461] RAX: 0059be25 RBX: 0002 RCX: 

[11176.742465] RDX:  RSI:  RDI: 
9efeed78
[11176.742470] RBP: b6970021fe80 R08: 0001 R09: 
0001
[11176.742473] R10: 0001 R11: 0001 R12: 
9c3350b0e800
[11176.742477] R13: a00e9680 R14: 0a2a49ada060 R15: 
0002

[11176.742483]  ? cpuidle_enter_state+0xf8/0x470
[11176.742489]  ? cpuidle_enter_state+0xf8/0x470
[11176.742495]  cpuidle_enter+0x2e/0x40
[11176.742500]  call_cpuidle+0x23/0x40
[11176.742506]  do_idle+0x201/0x280
[11176.742512]  cpu_startup_entry+0x20/0x30
[11176.742517]  start_secondary+0x11f/0x160
[11176.742523]  secondary_startup_64_no_verify+0xb0/0xbb



Re: Lockdep spalt on killing a processes

2021-10-04 Thread Andrey Grodzovsky
I see my confusion, we hang all unsubmitted jobs on the last submitted 
to HW job.
Yea, in this case indeed rescheduling to a different thread context will 
avoid the splat but
the schedule work cannot be done for each dependency signalling but 
rather they way we do
for ttm_bo_delayed_delete with a list of dependencies to signal. 
Otherwise some of the schedule

work will be drop because previous invocation is still pending execution.

Andrey

On 2021-10-04 4:14 a.m., Christian König wrote:

The problem is a bit different.

The callback is on the dependent fence, while we need to signal the 
scheduler fence.


Daniel is right that this needs an irq_work struct to handle this 
properly.


Christian.

Am 01.10.21 um 17:10 schrieb Andrey Grodzovsky:
From what I see here you supposed to have actual deadlock and not 
only warning, sched_fence->finished is  first signaled from within
hw fence done callback (drm_sched_job_done_cb) but then again from 
within it's own callback (drm_sched_entity_kill_jobs_cb) and so
looks like same fence  object is recursively signaled twice. This 
leads to attempt to lock fence->lock second time while it's already
locked. I don't see a need to call drm_sched_fence_finished from 
within drm_sched_entity_kill_jobs_cb as this callback already registered
on sched_fence->finished fence (entity->last_scheduled == 
s_fence->finished) and hence the signaling already took place.


Andrey

On 2021-10-01 6:50 a.m., Christian König wrote:

Hey, Andrey.

while investigating some memory management problems I've got the 
logdep splat below.


Looks like something is wrong with drm_sched_entity_kill_jobs_cb(), 
can you investigate?


Thanks,
Christian.

[11176.741052] 
[11176.741056] WARNING: possible recursive locking detected
[11176.741060] 5.15.0-rc1-00031-g9d546d600800 #171 Not tainted
[11176.741066] 
[11176.741070] swapper/12/0 is trying to acquire lock:
[11176.741074] 9c337ed175a8 (&fence->lock){-.-.}-{3:3}, at: 
dma_fence_signal+0x28/0x80

[11176.741088]
   but task is already holding lock:
[11176.741092] 9c337ed172a8 (&fence->lock){-.-.}-{3:3}, at: 
dma_fence_signal+0x28/0x80

[11176.741100]
   other info that might help us debug this:
[11176.741104]  Possible unsafe locking scenario:

[11176.741108]    CPU0
[11176.741110]    
[11176.741113]   lock(&fence->lock);
[11176.741118]   lock(&fence->lock);
[11176.741122]
    *** DEADLOCK ***

[11176.741125]  May be due to missing lock nesting notation

[11176.741128] 2 locks held by swapper/12/0:
[11176.741133]  #0: 9c339c30f768 
(&ring->fence_drv.lock){-.-.}-{3:3}, at: dma_fence_signal+0x28/0x80
[11176.741142]  #1: 9c337ed172a8 (&fence->lock){-.-.}-{3:3}, at: 
dma_fence_signal+0x28/0x80

[11176.741151]
   stack backtrace:
[11176.741155] CPU: 12 PID: 0 Comm: swapper/12 Not tainted 
5.15.0-rc1-00031-g9d546d600800 #171
[11176.741160] Hardware name: System manufacturer System Product 
Name/PRIME X399-A, BIOS 0808 10/12/2018

[11176.741165] Call Trace:
[11176.741169]  
[11176.741173]  dump_stack_lvl+0x5b/0x74
[11176.741181]  dump_stack+0x10/0x12
[11176.741186]  __lock_acquire.cold+0x208/0x2df
[11176.741197]  lock_acquire+0xc6/0x2d0
[11176.741204]  ? dma_fence_signal+0x28/0x80
[11176.741212]  _raw_spin_lock_irqsave+0x4d/0x70
[11176.741219]  ? dma_fence_signal+0x28/0x80
[11176.741225]  dma_fence_signal+0x28/0x80
[11176.741230]  drm_sched_fence_finished+0x12/0x20 [gpu_sched]
[11176.741240]  drm_sched_entity_kill_jobs_cb+0x1c/0x50 [gpu_sched]
[11176.741248]  dma_fence_signal_timestamp_locked+0xac/0x1a0
[11176.741254]  dma_fence_signal+0x3b/0x80
[11176.741260]  drm_sched_fence_finished+0x12/0x20 [gpu_sched]
[11176.741268]  drm_sched_job_done.isra.0+0x7f/0x1a0 [gpu_sched]
[11176.741277]  drm_sched_job_done_cb+0x12/0x20 [gpu_sched]
[11176.741284]  dma_fence_signal_timestamp_locked+0xac/0x1a0
[11176.741290]  dma_fence_signal+0x3b/0x80
[11176.741296]  amdgpu_fence_process+0xd1/0x140 [amdgpu]
[11176.741504]  sdma_v4_0_process_trap_irq+0x8c/0xb0 [amdgpu]
[11176.741731]  amdgpu_irq_dispatch+0xce/0x250 [amdgpu]
[11176.741954]  amdgpu_ih_process+0x81/0x100 [amdgpu]
[11176.742174]  amdgpu_irq_handler+0x26/0xa0 [amdgpu]
[11176.742393]  __handle_irq_event_percpu+0x4f/0x2c0
[11176.742402]  handle_irq_event_percpu+0x33/0x80
[11176.742408]  handle_irq_event+0x39/0x60
[11176.742414]  handle_edge_irq+0x93/0x1d0
[11176.742419]  __common_interrupt+0x50/0xe0
[11176.742426]  common_interrupt+0x80/0x90
[11176.742431]  
[11176.742436]  asm_common_interrupt+0x1e/0x40
[11176.742442] RIP: 0010:cpuidle_enter_state+0xff/0x470
[11176.742449] Code: 0f a3 05 04 54 24 01 0f 82 70 02 00 00 31 ff e8 
37 5d 6f ff 80 7d d7 00 0f 85 e9 01 00 00 e8 58 a2 7f ff fb 66 0f 1f 
44 00 00 <45> 85 ff 0f 88 01 01 00 00 49 63 c7 4c 2b 75 c8 48 8d 14 
40 4

Re: [PATCH] drm/amdgpu: handle the case of pci_channel_io_frozen only in amdgpu_pci_resume

2021-10-04 Thread Andrey Grodzovsky



On 2021-10-02 11:18 a.m., Guchun Chen wrote:

In current code, when a PCI error state pci_channel_io_normal is detectd,
it will report PCI_ERS_RESULT_CAN_RECOVER status to PCI driver, and PCI
driver will continue the execution of PCI resume callback report_resume by
pci_walk_bridge, and the callback will go into amdgpu_pci_resume
finally, where write lock is releasd unconditionally without acquiring
such lock first. In this case, a deadlock will happen when other threads
start to acquire the read lock.

To fix this, add a member in amdgpu_device strucutre to cache
pci_channel_state, and only continue the execution in amdgpu_pci_resume
when it's pci_channel_io_frozen.

Fixes: c9a6b82f45e2("drm/amdgpu: Implement DPC recovery")
Suggested-by: Andrey Grodzovsky 
Signed-off-by: Guchun Chen 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h| 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++
  2 files changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f4bceb2624fb..720d0ccecfe0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1094,6 +1094,7 @@ struct amdgpu_device {
  
  	boolno_hw_access;

struct pci_saved_state  *pci_state;
+   pci_channel_state_t cached_state;



I would give a more descriptive name to this (e.g. pci_channel_state)
Other then that Reviewed-by: Andrey Grodzovsky 

Andrey


  
  	struct amdgpu_reset_control *reset_cntl;

uint32_t
ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bb5ad2b6ca13..1aaeb4b30edc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5368,6 +5368,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev 
*pdev, pci_channel_sta
return PCI_ERS_RESULT_DISCONNECT;
}
  
+	adev->cached_state = state;

+
switch (state) {
case pci_channel_io_normal:
return PCI_ERS_RESULT_CAN_RECOVER;
@@ -5510,6 +5512,10 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
  
  	DRM_INFO("PCI error: resume callback!!\n");
  
+	/* Only continue execution for the case of pci_channel_io_frozen */

+   if (adev->cached_state != pci_channel_io_frozen)
+   return;
+
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
  


Re: [PATCH 1/1] drm/amdgpu: recover gart table at resume

2021-10-19 Thread Andrey Grodzovsky



On 2021-10-19 9:22 a.m., Nirmoy Das wrote:

Get rid off pin/unpin and evict and swap back gart
page table which should make things less likely to break.


+Christian

Could you guys also clarify what exactly are the stability issues this 
fixes ?


Andrey




Also remove 2nd call to amdgpu_device_evict_resources()
as we don't need it.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 -
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 16 
  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 17 +
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 16 
  4 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ce86244144..22ff229ab981 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3941,11 +3941,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
amdgpu_fence_driver_hw_fini(adev);
  
  	amdgpu_device_ip_suspend_phase2(adev);

-   /* This second call to evict device resources is to evict
-* the gart page table using the CPU.
-*/
-   amdgpu_device_evict_resources(adev);
-
return 0;
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

index 3ec5ff5a6dbe..18e3f3c5aae6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -992,9 +992,16 @@ static int gmc_v10_0_gart_enable(struct amdgpu_device 
*adev)
return -EINVAL;
}
  
-	r = amdgpu_gart_table_vram_pin(adev);

-   if (r)
-   return r;
+   if (!adev->in_suspend) {
+   r = amdgpu_gart_table_vram_pin(adev);
+   if (r)
+   return r;
+   } else {
+   r = amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev,
+   TTM_PL_TT));
+   if (r)
+   return r;
+   }
  
  	r = adev->gfxhub.funcs->gart_enable(adev);

if (r)
@@ -1062,7 +1069,8 @@ static void gmc_v10_0_gart_disable(struct amdgpu_device 
*adev)
  {
adev->gfxhub.funcs->gart_disable(adev);
adev->mmhub.funcs->gart_disable(adev);
-   amdgpu_gart_table_vram_unpin(adev);
+   if (!adev->in_suspend)
+   amdgpu_gart_table_vram_unpin(adev);
  }
  
  static int gmc_v10_0_hw_fini(void *handle)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 492ebed2915b..0ef50ad3d7d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -837,9 +837,17 @@ static int gmc_v8_0_gart_enable(struct amdgpu_device *adev)
dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
return -EINVAL;
}
-   r = amdgpu_gart_table_vram_pin(adev);
-   if (r)
-   return r;
+
+   if (!adev->in_suspend) {
+   r = amdgpu_gart_table_vram_pin(adev);
+   if (r)
+   return r;
+   } else {
+   r = amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev,
+   TTM_PL_TT));
+   if (r)
+   return r;
+   }
  
  	table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
  
@@ -992,7 +1000,8 @@ static void gmc_v8_0_gart_disable(struct amdgpu_device *adev)

tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
WREG32(mmVM_L2_CNTL, tmp);
WREG32(mmVM_L2_CNTL2, 0);
-   amdgpu_gart_table_vram_unpin(adev);
+   if (!adev->in_suspend)
+   amdgpu_gart_table_vram_unpin(adev);
  }
  
  /**

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index cb82404df534..1bbcefd53974 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1714,9 +1714,16 @@ static int gmc_v9_0_gart_enable(struct amdgpu_device 
*adev)
return -EINVAL;
}
  
-	r = amdgpu_gart_table_vram_pin(adev);

-   if (r)
-   return r;
+   if (!adev->in_suspend) {
+   r = amdgpu_gart_table_vram_pin(adev);
+   if (r)
+   return r;
+   } else {
+   r = amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev,
+   TTM_PL_TT));
+   if (r)
+   return r;
+   }
  
  	r = adev->gfxhub.funcs->gart_enable(adev);

if (r)
@@ -1793,7 +1800,8 @@ static void gmc_v9_0_gart_disable(struct amdgpu_device 
*adev)
  {
adev->gfxhub.funcs->gart_disable(adev);
adev->mmhub.funcs->gart_disable(adev);
-   amdgpu_gart_table_vram_unpin(adev);
+   if (!adev->in_suspend)
+   amdgpu_gar

Re: [PATCH 1/1] drm/amdgpu: recover gart table at resume

2021-10-19 Thread Andrey Grodzovsky



On 2021-10-19 11:54 a.m., Christian König wrote:

Am 19.10.21 um 17:41 schrieb Andrey Grodzovsky:


On 2021-10-19 9:22 a.m., Nirmoy Das wrote:

Get rid off pin/unpin and evict and swap back gart
page table which should make things less likely to break.


+Christian

Could you guys also clarify what exactly are the stability issues 
this fixes ?


When we evict the GART table during suspend it is theoretically 
possible that we run into an OOM situation.


But since the OOM killer and the buffer move functions are already 
disable that is basically not gracefully handle able.


When we just keep the GART pinned all the time and restore it's 
content during resume from the metadata we should be able to avoid any 
memory allocation for the move.


Christian.



Got it.

Andrey






Andrey




Also remove 2nd call to amdgpu_device_evict_resources()
as we don't need it.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 -
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 16 
  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 17 +
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 16 
  4 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 41ce86244144..22ff229ab981 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3941,11 +3941,6 @@ int amdgpu_device_suspend(struct drm_device 
*dev, bool fbcon)

  amdgpu_fence_driver_hw_fini(adev);
    amdgpu_device_ip_suspend_phase2(adev);
-    /* This second call to evict device resources is to evict
- * the gart page table using the CPU.
- */
-    amdgpu_device_evict_resources(adev);
-
  return 0;
  }
  diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

index 3ec5ff5a6dbe..18e3f3c5aae6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -992,9 +992,16 @@ static int gmc_v10_0_gart_enable(struct 
amdgpu_device *adev)

  return -EINVAL;
  }
  -    r = amdgpu_gart_table_vram_pin(adev);
-    if (r)
-    return r;
+    if (!adev->in_suspend) {
+    r = amdgpu_gart_table_vram_pin(adev);
+    if (r)
+    return r;
+    } else {
+    r = amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev,
+    TTM_PL_TT));
+    if (r)
+    return r;
+    }
    r = adev->gfxhub.funcs->gart_enable(adev);
  if (r)
@@ -1062,7 +1069,8 @@ static void gmc_v10_0_gart_disable(struct 
amdgpu_device *adev)

  {
  adev->gfxhub.funcs->gart_disable(adev);
  adev->mmhub.funcs->gart_disable(adev);
-    amdgpu_gart_table_vram_unpin(adev);
+    if (!adev->in_suspend)
+    amdgpu_gart_table_vram_unpin(adev);
  }
    static int gmc_v10_0_hw_fini(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c

index 492ebed2915b..0ef50ad3d7d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -837,9 +837,17 @@ static int gmc_v8_0_gart_enable(struct 
amdgpu_device *adev)

  dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
  return -EINVAL;
  }
-    r = amdgpu_gart_table_vram_pin(adev);
-    if (r)
-    return r;
+
+    if (!adev->in_suspend) {
+    r = amdgpu_gart_table_vram_pin(adev);
+    if (r)
+    return r;
+    } else {
+    r = amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev,
+    TTM_PL_TT));
+    if (r)
+    return r;
+    }
    table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
  @@ -992,7 +1000,8 @@ static void gmc_v8_0_gart_disable(struct 
amdgpu_device *adev)

  tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
  WREG32(mmVM_L2_CNTL, tmp);
  WREG32(mmVM_L2_CNTL2, 0);
-    amdgpu_gart_table_vram_unpin(adev);
+    if (!adev->in_suspend)
+    amdgpu_gart_table_vram_unpin(adev);
  }
    /**
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

index cb82404df534..1bbcefd53974 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1714,9 +1714,16 @@ static int gmc_v9_0_gart_enable(struct 
amdgpu_device *adev)

  return -EINVAL;
  }
  -    r = amdgpu_gart_table_vram_pin(adev);
-    if (r)
-    return r;
+    if (!adev->in_suspend) {
+    r = amdgpu_gart_table_vram_pin(adev);
+    if (r)
+    return r;
+    } else {
+    r = amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev,
+    TTM_PL_TT));
+    if (r)
+    return r;
+    }
    r = adev->gfxhub.funcs->gart_enable(adev);
  if (r)
@@ -1793,7 +18

Re: Lockdep spalt on killing a processes

2021-10-20 Thread Andrey Grodzovsky

On 2021-10-04 4:14 a.m., Christian König wrote:


The problem is a bit different.

The callback is on the dependent fence, while we need to signal the 
scheduler fence.


Daniel is right that this needs an irq_work struct to handle this 
properly.


Christian.



So we had some discussions with Christian regarding irq_work and agreed 
I should look into doing it but stepping back for a sec -


Why we insist on calling the dma_fence_cb  with fence->lock locked ? Is 
it because of dma_fence_add_callback ?
Because we first test for DMA_FENCE_FLAG_SIGNALED_BIT and only after 
that lock the fence->lock ? If so, can't we
move DMA_FENCE_FLAG_SIGNALED_BIT  check inside the locked section ? 
Because if in theory
we could call the cb with unlocked fence->lock (i.e. this kind of 
iteration 
https://elixir.bootlin.com/linux/v5.15-rc6/source/drivers/gpu/drm/ttm/ttm_resource.c#L117)

we wouldn't have the lockdep splat. And in general, is it really
the correct approach to call a third party code from a call back with 
locked spinlock ? We don't know what the cb does inside
and I don't see any explicit restrictions in documentation of 
dma_fence_func_t what can and cannot be done there.


Andrey




Am 01.10.21 um 17:10 schrieb Andrey Grodzovsky:
From what I see here you supposed to have actual deadlock and not 
only warning, sched_fence->finished is  first signaled from within
hw fence done callback (drm_sched_job_done_cb) but then again from 
within it's own callback (drm_sched_entity_kill_jobs_cb) and so
looks like same fence  object is recursively signaled twice. This 
leads to attempt to lock fence->lock second time while it's already
locked. I don't see a need to call drm_sched_fence_finished from 
within drm_sched_entity_kill_jobs_cb as this callback already registered
on sched_fence->finished fence (entity->last_scheduled == 
s_fence->finished) and hence the signaling already took place.


Andrey

On 2021-10-01 6:50 a.m., Christian König wrote:

Hey, Andrey.

while investigating some memory management problems I've got the 
logdep splat below.


Looks like something is wrong with drm_sched_entity_kill_jobs_cb(), 
can you investigate?


Thanks,
Christian.

[11176.741052] 
[11176.741056] WARNING: possible recursive locking detected
[11176.741060] 5.15.0-rc1-00031-g9d546d600800 #171 Not tainted
[11176.741066] 
[11176.741070] swapper/12/0 is trying to acquire lock:
[11176.741074] 9c337ed175a8 (&fence->lock){-.-.}-{3:3}, at: 
dma_fence_signal+0x28/0x80

[11176.741088]
   but task is already holding lock:
[11176.741092] 9c337ed172a8 (&fence->lock){-.-.}-{3:3}, at: 
dma_fence_signal+0x28/0x80

[11176.741100]
   other info that might help us debug this:
[11176.741104]  Possible unsafe locking scenario:

[11176.741108]    CPU0
[11176.741110]    
[11176.741113]   lock(&fence->lock);
[11176.741118]   lock(&fence->lock);
[11176.741122]
    *** DEADLOCK ***

[11176.741125]  May be due to missing lock nesting notation

[11176.741128] 2 locks held by swapper/12/0:
[11176.741133]  #0: 9c339c30f768 
(&ring->fence_drv.lock){-.-.}-{3:3}, at: dma_fence_signal+0x28/0x80
[11176.741142]  #1: 9c337ed172a8 (&fence->lock){-.-.}-{3:3}, at: 
dma_fence_signal+0x28/0x80

[11176.741151]
   stack backtrace:
[11176.741155] CPU: 12 PID: 0 Comm: swapper/12 Not tainted 
5.15.0-rc1-00031-g9d546d600800 #171
[11176.741160] Hardware name: System manufacturer System Product 
Name/PRIME X399-A, BIOS 0808 10/12/2018

[11176.741165] Call Trace:
[11176.741169]  
[11176.741173]  dump_stack_lvl+0x5b/0x74
[11176.741181]  dump_stack+0x10/0x12
[11176.741186]  __lock_acquire.cold+0x208/0x2df
[11176.741197]  lock_acquire+0xc6/0x2d0
[11176.741204]  ? dma_fence_signal+0x28/0x80
[11176.741212]  _raw_spin_lock_irqsave+0x4d/0x70
[11176.741219]  ? dma_fence_signal+0x28/0x80
[11176.741225]  dma_fence_signal+0x28/0x80
[11176.741230]  drm_sched_fence_finished+0x12/0x20 [gpu_sched]
[11176.741240]  drm_sched_entity_kill_jobs_cb+0x1c/0x50 [gpu_sched]
[11176.741248]  dma_fence_signal_timestamp_locked+0xac/0x1a0
[11176.741254]  dma_fence_signal+0x3b/0x80
[11176.741260]  drm_sched_fence_finished+0x12/0x20 [gpu_sched]
[11176.741268]  drm_sched_job_done.isra.0+0x7f/0x1a0 [gpu_sched]
[11176.741277]  drm_sched_job_done_cb+0x12/0x20 [gpu_sched]
[11176.741284]  dma_fence_signal_timestamp_locked+0xac/0x1a0
[11176.741290]  dma_fence_signal+0x3b/0x80
[11176.741296]  amdgpu_fence_process+0xd1/0x140 [amdgpu]
[11176.741504]  sdma_v4_0_process_trap_irq+0x8c/0xb0 [amdgpu]
[11176.741731]  amdgpu_irq_dispatch+0xce/0x250 [amdgpu]
[11176.741954]  amdgpu_ih_process+0x81/0x100 [amdgpu]
[11176.742174]  amdgpu_irq_handler+0x26/0xa0 [amdgpu]
[11176.742393]  __handle_irq_event_percpu+0x4f/0x2c0
[11176.742402]  handle_irq_ev

Re: FW: [PATCH 1/3] drm/amdgpu: fix a potential memory leak in amdgpu_device_fini_sw()

2021-10-21 Thread Andrey Grodzovsky

On 2021-10-21 3:19 a.m., Yu, Lang wrote:


[AMD Official Use Only]




-Original Message-
From: Yu, Lang 
Sent: Thursday, October 21, 2021 3:18 PM
To: Grodzovsky, Andrey 
Cc: Deucher, Alexander ; Koenig, Christian
; Huang, Ray ; Yu, Lang

Subject: [PATCH 1/3] drm/amdgpu: fix a potential memory leak in
amdgpu_device_fini_sw()

amdgpu_fence_driver_sw_fini() should be executed before
amdgpu_device_ip_fini(), otherwise fence driver resource won't be properly freed
as adev->rings have been tore down.



Cam you clarify more where exactly the memleak happens ?

Andrey




Fixes: 72c8c97b1522 ("drm/amdgpu: Split amdgpu_device_fini into early and late")

Signed-off-by: Lang Yu 
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ce86244144..5654c4790773 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3843,8 +3843,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device
*adev)

void amdgpu_device_fini_sw(struct amdgpu_device *adev)  {
-   amdgpu_device_ip_fini(adev);
amdgpu_fence_driver_sw_fini(adev);
+   amdgpu_device_ip_fini(adev);
release_firmware(adev->firmware.gpu_info_fw);
adev->firmware.gpu_info_fw = NULL;
adev->accel_working = false;
--
2.25.1


Re: FW: [PATCH 1/3] drm/amdgpu: fix a potential memory leak in amdgpu_device_fini_sw()

2021-10-22 Thread Andrey Grodzovsky



On 2021-10-21 7:33 p.m., Yu, Lang wrote:

[AMD Official Use Only]




-Original Message-
From: Grodzovsky, Andrey 
Sent: Thursday, October 21, 2021 11:18 PM
To: Yu, Lang ; amd-gfx@lists.freedesktop.org
Subject: Re: FW: [PATCH 1/3] drm/amdgpu: fix a potential memory leak in
amdgpu_device_fini_sw()

On 2021-10-21 3:19 a.m., Yu, Lang wrote:


[AMD Official Use Only]




-Original Message-
From: Yu, Lang 
Sent: Thursday, October 21, 2021 3:18 PM
To: Grodzovsky, Andrey 
Cc: Deucher, Alexander ; Koenig, Christian
; Huang, Ray ; Yu, Lang

Subject: [PATCH 1/3] drm/amdgpu: fix a potential memory leak in
amdgpu_device_fini_sw()

amdgpu_fence_driver_sw_fini() should be executed before
amdgpu_device_ip_fini(), otherwise fence driver resource won't be
properly freed as adev->rings have been tore down.


Cam you clarify more where exactly the memleak happens ?

Andrey

See amdgpu_fence_driver_sw_fini(), ring->fence_drv.fences will only be freed
when adev->rings[i] is not NULL.

void amdgpu_fence_driver_sw_fini(struct amdgpu_device *adev)
{
unsigned int i, j;

for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
struct amdgpu_ring *ring = adev->rings[i];

if (!ring || !ring->fence_drv.initialized)
continue;

if (!ring->no_scheduler)
drm_sched_fini(&ring->sched);

for (j = 0; j <= ring->fence_drv.num_fences_mask; ++j)
dma_fence_put(ring->fence_drv.fences[j]);
kfree(ring->fence_drv.fences);
ring->fence_drv.fences = NULL;
ring->fence_drv.initialized = false;
}
}

If amdgpu_device_ip_fini() is executed before amdgpu_fence_driver_sw_fini(),
amdgpu_device_ip_fini() will call gfx_vX_0_sw_fini()
then call amdgpu_ring_fini() and set adev->rings[i] to NULL.
Nothing will be freed in amdgpu_fence_driver_sw_fini().
ring->fence_drv.fences  memory leak happened!

void amdgpu_ring_fini(struct amdgpu_ring *ring)
{
..
ring->adev->rings[ring->idx] = NULL;
}

Regards,
Lang



Got it, Looks good to me.

Reviewed-by: Andrey Grodzovsky 

Andrey






Fixes: 72c8c97b1522 ("drm/amdgpu: Split amdgpu_device_fini into early
and late")

Signed-off-by: Lang Yu 
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ce86244144..5654c4790773 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3843,8 +3843,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device
*adev)

void amdgpu_device_fini_sw(struct amdgpu_device *adev)  {
-   amdgpu_device_ip_fini(adev);
amdgpu_fence_driver_sw_fini(adev);
+   amdgpu_device_ip_fini(adev);
release_firmware(adev->firmware.gpu_info_fw);
adev->firmware.gpu_info_fw = NULL;
adev->accel_working = false;
--
2.25.1


Re: [PATCH] drm/amd/amdgpu: fix potential bad job hw_fence underflow

2021-10-22 Thread Andrey Grodzovsky



What do you mean by underflow in this case ? You mean use after free 
because of extra dma_fence_put() ?


On 2021-10-22 4:14 a.m., JingWen Chen wrote:

ping

On 2021/10/22 AM11:33, Jingwen Chen wrote:

[Why]
In advance tdr mode, the real bad job will be resubmitted twice, while
in drm_sched_resubmit_jobs_ext, there's a dma_fence_put, so the bad job
is put one more time than other jobs.

[How]
Adding dma_fence_get before resbumit job in
amdgpu_device_recheck_guilty_jobs and put the fence for normal jobs

Signed-off-by: Jingwen Chen 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 
  1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ce86244144..975f069f6fe8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4841,6 +4841,9 @@ static void amdgpu_device_recheck_guilty_jobs(
  
  		/* clear job's guilty and depend the folowing step to decide the real one */

drm_sched_reset_karma(s_job);
+   /* for the real bad job, it will be resubmitted twice, adding a 
dma_fence_get
+* to make sure fence is balanced */



But that put in drm_sched_resubmit_jobs_ext is for the previous parent 
fence.
fence = sched->ops->run_job(s_job); returns a new HW fence and the put 
drops the refcount on the old one.


Andrey



+   dma_fence_get(s_job->s_fence->parent);
drm_sched_resubmit_jobs_ext(&ring->sched, 1);
  
  		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);

@@ -4876,6 +4879,7 @@ static void amdgpu_device_recheck_guilty_jobs(
  
  		/* got the hw fence, signal finished fence */

atomic_dec(ring->sched.score);
+   dma_fence_put(s_job->s_fence->parent);
dma_fence_get(&s_job->s_fence->finished);
dma_fence_signal(&s_job->s_fence->finished);
dma_fence_put(&s_job->s_fence->finished);


Re: [PATCH] drm/amd/amdgpu: fix potential bad job hw_fence underflow

2021-10-25 Thread Andrey Grodzovsky



On 2021-10-24 10:56 p.m., JingWen Chen wrote:

On 2021/10/23 上午4:41, Andrey Grodzovsky wrote:

What do you mean by underflow in this case ? You mean use after free because of 
extra dma_fence_put() ?

yes



Then maybe update the description  because 'underflow' is very confusing



On 2021-10-22 4:14 a.m., JingWen Chen wrote:

ping

On 2021/10/22 AM11:33, Jingwen Chen wrote:

[Why]
In advance tdr mode, the real bad job will be resubmitted twice, while
in drm_sched_resubmit_jobs_ext, there's a dma_fence_put, so the bad job
is put one more time than other jobs.

[How]
Adding dma_fence_get before resbumit job in
amdgpu_device_recheck_guilty_jobs and put the fence for normal jobs

Signed-off-by: Jingwen Chen 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 
   1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ce86244144..975f069f6fe8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4841,6 +4841,9 @@ static void amdgpu_device_recheck_guilty_jobs(
     /* clear job's guilty and depend the folowing step to decide the 
real one */
   drm_sched_reset_karma(s_job);
+    /* for the real bad job, it will be resubmitted twice, adding a 
dma_fence_get
+ * to make sure fence is balanced */


But that put in drm_sched_resubmit_jobs_ext is for the previous parent fence.
fence = sched->ops->run_job(s_job); returns a new HW fence and the put drops 
the refcount on the old one.

Andrey



Hi Andrey,

If I remember correctly, after we embedded the hw_fence into amdgpu_job, there 
will be not fence replacement in amdgpu_job_run.



Right, I forgot that... What about removing line 
https://elixir.bootlin.com/linux/v5.15-rc6/source/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c#L265 
?

What if you make dma_get_fence unconditional instead ?

Andrey





+    dma_fence_get(s_job->s_fence->parent);
   drm_sched_resubmit_jobs_ext(&ring->sched, 1);
     ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, 
ring->sched.timeout);
@@ -4876,6 +4879,7 @@ static void amdgpu_device_recheck_guilty_jobs(
     /* got the hw fence, signal finished fence */
   atomic_dec(ring->sched.score);
+    dma_fence_put(s_job->s_fence->parent);
   dma_fence_get(&s_job->s_fence->finished);
   dma_fence_signal(&s_job->s_fence->finished);
   dma_fence_put(&s_job->s_fence->finished);


Re: Lockdep spalt on killing a processes

2021-10-25 Thread Andrey Grodzovsky
Adding back Daniel (somehow he got off the addresses list) and Chris who 
worked a lot in this area.


On 2021-10-21 2:34 a.m., Christian König wrote:



Am 20.10.21 um 21:32 schrieb Andrey Grodzovsky:

On 2021-10-04 4:14 a.m., Christian König wrote:


The problem is a bit different.

The callback is on the dependent fence, while we need to signal the 
scheduler fence.


Daniel is right that this needs an irq_work struct to handle this 
properly.


Christian.



So we had some discussions with Christian regarding irq_work and 
agreed I should look into doing it but stepping back for a sec -


Why we insist on calling the dma_fence_cb  with fence->lock locked ? 
Is it because of dma_fence_add_callback ?
Because we first test for DMA_FENCE_FLAG_SIGNALED_BIT and only after 
that lock the fence->lock ? If so, can't we
move DMA_FENCE_FLAG_SIGNALED_BIT  check inside the locked section ? 
Because if in theory
we could call the cb with unlocked fence->lock (i.e. this kind of 
iteration 
https://elixir.bootlin.com/linux/v5.15-rc6/source/drivers/gpu/drm/ttm/ttm_resource.c#L117)

we wouldn't have the lockdep splat. And in general, is it really
the correct approach to call a third party code from a call back with 
locked spinlock ? We don't know what the cb does inside
and I don't see any explicit restrictions in documentation of 
dma_fence_func_t what can and cannot be done there.


Yeah, that's exactly what I meant with using the irq_work directly in 
the fence code.



My idea is not to use irq work at all but instead to implement unlocked 
dma_fence cb execution using iteration
which drops the spinlock each time next cb is executed and acquiring it 
again after (until cb_list is empy).






The problem is dma_fence_signal_locked() which is used by quite a 
number of drivers to signal the fence while holding the lock.



For this I think we should not reuse dma_fence_signal_locked inside 
dma_fence_signal and instead implement it using the
unlocked iteration I mentioned above. I looked a bit in the code and the 
history and I see that until some time ago
(this commit by Chris 0fc89b6802ba1fcc561b0c906e0cefd384e3b2e5), indeed 
dma_fence_signal was doing it's own, locked iteration
and wasn't reusing dma_fence_signal_locked. This way whoever relies on 
the dma_fence_signal_locked won't be impacted
an who is not (like us in 
drm_sched_fence_scheduled/drm_sched_fence_finished) should also not be 
impacted by more narrow
scope of the lock. I also looked at dma_fence_default_wait and how it 
locks the fence->lock and check if fence is signaled

before wait start and I don't see a problem there either.

I attached quick draft of this proposal to clarify.

Andrey




Otherwise we could indeed simplify the fence handling a lot.

Christian.



Andrey




Am 01.10.21 um 17:10 schrieb Andrey Grodzovsky:
From what I see here you supposed to have actual deadlock and not 
only warning, sched_fence->finished is  first signaled from within
hw fence done callback (drm_sched_job_done_cb) but then again from 
within it's own callback (drm_sched_entity_kill_jobs_cb) and so
looks like same fence  object is recursively signaled twice. This 
leads to attempt to lock fence->lock second time while it's already
locked. I don't see a need to call drm_sched_fence_finished from 
within drm_sched_entity_kill_jobs_cb as this callback already 
registered
on sched_fence->finished fence (entity->last_scheduled == 
s_fence->finished) and hence the signaling already took place.


Andrey

On 2021-10-01 6:50 a.m., Christian König wrote:

Hey, Andrey.

while investigating some memory management problems I've got the 
logdep splat below.


Looks like something is wrong with 
drm_sched_entity_kill_jobs_cb(), can you investigate?


Thanks,
Christian.

[11176.741052] 
[11176.741056] WARNING: possible recursive locking detected
[11176.741060] 5.15.0-rc1-00031-g9d546d600800 #171 Not tainted
[11176.741066] 
[11176.741070] swapper/12/0 is trying to acquire lock:
[11176.741074] 9c337ed175a8 (&fence->lock){-.-.}-{3:3}, at: 
dma_fence_signal+0x28/0x80

[11176.741088]
   but task is already holding lock:
[11176.741092] 9c337ed172a8 (&fence->lock){-.-.}-{3:3}, at: 
dma_fence_signal+0x28/0x80

[11176.741100]
   other info that might help us debug this:
[11176.741104]  Possible unsafe locking scenario:

[11176.741108]    CPU0
[11176.741110]    
[11176.741113]   lock(&fence->lock);
[11176.741118]   lock(&fence->lock);
[11176.741122]
    *** DEADLOCK ***

[11176.741125]  May be due to missing lock nesting notation

[11176.741128] 2 locks held by swapper/12/0:
[11176.741133]  #0: 9c339c30f768 
(&ring->fence_drv.lock){-.-.}-{3:3}, at: dma_fence_signal+0x28/0x80
[11176.741142]  #1: 9c337ed172a8 (&a

Re: Lockdep spalt on killing a processes

2021-10-25 Thread Andrey Grodzovsky

On 2021-10-25 3:56 p.m., Christian König wrote:

In general I'm all there to get this fixed, but there is one major 
problem: Drivers don't expect the lock to be dropped.



I am probably missing something but in my approach we only modify the 
code for those clients that call dma_fence_signal,
not dma_fence_signal_locked. In those cases the drivers are agnostic to 
lock behavior (or should be at least) since the lock
is acquired within the dma fence code. Note that if you are worried 
about calling the callback without lock then same exact
concern is relevant to using the irq_work directly in the fence code 
since the irq_work will execute at a later time without locked

fence->lock (which is the point of using irq_work).



What we could do is to change all drivers so they call always call the 
dma_fence_signal functions and drop the _locked variants. This way we 
could move calling the callback out of the spinlock.


But that requires audit of all drivers, so quite a lot of work to do.



As i said earlier - if we only modify dma_fence_signal and don't touch 
dma_fence_signal_locked then our only concern should the users of 
dma_fence_signal.

Let me please know if I am still missing some point of yours.

Andrey




Regards,
Christian.

Am 25.10.21 um 21:10 schrieb Andrey Grodzovsky:
Adding back Daniel (somehow he got off the addresses list) and Chris 
who worked a lot in this area.


On 2021-10-21 2:34 a.m., Christian König wrote:



Am 20.10.21 um 21:32 schrieb Andrey Grodzovsky:

On 2021-10-04 4:14 a.m., Christian König wrote:


The problem is a bit different.

The callback is on the dependent fence, while we need to signal 
the scheduler fence.


Daniel is right that this needs an irq_work struct to handle this 
properly.


Christian.



So we had some discussions with Christian regarding irq_work and 
agreed I should look into doing it but stepping back for a sec -


Why we insist on calling the dma_fence_cb  with fence->lock locked 
? Is it because of dma_fence_add_callback ?
Because we first test for DMA_FENCE_FLAG_SIGNALED_BIT and only 
after that lock the fence->lock ? If so, can't we
move DMA_FENCE_FLAG_SIGNALED_BIT  check inside the locked section ? 
Because if in theory
we could call the cb with unlocked fence->lock (i.e. this kind of 
iteration 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Fv5.15-rc6%2Fsource%2Fdrivers%2Fgpu%2Fdrm%2Fttm%2Fttm_resource.c%23L117&data=04%7C01%7Candrey.grodzovsky%40amd.com%7Cc8a4525f94c244bebbd208d997f19242%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637707886075917091%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=YBq%2BNkDuYKERc8XJDWTfeM%2FSknpuCBHQYgN8Uo5PFv0%3D&reserved=0)

we wouldn't have the lockdep splat. And in general, is it really
the correct approach to call a third party code from a call back 
with locked spinlock ? We don't know what the cb does inside
and I don't see any explicit restrictions in documentation of 
dma_fence_func_t what can and cannot be done there.


Yeah, that's exactly what I meant with using the irq_work directly 
in the fence code.



My idea is not to use irq work at all but instead to implement 
unlocked dma_fence cb execution using iteration
which drops the spinlock each time next cb is executed and acquiring 
it again after (until cb_list is empy).






The problem is dma_fence_signal_locked() which is used by quite a 
number of drivers to signal the fence while holding the lock.



For this I think we should not reuse dma_fence_signal_locked inside 
dma_fence_signal and instead implement it using the
unlocked iteration I mentioned above. I looked a bit in the code and 
the history and I see that until some time ago
(this commit by Chris 0fc89b6802ba1fcc561b0c906e0cefd384e3b2e5), 
indeed dma_fence_signal was doing it's own, locked iteration
and wasn't reusing dma_fence_signal_locked. This way whoever relies 
on the dma_fence_signal_locked won't be impacted
an who is not (like us in 
drm_sched_fence_scheduled/drm_sched_fence_finished) should also not 
be impacted by more narrow
scope of the lock. I also looked at dma_fence_default_wait and how it 
locks the fence->lock and check if fence is signaled

before wait start and I don't see a problem there either.

I attached quick draft of this proposal to clarify.

Andrey




Otherwise we could indeed simplify the fence handling a lot.

Christian.



Andrey




Am 01.10.21 um 17:10 schrieb Andrey Grodzovsky:
From what I see here you supposed to have actual deadlock and not 
only warning, sched_fence->finished is  first signaled from within
hw fence done callback (drm_sched_job_done_cb) but then again 
from within it's own callback (drm_sched_entity_kill_jobs_cb) and so
looks like same fence  object is recursively signaled twice. This 
leads to attempt to lock fence->lo

Re: Lockdep spalt on killing a processes

2021-10-27 Thread Andrey Grodzovsky



On 2021-10-26 6:54 a.m., Christian König wrote:

Am 26.10.21 um 04:33 schrieb Andrey Grodzovsky:

On 2021-10-25 3:56 p.m., Christian König wrote:

In general I'm all there to get this fixed, but there is one major 
problem: Drivers don't expect the lock to be dropped.



I am probably missing something but in my approach we only modify the 
code for those clients that call dma_fence_signal,
not dma_fence_signal_locked. In those cases the drivers are agnostic 
to lock behavior (or should be at least) since the lock
is acquired within the dma fence code. Note that if you are worried 
about calling the callback without lock then same exact
concern is relevant to using the irq_work directly in the fence code 
since the irq_work will execute at a later time without locked

fence->lock (which is the point of using irq_work).


Yeah, I've seen that it just doesn't make much sense to me.



Not clear what doesn't make sense ?






What we could do is to change all drivers so they call always call 
the dma_fence_signal functions and drop the _locked variants. This 
way we could move calling the callback out of the spinlock.


But that requires audit of all drivers, so quite a lot of work to do.



As i said earlier - if we only modify dma_fence_signal and don't 
touch dma_fence_signal_locked then our only concern should the users 
of dma_fence_signal.


Yes, but what do you do with the drivers who call the _locked variant?



IMHO we don't touch them at all, they stay as is, we only re-implement  
dma_fence_signal  because drivers that use the locked variant take the 
lock explicitly and so they intend for their callbacks to run
under the lock and if they don't , it's their own problem within their 
code and they should fix it locally there. Driver that call 
dma_fence_signal are our only problem because they didn't take the lock
explicitly but were forced to run the callback under lock by the 
dma-fence framework and that something we can change.






Let me please know if I am still missing some point of yours.


Well, I mean we need to be able to handle this for all drivers.



For sure, but as i said above in my opinion we need to change only for 
those drivers that don't use the _locked version.


Andrey




Regards,
Christian.



Andrey




Regards,
Christian.

Am 25.10.21 um 21:10 schrieb Andrey Grodzovsky:
Adding back Daniel (somehow he got off the addresses list) and 
Chris who worked a lot in this area.


On 2021-10-21 2:34 a.m., Christian König wrote:



Am 20.10.21 um 21:32 schrieb Andrey Grodzovsky:

On 2021-10-04 4:14 a.m., Christian König wrote:


The problem is a bit different.

The callback is on the dependent fence, while we need to signal 
the scheduler fence.


Daniel is right that this needs an irq_work struct to handle 
this properly.


Christian.



So we had some discussions with Christian regarding irq_work and 
agreed I should look into doing it but stepping back for a sec -


Why we insist on calling the dma_fence_cb  with fence->lock 
locked ? Is it because of dma_fence_add_callback ?
Because we first test for DMA_FENCE_FLAG_SIGNALED_BIT and only 
after that lock the fence->lock ? If so, can't we
move DMA_FENCE_FLAG_SIGNALED_BIT  check inside the locked section 
? Because if in theory
we could call the cb with unlocked fence->lock (i.e. this kind of 
iteration 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Fv5.15-rc6%2Fsource%2Fdrivers%2Fgpu%2Fdrm%2Fttm%2Fttm_resource.c%23L117&data=04%7C01%7Candrey.grodzovsky%40amd.com%7Cc8a4525f94c244bebbd208d997f19242%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637707886075917091%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=YBq%2BNkDuYKERc8XJDWTfeM%2FSknpuCBHQYgN8Uo5PFv0%3D&reserved=0)

we wouldn't have the lockdep splat. And in general, is it really
the correct approach to call a third party code from a call back 
with locked spinlock ? We don't know what the cb does inside
and I don't see any explicit restrictions in documentation of 
dma_fence_func_t what can and cannot be done there.


Yeah, that's exactly what I meant with using the irq_work directly 
in the fence code.



My idea is not to use irq work at all but instead to implement 
unlocked dma_fence cb execution using iteration
which drops the spinlock each time next cb is executed and 
acquiring it again after (until cb_list is empy).






The problem is dma_fence_signal_locked() which is used by quite a 
number of drivers to signal the fence while holding the lock.



For this I think we should not reuse dma_fence_signal_locked inside 
dma_fence_signal and instead implement it using the
unlocked iteration I mentioned above. I looked a bit in the code 
and the history and I see that until some time ago
(this commit by Chris 0fc89b6802ba1fcc561b0c906e0cefd384e3b2e5), 
indeed dm

Re: Lockdep spalt on killing a processes

2021-10-27 Thread Andrey Grodzovsky



On 2021-10-27 10:34 a.m., Christian König wrote:

Am 27.10.21 um 16:27 schrieb Andrey Grodzovsky:

[SNIP]



Let me please know if I am still missing some point of yours.


Well, I mean we need to be able to handle this for all drivers.



For sure, but as i said above in my opinion we need to change only 
for those drivers that don't use the _locked version.


And that absolutely won't work.

See the dma_fence is a contract between drivers, so you need the same 
calling convention between all drivers.


Either we always call the callback with the lock held or we always 
call it without the lock, but sometimes like that and sometimes 
otherwise won't work.


Christian.



I am not sure I fully understand what problems this will cause but 
anyway, then we are back to irq_work. We cannot embed irq_work as union 
within dma_fenc's cb_list
because it's already reused as timestamp and as rcu head after the fence 
is signaled. So I will do it within drm_scheduler with single irq_work 
per drm_sched_entity

as we discussed before.

Andrey






Andrey




Re: [PATCH] drm/amd/amdgpu: fix potential bad job hw_fence underflow

2021-10-27 Thread Andrey Grodzovsky



On 2021-10-25 10:57 p.m., JingWen Chen wrote:

On 2021/10/25 下午11:18, Andrey Grodzovsky wrote:

On 2021-10-24 10:56 p.m., JingWen Chen wrote:

On 2021/10/23 上午4:41, Andrey Grodzovsky wrote:

What do you mean by underflow in this case ? You mean use after free because of 
extra dma_fence_put() ?

yes


Then maybe update the description  because 'underflow' is very confusing


will do

On 2021-10-22 4:14 a.m., JingWen Chen wrote:

ping

On 2021/10/22 AM11:33, Jingwen Chen wrote:

[Why]
In advance tdr mode, the real bad job will be resubmitted twice, while
in drm_sched_resubmit_jobs_ext, there's a dma_fence_put, so the bad job
is put one more time than other jobs.

[How]
Adding dma_fence_get before resbumit job in
amdgpu_device_recheck_guilty_jobs and put the fence for normal jobs

Signed-off-by: Jingwen Chen 
---
    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 
    1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ce86244144..975f069f6fe8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4841,6 +4841,9 @@ static void amdgpu_device_recheck_guilty_jobs(
      /* clear job's guilty and depend the folowing step to decide the 
real one */
    drm_sched_reset_karma(s_job);
+    /* for the real bad job, it will be resubmitted twice, adding a 
dma_fence_get
+ * to make sure fence is balanced */

But that put in drm_sched_resubmit_jobs_ext is for the previous parent fence.
fence = sched->ops->run_job(s_job); returns a new HW fence and the put drops 
the refcount on the old one.

Andrey



Hi Andrey,

If I remember correctly, after we embedded the hw_fence into amdgpu_job, there 
will be not fence replacement in amdgpu_job_run.


Right, I forgot that... What about removing line 
https://elixir.bootlin.com/linux/v5.15-rc6/source/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c#L265
 ?
What if you make dma_get_fence unconditional instead ?

Andrey



Hi Andrey,

I have tried this and this will cause normal jobs cannot be free(lacks a 
dma_fence_put).



I can't see it  - can you point me where in that case you get unbalanced 
refcount ? As far as I see for a a normal job
being ran in amdgpu_device_recheck_guilty_jobs the refcount on hw_fence 
is  -


drm_sched_resubmit_jobs_ext->dma_fence_put -> refcount decrease by 1
drm_sched_resubmit_jobs_ext->amdgpu_job_run->dma_fence_get increase by 1

In total refcount didn't change until now

Next,  dma_fence_wait_timeout completed successfully because the job is 
normal and then you delete that job from pending list and call the

free_job cb which drops remaining refcounts on the hw_fence.

I am probably missing some  dma_fence_get since you checked it on a 
device but I wonder where is my mistake ?


Andrey




I have figured out all the get/put

for sched_jobs and only the bad job lacks a dma_fence_get, other jobs are just 
fine.


+    dma_fence_get(s_job->s_fence->parent);
    drm_sched_resubmit_jobs_ext(&ring->sched, 1);
      ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, 
ring->sched.timeout);
@@ -4876,6 +4879,7 @@ static void amdgpu_device_recheck_guilty_jobs(
      /* got the hw fence, signal finished fence */
    atomic_dec(ring->sched.score);
+    dma_fence_put(s_job->s_fence->parent);
    dma_fence_get(&s_job->s_fence->finished);
    dma_fence_signal(&s_job->s_fence->finished);
    dma_fence_put(&s_job->s_fence->finished);


Re: Lockdep spalt on killing a processes

2021-10-27 Thread Andrey Grodzovsky



On 2021-10-27 10:50 a.m., Christian König wrote:

Am 27.10.21 um 16:47 schrieb Andrey Grodzovsky:


On 2021-10-27 10:34 a.m., Christian König wrote:

Am 27.10.21 um 16:27 schrieb Andrey Grodzovsky:

[SNIP]



Let me please know if I am still missing some point of yours.


Well, I mean we need to be able to handle this for all drivers.



For sure, but as i said above in my opinion we need to change only 
for those drivers that don't use the _locked version.


And that absolutely won't work.

See the dma_fence is a contract between drivers, so you need the 
same calling convention between all drivers.


Either we always call the callback with the lock held or we always 
call it without the lock, but sometimes like that and sometimes 
otherwise won't work.


Christian.



I am not sure I fully understand what problems this will cause but 
anyway, then we are back to irq_work. We cannot embed irq_work as 
union within dma_fenc's cb_list
because it's already reused as timestamp and as rcu head after the 
fence is signaled. So I will do it within drm_scheduler with single 
irq_work per drm_sched_entity

as we discussed before.


That won't work either. We free up the entity after the cleanup 
function. That's the reason we use the callback on the job in the 
first place.



Yep, missed it.




We could overlead the cb structure in the job though.



I guess, since no one else is using this member it after the cb executed.

Andrey




Christian.



Andrey






Andrey






Re: [PATCH] drm/amd/amdgpu: fix potential bad job hw_fence underflow

2021-10-28 Thread Andrey Grodzovsky



On 2021-10-27 10:43 p.m., JingWen Chen wrote:

On 2021/10/28 上午3:43, Andrey Grodzovsky wrote:

On 2021-10-25 10:57 p.m., JingWen Chen wrote:

On 2021/10/25 下午11:18, Andrey Grodzovsky wrote:

On 2021-10-24 10:56 p.m., JingWen Chen wrote:

On 2021/10/23 上午4:41, Andrey Grodzovsky wrote:

What do you mean by underflow in this case ? You mean use after free because of 
extra dma_fence_put() ?

yes

Then maybe update the description  because 'underflow' is very confusing


will do

On 2021-10-22 4:14 a.m., JingWen Chen wrote:

ping

On 2021/10/22 AM11:33, Jingwen Chen wrote:

[Why]
In advance tdr mode, the real bad job will be resubmitted twice, while
in drm_sched_resubmit_jobs_ext, there's a dma_fence_put, so the bad job
is put one more time than other jobs.

[How]
Adding dma_fence_get before resbumit job in
amdgpu_device_recheck_guilty_jobs and put the fence for normal jobs

Signed-off-by: Jingwen Chen 
---
     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 
     1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ce86244144..975f069f6fe8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4841,6 +4841,9 @@ static void amdgpu_device_recheck_guilty_jobs(
       /* clear job's guilty and depend the folowing step to decide the 
real one */
     drm_sched_reset_karma(s_job);
+    /* for the real bad job, it will be resubmitted twice, adding a 
dma_fence_get
+ * to make sure fence is balanced */

But that put in drm_sched_resubmit_jobs_ext is for the previous parent fence.
fence = sched->ops->run_job(s_job); returns a new HW fence and the put drops 
the refcount on the old one.

Andrey



Hi Andrey,

If I remember correctly, after we embedded the hw_fence into amdgpu_job, there 
will be not fence replacement in amdgpu_job_run.

Right, I forgot that... What about removing line 
https://elixir.bootlin.com/linux/v5.15-rc6/source/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c#L265
 ?
What if you make dma_get_fence unconditional instead ?

Andrey



Hi Andrey,

I have tried this and this will cause normal jobs cannot be free(lacks a 
dma_fence_put).


I can't see it  - can you point me where in that case you get unbalanced 
refcount ? As far as I see for a a normal job
being ran in amdgpu_device_recheck_guilty_jobs the refcount on hw_fence is  -

drm_sched_resubmit_jobs_ext->dma_fence_put -> refcount decrease by 1
drm_sched_resubmit_jobs_ext->amdgpu_job_run->dma_fence_get increase by 1

In total refcount didn't change until now

Next,  dma_fence_wait_timeout completed successfully because the job is normal 
and then you delete that job from pending list and call the
free_job cb which drops remaining refcounts on the hw_fence.

I am probably missing some  dma_fence_get since you checked it on a device but 
I wonder where is my mistake ?

Andrey



Hi Andrey,

The thing is the put/get is balanced right now for normal jobs in TDR. Changing 
this dma_fence_get to unconditional simply adds 1 dma_fence_get but there's no 
corresponding dma_fence_put for normal jobs.

And if this can be helpful, I try to find all dma_fence_get/put for a normal 
job in advance TDR based on the latest drm-next.

amdgpu_fence_emit -> dma_fence_init    ref_count = 1​
amdgpu_fence_emit -> add into rcu    ref_count = 2​
amdgpu_job_run->get after ib_schedule    ref_count = 3​
drm_sched_main-> add fence callback get    ref_count = 4​
drm_sched_main-> add fence callback put    ref_count = 3​
drm_sched_resubmit_jobs_ext    ref_count = 2
amdgpu_fence_emit -> add into rcu    ref_count = 3​



Now I see that that put in drm_sched_resubmit_jobs_ext is for dropping 
the refcount for the previous
'amdgpu_fence_emit -> add into rcu' get... This is all very convoluted 
and confusing. Probably requires some
rework to make the code more clear but for now we need the bug fixed so 
with the title changed

the patch is Reviewed-by: Andrey Grodzovsky 

Andrey




amdgpu_fence_process-> put after signal    ref_count = 2​
drm_sched_main->free_job    ref_count = 1​
drm_sched_fence_release_finished    ref_count = 0​

If we do unconditional get, this sequence will turn into:

amdgpu_fence_emit -> dma_fence_init    ref_count = 1​
amdgpu_fence_emit -> add into rcu    ref_count = 2​
amdgpu_job_run->get after ib_schedule    ref_count = 3​
drm_sched_main-> add fence callback get    ref_count = 4​
drm_sched_main-> add fence callback put    ref_count = 3​
drm_sched_resubmit_jobs_ext    ref_count = 2
amdgpu_fence_emit -> add into rcu    ref_count = 3​
+  amdgpu_job_run->get after ib_schedule    ref_count = 4
amdgpu_fence_process-> put after signal    ref_count = 3
drm_sched_main->free_job    ref_count = 2
drm_sched_fence_release_finished    ref_count = 1

I have figured out all the get/put

Re: Lockdep spalt on killing a processes

2021-10-28 Thread Andrey Grodzovsky


On 2021-10-27 3:58 p.m., Andrey Grodzovsky wrote:


On 2021-10-27 10:50 a.m., Christian König wrote:

Am 27.10.21 um 16:47 schrieb Andrey Grodzovsky:


On 2021-10-27 10:34 a.m., Christian König wrote:

Am 27.10.21 um 16:27 schrieb Andrey Grodzovsky:

[SNIP]



Let me please know if I am still missing some point of yours.


Well, I mean we need to be able to handle this for all drivers.



For sure, but as i said above in my opinion we need to change only 
for those drivers that don't use the _locked version.


And that absolutely won't work.

See the dma_fence is a contract between drivers, so you need the 
same calling convention between all drivers.


Either we always call the callback with the lock held or we always 
call it without the lock, but sometimes like that and sometimes 
otherwise won't work.


Christian.



I am not sure I fully understand what problems this will cause but 
anyway, then we are back to irq_work. We cannot embed irq_work as 
union within dma_fenc's cb_list
because it's already reused as timestamp and as rcu head after the 
fence is signaled. So I will do it within drm_scheduler with single 
irq_work per drm_sched_entity

as we discussed before.


That won't work either. We free up the entity after the cleanup 
function. That's the reason we use the callback on the job in the 
first place.



Yep, missed it.




We could overlead the cb structure in the job though.



I guess, since no one else is using this member it after the cb executed.

Andrey



Attached a patch. Give it a try please, I tested it on my side and tried 
to generate the right conditions to trigger this code path by repeatedly 
submitting commands while issuing GPU reset to stop the scheduler and 
then killing command submissions process in the middle. But for some 
reason looks like the job_queue was always empty already at the time of 
entity kill.


Andrey







Christian.



Andrey






Andrey




>From 8ba5c089939b79a6567411c33d4db40e5846eef3 Mon Sep 17 00:00:00 2001
From: Andrey Grodzovsky 
Date: Thu, 28 Oct 2021 12:24:03 -0400
Subject: [PATCH] drm/sched: Avoid lockdep spalt on killing a processes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Probelm:
Singlaning one sched fence from within another's sched
fence singal callback generates lockdep splat because
the both have same lockdep class of their fence->lock

Fix:
Fix bellow stack by rescheduling to irq work of
signaling and killing of jobs that left when entity is killed.

[11176.741181]  dump_stack+0x10/0x12
[11176.741186] __lock_acquire.cold+0x208/0x2df
[11176.741197]  lock_acquire+0xc6/0x2d0
[11176.741204]  ? dma_fence_signal+0x28/0x80
[11176.741212] _raw_spin_lock_irqsave+0x4d/0x70
[11176.741219]  ? dma_fence_signal+0x28/0x80
[11176.741225]  dma_fence_signal+0x28/0x80
[11176.741230] drm_sched_fence_finished+0x12/0x20 [gpu_sched]
[11176.741240] drm_sched_entity_kill_jobs_cb+0x1c/0x50 [gpu_sched]
[11176.741248] dma_fence_signal_timestamp_locked+0xac/0x1a0
[11176.741254]  dma_fence_signal+0x3b/0x80
[11176.741260] drm_sched_fence_finished+0x12/0x20 [gpu_sched]
[11176.741268] drm_sched_job_done.isra.0+0x7f/0x1a0 [gpu_sched]
[11176.741277] drm_sched_job_done_cb+0x12/0x20 [gpu_sched]
[11176.741284] dma_fence_signal_timestamp_locked+0xac/0x1a0
[11176.741290]  dma_fence_signal+0x3b/0x80
[11176.741296] amdgpu_fence_process+0xd1/0x140 [amdgpu]
[11176.741504] sdma_v4_0_process_trap_irq+0x8c/0xb0 [amdgpu]
[11176.741731]  amdgpu_irq_dispatch+0xce/0x250 [amdgpu]
[11176.741954]  amdgpu_ih_process+0x81/0x100 [amdgpu]
[11176.742174]  amdgpu_irq_handler+0x26/0xa0 [amdgpu]
[11176.742393] __handle_irq_event_percpu+0x4f/0x2c0
[11176.742402] handle_irq_event_percpu+0x33/0x80
[11176.742408]  handle_irq_event+0x39/0x60
[11176.742414]  handle_edge_irq+0x93/0x1d0
[11176.742419]  __common_interrupt+0x50/0xe0
[11176.742426]  common_interrupt+0x80/0x90

Signed-off-by: Andrey Grodzovsky 
Suggested-by: Daniel Vetter  
Suggested-by: Christian König 
---
 drivers/gpu/drm/scheduler/sched_entity.c | 15 ---
 include/drm/gpu_scheduler.h  | 12 +++-
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 27e1573af96e..191c56064f19 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -190,6 +190,16 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
 }
 EXPORT_SYMBOL(drm_sched_entity_flush);
 
+static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)
+{
+	struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
+
+	drm_sched_fence_finished(job->s_fence);
+	WARN_ON(job->s_fence->parent);
+	job->sched->ops->free_job(job);
+}
+
+
 /* Signal the scheduler finished fence when the entity in question is killed. */
 static void drm_sched_entity_k

Re: Lockdep spalt on killing a processes

2021-11-01 Thread Andrey Grodzovsky

Pushed to drm-misc-next

Andrey

On 2021-10-29 3:07 a.m., Christian König wrote:


Attached a patch. Give it a try please, I tested it on my side and 
tried to generate the right conditions to trigger this code path by 
repeatedly submitting commands while issuing GPU reset to stop the 
scheduler and then killing command submissions process in the middle. 
But for some reason looks like the job_queue was always empty already 
at the time of entity kill.


It was trivial to trigger with the stress utility I've hacked together:

amdgpu_stress -b v 1g -b g 1g -c 1 2 1g 1k

Then while it is copying just cntrl+c to kill it.

The patch itself is:

Tested-by: Christian König 
Reviewed-by: Christian König 

Thanks,
Christian. 


Re: [PATCH 2/2] drm/sched: serialize job_timeout and scheduler

2021-11-10 Thread Andrey Grodzovsky



On 2021-11-10 5:09 a.m., Christian König wrote:

Am 10.11.21 um 10:50 schrieb Daniel Vetter:

On Tue, Nov 09, 2021 at 08:17:01AM -0800, Rob Clark wrote:

On Tue, Nov 9, 2021 at 1:07 AM Daniel Vetter  wrote:

On Mon, Nov 08, 2021 at 03:39:17PM -0800, Rob Clark wrote:

I stumbled across this thread when I ran into the same issue, while
working out how to move drm/msm to use scheduler's retire +
timeout/recovery (and get rid of our own mirror list of in-flight
jobs).  We already have hw error detection enabled, and it can signal
quite fast, so assuming the first job on the list is the guilty job
just won't work.

But I was considering a slightly different approach to fixing this,
instead just handling it all in drm_sched_main() and getting rid of
the complicated kthread parking gymnastics.  Ie. something along the
lines of:

So handling timeouts in the main sched thread wont work as soon as you
have multiple engines and reset that impacts across engines:

- Nothing is simplified since you still need to stop the other 
scheduler

   threads.

- You get deadlocks if 2 schedulers time out at the same time, and 
both

   want to stop the other one.

Hence workqueue. Now the rule for the wq is that you can only have 
one per

reset domain, so
- single engine you just take the one drm/sched provides
- if reset affects all your engines in the chip, then you allocate 
on in

   the drm_device and pass that to all
- if you have a complex of gpus all interconnected (e.g. xgmi hive for
   amd), then it's one wq for the entire hive

_All_ reset related things must be run on that workqueue or things 
breaks,
which means if you get hw fault that also needs to be run there. I 
guess
we should either patch drm/sched to check you call that function 
from the

right workqueue, or just handle it internally.

Hmm, ok.. I guess it would be useful to better document the reasoning
for the current design, that would have steered me more towards the
approach taken in this patch.

Maybe this was because you worked on an old kernel? Boris did update the
kerneldoc as part of making gpu reset work for panfrost, which has this
multi-engine reset problem. If that's not yet clear then we need to
improve the docs further.

AMD's problem is even worse, because their reset domain is the entire 
xgmi

hive, so multiple pci devices.


I'm pushing for quite a while that we get something like an 
amdgpu_reset_domain structure or similar for this, but we 
unfortunately don't have that yet.


Maybe it should be a good idea to have something like a 
drm_sched_domain or similar with all the necessary information for the 
inter scheduler handling.


E.g. a workqueue for reset etc...

Regards,
Christian.



I think Monk and Jingwen already switched SRIOV case to using Boris's 
single threaded queue
interface for SRIOV, we can try to expand this to general bare metal 
case for AMD and on the way

to add drm_sched_domain to the scheduler.

Andrey






Also there might more issues in drm/sched ofc, e.g. I've looked a bit at
ordering/barriers and I'm pretty sure a lot are still missing. Or at 
least

we should have comments in the code explaining why it all works.
-Daniel


BR,
-R


-Daniel


-
diff --git a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
index 67382621b429..4d6ce775c316 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -764,6 +764,45 @@ static bool drm_sched_blocked(struct
drm_gpu_scheduler *sched)
 return false;
  }

+static bool handle_timeout(struct drm_gpu_scheduler *sched)
+{
+   struct drm_sched_job *bad;
+
+   if (!sched->has_timeout)
+   return false;
+
+   sched->has_timeout = false;
+
+   spin_lock(&sched->job_list_lock);
+   bad = list_first_entry_or_null(&sched->pending_list,
+  struct drm_sched_job, list);
+
+   if (!bad) {
+   spin_unlock(&sched->job_list_lock);
+   return false;
+   }
+
+   spin_unlock(&sched->job_list_lock);
+
+   if (sched->timeout_wq == system_wq) {
+   /*
+    * If driver has no specific requirements about 
serializing
+    * reset wrt. other engines, just call 
timedout_job() directly

+    */
+   sched->ops->timedout_job(job);
+   } else {
+   /*
+    * Otherwise queue it on timeout_wq and wait for 
it to complete

+    */
+   ... more typing needed here ...
+   }
+
+   if (sched->free_guilty) {
+   sched->ops->free_job(job);
+   sched->free_guilty = false;
+   }
+}
+
  /**
   * drm_sched_main - main scheduler thread
   *
@@ -787,6 +826,7 @@ static int drm_sched_main(void *param)

wait_event_interruptible(sched->wake_up_worker,
  (cleanup_job =
drm_sched_get_cleanup_job(sched)) ||
+ handle_timeout(sch

  1   2   3   4   5   6   7   8   9   10   >