On 25/08/2025 17:53, Maíra Canal wrote:
Hi Melissa,

On 25/08/25 17:03, Melissa Wen wrote:


On 15/08/2025 16:58, Maíra Canal wrote:
When the file descriptor is closed while a job is still running,
there's a race condition between the job completion callback and the
file descriptor cleanup. This can lead to accessing freed memory when
updating per-fd GPU stats, such as the following example:

[56120.512903] Unable to handle kernel paging request at virtual address 0000330a92b9688a
[56120.520881] Mem abort info:
[56120.523687] ESR = 0x0000000096000005
[56120.527454] EC = 0x25: DABT (current EL), IL = 32 bits
[56120.532785] SET = 0, FnV = 0
[56120.535847] EA = 0, S1PTW = 0
[56120.538995] FSC = 0x05: level 1 translation fault
[56120.543891] Data abort info:
[56120.546778] ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
[56120.552289] CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[56120.557362] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[56120.562690] user pgtable: 16k pages, 47-bit VAs, pgdp=0000000023f54000 [56120.569239] [0000330a92b9688a] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
[56120.577975] Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP
             CPU: 0 UID: 1000 PID: 1497409 Comm: mpv Not tainted 6.12.37-ncvm5+ #1
             Hardware name: Raspberry Pi 5 Model B Rev 1.0 (DT)
             pstate: 604000c9 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
             pc : v3d_job_update_stats+0x64/0x168 [v3d]
             lr : v3d_job_update_stats+0x40/0x168 [v3d]
             sp : ffffc00080003e60
             x29: ffffc00080003e60 x28: ffff800002860000 x27: 0000000000000000              x26: 0000000000000000 x25: ffff800002860000 x24: ffff800002630800              x23: ffff800060786000 x22: 0000330a933c31fb x21: 0000000000000001              x20: 0000330a92b96302 x19: ffff800060786b10 x18: 0000000000000000              x17: ffffaf90506a0000 x16: ffffd06fce57c360 x15: 0000000000000000              x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000              x11: 0000000000000000 x10: 0000000000000000 x9 : ffffd06f5d0fec40              x8 : 0000000000000000 x7 : 0000000000000000 x6 : 000002978dbd535a              x5 : 00ffffffffffffff x4 : 0000000000000015 x3 : 0000300001fddf88              x2 : 0000000000000020 x1 : 0000000000010001 x0 : 0000330a92b96872
             Call trace:
         v3d_job_update_stats+0x64/0x168 [v3d]
         v3d_irq+0x118/0x2e0 [v3d]
         __handle_irq_event_percpu+0x60/0x220

Fix such an issue by protecting all accesses to `job->file_priv` with
the queue's lock. With that, we can clear `job->file_priv` before the
V3D per-fd structure is freed and assure that `job->file_priv` exists
during the per-fd GPU stats updates.

Fixes: e1bc3a13bd77 ("drm/v3d: Avoid NULL pointer dereference in `v3d_job_update_stats()`")
Signed-off-by: Maíra Canal <mca...@igalia.com>
Reviewed-by: Iago Toral Quiroga <ito...@igalia.com>
---
  drivers/gpu/drm/v3d/v3d_drv.c   | 14 +++++++++++++-
  drivers/gpu/drm/v3d/v3d_drv.h   |  2 +-
  drivers/gpu/drm/v3d/v3d_sched.c | 14 +++++++-------
  3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_drv.c b/drivers/gpu/drm/v3d/ v3d_drv.c index 2def155ce496ec5f159f6bda9929aeaae141d1a6..c5a3bbbc74c5c6c9a34a6e6111b4e4e21bee0b34 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.c
+++ b/drivers/gpu/drm/v3d/v3d_drv.c
@@ -157,12 +157,24 @@ v3d_open(struct drm_device *dev, struct drm_file *file)
  static void
  v3d_postclose(struct drm_device *dev, struct drm_file *file)
  {
+    struct v3d_dev *v3d = to_v3d_dev(dev);
      struct v3d_file_priv *v3d_priv = file->driver_priv;
+    unsigned long irqflags;
      enum v3d_queue q;
-    for (q = 0; q < V3D_MAX_QUEUES; q++)
+    for (q = 0; q < V3D_MAX_QUEUES; q++) {
+        struct v3d_queue_state *queue = &v3d->queue[q];
+        struct v3d_job *job = queue->active_job;
+
drm_sched_entity_destroy(&v3d_priv->sched_entity[q]);
+        if (job && job->base.entity == &v3d_priv->sched_entity[q]) {
+            spin_lock_irqsave(&queue->queue_lock, irqflags);
+            job->file_priv = NULL;
+            spin_unlock_irqrestore(&queue->queue_lock, irqflags);
+        }
+    }
+
      v3d_perfmon_close_file(v3d_priv);
      kfree(v3d_priv);
  }
diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/ v3d_drv.h index cfc2f9c123aa99f6f1875b297eaf6c226b03d4ec..0317f3d7452a3f01b91bfdc691b5a98e54b3a4ec 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -608,7 +608,7 @@ void v3d_timestamp_query_info_free(struct v3d_timestamp_query_info *query_info,
                     unsigned int count);
  void v3d_performance_query_info_free(struct v3d_performance_query_info *query_info,
                       unsigned int count);
-void v3d_job_update_stats(struct v3d_job *job, enum v3d_queue queue);
+void v3d_job_update_stats(struct v3d_job *job, enum v3d_queue q);
  int v3d_sched_init(struct v3d_dev *v3d);
  void v3d_sched_fini(struct v3d_dev *v3d);
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/ v3d_sched.c index e348816b691ef05909828accbe15399816e69369..60e251367f42170b30de968682deb6370604db44 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -194,11 +194,11 @@ v3d_stats_update(struct v3d_stats *stats, u64 now)
  }
  void
-v3d_job_update_stats(struct v3d_job *job, enum v3d_queue queue)
+v3d_job_update_stats(struct v3d_job *job, enum v3d_queue q)
  {
      struct v3d_dev *v3d = job->v3d;
-    struct v3d_file_priv *file = job->file_priv;
-    struct v3d_stats *global_stats = &v3d->queue[queue].stats;
+    struct v3d_queue_state *queue = &v3d->queue[q];
+    struct v3d_stats *global_stats = &queue->stats;
      u64 now = local_clock();
      unsigned long flags;
@@ -209,10 +209,10 @@ v3d_job_update_stats(struct v3d_job *job, enum v3d_queue queue)
          preempt_disable();
      /* Don't update the local stats if the file context has already closed */
-    if (file)
-        v3d_stats_update(&file->stats[queue], now);
-    else
-        drm_dbg(&v3d->drm, "The file descriptor was closed before job completion\n");
+    spin_lock(&queue->queue_lock);
+    if (job->file_priv)
+        v3d_stats_update(&job->file_priv->stats[q], now);
Why not keep the drm_dbg() if !job->file_priv?

After some thought, I concluded that this debug message won't be useful
anymore. After this patch, it doesn't really matter if !job->file_priv,
the job will still end gracefully.

I think having a dbg message can be helpful to understand why the driver doesn't update local stats but global one in a given situation (?) Probably a slightly different message like "file context already closed, local stats not updated".

Melissa

Best Regards,
- Maíra


Reply via email to