On 5/21/25 11:49, Pierre-Eric Pelloux-Prayer wrote: > The coredump needs to contain accurate data and reporting a page > fault from a previous issue is incorrect. > > Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-pra...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 13 ++++++++----- > drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 5 +++++ > 3 files changed, 14 insertions(+), 5 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > index de70747a099d..6fa53e070b50 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > @@ -273,11 +273,13 @@ __amdgpu_devcoredump_read(char *buffer, size_t count, > struct amdgpu_coredump_inf > } > > /* Add page fault information */ > - fault_info = &coredump->adev->vm_manager.fault_info; > - drm_printf(&p, "\n[%s] Page fault observed\n", > - fault_info->vmhub ? "mmhub" : "gfxhub"); > - drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", > fault_info->addr); > - drm_printf(&p, "Protection fault status register: 0x%x\n\n", > fault_info->status); > + fault_info = &coredump->fault_info; > + if (fault_info->status != 0) { > + drm_printf(&p, "\n[%s] Page fault observed\n", > + fault_info->vmhub ? "mmhub" : "gfxhub"); > + drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", > fault_info->addr); > + drm_printf(&p, "Protection fault status register: 0x%x\n\n", > fault_info->status); > + } > > /* dump the ip state for each ip */ > drm_printf(&p, "IP Dump\n"); > @@ -377,6 +379,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool > skip_vram_check, > > coredump->skip_vram_check = skip_vram_check; > coredump->reset_vram_lost = vram_lost; > + coredump->fault_info = adev->vm_manager.fault_info; > > if (job && job->pasid) { > struct amdgpu_task_info *ti; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h > index 33f2f6fdfcf7..38ccdd3d6213 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h > @@ -37,6 +37,7 @@ struct amdgpu_coredump_info { > struct timespec64 reset_time; > bool skip_vram_check; > bool reset_vram_lost; > + struct amdgpu_vm_fault_info fault_info; > struct amdgpu_ring *ring; > /* Readable form of coredevdump, generate once to speed up > * reading it (see drm_coredump_printer's documentation). > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index acb21fc8b3ce..5ee9d2cd74e5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -199,6 +199,11 @@ static enum drm_gpu_sched_stat > amdgpu_job_timedout(struct drm_sched_job *s_job) > > exit: > drm_dev_exit(idx); > + > + /* Clear fault info to avoid reporting the same fault. */ > + adev->vm_manager.fault_info.status = 0; > + adev->vm_manager.fault_info.addr = 0; > +
That needs to come much earlier and prefereable while holding a lock. Apart from that looks good to me. Christian. > return DRM_GPU_SCHED_STAT_NOMINAL; > } >