xe: Add devcoredump chunking

Matthew Brost Thu, 03 Apr 2025 19:32:17 -0700

On Thu, Apr 03, 2025 at 03:41:16PM -0700, John Harrison wrote:
> On 4/3/2025 3:32 PM, Matthew Brost wrote:
> > On Thu, Apr 03, 2025 at 02:39:16PM -0700, John Harrison wrote:
> > > On 4/3/2025 1:27 PM, Matthew Brost wrote:
> > > > Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> > > > of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> > > > callback as needed.
> > > > 
> > > > Signed-off-by: Matthew Brost <matthew.br...@intel.com>
> > > > ---
> > > >    drivers/gpu/drm/xe/xe_devcoredump.c       | 59 
> > > > ++++++++++++++++++-----
> > > >    drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
> > > >    drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
> > > >    3 files changed, 50 insertions(+), 13 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c 
> > > > b/drivers/gpu/drm/xe/xe_devcoredump.c
> > > > index 81b9d9bb3f57..a9e618abf8ac 100644
> > > > --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> > > > +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> > > > @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct 
> > > > xe_exec_queue *q)
> > > >         return &q->gt->uc.guc;
> > > >    }
> > > > -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> > > > +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> > > > +                                    ssize_t start,
> > > >                                      struct xe_devcoredump *coredump)
> > > >    {
> > > >         struct xe_device *xe;
> > > > @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, 
> > > > size_t count,
> > > >         ss = &coredump->snapshot;
> > > >         iter.data = buffer;
> > > > -       iter.start = 0;
> > > > +       iter.start = start;
> > > >         iter.remain = count;
> > > >         p = drm_coredump_printer(&iter);
> > > > @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct 
> > > > xe_devcoredump_snapshot *ss)
> > > >         ss->vm = NULL;
> > > >    }
> > > > +#define XE_DEVCOREDUMP_CHUNK_MAX       (SZ_512M + SZ_1G)
> > > > +
> > > >    static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > > >                                    size_t count, void *data, size_t 
> > > > datalen)
> > > >    {
> > > > @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, 
> > > > loff_t offset,
> > > >         /* Ensure delayed work is captured before continuing */
> > > >         flush_work(&ss->work);
> > > > +       if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > > > +               xe_pm_runtime_get(gt_to_xe(ss->gt));
> > > > +
> > > >         mutex_lock(&coredump->lock);
> > > >         if (!ss->read.buffer) {
> > > > @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, 
> > > > loff_t offset,
> > > >                 return 0;
> > > >         }
> > > > +       if (offset >= ss->read.chunk_position + 
> > > > XE_DEVCOREDUMP_CHUNK_MAX ||
> > > > +           offset < ss->read.chunk_position) {
> > > > +               ss->read.chunk_position =
> > > > +                       ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> > > > +
> > > > +               __xe_devcoredump_read(ss->read.buffer,
> > > > +                                     XE_DEVCOREDUMP_CHUNK_MAX,
> > > > +                                     ss->read.chunk_position, 
> > > > coredump);
> > > > +       }
> > > > +
> > > >         byte_copied = count < ss->read.size - offset ? count :
> > > >                 ss->read.size - offset;
> > > > -       memcpy(buffer, ss->read.buffer + offset, byte_copied);
> > > > +       memcpy(buffer, ss->read.buffer +
> > > > +              (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
> > > >         mutex_unlock(&coredump->lock);
> > > > +       if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > > > +               xe_pm_runtime_put(gt_to_xe(ss->gt));
> > > > +
> > > >         return byte_copied;
> > > >    }
> > > > @@ -254,17 +274,32 @@ static void 
> > > > xe_devcoredump_deferred_snap_work(struct work_struct *work)
> > > >         xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> > > >         xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
> > > > -       xe_pm_runtime_put(xe);
> > > > +       ss->read.chunk_position = 0;
> > > >         /* Calculate devcoredump size */
> > > > -       ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> > > > -
> > > > -       ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > > > -       if (!ss->read.buffer)
> > > > -               return;
> > > > +       ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, 
> > > > coredump);
> > > > +
> > > > +       if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> > > > +               ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> > > > +                                          GFP_USER);
> > > > +               if (!ss->read.buffer)
> > > > +                       goto put_pm;
> > > > +
> > > > +               __xe_devcoredump_read(ss->read.buffer,
> > > > +                                     XE_DEVCOREDUMP_CHUNK_MAX,
> > > > +                                     0, coredump);
> > > > +       } else {
> > > > +               ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > > > +               if (!ss->read.buffer)
> > > > +                       goto put_pm;
> > > > +
> > > > +               __xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> > > > +                                     coredump);
> > > > +               xe_devcoredump_snapshot_free(ss);
> > > > +       }
> > > > -       __xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> > > > -       xe_devcoredump_snapshot_free(ss);
> > > > +put_pm:
> > > > +       xe_pm_runtime_put(xe);
> > > >    }
> > > >    static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> > > > @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, 
> > > > const char *prefix, char suffi
> > > >         if (offset & 3)
> > > >                 drm_printf(p, "Offset not word aligned: %zu", offset);
> > > > -       line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> > > > +       line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
> > > Is this related? Or should it be a separate patch?
> > > 
> > It is related. Now that __xe_devcoredump_read is called under
> > 'coredump->lock' we are in the path of reclaim, __xe_devcoredump_read
> > calls xe_print_blob_ascii85.
> > 
> > Both cases the allocation is relatively small, so would be fairly
> > unlikely to fail.
> > 
> > I could make this a seperate prep patch if you think that would be
> > better.
> Not worth splitting if this is the requirement. But maybe just add a comment
> to the commit message about why it is necessary.
>


Sure.

> > 
> > > >         if (!line_buff) {
> > > >                 drm_printf(p, "Failed to allocate line buffer\n");
> > > >                 return;
> > > > diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h 
> > > > b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > > > index 1a1d16a96b2d..a174385a6d83 100644
> > > > --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > > > @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
> > > >         struct {
> > > >                 /** @read.size: size of devcoredump in human readable 
> > > > format */
> > > >                 ssize_t size;
> > > > +               /** @read.chunk_position: position of devcoredump chunk 
> > > > */
> > > > +               ssize_t chunk_position;
> > > >                 /** @read.buffer: buffer of devcoredump in human 
> > > > readable format */
> > > >                 char *buffer;
> > > >         } read;
> > > > diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c 
> > > > b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > > > index af2c817d552c..21403a250834 100644
> > > > --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > > > +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > > > @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, 
> > > > u32 attribute, u32 *val)
> > > >         if (num_dw == 0)
> > > >                 return -EINVAL;
> > > > -       hwconfig = kzalloc(size, GFP_KERNEL);
> > > > +       hwconfig = kzalloc(size, GFP_ATOMIC);
> > > Likewise this?
> Hmm, hadn't realised we were doing an alloc/copy/free for every config table
> access. Is that just to avoid doing iosys type reads because the table is in
> LMEM? Seems a lot of overhead for each individual u32 read!
> 
> But also why are we doing table reads during core dump -> text printing?
> AFAICT, the only reads are in the steering code for doing EU register reads.
> Don't all the reg reads happen up front and then we stop accessing the
> hardware while doing the chunked dump? If we are re-reading hardware state
> when doing the text conversion we can end up with inconsistent dumps as the
> state changes from one chunk to the next?
> 

Honestly not familiar with this code at all, lockdep puked and just fixed it. :)

On its surface, it does like an unnecessary allocation.

Matt

> John.
> 
> 
> > > 
> > Same as above.
> > 
> > Matt
> > 
> > > John.
> > > 
> > > >         if (!hwconfig)
> > > >                 return -ENOMEM;
>

Re: [PATCH v2 1/4] drm/xe: Add devcoredump chunking

Reply via email to