On Tue, 2 Sept 2025 at 04:18, M Henning <mhenn...@darkrefraction.com> wrote:
>
> On Thu, Aug 28, 2025 at 10:17 PM Dave Airlie <airl...@gmail.com> wrote:
> >
> > From: Dave Airlie <airl...@redhat.com>
> >
> > Nouveau has code that when it gets an IRQ with no allowed handler
> > it disables it to avoid storms.
> >
> > However with nonstall interrupts, we often disable them from
> > the drm driver, but still request their emission via the push submission.
> >
> > Just don't disable nonstall irqs ever in normal operation, the
> > event handling code will filter them out, and the driver will
> > just enable/disable them at load time.
> >
> > This fixes timeouts we've been seeing on/off for a long time,
> > but they became a lot more noticable on Blackwell.
> >
> > This doesn't fix all of them, there is a subsequent fence emission
> > fix to fix the last few.
> >
> > Fixes: 3ebd64aa3c4f ("drm/nouveau/intr: support multiple trees, and 
> > explicit interfaces")
> > Cc: sta...@vger.kernel.org
> > Signed-off-by: Dave Airlie <airl...@redhat.com>
> >
> > ---
> > v2: add missing ga102.
> > ---
> >  .../gpu/drm/nouveau/nvkm/engine/fifo/base.c   |  2 ++
> >  .../gpu/drm/nouveau/nvkm/engine/fifo/ga100.c  | 22 ++++++++++++-------
> >  .../gpu/drm/nouveau/nvkm/engine/fifo/ga102.c  |  1 +
> >  .../gpu/drm/nouveau/nvkm/engine/fifo/priv.h   |  2 ++
> >  .../nouveau/nvkm/subdev/gsp/rm/r535/fifo.c    |  2 +-
> >  5 files changed, 20 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c 
> > b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
> > index fdffa0391b31..6fd4e60634fb 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
> > @@ -350,6 +350,8 @@ nvkm_fifo_dtor(struct nvkm_engine *engine)
> >         nvkm_chid_unref(&fifo->chid);
> >
> >         nvkm_event_fini(&fifo->nonstall.event);
> > +       if (fifo->func->nonstall_dtor)
> > +               fifo->func->nonstall_dtor(fifo);
> >         mutex_destroy(&fifo->mutex);
> >
> >         if (fifo->func->dtor)
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga100.c 
> > b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga100.c
> > index e74493a4569e..81beae473122 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga100.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga100.c
> > @@ -517,19 +517,11 @@ ga100_fifo_nonstall_intr(struct nvkm_inth *inth)
> >  static void
> >  ga100_fifo_nonstall_block(struct nvkm_event *event, int type, int index)
> >  {
> > -       struct nvkm_fifo *fifo = container_of(event, typeof(*fifo), 
> > nonstall.event);
> > -       struct nvkm_runl *runl = nvkm_runl_get(fifo, index, 0);
> > -
> > -       nvkm_inth_block(&runl->nonstall.inth);
> >  }
> >
> >  static void
> >  ga100_fifo_nonstall_allow(struct nvkm_event *event, int type, int index)
> >  {
> > -       struct nvkm_fifo *fifo = container_of(event, typeof(*fifo), 
> > nonstall.event);
> > -       struct nvkm_runl *runl = nvkm_runl_get(fifo, index, 0);
> > -
> > -       nvkm_inth_allow(&runl->nonstall.inth);
> >  }
> >
> >  const struct nvkm_event_func
> > @@ -564,12 +556,25 @@ ga100_fifo_nonstall_ctor(struct nvkm_fifo *fifo)
> >                 if (ret)
> >                         return ret;
> >
> > +               nvkm_inth_allow(&runl->nonstall.inth);
> > +
> >                 nr = max(nr, runl->id + 1);
> >         }
> >
> >         return nr;
> >  }
> >
> > +void
> > +ga100_fifo_nonstall_dtor(struct nvkm_fifo *fifo)
> > +{
> > +       struct nvkm_runl *runl;
> > +       nvkm_runl_foreach(runl, fifo) {
> > +               if (runl->nonstall.vector < 0)
> > +                       continue;
> > +               nvkm_inth_block(&runl->nonstall.inth);
> > +       }
> > +}
> > +
> >  int
> >  ga100_fifo_runl_ctor(struct nvkm_fifo *fifo)
> >  {
> > @@ -599,6 +604,7 @@ ga100_fifo = {
> >         .runl_ctor = ga100_fifo_runl_ctor,
> >         .mmu_fault = &tu102_fifo_mmu_fault,
> >         .nonstall_ctor = ga100_fifo_nonstall_ctor,
> > +       .nonstall_dtor = ga100_fifo_nonstall_dtor,
> >         .nonstall = &ga100_fifo_nonstall,
> >         .runl = &ga100_runl,
> >         .runq = &ga100_runq,
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c 
> > b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c
> > index 755235f55b3a..18a0b1f4eab7 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c
> > @@ -30,6 +30,7 @@ ga102_fifo = {
> >         .runl_ctor = ga100_fifo_runl_ctor,
> >         .mmu_fault = &tu102_fifo_mmu_fault,
> >         .nonstall_ctor = ga100_fifo_nonstall_ctor,
> > +       .nonstall_dtor = ga100_fifo_nonstall_dtor,
> >         .nonstall = &ga100_fifo_nonstall,
> >         .runl = &ga100_runl,
> >         .runq = &ga100_runq,
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h 
> > b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h
> > index 5e81ae195329..fff1428ef267 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h
> > +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h
> > @@ -41,6 +41,7 @@ struct nvkm_fifo_func {
> >         void (*start)(struct nvkm_fifo *, unsigned long *);
> >
> >         int (*nonstall_ctor)(struct nvkm_fifo *);
> > +       void (*nonstall_dtor)(struct nvkm_fifo *);
> >         const struct nvkm_event_func *nonstall;
> >
> >         const struct nvkm_runl_func *runl;
> > @@ -200,6 +201,7 @@ u32 tu102_chan_doorbell_handle(struct nvkm_chan *);
> >
> >  int ga100_fifo_runl_ctor(struct nvkm_fifo *);
> >  int ga100_fifo_nonstall_ctor(struct nvkm_fifo *);
> > +void ga100_fifo_nonstall_dtor(struct nvkm_fifo *);
> >  extern const struct nvkm_event_func ga100_fifo_nonstall;
> >  extern const struct nvkm_runl_func ga100_runl;
> >  extern const struct nvkm_runq_func ga100_runq;
> > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/fifo.c 
> > b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/fifo.c
> > index 1ac5628c5140..b8be0a872e7a 100644
> > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/fifo.c
> > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/fifo.c
> > @@ -601,7 +601,7 @@ r535_fifo_new(const struct nvkm_fifo_func *hw, struct 
> > nvkm_device *device,
> >         rm->chan.func = &r535_chan;
> >         rm->nonstall = &ga100_fifo_nonstall;
> >         rm->nonstall_ctor = ga100_fifo_nonstall_ctor;
> > -
> > +       rm->nonstall_dtor = ga100_fifo_nonstall_dtor;
> >         return nvkm_fifo_new_(rm, device, type, inst, pfifo);
> >  }
> >
> > --
> > 2.50.1
> >
>
> Maybe we should also do this for older GPUs? eg. perhaps we should
> also update gf100_fifo_nonstall_allow / gf100_fifo_nonstall_block ?

Those actually turn off the irq at the hardware, and therefore
shouldn't hit the allowed path check, not touching that without
someone showing it's broken.

Dave.

Reply via email to