from:"Rob Clark"

[PATCH] drm/msm: Add fence->wait() op

2021-07-20 Thread Rob Clark

From: Rob Clark 

Somehow we had neither ->wait() nor dma_fence_signal() calls, and no
one noticed.  Oops.

Note that this removes the !timeout case, which has not been used in
a long time.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/msm_fence.c | 59 +++--
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_fence.c b/drivers/gpu/drm/msm/msm_fence.c
index cd59a5918038..8ee96b90ded6 100644
--- a/drivers/gpu/drm/msm/msm_fence.c
+++ b/drivers/gpu/drm/msm/msm_fence.c
@@ -38,11 +38,10 @@ static inline bool fence_completed(struct msm_fence_context 
*fctx, uint32_t fenc
return (int32_t)(fctx->completed_fence - fence) >= 0;
 }
 
-/* legacy path for WAIT_FENCE ioctl: */
-int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence,
-   ktime_t *timeout, bool interruptible)
+static signed long wait_fence(struct msm_fence_context *fctx, uint32_t fence,
+   signed long remaining_jiffies, bool interruptible)
 {
-   int ret;
+   signed long ret;
 
if (fence > fctx->last_fence) {
DRM_ERROR_RATELIMITED("%s: waiting on invalid fence: %u (of 
%u)\n",
@@ -50,33 +49,34 @@ int msm_wait_fence(struct msm_fence_context *fctx, uint32_t 
fence,
return -EINVAL;
}
 
-   if (!timeout) {
-   /* no-wait: */
-   ret = fence_completed(fctx, fence) ? 0 : -EBUSY;
+   if (interruptible) {
+   ret = wait_event_interruptible_timeout(fctx->event,
+   fence_completed(fctx, fence),
+   remaining_jiffies);
} else {
-   unsigned long remaining_jiffies = timeout_to_jiffies(timeout);
-
-   if (interruptible)
-   ret = wait_event_interruptible_timeout(fctx->event,
-   fence_completed(fctx, fence),
-   remaining_jiffies);
-   else
-   ret = wait_event_timeout(fctx->event,
-   fence_completed(fctx, fence),
-   remaining_jiffies);
-
-   if (ret == 0) {
-   DBG("timeout waiting for fence: %u (completed: %u)",
-   fence, fctx->completed_fence);
-   ret = -ETIMEDOUT;
-   } else if (ret != -ERESTARTSYS) {
-   ret = 0;
-   }
+   ret = wait_event_timeout(fctx->event,
+   fence_completed(fctx, fence),
+   remaining_jiffies);
+   }
+
+   if (ret == 0) {
+   DBG("timeout waiting for fence: %u (completed: %u)",
+   fence, fctx->completed_fence);
+   ret = -ETIMEDOUT;
+   } else if (ret != -ERESTARTSYS) {
+   ret = 0;
}
 
return ret;
 }
 
+/* legacy path for WAIT_FENCE ioctl: */
+int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence,
+   ktime_t *timeout, bool interruptible)
+{
+   return wait_fence(fctx, fence, timeout_to_jiffies(timeout), 
interruptible);
+}
+
 /* called from workqueue */
 void msm_update_fence(struct msm_fence_context *fctx, uint32_t fence)
 {
@@ -114,10 +114,19 @@ static bool msm_fence_signaled(struct dma_fence *fence)
return fence_completed(f->fctx, f->base.seqno);
 }
 
+static signed long msm_fence_wait(struct dma_fence *fence, bool intr,
+   signed long timeout)
+{
+   struct msm_fence *f = to_msm_fence(fence);
+
+   return wait_fence(f->fctx, fence->seqno, timeout, intr);
+}
+
 static const struct dma_fence_ops msm_fence_ops = {
.get_driver_name = msm_fence_get_driver_name,
.get_timeline_name = msm_fence_get_timeline_name,
.signaled = msm_fence_signaled,
+   .wait = msm_fence_wait,
 };
 
 struct dma_fence *
-- 
2.31.1

Re: [Linaro-mm-sig] [PATCH] drm/msm: Add fence->wait() op

2021-07-20 Thread Rob Clark

On Tue, Jul 20, 2021 at 11:03 AM Christian König
 wrote:
>
> Hi Rob,
>
> Am 20.07.21 um 17:07 schrieb Rob Clark:
> > From: Rob Clark 
> >
> > Somehow we had neither ->wait() nor dma_fence_signal() calls, and no
> > one noticed.  Oops.
>
>
> I'm not sure if that is a good idea.
>
> The dma_fence->wait() callback is pretty much deprecated and should not
> be used any more.
>
> What exactly do you need that for?

Well, the alternative is to track the set of fences which have
signalling enabled, and then figure out which ones to signal, which
seems like a lot more work, vs just re-purposing the wait
implementation we already have for non-dma_fence cases ;-)

Why is the ->wait() callback (pretty much) deprecated?

BR,
-R

> Regards,
> Christian.
>
> >
> > Note that this removes the !timeout case, which has not been used in
> > a long time.
>
>
> >
> > Signed-off-by: Rob Clark 
> > ---
> >   drivers/gpu/drm/msm/msm_fence.c | 59 +++--
> >   1 file changed, 34 insertions(+), 25 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/msm/msm_fence.c 
> > b/drivers/gpu/drm/msm/msm_fence.c
> > index cd59a5918038..8ee96b90ded6 100644
> > --- a/drivers/gpu/drm/msm/msm_fence.c
> > +++ b/drivers/gpu/drm/msm/msm_fence.c
> > @@ -38,11 +38,10 @@ static inline bool fence_completed(struct 
> > msm_fence_context *fctx, uint32_t fenc
> >   return (int32_t)(fctx->completed_fence - fence) >= 0;
> >   }
> >
> > -/* legacy path for WAIT_FENCE ioctl: */
> > -int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence,
> > - ktime_t *timeout, bool interruptible)
> > +static signed long wait_fence(struct msm_fence_context *fctx, uint32_t 
> > fence,
> > + signed long remaining_jiffies, bool interruptible)
> >   {
> > - int ret;
> > + signed long ret;
> >
> >   if (fence > fctx->last_fence) {
> >   DRM_ERROR_RATELIMITED("%s: waiting on invalid fence: %u (of 
> > %u)\n",
> > @@ -50,33 +49,34 @@ int msm_wait_fence(struct msm_fence_context *fctx, 
> > uint32_t fence,
> >   return -EINVAL;
> >   }
> >
> > - if (!timeout) {
> > - /* no-wait: */
> > - ret = fence_completed(fctx, fence) ? 0 : -EBUSY;
> > + if (interruptible) {
> > + ret = wait_event_interruptible_timeout(fctx->event,
> > + fence_completed(fctx, fence),
> > + remaining_jiffies);
> >   } else {
> > - unsigned long remaining_jiffies = timeout_to_jiffies(timeout);
> > -
> > - if (interruptible)
> > - ret = wait_event_interruptible_timeout(fctx->event,
> > - fence_completed(fctx, fence),
> > - remaining_jiffies);
> > - else
> > - ret = wait_event_timeout(fctx->event,
> > - fence_completed(fctx, fence),
> > - remaining_jiffies);
> > -
> > - if (ret == 0) {
> > - DBG("timeout waiting for fence: %u (completed: %u)",
> > - fence, fctx->completed_fence);
> > - ret = -ETIMEDOUT;
> > - } else if (ret != -ERESTARTSYS) {
> > - ret = 0;
> > - }
> > + ret = wait_event_timeout(fctx->event,
> > + fence_completed(fctx, fence),
> > + remaining_jiffies);
> > + }
> > +
> > + if (ret == 0) {
> > + DBG("timeout waiting for fence: %u (completed: %u)",
> > + fence, fctx->completed_fence);
> > + ret = -ETIMEDOUT;
> > + } else if (ret != -ERESTARTSYS) {
> > + ret = 0;
> >   }
> >
> >   return ret;
> >   }
> >
> > +/* legacy path for WAIT_FENCE ioctl: */
> > +int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence,
> > + ktime_t *timeout, bool interruptible)
> > +{
> > + return wait_fence(fctx, fence, timeout_to_jiffies(timeout), 
> > interruptible);
> > +}
> > +
> >   /* called from workqueue */
> >   void msm_update_fence(struct msm_fence_context *fctx, uint32_t fence)
> >   {
> > @@ -114,10 +114,19 @@ static bool msm_fence_signaled(struct dma_fence 
> > *fence)
> >   return fence_completed(f->fctx, f->base.seqno);
> >   }
> >
> > +static signed long msm_fence_wait(struct dma_fence *fence, bool intr,
> > + signed long timeout)
> > +{
> > + struct msm_fence *f = to_msm_fence(fence);
> > +
> > + return wait_fence(f->fctx, fence->seqno, timeout, intr);
> > +}
> > +
> >   static const struct dma_fence_ops msm_fence_ops = {
> >   .get_driver_name = msm_fence_get_driver_name,
> >   .get_timeline_name = msm_fence_get_timeline_name,
> >   .signaled = msm_fence_signaled,
> > + .wait = msm_fence_wait,
> >   };
> >
> >   struct dma_fence *
>

Re: [Linaro-mm-sig] [PATCH] drm/msm: Add fence->wait() op

2021-07-20 Thread Rob Clark

On Tue, Jul 20, 2021 at 1:55 PM Daniel Vetter  wrote:
>
> On Tue, Jul 20, 2021 at 8:26 PM Rob Clark  wrote:
> >
> > On Tue, Jul 20, 2021 at 11:03 AM Christian König
> >  wrote:
> > >
> > > Hi Rob,
> > >
> > > Am 20.07.21 um 17:07 schrieb Rob Clark:
> > > > From: Rob Clark 
> > > >
> > > > Somehow we had neither ->wait() nor dma_fence_signal() calls, and no
> > > > one noticed.  Oops.
> > >
> > >
> > > I'm not sure if that is a good idea.
> > >
> > > The dma_fence->wait() callback is pretty much deprecated and should not
> > > be used any more.
> > >
> > > What exactly do you need that for?
> >
> > Well, the alternative is to track the set of fences which have
> > signalling enabled, and then figure out which ones to signal, which
> > seems like a lot more work, vs just re-purposing the wait
> > implementation we already have for non-dma_fence cases ;-)
> >
> > Why is the ->wait() callback (pretty much) deprecated?
>
> Because if you need it that means for your driver dma_fence_add_cb is
> broken, which means a _lot_ of things don't work. Like dma_buf poll
> (compositors have patches to start using that), and I think
> drm/scheduler also becomes rather unhappy.

I'm starting to page back in how this works.. fence cb's aren't broken
(which is also why dma_fence_wait() was not completely broken),
because in retire_submits() we call
dma_fence_is_signaled(submit->hw_fence).

But the reason that the custom wait function cleans up a tiny bit of
jank is that the wait_queue_head_t gets signaled earlier, before we
start iterating the submits and doing all that retire_submit() stuff
(unpin/unref bo's, etc).  I suppose I could just split things up to
call dma_fence_signal() earlier, and *then* do the retire_submits()
stuff.

BR,
-R

> It essentially exists only for old drivers where ->enable_signalling
> is unreliable and we paper over that with a retry loop in ->wait and
> pray no one notices that it's too butchered. The proper fix is to have
> a driver thread to guarantee that ->enable_signalling works reliable,
> so you don't need a ->wait.
>
> Can you type up a kerneldoc patch for dma_fence_ops->wait to hammer
> this in please?
> -Daniel
>
> >
> > BR,
> > -R
> >
> > > Regards,
> > > Christian.
> > >
> > > >
> > > > Note that this removes the !timeout case, which has not been used in
> > > > a long time.
> > >
> > >
> > > >
> > > > Signed-off-by: Rob Clark 
> > > > ---
> > > >   drivers/gpu/drm/msm/msm_fence.c | 59 +++--
> > > >   1 file changed, 34 insertions(+), 25 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/msm/msm_fence.c 
> > > > b/drivers/gpu/drm/msm/msm_fence.c
> > > > index cd59a5918038..8ee96b90ded6 100644
> > > > --- a/drivers/gpu/drm/msm/msm_fence.c
> > > > +++ b/drivers/gpu/drm/msm/msm_fence.c
> > > > @@ -38,11 +38,10 @@ static inline bool fence_completed(struct 
> > > > msm_fence_context *fctx, uint32_t fenc
> > > >   return (int32_t)(fctx->completed_fence - fence) >= 0;
> > > >   }
> > > >
> > > > -/* legacy path for WAIT_FENCE ioctl: */
> > > > -int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence,
> > > > - ktime_t *timeout, bool interruptible)
> > > > +static signed long wait_fence(struct msm_fence_context *fctx, uint32_t 
> > > > fence,
> > > > + signed long remaining_jiffies, bool interruptible)
> > > >   {
> > > > - int ret;
> > > > + signed long ret;
> > > >
> > > >   if (fence > fctx->last_fence) {
> > > >   DRM_ERROR_RATELIMITED("%s: waiting on invalid fence: %u 
> > > > (of %u)\n",
> > > > @@ -50,33 +49,34 @@ int msm_wait_fence(struct msm_fence_context *fctx, 
> > > > uint32_t fence,
> > > >   return -EINVAL;
> > > >   }
> > > >
> > > > - if (!timeout) {
> > > > - /* no-wait: */
> > > > - ret = fence_completed(fctx, fence) ? 0 : -EBUSY;
> > > > + if (interruptible) {
> > > > + ret = wait_event_interruptible_timeout(fctx->event,
> > > > + fence_completed(fctx, fence),
> > >

Re: [Linaro-mm-sig] [PATCH] drm/msm: Add fence->wait() op

2021-07-21 Thread Rob Clark

On Wed, Jul 21, 2021 at 12:59 AM Daniel Vetter  wrote:
>
> On Wed, Jul 21, 2021 at 12:32 AM Rob Clark  wrote:
> >
> > On Tue, Jul 20, 2021 at 1:55 PM Daniel Vetter  wrote:
> > >
> > > On Tue, Jul 20, 2021 at 8:26 PM Rob Clark  wrote:
> > > >
> > > > On Tue, Jul 20, 2021 at 11:03 AM Christian König
> > > >  wrote:
> > > > >
> > > > > Hi Rob,
> > > > >
> > > > > Am 20.07.21 um 17:07 schrieb Rob Clark:
> > > > > > From: Rob Clark 
> > > > > >
> > > > > > Somehow we had neither ->wait() nor dma_fence_signal() calls, and no
> > > > > > one noticed.  Oops.
> > > > >
> > > > >
> > > > > I'm not sure if that is a good idea.
> > > > >
> > > > > The dma_fence->wait() callback is pretty much deprecated and should 
> > > > > not
> > > > > be used any more.
> > > > >
> > > > > What exactly do you need that for?
> > > >
> > > > Well, the alternative is to track the set of fences which have
> > > > signalling enabled, and then figure out which ones to signal, which
> > > > seems like a lot more work, vs just re-purposing the wait
> > > > implementation we already have for non-dma_fence cases ;-)
> > > >
> > > > Why is the ->wait() callback (pretty much) deprecated?
> > >
> > > Because if you need it that means for your driver dma_fence_add_cb is
> > > broken, which means a _lot_ of things don't work. Like dma_buf poll
> > > (compositors have patches to start using that), and I think
> > > drm/scheduler also becomes rather unhappy.
> >
> > I'm starting to page back in how this works.. fence cb's aren't broken
> > (which is also why dma_fence_wait() was not completely broken),
> > because in retire_submits() we call
> > dma_fence_is_signaled(submit->hw_fence).
> >
> > But the reason that the custom wait function cleans up a tiny bit of
> > jank is that the wait_queue_head_t gets signaled earlier, before we
> > start iterating the submits and doing all that retire_submit() stuff
> > (unpin/unref bo's, etc).  I suppose I could just split things up to
> > call dma_fence_signal() earlier, and *then* do the retire_submits()
> > stuff.
>
> Yeah reducing the latency there sounds like a good idea.
> -Daniel
>

Hmm, no, turns out that isn't the problem.. or, well, it is probably a
good idea to call drm_fence_signal() earlier.  But it seems like
waking up from wait_event_* is faster than wake_up_state(wait->task,
TASK_NORMAL).  I suppose the wake_up_state() approach still needs for
the scheduler to get around to schedule the runnable task.

So for now, I'm going back to my own wait function (plus earlier
drm_fence_signal())

Before removing dma_fence_opps::wait(), I guess we want to re-think
dma_fence_default_wait().. but I think that would require a
dma_fence_context base class (rather than just a raw integer).

BR,
-R

> >
> > BR,
> > -R
> >
> > > It essentially exists only for old drivers where ->enable_signalling
> > > is unreliable and we paper over that with a retry loop in ->wait and
> > > pray no one notices that it's too butchered. The proper fix is to have
> > > a driver thread to guarantee that ->enable_signalling works reliable,
> > > so you don't need a ->wait.
> > >
> > > Can you type up a kerneldoc patch for dma_fence_ops->wait to hammer
> > > this in please?
> > > -Daniel
> > >
> > > >
> > > > BR,
> > > > -R
> > > >
> > > > > Regards,
> > > > > Christian.
> > > > >
> > > > > >
> > > > > > Note that this removes the !timeout case, which has not been used in
> > > > > > a long time.
> > > > >
> > > > >
> > > > > >
> > > > > > Signed-off-by: Rob Clark 
> > > > > > ---
> > > > > >   drivers/gpu/drm/msm/msm_fence.c | 59 
> > > > > > +++--
> > > > > >   1 file changed, 34 insertions(+), 25 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/gpu/drm/msm/msm_fence.c 
> > > > > > b/drivers/gpu/drm/msm/msm_fence.c
> > > > > > index cd59a5918038..8ee96b90ded6 100644

Re: [Linaro-mm-sig] [PATCH] drm/msm: Add fence->wait() op

2021-07-22 Thread Rob Clark

On Thu, Jul 22, 2021 at 1:42 AM Christian König
 wrote:
>
> Am 21.07.21 um 21:03 schrieb Daniel Vetter:
> > On Wed, Jul 21, 2021 at 09:34:43AM -0700, Rob Clark wrote:
> >> On Wed, Jul 21, 2021 at 12:59 AM Daniel Vetter  wrote:
> >>> On Wed, Jul 21, 2021 at 12:32 AM Rob Clark  wrote:
> >>>> On Tue, Jul 20, 2021 at 1:55 PM Daniel Vetter  wrote:
> >>>>> On Tue, Jul 20, 2021 at 8:26 PM Rob Clark  wrote:
> >>>>>> On Tue, Jul 20, 2021 at 11:03 AM Christian König
> >>>>>>  wrote:
> >>>>>>> Hi Rob,
> >>>>>>>
> >>>>>>> Am 20.07.21 um 17:07 schrieb Rob Clark:
> >>>>>>>> From: Rob Clark 
> >>>>>>>>
> >>>>>>>> Somehow we had neither ->wait() nor dma_fence_signal() calls, and no
> >>>>>>>> one noticed.  Oops.
> >>>>>>>
> >>>>>>> I'm not sure if that is a good idea.
> >>>>>>>
> >>>>>>> The dma_fence->wait() callback is pretty much deprecated and should 
> >>>>>>> not
> >>>>>>> be used any more.
> >>>>>>>
> >>>>>>> What exactly do you need that for?
> >>>>>> Well, the alternative is to track the set of fences which have
> >>>>>> signalling enabled, and then figure out which ones to signal, which
> >>>>>> seems like a lot more work, vs just re-purposing the wait
> >>>>>> implementation we already have for non-dma_fence cases ;-)
> >>>>>>
> >>>>>> Why is the ->wait() callback (pretty much) deprecated?
> >>>>> Because if you need it that means for your driver dma_fence_add_cb is
> >>>>> broken, which means a _lot_ of things don't work. Like dma_buf poll
> >>>>> (compositors have patches to start using that), and I think
> >>>>> drm/scheduler also becomes rather unhappy.
> >>>> I'm starting to page back in how this works.. fence cb's aren't broken
> >>>> (which is also why dma_fence_wait() was not completely broken),
> >>>> because in retire_submits() we call
> >>>> dma_fence_is_signaled(submit->hw_fence).
> >>>>
> >>>> But the reason that the custom wait function cleans up a tiny bit of
> >>>> jank is that the wait_queue_head_t gets signaled earlier, before we
> >>>> start iterating the submits and doing all that retire_submit() stuff
> >>>> (unpin/unref bo's, etc).  I suppose I could just split things up to
> >>>> call dma_fence_signal() earlier, and *then* do the retire_submits()
> >>>> stuff.
> >>> Yeah reducing the latency there sounds like a good idea.
> >>> -Daniel
> >>>
> >> Hmm, no, turns out that isn't the problem.. or, well, it is probably a
> >> good idea to call drm_fence_signal() earlier.  But it seems like
> >> waking up from wait_event_* is faster than wake_up_state(wait->task,
> >> TASK_NORMAL).  I suppose the wake_up_state() approach still needs for
> >> the scheduler to get around to schedule the runnable task.
>
> As far as I know wake_up_state() tries to run the thread on the CPU it
> was scheduled last, while wait_event_* makes the thread run on the CPU
> who issues the wake by default.
>
> And yes I've also noticed this already and it was one of the reason why
> I suggested to use a wait_queue instead of the hand wired dma_fence_wait
> implementation.
>
> >>
> >> So for now, I'm going back to my own wait function (plus earlier
> >> drm_fence_signal())
> >>
> >> Before removing dma_fence_opps::wait(), I guess we want to re-think
> >> dma_fence_default_wait().. but I think that would require a
> >> dma_fence_context base class (rather than just a raw integer).
> > Uh that's not great ... can't we fix this instead of papering over it in
> > drivers? Aside from maybe different wakeup flags it all is supposed to
> > work exactly the same underneath, and whether using a wait queue or not
> > really shouldn't matter.
>
> Well it would have been nicer if we used the existing infrastructure
> instead of re-inventing stuff for dma_fence, but that chance is long gone.
>
> And you don't need a dma_fence_context base class, but rather just a
> flag in the dma_fence_ops if you want to change the behavior.

Hmm, I was thinking dma_fence_context to have a place for the
wait_queue_head, but I guess that could also be per-dma_fence

Re: [Linaro-mm-sig] [PATCH] drm/msm: Add fence->wait() op

2021-07-22 Thread Rob Clark

On Thu, Jul 22, 2021 at 2:28 AM Christian König
 wrote:
>
> Am 22.07.21 um 11:08 schrieb Daniel Vetter:
> > [SNIP]
> >> As far as I know wake_up_state() tries to run the thread on the CPU it was
> >> scheduled last, while wait_event_* makes the thread run on the CPU who
> >> issues the wake by default.
> >>
> >> And yes I've also noticed this already and it was one of the reason why I
> >> suggested to use a wait_queue instead of the hand wired dma_fence_wait
> >> implementation.
> > The first versions had used wait_queue, but iirc we had some issues with
> > the callbacks and stuff and that was the reasons for hand-rolling. Or
> > maybe it was the integration of the lockless fastpath for
> > dma_fence_is_signalled().
> >
> >> [SNIP]
> >> Well it would have been nicer if we used the existing infrastructure 
> >> instead
> >> of re-inventing stuff for dma_fence, but that chance is long gone.
> >>
> >> And you don't need a dma_fence_context base class, but rather just a flag 
> >> in
> >> the dma_fence_ops if you want to change the behavior.
> > If there's something broken we should just fix it, not force everyone to
> > set a random flag. dma_fence work like special wait_queues, so if we
> > differ then we should go back to that.
>
> Wait a second with that, this is not broken. It's just different
> behavior and there are good arguments for both sides.
>
> If a wait is short you can have situations where you want to start the
> thread on the original CPU.
>  This is because you can assume that the caches on that CPU are
> still hot and heating up the caches on the local CPU would take longer
> than an inter CPU interrupt.
>
> But if the wait is long it makes more sense to run the thread on the CPU
> where you noticed the wake up event.
>  This is because you can assume that the caches are cold anyway and
> starting the thread on the current CPU (most likely from an interrupt
> handler) gives you the absolutely best latency.
>  In other words you usually return from the interrupt handler and
> just directly switch to the now running thread.
>
> I'm not sure if all drivers want the same behavior. Rob here seems to
> prefer number 2, but we have used 1 for dma_fence for a rather long time
> now and it could be that some people start to complain when we switch
> unconditionally.
>

Hmm, I wonder if it would make sense to have a dma_wait_fence() flag
to control the behavior, since it is maybe more about the waiter (and
perhaps how long the waiter expects to wait) than the signaler..

BR,
-R

[PATCH 0/3] drm/msm: Improved devfreq tuning

2021-07-22 Thread Rob Clark

From: Rob Clark 

This is the outcome of trying to fix some bad gpu freq behavior seen in
some use-cases, in particular mobile games that throttle themselves to
30fps.  With the existing tuning, we'd end up spending most of the time
that we should be running fast at a low freq, and most of the idle time
at a high freq.

First two patches are prep, 3/3 is the interesting bit.  See the patch
description in 3/3 for more details.

Rob Clark (3):
  drm/msm: Split out devfreq handling
  drm/msm: Split out get_freq() helper
  drm/msm: Devfreq tuning

 drivers/gpu/drm/msm/Makefile  |   1 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c |   4 +-
 drivers/gpu/drm/msm/msm_gpu.c | 124 ++--
 drivers/gpu/drm/msm/msm_gpu.h |  27 +++-
 drivers/gpu/drm/msm/msm_gpu_devfreq.c | 203 ++
 5 files changed, 238 insertions(+), 121 deletions(-)
 create mode 100644 drivers/gpu/drm/msm/msm_gpu_devfreq.c

-- 
2.31.1

[PATCH 1/3] drm/msm: Split out devfreq handling

2021-07-22 Thread Rob Clark

From: Rob Clark 

Before we start adding more cleverness, split it into it's own file.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/Makefile  |   1 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c |   4 +-
 drivers/gpu/drm/msm/msm_gpu.c | 116 +-
 drivers/gpu/drm/msm/msm_gpu.h |  18 ++--
 drivers/gpu/drm/msm/msm_gpu_devfreq.c | 133 ++
 5 files changed, 151 insertions(+), 121 deletions(-)
 create mode 100644 drivers/gpu/drm/msm/msm_gpu_devfreq.c

diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
index 2c00aa70b708..904535eda0c4 100644
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@@ -90,6 +90,7 @@ msm-y := \
msm_gem_submit.o \
msm_gem_vma.o \
msm_gpu.o \
+   msm_gpu_devfreq.o \
msm_iommu.o \
msm_perf.o \
msm_rd.o \
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index a7df02298479..55ea136b8933 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1477,7 +1477,7 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
if (ret)
return ret;
 
-   msm_gpu_resume_devfreq(gpu);
+   msm_devfreq_resume(gpu);
 
a6xx_llc_activate(a6xx_gpu);
 
@@ -1494,7 +1494,7 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
 
a6xx_llc_deactivate(a6xx_gpu);
 
-   devfreq_suspend_device(gpu->devfreq.devfreq);
+   msm_devfreq_suspend(gpu);
 
ret = a6xx_gmu_stop(a6xx_gpu);
if (ret)
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index c4e202f0366c..70d8610b1b73 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -13,8 +13,6 @@
 
 #include 
 #include 
-#include 
-#include 
 #include 
 #include 
 
@@ -22,106 +20,6 @@
  * Power Management:
  */
 
-static int msm_devfreq_target(struct device *dev, unsigned long *freq,
-   u32 flags)
-{
-   struct msm_gpu *gpu = dev_to_gpu(dev);
-   struct dev_pm_opp *opp;
-
-   opp = devfreq_recommended_opp(dev, freq, flags);
-
-   if (IS_ERR(opp))
-   return PTR_ERR(opp);
-
-   trace_msm_gpu_freq_change(dev_pm_opp_get_freq(opp));
-
-   if (gpu->funcs->gpu_set_freq)
-   gpu->funcs->gpu_set_freq(gpu, opp);
-   else
-   clk_set_rate(gpu->core_clk, *freq);
-
-   dev_pm_opp_put(opp);
-
-   return 0;
-}
-
-static int msm_devfreq_get_dev_status(struct device *dev,
-   struct devfreq_dev_status *status)
-{
-   struct msm_gpu *gpu = dev_to_gpu(dev);
-   ktime_t time;
-
-   if (gpu->funcs->gpu_get_freq)
-   status->current_frequency = gpu->funcs->gpu_get_freq(gpu);
-   else
-   status->current_frequency = clk_get_rate(gpu->core_clk);
-
-   status->busy_time = gpu->funcs->gpu_busy(gpu);
-
-   time = ktime_get();
-   status->total_time = ktime_us_delta(time, gpu->devfreq.time);
-   gpu->devfreq.time = time;
-
-   return 0;
-}
-
-static int msm_devfreq_get_cur_freq(struct device *dev, unsigned long *freq)
-{
-   struct msm_gpu *gpu = dev_to_gpu(dev);
-
-   if (gpu->funcs->gpu_get_freq)
-   *freq = gpu->funcs->gpu_get_freq(gpu);
-   else
-   *freq = clk_get_rate(gpu->core_clk);
-
-   return 0;
-}
-
-static struct devfreq_dev_profile msm_devfreq_profile = {
-   .polling_ms = 10,
-   .target = msm_devfreq_target,
-   .get_dev_status = msm_devfreq_get_dev_status,
-   .get_cur_freq = msm_devfreq_get_cur_freq,
-};
-
-static void msm_devfreq_init(struct msm_gpu *gpu)
-{
-   /* We need target support to do devfreq */
-   if (!gpu->funcs->gpu_busy)
-   return;
-
-   msm_devfreq_profile.initial_freq = gpu->fast_rate;
-
-   /*
-* Don't set the freq_table or max_state and let devfreq build the table
-* from OPP
-* After a deferred probe, these may have be left to non-zero values,
-* so set them back to zero before creating the devfreq device
-*/
-   msm_devfreq_profile.freq_table = NULL;
-   msm_devfreq_profile.max_state = 0;
-
-   gpu->devfreq.devfreq = devm_devfreq_add_device(&gpu->pdev->dev,
-   &msm_devfreq_profile, DEVFREQ_GOV_SIMPLE_ONDEMAND,
-   NULL);
-
-   if (IS_ERR(gpu->devfreq.devfreq)) {
-   DRM_DEV_ERROR(&gpu->pdev->dev, "Couldn't initialize GPU 
devfreq\n");
-   gpu->devfreq.devfreq = NULL;
-   return;
-   }
-
-   devfreq_suspend_device(gpu->devfreq.devfreq);
-
-   gpu->cooling = of_devfreq_cooling_register(gpu->pdev->dev.of_node,
-   gpu->devfreq.devfreq);
-   if (IS_ERR(gpu->cooli

[PATCH 2/3] drm/msm: Split out get_freq() helper

2021-07-22 Thread Rob Clark

From: Rob Clark 

In the next patch, it grows a bit more, so lets not duplicate the logic
in multiple places.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/msm_gpu_devfreq.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gpu_devfreq.c 
b/drivers/gpu/drm/msm/msm_gpu_devfreq.c
index 3bcea0baddab..2e24a97be624 100644
--- a/drivers/gpu/drm/msm/msm_gpu_devfreq.c
+++ b/drivers/gpu/drm/msm/msm_gpu_devfreq.c
@@ -37,17 +37,21 @@ static int msm_devfreq_target(struct device *dev, unsigned 
long *freq,
return 0;
 }
 
+static unsigned long get_freq(struct msm_gpu *gpu)
+{
+   if (gpu->funcs->gpu_get_freq)
+   return gpu->funcs->gpu_get_freq(gpu);
+
+   return clk_get_rate(gpu->core_clk);
+}
+
 static int msm_devfreq_get_dev_status(struct device *dev,
struct devfreq_dev_status *status)
 {
struct msm_gpu *gpu = dev_to_gpu(dev);
ktime_t time;
 
-   if (gpu->funcs->gpu_get_freq)
-   status->current_frequency = gpu->funcs->gpu_get_freq(gpu);
-   else
-   status->current_frequency = clk_get_rate(gpu->core_clk);
-
+   status->current_frequency = get_freq(gpu);
status->busy_time = gpu->funcs->gpu_busy(gpu);
 
time = ktime_get();
@@ -59,12 +63,7 @@ static int msm_devfreq_get_dev_status(struct device *dev,
 
 static int msm_devfreq_get_cur_freq(struct device *dev, unsigned long *freq)
 {
-   struct msm_gpu *gpu = dev_to_gpu(dev);
-
-   if (gpu->funcs->gpu_get_freq)
-   *freq = gpu->funcs->gpu_get_freq(gpu);
-   else
-   *freq = clk_get_rate(gpu->core_clk);
+   *freq = get_freq(dev_to_gpu(dev));
 
return 0;
 }
-- 
2.31.1

[PATCH 3/3] drm/msm: Devfreq tuning

2021-07-22 Thread Rob Clark

From: Rob Clark 

This adds a few things to try and make frequency scaling better match
the workload:

1) Longer polling interval to avoid whip-lashing between too-high and
   too-low frequencies in certain workloads, like mobile games which
   throttle themselves to 30fps.

   Previously our polling interval was short enough to let things
   ramp down to minimum freq in the "off" frame, but long enough to
   not react quickly enough when rendering started on the next frame,
   leading to uneven frame times.  (Ie. rather than a consistent 33ms
   it would alternate between 16/33/48ms.)

2) Awareness of when the GPU is active vs idle.  Since we know when
   the GPU is active vs idle, we can clamp the frequency down to the
   minimum while it is idle.  (If it is idle for long enough, then
   the autosuspend delay will eventually kick in and power down the
   GPU.)

   Since devfreq has no knowledge of powered-but-idle, this takes a
   small bit of trickery to maintain a "fake" frequency while idle.
   This, combined with the longer polling period allows devfreq to
   arrive at a reasonable "active" frequency, while still clamping
   to minimum freq when idle to reduce power draw.

3) Boost.  Because simple_ondemand needs to see a certain threshold
   of busyness to ramp up, we could end up needing multiple polling
   cycles before it reacts appropriately on interactive workloads
   (ex. scrolling a web page after reading for some time), on top
   of the already lengthened polling interval, when we see a idle
   to active transition after a period of idle time we boost the
   frequency that we return to.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/msm_gpu.c |  8 +++
 drivers/gpu/drm/msm/msm_gpu.h |  9 
 drivers/gpu/drm/msm/msm_gpu_devfreq.c | 73 ++-
 3 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 70d8610b1b73..68d2df590054 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -667,6 +667,10 @@ static void retire_submit(struct msm_gpu *gpu, struct 
msm_ringbuffer *ring,
list_del(&submit->node);
spin_unlock(&ring->submit_lock);
 
+   /* Update devfreq on transition from active->idle: */
+   if (atomic_dec_return(&gpu->active_submits) == 0)
+   msm_devfreq_idle(gpu);
+
msm_gem_submit_put(submit);
 }
 
@@ -747,6 +751,10 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct 
msm_gem_submit *submit)
list_add_tail(&submit->node, &ring->submits);
spin_unlock(&ring->submit_lock);
 
+   /* Update devfreq on transition from idle->active: */
+   if (atomic_inc_return(&gpu->active_submits) == 1)
+   msm_devfreq_active(gpu);
+
gpu->funcs->submit(gpu, submit);
priv->lastctx = submit->queue->ctx;
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index ada15e28f251..e14edda3d778 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -84,6 +84,10 @@ struct msm_gpu_devfreq {
struct devfreq *devfreq;
u64 busy_cycles;
ktime_t time;
+
+   /* Time and freq of last transition to idle: */
+   ktime_t idle_time;
+   unsigned long idle_freq;
 };
 
 struct msm_gpu {
@@ -115,6 +119,9 @@ struct msm_gpu {
 */
struct list_head active_list;
 
+   /* number of in-flight submits: */
+   atomic_t active_submits;
+
/* does gpu need hw_init? */
bool needs_hw_init;
 
@@ -384,6 +391,8 @@ void msm_devfreq_init(struct msm_gpu *gpu);
 void msm_devfreq_cleanup(struct msm_gpu *gpu);
 void msm_devfreq_resume(struct msm_gpu *gpu);
 void msm_devfreq_suspend(struct msm_gpu *gpu);
+void msm_devfreq_active(struct msm_gpu *gpu);
+void msm_devfreq_idle(struct msm_gpu *gpu);
 
 int msm_gpu_hw_init(struct msm_gpu *gpu);
 
diff --git a/drivers/gpu/drm/msm/msm_gpu_devfreq.c 
b/drivers/gpu/drm/msm/msm_gpu_devfreq.c
index 2e24a97be624..0a1ee20296a2 100644
--- a/drivers/gpu/drm/msm/msm_gpu_devfreq.c
+++ b/drivers/gpu/drm/msm/msm_gpu_devfreq.c
@@ -22,6 +22,15 @@ static int msm_devfreq_target(struct device *dev, unsigned 
long *freq,
 
opp = devfreq_recommended_opp(dev, freq, flags);
 
+   /*
+* If the GPU is idle, devfreq is not aware, so just ignore
+* it's requests
+*/
+   if (gpu->devfreq.idle_freq) {
+   gpu->devfreq.idle_freq = *freq;
+   return 0;
+   }
+
if (IS_ERR(opp))
return PTR_ERR(opp);
 
@@ -39,6 +48,9 @@ static int msm_devfreq_target(struct device *dev, unsigned 
long *freq,
 
 static unsigned long get_freq(struct msm_gpu *gpu)
 {
+   if (gpu->devfreq.idle_freq)
+   return gpu->devfreq.idle_freq;
+
if (gpu->funcs->gpu_get_freq)

Re: [Freedreno] [PATCH 2/2] drm/msm/a6xx: Add support for Adreno 7c Gen 3 gpu

2021-07-24 Thread Rob Clark

()

On Fri, Jul 23, 2021 at 3:38 AM Akhil P Oommen  wrote:
>
> This patch adds support for the gpu found in the Snapdragon 7c Gen 3
> compute platform. This gpu is similar to the exisiting a660 gpu with
> minor delta in the programing sequence. As the Adreno GPUs are moving
> away from a numeric chipid based naming scheme to a string, it was
> decided to use 0x06030500 as the gpu id of this gpu to communicate
> to the userspace driver.
>
> Signed-off-by: Akhil P Oommen 
> ---
>  drivers/gpu/drm/msm/adreno/a6xx_gmu.c  | 20 ++-
>  drivers/gpu/drm/msm/adreno/a6xx_gmu.h  |  1 +
>  drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h  |  2 ++
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c  | 21 ++--
>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c  | 32 
> ++
>  drivers/gpu/drm/msm/adreno/adreno_device.c | 12 +++
>  drivers/gpu/drm/msm/adreno/adreno_gpu.h| 11 --
>  7 files changed, 90 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c 
> b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> index b349692..332301f 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> @@ -933,6 +933,7 @@ int a6xx_gmu_resume(struct a6xx_gpu *a6xx_gpu)
>
> /* Use a known rate to bring up the GMU */
> clk_set_rate(gmu->core_clk, 2);
> +   clk_set_rate(gmu->hub_clk, 15000);
> ret = clk_bulk_prepare_enable(gmu->nr_clocks, gmu->clocks);
> if (ret) {
> pm_runtime_put(gmu->gxpd);
> @@ -1094,6 +1095,7 @@ static void a6xx_gmu_shutdown(struct a6xx_gmu *gmu)
>
>  int a6xx_gmu_stop(struct a6xx_gpu *a6xx_gpu)
>  {
> +   struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
> struct a6xx_gmu *gmu = &a6xx_gpu->gmu;
> struct msm_gpu *gpu = &a6xx_gpu->base.base;
>
> @@ -1117,9 +1119,22 @@ int a6xx_gmu_stop(struct a6xx_gpu *a6xx_gpu)
>  * domain. Usually the GMU does this but only if the shutdown sequence
>  * was successful
>  */
> -   if (!IS_ERR_OR_NULL(gmu->gxpd))
> +   if (!IS_ERR_OR_NULL(gmu->gxpd)) {
> +   /*
> +* Toggle the loop_en bit, across disabling the gx gdsc,
> +* with a delay of 10 XO cycles before disabling gx
> +* gdsc. This is to prevent CPR measurements from
> +* failing.
> +*/
> +   if (adreno_is_a660(adreno_gpu))
> +   gmu_rmw(gmu, REG_A6XX_GPU_CPR_FSM_CTL, 1, 0);
> +
> pm_runtime_put_sync(gmu->gxpd);
>
> +   if (adreno_is_a660(adreno_gpu))
> +   gmu_rmw(gmu, REG_A6XX_GPU_CPR_FSM_CTL, 1, 1);

This kinda seems like it should be a separate patch.. but I noticed
you silently turned adreno_is_a660() into what should probably be
adreno_is_a660_family()

I'd suggest to break this out into it's own patch, so it is clear that
it effects a660 as well, and then a next patch to rename
adreno_is_a660_family()

Longer term, we might want to think about refactoring all the
if(adreno_is_xyz()) into a features table (see i915_pci.c for ideas)

> +   }
> +
> clk_bulk_disable_unprepare(gmu->nr_clocks, gmu->clocks);
>
> pm_runtime_put_sync(gmu->dev);
> @@ -1393,6 +1408,9 @@ static int a6xx_gmu_clocks_probe(struct a6xx_gmu *gmu)
> gmu->core_clk = msm_clk_bulk_get_clock(gmu->clocks,
> gmu->nr_clocks, "gmu");
>
> +   gmu->hub_clk = msm_clk_bulk_get_clock(gmu->clocks,
> +   gmu->nr_clocks, "hub");
> +
> return 0;
>  }
>
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h 
> b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> index 71dfa600..3c74f64 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> @@ -66,6 +66,7 @@ struct a6xx_gmu {
> int nr_clocks;
> struct clk_bulk_data *clocks;
> struct clk *core_clk;
> +   struct clk *hub_clk;
>
> /* current performance index set externally */
> int current_perf_index;
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h 
> b/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h
> index 8115892..d46733f 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h
> @@ -479,5 +479,7 @@ static inline uint32_t A6XX_GMU_GPU_NAP_CTRL_SID(uint32_t 
> val)
>
>  #define REG_A6XX_RSCC_TCS3_DRV0_STATUS 0x053e
>
> +#define REG_A6XX_GPU_CPR_FSM_CTL   0xc001
> +
>
>  #endif /* A6XX_GMU_XML */
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index 183b9f9..c0882536 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -694,6 +694,13 @@ static void a6xx_set_ubwc_config(struct msm_gpu *gpu)
> uavflagprd_inv = 2;
> }
>
> +   if (adreno_is_7c3

[PATCH 0/2] drm/msm: Reduce fence signal latency

2021-07-26 Thread Rob Clark

From: Rob Clark 

A couple tweaks to reduce fence signal latency.

Rob Clark (2):
  drm/msm: Let fences read directly from memptrs
  drm/msm: Signal fences sooner

 drivers/gpu/drm/msm/msm_fence.c  | 11 +--
 drivers/gpu/drm/msm/msm_fence.h  | 41 +++---
 drivers/gpu/drm/msm/msm_gpu.c| 44 
 drivers/gpu/drm/msm/msm_ringbuffer.c |  2 +-
 4 files changed, 73 insertions(+), 25 deletions(-)

-- 
2.31.1

[PATCH 1/2] drm/msm: Let fences read directly from memptrs

2021-07-26 Thread Rob Clark

From: Rob Clark 

Let dma_fence::signaled, etc, read directly from the address that the hw
is writing with updated completed fence seqno, so we can potentially
notice that the fence is signaled sooner.

Plus add some docs.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/msm_fence.c  | 11 ++--
 drivers/gpu/drm/msm/msm_fence.h  | 41 +---
 drivers/gpu/drm/msm/msm_ringbuffer.c |  2 +-
 3 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_fence.c b/drivers/gpu/drm/msm/msm_fence.c
index cd59a5918038..b92a9091a1e2 100644
--- a/drivers/gpu/drm/msm/msm_fence.c
+++ b/drivers/gpu/drm/msm/msm_fence.c
@@ -11,7 +11,8 @@
 
 
 struct msm_fence_context *
-msm_fence_context_alloc(struct drm_device *dev, const char *name)
+msm_fence_context_alloc(struct drm_device *dev, volatile uint32_t *fenceptr,
+   const char *name)
 {
struct msm_fence_context *fctx;
 
@@ -22,6 +23,7 @@ msm_fence_context_alloc(struct drm_device *dev, const char 
*name)
fctx->dev = dev;
strncpy(fctx->name, name, sizeof(fctx->name));
fctx->context = dma_fence_context_alloc(1);
+   fctx->fenceptr = fenceptr;
init_waitqueue_head(&fctx->event);
spin_lock_init(&fctx->spinlock);
 
@@ -35,7 +37,12 @@ void msm_fence_context_free(struct msm_fence_context *fctx)
 
 static inline bool fence_completed(struct msm_fence_context *fctx, uint32_t 
fence)
 {
-   return (int32_t)(fctx->completed_fence - fence) >= 0;
+   /*
+* Note: Check completed_fence first, as fenceptr is in a write-combine
+* mapping, so it will be more expensive to read.
+*/
+   return (int32_t)(fctx->completed_fence - fence) >= 0 ||
+   (int32_t)(*fctx->fenceptr - fence) >= 0;
 }
 
 /* legacy path for WAIT_FENCE ioctl: */
diff --git a/drivers/gpu/drm/msm/msm_fence.h b/drivers/gpu/drm/msm/msm_fence.h
index 2d9af66dcca5..6ab97062ff1a 100644
--- a/drivers/gpu/drm/msm/msm_fence.h
+++ b/drivers/gpu/drm/msm/msm_fence.h
@@ -9,19 +9,52 @@
 
 #include "msm_drv.h"
 
+/**
+ * struct msm_fence_context - fence context for gpu
+ *
+ * Each ringbuffer has a single fence context, with the GPU writing an
+ * incrementing fence seqno at the end of each submit
+ */
 struct msm_fence_context {
struct drm_device *dev;
+   /** name: human readable name for fence timeline */
char name[32];
+   /** context: see dma_fence_context_alloc() */
unsigned context;
-   /* last_fence == completed_fence --> no pending work */
-   uint32_t last_fence;  /* last assigned fence */
-   uint32_t completed_fence; /* last completed fence */
+
+   /**
+* last_fence:
+*
+* Last assigned fence, incremented each time a fence is created
+* on this fence context.  If last_fence == completed_fence,
+* there is no remaining pending work
+*/
+   uint32_t last_fence;
+
+   /**
+* completed_fence:
+*
+* The last completed fence, updated from the CPU after interrupt
+* from GPU
+*/
+   uint32_t completed_fence;
+
+   /**
+* fenceptr:
+*
+* The address that the GPU directly writes with completed fence
+* seqno.  This can be ahead of completed_fence.  We can peek at
+* this to see if a fence has already signaled but the CPU hasn't
+* gotten around to handling the irq and updating completed_fence
+*/
+   volatile uint32_t *fenceptr;
+
wait_queue_head_t event;
spinlock_t spinlock;
 };
 
 struct msm_fence_context * msm_fence_context_alloc(struct drm_device *dev,
-   const char *name);
+   volatile uint32_t *fenceptr, const char *name);
 void msm_fence_context_free(struct msm_fence_context *fctx);
 
 int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence,
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c 
b/drivers/gpu/drm/msm/msm_ringbuffer.c
index 4d2a2a4abef8..7e92d9532454 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -51,7 +51,7 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu 
*gpu, int id,
 
snprintf(name, sizeof(name), "gpu-ring-%d", ring->id);
 
-   ring->fctx = msm_fence_context_alloc(gpu->dev, name);
+   ring->fctx = msm_fence_context_alloc(gpu->dev, &ring->memptrs->fence, 
name);
 
return ring;
 
-- 
2.31.1

[PATCH 2/2] drm/msm: Signal fences sooner

2021-07-26 Thread Rob Clark

From: Rob Clark 

Nothing we do to in update_fences() can't be done in an atomic context,
so move this into the GPU's irq context to reduce latency (and call
dma_fence_signal() so we aren't relying on dma_fence_is_signaled() which
would defeat the purpose).

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/msm_gpu.c | 44 +--
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 0ebf7bc6ad09..647af45cf892 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -278,16 +278,18 @@ static void update_fences(struct msm_gpu *gpu, struct 
msm_ringbuffer *ring,
uint32_t fence)
 {
struct msm_gem_submit *submit;
+   unsigned long flags;
 
-   spin_lock(&ring->submit_lock);
+   spin_lock_irqsave(&ring->submit_lock, flags);
list_for_each_entry(submit, &ring->submits, node) {
if (submit->seqno > fence)
break;
 
msm_update_fence(submit->ring->fctx,
submit->fence->seqno);
+   dma_fence_signal(submit->fence);
}
-   spin_unlock(&ring->submit_lock);
+   spin_unlock_irqrestore(&ring->submit_lock, flags);
 }
 
 #ifdef CONFIG_DEV_COREDUMP
@@ -443,15 +445,16 @@ static struct msm_gem_submit *
 find_submit(struct msm_ringbuffer *ring, uint32_t fence)
 {
struct msm_gem_submit *submit;
+   unsigned long flags;
 
-   spin_lock(&ring->submit_lock);
+   spin_lock_irqsave(&ring->submit_lock, flags);
list_for_each_entry(submit, &ring->submits, node) {
if (submit->seqno == fence) {
-   spin_unlock(&ring->submit_lock);
+   spin_unlock_irqrestore(&ring->submit_lock, flags);
return submit;
}
}
-   spin_unlock(&ring->submit_lock);
+   spin_unlock_irqrestore(&ring->submit_lock, flags);
 
return NULL;
 }
@@ -547,11 +550,12 @@ static void recover_worker(struct kthread_work *work)
 */
for (i = 0; i < gpu->nr_rings; i++) {
struct msm_ringbuffer *ring = gpu->rb[i];
+   unsigned long flags;
 
-   spin_lock(&ring->submit_lock);
+   spin_lock_irqsave(&ring->submit_lock, flags);
list_for_each_entry(submit, &ring->submits, node)
gpu->funcs->submit(gpu, submit);
-   spin_unlock(&ring->submit_lock);
+   spin_unlock_irqrestore(&ring->submit_lock, flags);
}
}
 
@@ -641,7 +645,7 @@ static void hangcheck_handler(struct timer_list *t)
hangcheck_timer_reset(gpu);
 
/* workaround for missing irq: */
-   kthread_queue_work(gpu->worker, &gpu->retire_work);
+   msm_gpu_retire(gpu);
 }
 
 /*
@@ -752,6 +756,7 @@ static void retire_submit(struct msm_gpu *gpu, struct 
msm_ringbuffer *ring,
int index = submit->seqno % MSM_GPU_SUBMIT_STATS_COUNT;
volatile struct msm_gpu_submit_stats *stats;
u64 elapsed, clock = 0;
+   unsigned long flags;
int i;
 
stats = &ring->memptrs->stats[index];
@@ -781,9 +786,9 @@ static void retire_submit(struct msm_gpu *gpu, struct 
msm_ringbuffer *ring,
pm_runtime_mark_last_busy(&gpu->pdev->dev);
pm_runtime_put_autosuspend(&gpu->pdev->dev);
 
-   spin_lock(&ring->submit_lock);
+   spin_lock_irqsave(&ring->submit_lock, flags);
list_del(&submit->node);
-   spin_unlock(&ring->submit_lock);
+   spin_unlock_irqrestore(&ring->submit_lock, flags);
 
msm_gem_submit_put(submit);
 }
@@ -798,11 +803,12 @@ static void retire_submits(struct msm_gpu *gpu)
 
while (true) {
struct msm_gem_submit *submit = NULL;
+   unsigned long flags;
 
-   spin_lock(&ring->submit_lock);
+   spin_lock_irqsave(&ring->submit_lock, flags);
submit = list_first_entry_or_null(&ring->submits,
struct msm_gem_submit, node);
-   spin_unlock(&ring->submit_lock);
+   spin_unlock_irqrestore(&ring->submit_lock, flags);
 
/*
 * If no submit, we are done.  If submit->fence hasn't
@@ -821,10 +827,6 @@ static void retire_submits(struct msm_gpu *gpu)
 static void retire_worker(struct kthread_work *work)
 {
struct msm_gpu *gpu = container_of(work, struct msm_gpu, re

[PATCH v2 0/3] drm/msm: Improved devfreq tuning

2021-07-26 Thread Rob Clark

From: Rob Clark 

This is the outcome of trying to fix some bad gpu freq behavior seen in
some use-cases, in particular mobile games that throttle themselves to
30fps.  With the existing tuning, we'd end up spending most of the time
that we should be running fast at a low freq, and most of the idle time
at a high freq.

First two patches are prep, 3/3 is the interesting bit.  See the patch
description in 3/3 for more details.

v2: struct_mutex serializes the submit path, but not the retire path,
so add a dedicated lock to serialize active<->idle transitions.

Rob Clark (3):
  drm/msm: Split out devfreq handling
  drm/msm: Split out get_freq() helper
  drm/msm: Devfreq tuning

 drivers/gpu/drm/msm/Makefile  |   1 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c |   4 +-
 drivers/gpu/drm/msm/msm_gpu.c | 132 +++--
 drivers/gpu/drm/msm/msm_gpu.h |  61 +++-
 drivers/gpu/drm/msm/msm_gpu_devfreq.c | 203 ++
 5 files changed, 280 insertions(+), 121 deletions(-)
 create mode 100644 drivers/gpu/drm/msm/msm_gpu_devfreq.c

-- 
2.31.1

[PATCH v2 1/3] drm/msm: Split out devfreq handling

2021-07-26 Thread Rob Clark

From: Rob Clark 

Before we start adding more cleverness, split it into it's own file.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/Makefile  |   1 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c |   4 +-
 drivers/gpu/drm/msm/msm_gpu.c | 116 +-
 drivers/gpu/drm/msm/msm_gpu.h |  32 +--
 drivers/gpu/drm/msm/msm_gpu_devfreq.c | 133 ++
 5 files changed, 165 insertions(+), 121 deletions(-)
 create mode 100644 drivers/gpu/drm/msm/msm_gpu_devfreq.c

diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
index 2c00aa70b708..904535eda0c4 100644
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@@ -90,6 +90,7 @@ msm-y := \
msm_gem_submit.o \
msm_gem_vma.o \
msm_gpu.o \
+   msm_gpu_devfreq.o \
msm_iommu.o \
msm_perf.o \
msm_rd.o \
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 9db9f7847ea8..91f637b908f4 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1477,7 +1477,7 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
if (ret)
return ret;
 
-   msm_gpu_resume_devfreq(gpu);
+   msm_devfreq_resume(gpu);
 
a6xx_llc_activate(a6xx_gpu);
 
@@ -1494,7 +1494,7 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
 
a6xx_llc_deactivate(a6xx_gpu);
 
-   devfreq_suspend_device(gpu->devfreq.devfreq);
+   msm_devfreq_suspend(gpu);
 
ret = a6xx_gmu_stop(a6xx_gpu);
if (ret)
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 647af45cf892..fedbd785e42f 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -13,8 +13,6 @@
 
 #include 
 #include 
-#include 
-#include 
 #include 
 #include 
 
@@ -22,106 +20,6 @@
  * Power Management:
  */
 
-static int msm_devfreq_target(struct device *dev, unsigned long *freq,
-   u32 flags)
-{
-   struct msm_gpu *gpu = dev_to_gpu(dev);
-   struct dev_pm_opp *opp;
-
-   opp = devfreq_recommended_opp(dev, freq, flags);
-
-   if (IS_ERR(opp))
-   return PTR_ERR(opp);
-
-   trace_msm_gpu_freq_change(dev_pm_opp_get_freq(opp));
-
-   if (gpu->funcs->gpu_set_freq)
-   gpu->funcs->gpu_set_freq(gpu, opp);
-   else
-   clk_set_rate(gpu->core_clk, *freq);
-
-   dev_pm_opp_put(opp);
-
-   return 0;
-}
-
-static int msm_devfreq_get_dev_status(struct device *dev,
-   struct devfreq_dev_status *status)
-{
-   struct msm_gpu *gpu = dev_to_gpu(dev);
-   ktime_t time;
-
-   if (gpu->funcs->gpu_get_freq)
-   status->current_frequency = gpu->funcs->gpu_get_freq(gpu);
-   else
-   status->current_frequency = clk_get_rate(gpu->core_clk);
-
-   status->busy_time = gpu->funcs->gpu_busy(gpu);
-
-   time = ktime_get();
-   status->total_time = ktime_us_delta(time, gpu->devfreq.time);
-   gpu->devfreq.time = time;
-
-   return 0;
-}
-
-static int msm_devfreq_get_cur_freq(struct device *dev, unsigned long *freq)
-{
-   struct msm_gpu *gpu = dev_to_gpu(dev);
-
-   if (gpu->funcs->gpu_get_freq)
-   *freq = gpu->funcs->gpu_get_freq(gpu);
-   else
-   *freq = clk_get_rate(gpu->core_clk);
-
-   return 0;
-}
-
-static struct devfreq_dev_profile msm_devfreq_profile = {
-   .polling_ms = 10,
-   .target = msm_devfreq_target,
-   .get_dev_status = msm_devfreq_get_dev_status,
-   .get_cur_freq = msm_devfreq_get_cur_freq,
-};
-
-static void msm_devfreq_init(struct msm_gpu *gpu)
-{
-   /* We need target support to do devfreq */
-   if (!gpu->funcs->gpu_busy)
-   return;
-
-   msm_devfreq_profile.initial_freq = gpu->fast_rate;
-
-   /*
-* Don't set the freq_table or max_state and let devfreq build the table
-* from OPP
-* After a deferred probe, these may have be left to non-zero values,
-* so set them back to zero before creating the devfreq device
-*/
-   msm_devfreq_profile.freq_table = NULL;
-   msm_devfreq_profile.max_state = 0;
-
-   gpu->devfreq.devfreq = devm_devfreq_add_device(&gpu->pdev->dev,
-   &msm_devfreq_profile, DEVFREQ_GOV_SIMPLE_ONDEMAND,
-   NULL);
-
-   if (IS_ERR(gpu->devfreq.devfreq)) {
-   DRM_DEV_ERROR(&gpu->pdev->dev, "Couldn't initialize GPU 
devfreq\n");
-   gpu->devfreq.devfreq = NULL;
-   return;
-   }
-
-   devfreq_suspend_device(gpu->devfreq.devfreq);
-
-   gpu->cooling = of_devfreq_cooling_register(gpu->pdev->dev.of_node,
-   gpu->devfreq.devfreq);
-   if (IS_ERR(gpu->cooli

[PATCH v2 2/3] drm/msm: Split out get_freq() helper

2021-07-26 Thread Rob Clark

From: Rob Clark 

In the next patch, it grows a bit more, so lets not duplicate the logic
in multiple places.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/msm_gpu_devfreq.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gpu_devfreq.c 
b/drivers/gpu/drm/msm/msm_gpu_devfreq.c
index 3bcea0baddab..2e24a97be624 100644
--- a/drivers/gpu/drm/msm/msm_gpu_devfreq.c
+++ b/drivers/gpu/drm/msm/msm_gpu_devfreq.c
@@ -37,17 +37,21 @@ static int msm_devfreq_target(struct device *dev, unsigned 
long *freq,
return 0;
 }
 
+static unsigned long get_freq(struct msm_gpu *gpu)
+{
+   if (gpu->funcs->gpu_get_freq)
+   return gpu->funcs->gpu_get_freq(gpu);
+
+   return clk_get_rate(gpu->core_clk);
+}
+
 static int msm_devfreq_get_dev_status(struct device *dev,
struct devfreq_dev_status *status)
 {
struct msm_gpu *gpu = dev_to_gpu(dev);
ktime_t time;
 
-   if (gpu->funcs->gpu_get_freq)
-   status->current_frequency = gpu->funcs->gpu_get_freq(gpu);
-   else
-   status->current_frequency = clk_get_rate(gpu->core_clk);
-
+   status->current_frequency = get_freq(gpu);
status->busy_time = gpu->funcs->gpu_busy(gpu);
 
time = ktime_get();
@@ -59,12 +63,7 @@ static int msm_devfreq_get_dev_status(struct device *dev,
 
 static int msm_devfreq_get_cur_freq(struct device *dev, unsigned long *freq)
 {
-   struct msm_gpu *gpu = dev_to_gpu(dev);
-
-   if (gpu->funcs->gpu_get_freq)
-   *freq = gpu->funcs->gpu_get_freq(gpu);
-   else
-   *freq = clk_get_rate(gpu->core_clk);
+   *freq = get_freq(dev_to_gpu(dev));
 
return 0;
 }
-- 
2.31.1

[PATCH v2 3/3] drm/msm: Devfreq tuning

2021-07-26 Thread Rob Clark

From: Rob Clark 

This adds a few things to try and make frequency scaling better match
the workload:

1) Longer polling interval to avoid whip-lashing between too-high and
   too-low frequencies in certain workloads, like mobile games which
   throttle themselves to 30fps.

   Previously our polling interval was short enough to let things
   ramp down to minimum freq in the "off" frame, but long enough to
   not react quickly enough when rendering started on the next frame,
   leading to uneven frame times.  (Ie. rather than a consistent 33ms
   it would alternate between 16/33/48ms.)

2) Awareness of when the GPU is active vs idle.  Since we know when
   the GPU is active vs idle, we can clamp the frequency down to the
   minimum while it is idle.  (If it is idle for long enough, then
   the autosuspend delay will eventually kick in and power down the
   GPU.)

   Since devfreq has no knowledge of powered-but-idle, this takes a
   small bit of trickery to maintain a "fake" frequency while idle.
   This, combined with the longer polling period allows devfreq to
   arrive at a reasonable "active" frequency, while still clamping
   to minimum freq when idle to reduce power draw.

3) Boost.  Because simple_ondemand needs to see a certain threshold
   of busyness to ramp up, we could end up needing multiple polling
   cycles before it reacts appropriately on interactive workloads
   (ex. scrolling a web page after reading for some time), on top
   of the already lengthened polling interval, when we see a idle
   to active transition after a period of idle time we boost the
   frequency that we return to.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/msm_gpu.c | 16 ++
 drivers/gpu/drm/msm/msm_gpu.h | 29 +++
 drivers/gpu/drm/msm/msm_gpu_devfreq.c | 73 ++-
 3 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index fedbd785e42f..c4ed8694f721 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -680,6 +680,14 @@ static void retire_submit(struct msm_gpu *gpu, struct 
msm_ringbuffer *ring,
list_del(&submit->node);
spin_unlock_irqrestore(&ring->submit_lock, flags);
 
+   /* Update devfreq on transition from active->idle: */
+   mutex_lock(&gpu->active_lock);
+   gpu->active_submits--;
+   WARN_ON(gpu->active_submits < 0);
+   if (!gpu->active_submits)
+   msm_devfreq_idle(gpu);
+   mutex_unlock(&gpu->active_lock);
+
msm_gem_submit_put(submit);
 }
 
@@ -781,6 +789,13 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct 
msm_gem_submit *submit)
list_add_tail(&submit->node, &ring->submits);
spin_unlock_irqrestore(&ring->submit_lock, flags);
 
+   /* Update devfreq on transition from idle->active: */
+   mutex_lock(&gpu->active_lock);
+   if (!gpu->active_submits)
+   msm_devfreq_active(gpu);
+   gpu->active_submits++;
+   mutex_unlock(&gpu->active_lock);
+
gpu->funcs->submit(gpu, submit);
priv->lastctx = submit->queue->ctx;
 
@@ -866,6 +881,7 @@ int msm_gpu_init(struct drm_device *drm, struct 
platform_device *pdev,
sched_set_fifo_low(gpu->worker->task);
 
INIT_LIST_HEAD(&gpu->active_list);
+   mutex_init(&gpu->active_lock);
kthread_init_work(&gpu->retire_work, retire_worker);
kthread_init_work(&gpu->recover_work, recover_worker);
kthread_init_work(&gpu->fault_work, fault_worker);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 2e61d05293e6..710c3fedfbf3 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -98,6 +98,20 @@ struct msm_gpu_devfreq {
 
/** time: Time of last sampling period. */
ktime_t time;
+
+   /** idle_time: Time of last transition to idle: */
+   ktime_t idle_time;
+
+   /**
+* idle_freq:
+*
+* Shadow frequency used while the GPU is idle.  From the PoV of
+* the devfreq governor, we are continuing to sample busyness and
+* adjust frequency while the GPU is idle, but we use this shadow
+* value as the GPU is actually clamped to minimum frequency while
+* it is inactive.
+*/
+   unsigned long idle_freq;
 };
 
 struct msm_gpu {
@@ -129,6 +143,19 @@ struct msm_gpu {
 */
struct list_head active_list;
 
+   /**
+* active_submits:
+*
+* The number of submitted but not yet retired submits, used to
+* determine transitions between active and idle.
+*
+* Protected by lock
+*/
+   int active_submits;
+
+   /** lock: protects active_submits and idle/active transitions */

[PATCH v2 00/12] drm/msm: drm scheduler conversion and cleanups

2021-07-26 Thread Rob Clark

From: Rob Clark 

Conversion to gpu_scheduler, and bonus removal of
drm_gem_object_put_locked()

v2: Fix priority mixup (msm UAPI has lower numeric priority value as
higher priority, inverse of drm/scheduler) and add some comments
in the UAPI header to clarify.

Now that we move active refcnt get into msm_gem_submit, add a
patch to mark all bos busy before pinning, to avoid evicting bos
used in same batch.

Fix bo locking for cmdstream dumping ($debugfs/n/{rd,hangrd})

Rob Clark (12):
  drm/msm: Docs and misc cleanup
  drm/msm: Small submitqueue creation cleanup
  drm/msm: drop drm_gem_object_put_locked()
  drm: Drop drm_gem_object_put_locked()
  drm/msm/submit: Simplify out-fence-fd handling
  drm/msm: Consolidate submit bo state
  drm/msm: Track "seqno" fences by idr
  drm/msm: Return ERR_PTR() from submit_create()
  drm/msm: Conversion to drm scheduler
  drm/msm: Drop struct_mutex in submit path
  drm/msm: Utilize gpu scheduler priorities
  drm/msm/gem: Mark active before pinning

 drivers/gpu/drm/drm_gem.c   |  22 --
 drivers/gpu/drm/msm/Kconfig |   1 +
 drivers/gpu/drm/msm/adreno/a5xx_debugfs.c   |   4 +-
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c   |   6 +-
 drivers/gpu/drm/msm/adreno/a5xx_power.c |   2 +-
 drivers/gpu/drm/msm/adreno/a5xx_preempt.c   |   7 +-
 drivers/gpu/drm/msm/adreno/a6xx_gmu.c   |  12 +-
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   |   2 +-
 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c |   4 +-
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |   6 +-
 drivers/gpu/drm/msm/msm_drv.c   |  30 +-
 drivers/gpu/drm/msm/msm_fence.c |  39 ---
 drivers/gpu/drm/msm/msm_fence.h |   2 -
 drivers/gpu/drm/msm/msm_gem.c   |  93 +-
 drivers/gpu/drm/msm/msm_gem.h   |  39 ++-
 drivers/gpu/drm/msm/msm_gem_submit.c| 316 
 drivers/gpu/drm/msm/msm_gpu.c   |  46 +--
 drivers/gpu/drm/msm/msm_gpu.h   |  78 -
 drivers/gpu/drm/msm/msm_rd.c|   6 +-
 drivers/gpu/drm/msm/msm_ringbuffer.c|  70 -
 drivers/gpu/drm/msm/msm_ringbuffer.h|  12 +
 drivers/gpu/drm/msm/msm_submitqueue.c   |  53 +++-
 include/drm/drm_gem.h   |   2 -
 include/uapi/drm/msm_drm.h  |  14 +-
 24 files changed, 503 insertions(+), 363 deletions(-)

-- 
2.31.1

[PATCH v2 01/12] drm/msm: Docs and misc cleanup

2021-07-26 Thread Rob Clark

From: Rob Clark 

Fix a couple incorrect or misspelt comments, and add submitqueue doc
comment.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem.h |  3 +--
 drivers/gpu/drm/msm/msm_gem_submit.c  |  1 +
 drivers/gpu/drm/msm/msm_gpu.h | 15 +++
 drivers/gpu/drm/msm/msm_ringbuffer.c  |  2 +-
 drivers/gpu/drm/msm/msm_submitqueue.c |  9 +
 5 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index 405f8411e395..d69fcb37ce17 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -313,8 +313,7 @@ void msm_gem_vunmap(struct drm_gem_object *obj);
 
 /* Created per submit-ioctl, to track bo's and cmdstream bufs, etc,
  * associated with the cmdstream submission for synchronization (and
- * make it easier to unwind when things go wrong, etc).  This only
- * lasts for the duration of the submit-ioctl.
+ * make it easier to unwind when things go wrong, etc).
  */
 struct msm_gem_submit {
struct kref ref;
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index 44f84bfd0c0e..6d46f9275a40 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -655,6 +655,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
bool has_ww_ticket = false;
unsigned i;
int ret, submitid;
+
if (!gpu)
return -ENXIO;
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 710c3fedfbf3..96efcb31e502 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -250,6 +250,21 @@ struct msm_gpu_perfcntr {
const char *name;
 };
 
+/**
+ * A submitqueue is associated with a gl context or vk queue (or equiv)
+ * in userspace.
+ *
+ * @id:userspace id for the submitqueue, unique within the drm_file
+ * @flags: userspace flags for the submitqueue, specified at creation
+ * (currently unusued)
+ * @prio:  the submitqueue priority
+ * @faults:the number of GPU hangs associated with this submitqueue
+ * @ctx:   the per-drm_file context associated with the submitqueue (ie.
+ * which set of pgtables do submits jobs associated with the
+ * submitqueue use)
+ * @node:  node in the context's list of submitqueues
+ * @ref:   reference count
+ */
 struct msm_gpu_submitqueue {
int id;
u32 flags;
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c 
b/drivers/gpu/drm/msm/msm_ringbuffer.c
index 7e92d9532454..054461662af5 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -32,7 +32,7 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu 
*gpu, int id,
 
if (IS_ERR(ring->start)) {
ret = PTR_ERR(ring->start);
-   ring->start = 0;
+   ring->start = NULL;
goto fail;
}
 
diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c 
b/drivers/gpu/drm/msm/msm_submitqueue.c
index c3d206105d28..e5eef11ed014 100644
--- a/drivers/gpu/drm/msm/msm_submitqueue.c
+++ b/drivers/gpu/drm/msm/msm_submitqueue.c
@@ -98,17 +98,18 @@ int msm_submitqueue_create(struct drm_device *drm, struct 
msm_file_private *ctx,
return 0;
 }
 
+/*
+ * Create the default submit-queue (id==0), used for backwards compatibility
+ * for userspace that pre-dates the introduction of submitqueues.
+ */
 int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx)
 {
struct msm_drm_private *priv = drm->dev_private;
int default_prio;
 
-   if (!ctx)
-   return 0;
-
/*
 * Select priority 2 as the "default priority" unless nr_rings is less
-* than 2 and then pick the lowest pirority
+* than 2 and then pick the lowest priority
 */
default_prio = priv->gpu ?
clamp_t(uint32_t, 2, 0, priv->gpu->nr_rings - 1) : 0;
-- 
2.31.1

[PATCH v2 02/12] drm/msm: Small submitqueue creation cleanup

2021-07-26 Thread Rob Clark

From: Rob Clark 

If we don't have a gpu, there is no need to create a submitqueue, which
lets us simplify the error handling and submitqueue creation.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_submitqueue.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c 
b/drivers/gpu/drm/msm/msm_submitqueue.c
index e5eef11ed014..9e9fec61d629 100644
--- a/drivers/gpu/drm/msm/msm_submitqueue.c
+++ b/drivers/gpu/drm/msm/msm_submitqueue.c
@@ -66,6 +66,12 @@ int msm_submitqueue_create(struct drm_device *drm, struct 
msm_file_private *ctx,
if (!ctx)
return -ENODEV;
 
+   if (!priv->gpu)
+   return -ENODEV;
+
+   if (prio >= priv->gpu->nr_rings)
+   return -EINVAL;
+
queue = kzalloc(sizeof(*queue), GFP_KERNEL);
 
if (!queue)
@@ -73,15 +79,7 @@ int msm_submitqueue_create(struct drm_device *drm, struct 
msm_file_private *ctx,
 
kref_init(&queue->ref);
queue->flags = flags;
-
-   if (priv->gpu) {
-   if (prio >= priv->gpu->nr_rings) {
-   kfree(queue);
-   return -EINVAL;
-   }
-
-   queue->prio = prio;
-   }
+   queue->prio = prio;
 
write_lock(&ctx->queuelock);
 
@@ -107,12 +105,14 @@ int msm_submitqueue_init(struct drm_device *drm, struct 
msm_file_private *ctx)
struct msm_drm_private *priv = drm->dev_private;
int default_prio;
 
+   if (!priv->gpu)
+   return -ENODEV;
+
/*
 * Select priority 2 as the "default priority" unless nr_rings is less
 * than 2 and then pick the lowest priority
 */
-   default_prio = priv->gpu ?
-   clamp_t(uint32_t, 2, 0, priv->gpu->nr_rings - 1) : 0;
+   default_prio = clamp_t(uint32_t, 2, 0, priv->gpu->nr_rings - 1);
 
INIT_LIST_HEAD(&ctx->submitqueues);
 
-- 
2.31.1

[PATCH v2 03/12] drm/msm: drop drm_gem_object_put_locked()

2021-07-26 Thread Rob Clark

From: Rob Clark 

No idea why we were still using this.  It certainly hasn't been needed
for some time.  So drop the pointless twin codepaths.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/adreno/a5xx_debugfs.c   |  4 +-
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c   |  6 +--
 drivers/gpu/drm/msm/adreno/a5xx_power.c |  2 +-
 drivers/gpu/drm/msm/adreno/a5xx_preempt.c   |  7 ++-
 drivers/gpu/drm/msm/adreno/a6xx_gmu.c   | 12 ++---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   |  2 +-
 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c |  4 +-
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |  2 +-
 drivers/gpu/drm/msm/msm_gem.c   | 56 -
 drivers/gpu/drm/msm/msm_gem.h   |  7 +--
 drivers/gpu/drm/msm/msm_gem_submit.c|  2 +-
 drivers/gpu/drm/msm/msm_gpu.c   |  4 +-
 drivers/gpu/drm/msm/msm_ringbuffer.c|  2 +-
 13 files changed, 33 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c 
b/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c
index fc2c905b6c9e..c9d11d57aed6 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c
@@ -117,13 +117,13 @@ reset_set(void *data, u64 val)
 
if (a5xx_gpu->pm4_bo) {
msm_gem_unpin_iova(a5xx_gpu->pm4_bo, gpu->aspace);
-   drm_gem_object_put_locked(a5xx_gpu->pm4_bo);
+   drm_gem_object_put(a5xx_gpu->pm4_bo);
a5xx_gpu->pm4_bo = NULL;
}
 
if (a5xx_gpu->pfp_bo) {
msm_gem_unpin_iova(a5xx_gpu->pfp_bo, gpu->aspace);
-   drm_gem_object_put_locked(a5xx_gpu->pfp_bo);
+   drm_gem_object_put(a5xx_gpu->pfp_bo);
a5xx_gpu->pfp_bo = NULL;
}
 
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 7a271de9a212..0a93ed1d6b06 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -1415,7 +1415,7 @@ struct a5xx_gpu_state {
 static int a5xx_crashdumper_init(struct msm_gpu *gpu,
struct a5xx_crashdumper *dumper)
 {
-   dumper->ptr = msm_gem_kernel_new_locked(gpu->dev,
+   dumper->ptr = msm_gem_kernel_new(gpu->dev,
SZ_1M, MSM_BO_WC, gpu->aspace,
&dumper->bo, &dumper->iova);
 
@@ -1517,7 +1517,7 @@ static void a5xx_gpu_state_get_hlsq_regs(struct msm_gpu 
*gpu,
 
if (a5xx_crashdumper_run(gpu, &dumper)) {
kfree(a5xx_state->hlsqregs);
-   msm_gem_kernel_put(dumper.bo, gpu->aspace, true);
+   msm_gem_kernel_put(dumper.bo, gpu->aspace);
return;
}
 
@@ -1525,7 +1525,7 @@ static void a5xx_gpu_state_get_hlsq_regs(struct msm_gpu 
*gpu,
memcpy(a5xx_state->hlsqregs, dumper.ptr + (256 * SZ_1K),
count * sizeof(u32));
 
-   msm_gem_kernel_put(dumper.bo, gpu->aspace, true);
+   msm_gem_kernel_put(dumper.bo, gpu->aspace);
 }
 
 static struct msm_gpu_state *a5xx_gpu_state_get(struct msm_gpu *gpu)
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_power.c 
b/drivers/gpu/drm/msm/adreno/a5xx_power.c
index cdb165236a88..0e63a1429189 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_power.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_power.c
@@ -362,7 +362,7 @@ void a5xx_gpmu_ucode_init(struct msm_gpu *gpu)
 */
bosize = (cmds_size + (cmds_size / TYPE4_MAX_PAYLOAD) + 1) << 2;
 
-   ptr = msm_gem_kernel_new_locked(drm, bosize,
+   ptr = msm_gem_kernel_new(drm, bosize,
MSM_BO_WC | MSM_BO_GPU_READONLY, gpu->aspace,
&a5xx_gpu->gpmu_bo, &a5xx_gpu->gpmu_iova);
if (IS_ERR(ptr))
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c 
b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
index ee72510ff8ce..8abc9a2b114a 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
@@ -240,7 +240,7 @@ static int preempt_init_ring(struct a5xx_gpu *a5xx_gpu,
A5XX_PREEMPT_COUNTER_SIZE,
MSM_BO_WC, gpu->aspace, &counters_bo, &counters_iova);
if (IS_ERR(counters)) {
-   msm_gem_kernel_put(bo, gpu->aspace, true);
+   msm_gem_kernel_put(bo, gpu->aspace);
return PTR_ERR(counters);
}
 
@@ -272,9 +272,8 @@ void a5xx_preempt_fini(struct msm_gpu *gpu)
int i;
 
for (i = 0; i < gpu->nr_rings; i++) {
-   msm_gem_kernel_put(a5xx_gpu->preempt_bo[i], gpu->aspace, true);
-   msm_gem_kernel_put(a5xx_gpu->preempt_counters_bo[i],
-   gpu->aspace, true);
+   msm_gem_kernel_put(a5xx_gpu->preempt_bo[i], gpu->aspace);
+   msm_gem_kernel_put(a5xx_gpu->preempt_counters_bo[i], 
gpu->aspace);
}
 }
 
di

[PATCH v2 04/12] drm: Drop drm_gem_object_put_locked()

2021-07-26 Thread Rob Clark

From: Rob Clark 

Now that no one is using it, remove it.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
Reviewed-by: Daniel Vetter 
---
 drivers/gpu/drm/drm_gem.c | 22 --
 include/drm/drm_gem.h |  2 --
 2 files changed, 24 deletions(-)

diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 9989425e9875..c8866788b761 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -974,28 +974,6 @@ drm_gem_object_free(struct kref *kref)
 }
 EXPORT_SYMBOL(drm_gem_object_free);
 
-/**
- * drm_gem_object_put_locked - release a GEM buffer object reference
- * @obj: GEM buffer object
- *
- * This releases a reference to @obj. Callers must hold the
- * &drm_device.struct_mutex lock when calling this function, even when the
- * driver doesn't use &drm_device.struct_mutex for anything.
- *
- * For drivers not encumbered with legacy locking use
- * drm_gem_object_put() instead.
- */
-void
-drm_gem_object_put_locked(struct drm_gem_object *obj)
-{
-   if (obj) {
-   WARN_ON(!mutex_is_locked(&obj->dev->struct_mutex));
-
-   kref_put(&obj->refcount, drm_gem_object_free);
-   }
-}
-EXPORT_SYMBOL(drm_gem_object_put_locked);
-
 /**
  * drm_gem_vm_open - vma->ops->open implementation for GEM
  * @vma: VM area structure
diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
index 240049566592..35e7f44c2a75 100644
--- a/include/drm/drm_gem.h
+++ b/include/drm/drm_gem.h
@@ -384,8 +384,6 @@ drm_gem_object_put(struct drm_gem_object *obj)
__drm_gem_object_put(obj);
 }
 
-void drm_gem_object_put_locked(struct drm_gem_object *obj);
-
 int drm_gem_handle_create(struct drm_file *file_priv,
  struct drm_gem_object *obj,
  u32 *handlep);
-- 
2.31.1

[PATCH v2 06/12] drm/msm: Consolidate submit bo state

2021-07-26 Thread Rob Clark

From: Rob Clark 

Move all the locked/active/pinned state handling to msm_gem_submit.c.
In particular, for drm/scheduler, we'll need to do all this before
pushing the submit job to the scheduler.  But while we're at it we can
get rid of the dupicate pin and refcnt.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem.h|  2 +
 drivers/gpu/drm/msm/msm_gem_submit.c | 92 ++--
 drivers/gpu/drm/msm/msm_gpu.c| 29 +
 3 files changed, 75 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index 71ccf87a646b..da3af702a6c8 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -361,6 +361,8 @@ static inline void msm_gem_submit_put(struct msm_gem_submit 
*submit)
kref_put(&submit->ref, __msm_gem_submit_destroy);
 }
 
+void msm_submit_retire(struct msm_gem_submit *submit);
+
 /* helper to determine of a buffer in submit should be dumped, used for both
  * devcoredump and debugfs cmdstream dumping:
  */
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index 8abd743adfb0..4f02fa3c78f9 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -23,8 +23,8 @@
 
 /* make sure these don't conflict w/ MSM_SUBMIT_BO_x */
 #define BO_VALID0x8000   /* is current addr in cmdstream correct/valid? */
-#define BO_LOCKED   0x4000
-#define BO_PINNED   0x2000
+#define BO_LOCKED   0x4000   /* obj lock is held */
+#define BO_PINNED   0x2000   /* obj is pinned and on active list */
 
 static struct msm_gem_submit *submit_create(struct drm_device *dev,
struct msm_gpu *gpu,
@@ -220,21 +220,33 @@ static int submit_lookup_cmds(struct msm_gem_submit 
*submit,
return ret;
 }
 
-static void submit_unlock_unpin_bo(struct msm_gem_submit *submit,
-   int i, bool backoff)
+/* Unwind bo state, according to cleanup_flags.  In the success case, only
+ * the lock is dropped at the end of the submit (and active/pin ref is dropped
+ * later when the submit is retired).
+ */
+static void submit_cleanup_bo(struct msm_gem_submit *submit, int i,
+   unsigned cleanup_flags)
 {
-   struct msm_gem_object *msm_obj = submit->bos[i].obj;
+   struct drm_gem_object *obj = &submit->bos[i].obj->base;
+   unsigned flags = submit->bos[i].flags & cleanup_flags;
 
-   if (submit->bos[i].flags & BO_PINNED)
-   msm_gem_unpin_iova_locked(&msm_obj->base, submit->aspace);
+   if (flags & BO_PINNED) {
+   msm_gem_unpin_iova_locked(obj, submit->aspace);
+   msm_gem_active_put(obj);
+   }
 
-   if (submit->bos[i].flags & BO_LOCKED)
-   dma_resv_unlock(msm_obj->base.resv);
+   if (flags & BO_LOCKED)
+   dma_resv_unlock(obj->resv);
 
-   if (backoff && !(submit->bos[i].flags & BO_VALID))
-   submit->bos[i].iova = 0;
+   submit->bos[i].flags &= ~cleanup_flags;
+}
 
-   submit->bos[i].flags &= ~(BO_LOCKED | BO_PINNED);
+static void submit_unlock_unpin_bo(struct msm_gem_submit *submit, int i)
+{
+   submit_cleanup_bo(submit, i, BO_PINNED | BO_LOCKED);
+
+   if (!(submit->bos[i].flags & BO_VALID))
+   submit->bos[i].iova = 0;
 }
 
 /* This is where we make sure all the bo's are reserved and pin'd: */
@@ -266,10 +278,10 @@ static int submit_lock_objects(struct msm_gem_submit 
*submit)
 
 fail:
for (; i >= 0; i--)
-   submit_unlock_unpin_bo(submit, i, true);
+   submit_unlock_unpin_bo(submit, i);
 
if (slow_locked > 0)
-   submit_unlock_unpin_bo(submit, slow_locked, true);
+   submit_unlock_unpin_bo(submit, slow_locked);
 
if (ret == -EDEADLK) {
struct msm_gem_object *msm_obj = submit->bos[contended].obj;
@@ -325,16 +337,18 @@ static int submit_pin_objects(struct msm_gem_submit 
*submit)
submit->valid = true;
 
for (i = 0; i < submit->nr_bos; i++) {
-   struct msm_gem_object *msm_obj = submit->bos[i].obj;
+   struct drm_gem_object *obj = &submit->bos[i].obj->base;
uint64_t iova;
 
/* if locking succeeded, pin bo: */
-   ret = msm_gem_get_and_pin_iova_locked(&msm_obj->base,
+   ret = msm_gem_get_and_pin_iova_locked(obj,
submit->aspace, &iova);
 
if (ret)
break;
 
+   msm_gem_active_get(obj, submit->gpu);
+
submit->bos[i].flags |= BO_PINNED;
 
if (iova == submit->bos[i].iova) {
@@ -350,6 +364,20 @@ static int submit_pin_objects(struct msm_gem_submit 
*submit)
return re

[PATCH v2 08/12] drm/msm: Return ERR_PTR() from submit_create()

2021-07-26 Thread Rob Clark

From: Rob Clark 

In the next patch, we start having more than a single potential failure
reason.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem_submit.c | 21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index f6f595aae2c5..f570155bc086 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -32,30 +32,27 @@ static struct msm_gem_submit *submit_create(struct 
drm_device *dev,
uint32_t nr_cmds)
 {
struct msm_gem_submit *submit;
-   uint64_t sz = struct_size(submit, bos, nr_bos) +
- ((u64)nr_cmds * sizeof(submit->cmd[0]));
+   uint64_t sz;
+
+   sz = struct_size(submit, bos, nr_bos) +
+   ((u64)nr_cmds * sizeof(submit->cmd[0]));
 
if (sz > SIZE_MAX)
-   return NULL;
+   return ERR_PTR(-ENOMEM);
 
-   submit = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+   submit = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
if (!submit)
-   return NULL;
+   return ERR_PTR(-ENOMEM);
 
kref_init(&submit->ref);
submit->dev = dev;
submit->aspace = queue->ctx->aspace;
submit->gpu = gpu;
-   submit->fence = NULL;
submit->cmd = (void *)&submit->bos[nr_bos];
submit->queue = queue;
submit->ring = gpu->rb[queue->prio];
submit->fault_dumped = false;
 
-   /* initially, until copy_from_user() and bo lookup succeeds: */
-   submit->nr_bos = 0;
-   submit->nr_cmds = 0;
-
INIT_LIST_HEAD(&submit->node);
INIT_LIST_HEAD(&submit->bo_list);
 
@@ -799,8 +796,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 
submit = submit_create(dev, gpu, queue, args->nr_bos,
args->nr_cmds);
-   if (!submit) {
-   ret = -ENOMEM;
+   if (IS_ERR(submit)) {
+   ret = PTR_ERR(submit);
goto out_unlock;
}
 
-- 
2.31.1

[PATCH v2 05/12] drm/msm/submit: Simplify out-fence-fd handling

2021-07-26 Thread Rob Clark

From: Rob Clark 

No need for this to be split in two parts.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem_submit.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index e789f68d5be1..8abd743adfb0 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -645,7 +645,6 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
struct msm_file_private *ctx = file->driver_priv;
struct msm_gem_submit *submit;
struct msm_gpu *gpu = priv->gpu;
-   struct sync_file *sync_file = NULL;
struct msm_gpu_submitqueue *queue;
struct msm_ringbuffer *ring;
struct msm_submit_post_dep *post_deps = NULL;
@@ -824,22 +823,19 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void 
*data,
}
 
if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
-   sync_file = sync_file_create(submit->fence);
+   struct sync_file *sync_file = sync_file_create(submit->fence);
if (!sync_file) {
ret = -ENOMEM;
goto out;
}
+   fd_install(out_fence_fd, sync_file->file);
+   args->fence_fd = out_fence_fd;
}
 
msm_gpu_submit(gpu, submit);
 
args->fence = submit->fence->seqno;
 
-   if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
-   fd_install(out_fence_fd, sync_file->file);
-   args->fence_fd = out_fence_fd;
-   }
-
msm_reset_syncobjs(syncobjs_to_reset, args->nr_in_syncobjs);
msm_process_post_deps(post_deps, args->nr_out_syncobjs,
  submit->fence);
-- 
2.31.1

[PATCH v2 09/12] drm/msm: Conversion to drm scheduler

2021-07-26 Thread Rob Clark

From: Rob Clark 

For existing adrenos, there is one or more ringbuffer, depending on
whether preemption is supported.  When preemption is supported, each
ringbuffer has it's own priority.  A submitqueue (which maps to a
gl context or vk queue in userspace) is mapped to a specific ring-
buffer at creation time, based on the submitqueue's priority.

Each ringbuffer has it's own drm_gpu_scheduler.  Each submitqueue
maps to a drm_sched_entity.  And each submit maps to a drm_sched_job.

Closes: https://gitlab.freedesktop.org/drm/msm/-/issues/4
Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/Kconfig   |   1 +
 drivers/gpu/drm/msm/msm_gem.c |  35 --
 drivers/gpu/drm/msm/msm_gem.h |  26 -
 drivers/gpu/drm/msm/msm_gem_submit.c  | 161 +-
 drivers/gpu/drm/msm/msm_gpu.c |  13 +--
 drivers/gpu/drm/msm/msm_gpu.h |   2 +
 drivers/gpu/drm/msm/msm_rd.c  |   6 +-
 drivers/gpu/drm/msm/msm_ringbuffer.c  |  66 +++
 drivers/gpu/drm/msm/msm_ringbuffer.h  |  12 ++
 drivers/gpu/drm/msm/msm_submitqueue.c |  26 +
 10 files changed, 217 insertions(+), 131 deletions(-)

diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
index 10f693ea89d3..896266267ad7 100644
--- a/drivers/gpu/drm/msm/Kconfig
+++ b/drivers/gpu/drm/msm/Kconfig
@@ -12,6 +12,7 @@ config DRM_MSM
select REGULATOR
select DRM_KMS_HELPER
select DRM_PANEL
+   select DRM_SCHED
select SHMEM
select TMPFS
select QCOM_SCM if ARCH_QCOM
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 00a6289678dd..41a111c49cc7 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -806,41 +806,6 @@ void msm_gem_vunmap(struct drm_gem_object *obj)
msm_obj->vaddr = NULL;
 }
 
-/* must be called before _move_to_active().. */
-int msm_gem_sync_object(struct drm_gem_object *obj,
-   struct msm_fence_context *fctx, bool exclusive)
-{
-   struct dma_resv_list *fobj;
-   struct dma_fence *fence;
-   int i, ret;
-
-   fobj = dma_resv_get_list(obj->resv);
-   if (!fobj || (fobj->shared_count == 0)) {
-   fence = dma_resv_get_excl(obj->resv);
-   /* don't need to wait on our own fences, since ring is fifo */
-   if (fence && (fence->context != fctx->context)) {
-   ret = dma_fence_wait(fence, true);
-   if (ret)
-   return ret;
-   }
-   }
-
-   if (!exclusive || !fobj)
-   return 0;
-
-   for (i = 0; i < fobj->shared_count; i++) {
-   fence = rcu_dereference_protected(fobj->shared[i],
-   dma_resv_held(obj->resv));
-   if (fence->context != fctx->context) {
-   ret = dma_fence_wait(fence, true);
-   if (ret)
-   return ret;
-   }
-   }
-
-   return 0;
-}
-
 void msm_gem_active_get(struct drm_gem_object *obj, struct msm_gpu *gpu)
 {
struct msm_gem_object *msm_obj = to_msm_bo(obj);
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index e0579abda5b9..a48114058ff9 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -9,6 +9,7 @@
 
 #include 
 #include 
+#include "drm/gpu_scheduler.h"
 #include "msm_drv.h"
 
 /* Make all GEM related WARN_ON()s ratelimited.. when things go wrong they
@@ -143,8 +144,6 @@ void *msm_gem_get_vaddr_active(struct drm_gem_object *obj);
 void msm_gem_put_vaddr_locked(struct drm_gem_object *obj);
 void msm_gem_put_vaddr(struct drm_gem_object *obj);
 int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv);
-int msm_gem_sync_object(struct drm_gem_object *obj,
-   struct msm_fence_context *fctx, bool exclusive);
 void msm_gem_active_get(struct drm_gem_object *obj, struct msm_gpu *gpu);
 void msm_gem_active_put(struct drm_gem_object *obj);
 int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t 
*timeout);
@@ -311,6 +310,7 @@ void msm_gem_vunmap(struct drm_gem_object *obj);
  * make it easier to unwind when things go wrong, etc).
  */
 struct msm_gem_submit {
+   struct drm_sched_job base;
struct kref ref;
struct drm_device *dev;
struct msm_gpu *gpu;
@@ -319,7 +319,22 @@ struct msm_gem_submit {
struct list_head bo_list;
struct ww_acquire_ctx ticket;
uint32_t seqno; /* Sequence number of the submit on the ring */
-   struct dma_fence *fence;
+
+   /* Array of struct dma_fence * to block on before submitting this job.
+*/
+   struct xarray deps;
+   unsigned long last_dep;
+
+   /* Hw fence, which is created when the scheduler executes the

[PATCH v2 10/12] drm/msm: Drop struct_mutex in submit path

2021-07-26 Thread Rob Clark

From: Rob Clark 

It is sufficient to serialize on the submit queue now.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem_submit.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index 2b158433a6e5..affceccf145d 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -711,7 +711,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
struct msm_drm_private *priv = dev->dev_private;
struct drm_msm_gem_submit *args = data;
struct msm_file_private *ctx = file->driver_priv;
-   struct msm_gem_submit *submit;
+   struct msm_gem_submit *submit = NULL;
struct msm_gpu *gpu = priv->gpu;
struct msm_gpu_submitqueue *queue;
struct msm_ringbuffer *ring;
@@ -755,7 +755,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
trace_msm_gpu_submit(pid_nr(pid), ring->id, submitid,
args->nr_bos, args->nr_cmds);
 
-   ret = mutex_lock_interruptible(&dev->struct_mutex);
+   ret = mutex_lock_interruptible(&queue->lock);
if (ret)
goto out_post_unlock;
 
@@ -876,10 +876,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void 
*data,
 * Allocate an id which can be used by WAIT_FENCE ioctl to map back
 * to the underlying fence.
 */
-   mutex_lock(&queue->lock);
submit->fence_id = idr_alloc_cyclic(&queue->fence_idr,
submit->user_fence, 0, INT_MAX, GFP_KERNEL);
-   mutex_unlock(&queue->lock);
if (submit->fence_id < 0) {
ret = submit->fence_id = 0;
submit->fence_id = 0;
@@ -914,12 +912,12 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void 
*data,
submit_cleanup(submit, !!ret);
if (has_ww_ticket)
ww_acquire_fini(&submit->ticket);
-   msm_gem_submit_put(submit);
 out_unlock:
if (ret && (out_fence_fd >= 0))
put_unused_fd(out_fence_fd);
-   mutex_unlock(&dev->struct_mutex);
-
+   mutex_unlock(&queue->lock);
+   if (submit)
+   msm_gem_submit_put(submit);
 out_post_unlock:
if (!IS_ERR_OR_NULL(post_deps)) {
for (i = 0; i < args->nr_out_syncobjs; ++i) {
-- 
2.31.1

[PATCH v2 07/12] drm/msm: Track "seqno" fences by idr

2021-07-26 Thread Rob Clark

From: Rob Clark 

Previously the (non-fd) fence returned from submit ioctl was a raw
seqno, which is scoped to the ring.  But from UABI standpoint, the
ioctls related to seqno fences all specify a submitqueue.  We can
take advantage of that to replace the seqno fences with a cyclic idr
handle.

This is in preperation for moving to drm scheduler, at which point
the submit ioctl will return after queuing the submit job to the
scheduler, but before the submit is written into the ring (and
therefore before a ring seqno has been assigned).  Which means we
need to replace the dma_fence that userspace may need to wait on
with a scheduler fence.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_drv.c | 30 +++--
 drivers/gpu/drm/msm/msm_fence.c   | 39 ---
 drivers/gpu/drm/msm/msm_fence.h   |  2 --
 drivers/gpu/drm/msm/msm_gem.h |  1 +
 drivers/gpu/drm/msm/msm_gem_submit.c  | 23 +++-
 drivers/gpu/drm/msm/msm_gpu.h |  5 
 drivers/gpu/drm/msm/msm_submitqueue.c |  5 
 7 files changed, 61 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 9b8fa2ad0d84..1594ae39d54f 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -911,6 +911,7 @@ static int msm_ioctl_wait_fence(struct drm_device *dev, 
void *data,
ktime_t timeout = to_ktime(args->timeout);
struct msm_gpu_submitqueue *queue;
struct msm_gpu *gpu = priv->gpu;
+   struct dma_fence *fence;
int ret;
 
if (args->pad) {
@@ -925,10 +926,35 @@ static int msm_ioctl_wait_fence(struct drm_device *dev, 
void *data,
if (!queue)
return -ENOENT;
 
-   ret = msm_wait_fence(gpu->rb[queue->prio]->fctx, args->fence, &timeout,
-   true);
+   /*
+* Map submitqueue scoped "seqno" (which is actually an idr key)
+* back to underlying dma-fence
+*
+* The fence is removed from the fence_idr when the submit is
+* retired, so if the fence is not found it means there is nothing
+* to wait for
+*/
+   ret = mutex_lock_interruptible(&queue->lock);
+   if (ret)
+   return ret;
+   fence = idr_find(&queue->fence_idr, args->fence);
+   if (fence)
+   fence = dma_fence_get_rcu(fence);
+   mutex_unlock(&queue->lock);
+
+   if (!fence)
+   return 0;
 
+   ret = dma_fence_wait_timeout(fence, true, timeout_to_jiffies(&timeout));
+   if (ret == 0) {
+   ret = -ETIMEDOUT;
+   } else if (ret != -ERESTARTSYS) {
+   ret = 0;
+   }
+
+   dma_fence_put(fence);
msm_submitqueue_put(queue);
+
return ret;
 }
 
diff --git a/drivers/gpu/drm/msm/msm_fence.c b/drivers/gpu/drm/msm/msm_fence.c
index b92a9091a1e2..d8228029708e 100644
--- a/drivers/gpu/drm/msm/msm_fence.c
+++ b/drivers/gpu/drm/msm/msm_fence.c
@@ -45,45 +45,6 @@ static inline bool fence_completed(struct msm_fence_context 
*fctx, uint32_t fenc
(int32_t)(*fctx->fenceptr - fence) >= 0;
 }
 
-/* legacy path for WAIT_FENCE ioctl: */
-int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence,
-   ktime_t *timeout, bool interruptible)
-{
-   int ret;
-
-   if (fence > fctx->last_fence) {
-   DRM_ERROR_RATELIMITED("%s: waiting on invalid fence: %u (of 
%u)\n",
-   fctx->name, fence, fctx->last_fence);
-   return -EINVAL;
-   }
-
-   if (!timeout) {
-   /* no-wait: */
-   ret = fence_completed(fctx, fence) ? 0 : -EBUSY;
-   } else {
-   unsigned long remaining_jiffies = timeout_to_jiffies(timeout);
-
-   if (interruptible)
-   ret = wait_event_interruptible_timeout(fctx->event,
-   fence_completed(fctx, fence),
-   remaining_jiffies);
-   else
-   ret = wait_event_timeout(fctx->event,
-   fence_completed(fctx, fence),
-   remaining_jiffies);
-
-   if (ret == 0) {
-   DBG("timeout waiting for fence: %u (completed: %u)",
-   fence, fctx->completed_fence);
-   ret = -ETIMEDOUT;
-   } else if (ret != -ERESTARTSYS) {
-   ret = 0;
-   }
-   }
-
-   return ret;
-}
-
 /* called from workqueue */
 void msm_update_fence(struct msm_fence_context *fctx, uint32_t fence)
 {
diff --git a/drivers/gpu/drm/msm/msm_fence.h b/drivers/gpu/drm/msm/msm_fence.h
index 6ab97062ff1a..6de97d0f5153 100644
--- a/drivers/gpu/drm/msm/msm_fence.h
+++ b/drivers/gpu/dr

[PATCH v2 11/12] drm/msm: Utilize gpu scheduler priorities

2021-07-26 Thread Rob Clark

From: Rob Clark 

The drm/scheduler provides additional prioritization on top of that
provided by however many number of ringbuffers (each with their own
priority level) is supported on a given generation.  Expose the
additional levels of priority to userspace and map the userspace
priority back to ring (first level of priority) and schedular priority
(additional priority levels within the ring).

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |  4 +-
 drivers/gpu/drm/msm/msm_gem_submit.c|  4 +-
 drivers/gpu/drm/msm/msm_gpu.h   | 58 -
 drivers/gpu/drm/msm/msm_submitqueue.c   | 35 +++
 include/uapi/drm/msm_drm.h  | 14 +-
 5 files changed, 88 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c 
b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index bad4809b68ef..748665232d29 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -261,8 +261,8 @@ int adreno_get_param(struct msm_gpu *gpu, uint32_t param, 
uint64_t *value)
return ret;
}
return -EINVAL;
-   case MSM_PARAM_NR_RINGS:
-   *value = gpu->nr_rings;
+   case MSM_PARAM_PRIORITIES:
+   *value = gpu->nr_rings * NR_SCHED_PRIORITIES;
return 0;
case MSM_PARAM_PP_PGTABLE:
*value = 0;
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index affceccf145d..b60c3f7ed551 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -59,7 +59,7 @@ static struct msm_gem_submit *submit_create(struct drm_device 
*dev,
submit->gpu = gpu;
submit->cmd = (void *)&submit->bos[nr_bos];
submit->queue = queue;
-   submit->ring = gpu->rb[queue->prio];
+   submit->ring = gpu->rb[queue->ring_nr];
submit->fault_dumped = false;
 
INIT_LIST_HEAD(&submit->node);
@@ -751,7 +751,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
/* Get a unique identifier for the submission for logging purposes */
submitid = atomic_inc_return(&ident) - 1;
 
-   ring = gpu->rb[queue->prio];
+   ring = gpu->rb[queue->ring_nr];
trace_msm_gpu_submit(pid_nr(pid), ring->id, submitid,
args->nr_bos, args->nr_cmds);
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index b912cacaecc0..0e4b45bff2e6 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -250,6 +250,59 @@ struct msm_gpu_perfcntr {
const char *name;
 };
 
+/*
+ * The number of priority levels provided by drm gpu scheduler.  The
+ * DRM_SCHED_PRIORITY_KERNEL priority level is treated specially in some
+ * cases, so we don't use it (no need for kernel generated jobs).
+ */
+#define NR_SCHED_PRIORITIES (1 + DRM_SCHED_PRIORITY_HIGH - 
DRM_SCHED_PRIORITY_MIN)
+
+/**
+ * msm_gpu_convert_priority - Map userspace priority to ring # and sched 
priority
+ *
+ * @gpu:the gpu instance
+ * @prio:   the userspace priority level
+ * @ring_nr:[out] the ringbuffer the userspace priority maps to
+ * @sched_prio: [out] the gpu scheduler priority level which the userspace
+ *  priority maps to
+ *
+ * With drm/scheduler providing it's own level of prioritization, our total
+ * number of available priority levels is (nr_rings * NR_SCHED_PRIORITIES).
+ * Each ring is associated with it's own scheduler instance.  However, our
+ * UABI is that lower numerical values are higher priority.  So mapping the
+ * single userspace priority level into ring_nr and sched_prio takes some
+ * care.  The userspace provided priority (when a submitqueue is created)
+ * is mapped to ring nr and scheduler priority as such:
+ *
+ *   ring_nr= userspace_prio / NR_SCHED_PRIORITIES
+ *   sched_prio = NR_SCHED_PRIORITIES -
+ *(userspace_prio % NR_SCHED_PRIORITIES) - 1
+ *
+ * This allows generations without preemption (nr_rings==1) to have some
+ * amount of prioritization, and provides more priority levels for gens
+ * that do have preemption.
+ */
+static inline int msm_gpu_convert_priority(struct msm_gpu *gpu, int prio,
+   unsigned *ring_nr, enum drm_sched_priority *sched_prio)
+{
+   unsigned rn, sp;
+
+   rn = div_u64_rem(prio, NR_SCHED_PRIORITIES, &sp);
+
+   /* invert sched priority to map to higher-numeric-is-higher-
+* priority convention
+*/
+   sp = NR_SCHED_PRIORITIES - sp - 1;
+
+   if (rn >= gpu->nr_rings)
+   return -EINVAL;
+
+   *ring_nr = rn;
+   *sched_prio = sp;
+
+   return 0;
+}
+
 /**
  * A submitqueue is associated with a gl context or vk queue (or equiv)
  * in userspace.
@@ -257,7 +310,8 @@ st

[PATCH v2 12/12] drm/msm/gem: Mark active before pinning

2021-07-26 Thread Rob Clark

From: Rob Clark 

Mark all the bos in the submit as active, before pinning, to prevent
evicting a buffer in the same submit to make room for a buffer earlier
in the table.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/msm_gem.c|  2 --
 drivers/gpu/drm/msm/msm_gem_submit.c | 28 
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 41a111c49cc7..71a589fd4ba8 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -131,7 +131,6 @@ static struct page **get_pages(struct drm_gem_object *obj)
if (msm_obj->flags & (MSM_BO_WC|MSM_BO_UNCACHED))
sync_for_device(msm_obj);
 
-   GEM_WARN_ON(msm_obj->active_count);
update_inactive(msm_obj);
}
 
@@ -815,7 +814,6 @@ void msm_gem_active_get(struct drm_gem_object *obj, struct 
msm_gpu *gpu)
GEM_WARN_ON(!msm_gem_is_locked(obj));
GEM_WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED);
GEM_WARN_ON(msm_obj->dontneed);
-   GEM_WARN_ON(!msm_obj->sgt);
 
if (msm_obj->active_count++ == 0) {
mutex_lock(&priv->mm_lock);
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index b60c3f7ed551..2615a4b3a2e9 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -24,7 +24,8 @@
 /* make sure these don't conflict w/ MSM_SUBMIT_BO_x */
 #define BO_VALID0x8000   /* is current addr in cmdstream correct/valid? */
 #define BO_LOCKED   0x4000   /* obj lock is held */
-#define BO_PINNED   0x2000   /* obj is pinned and on active list */
+#define BO_ACTIVE   0x2000   /* active refcnt is held */
+#define BO_PINNED   0x1000   /* obj is pinned and on active list */
 
 static struct msm_gem_submit *submit_create(struct drm_device *dev,
struct msm_gpu *gpu,
@@ -252,10 +253,11 @@ static void submit_cleanup_bo(struct msm_gem_submit 
*submit, int i,
struct drm_gem_object *obj = &submit->bos[i].obj->base;
unsigned flags = submit->bos[i].flags & cleanup_flags;
 
-   if (flags & BO_PINNED) {
+   if (flags & BO_PINNED)
msm_gem_unpin_iova_locked(obj, submit->aspace);
+
+   if (flags & BO_ACTIVE)
msm_gem_active_put(obj);
-   }
 
if (flags & BO_LOCKED)
dma_resv_unlock(obj->resv);
@@ -265,7 +267,7 @@ static void submit_cleanup_bo(struct msm_gem_submit 
*submit, int i,
 
 static void submit_unlock_unpin_bo(struct msm_gem_submit *submit, int i)
 {
-   submit_cleanup_bo(submit, i, BO_PINNED | BO_LOCKED);
+   submit_cleanup_bo(submit, i, BO_PINNED | BO_ACTIVE | BO_LOCKED);
 
if (!(submit->bos[i].flags & BO_VALID))
submit->bos[i].iova = 0;
@@ -357,6 +359,18 @@ static int submit_pin_objects(struct msm_gem_submit 
*submit)
 
submit->valid = true;
 
+   /*
+* Increment active_count first, so if under memory pressure, we
+* don't inadvertently evict a bo needed by the submit in order
+* to pin an earlier bo in the same submit.
+*/
+   for (i = 0; i < submit->nr_bos; i++) {
+   struct drm_gem_object *obj = &submit->bos[i].obj->base;
+
+   msm_gem_active_get(obj, submit->gpu);
+   submit->bos[i].flags |= BO_ACTIVE;
+   }
+
for (i = 0; i < submit->nr_bos; i++) {
struct drm_gem_object *obj = &submit->bos[i].obj->base;
uint64_t iova;
@@ -368,8 +382,6 @@ static int submit_pin_objects(struct msm_gem_submit *submit)
if (ret)
break;
 
-   msm_gem_active_get(obj, submit->gpu);
-
submit->bos[i].flags |= BO_PINNED;
 
if (iova == submit->bos[i].iova) {
@@ -503,7 +515,7 @@ static void submit_cleanup(struct msm_gem_submit *submit, 
bool error)
unsigned i;
 
if (error)
-   cleanup_flags |= BO_PINNED;
+   cleanup_flags |= BO_PINNED | BO_ACTIVE;
 
for (i = 0; i < submit->nr_bos; i++) {
struct msm_gem_object *msm_obj = submit->bos[i].obj;
@@ -522,7 +534,7 @@ void msm_submit_retire(struct msm_gem_submit *submit)
struct drm_gem_object *obj = &submit->bos[i].obj->base;
 
msm_gem_lock(obj);
-   submit_cleanup_bo(submit, i, BO_PINNED);
+   submit_cleanup_bo(submit, i, BO_PINNED | BO_ACTIVE);
msm_gem_unlock(obj);
drm_gem_object_put(obj);
}
-- 
2.31.1

[RFC 0/4] dma-fence: Deadline awareness

2021-07-26 Thread Rob Clark

From: Rob Clark 

Based on discussion from a previous series[1] to add a "boost" mechanism
when, for example, vblank deadlines are missed.  Instead of a boost
callback, this approach adds a way to set a deadline on the fence, by
which the waiter would like to see the fence signalled.

I've not yet had a chance to re-work the drm/msm part of this, but
wanted to send this out as an RFC in case I don't have a chance to
finish the drm/msm part this week.

Original description:

In some cases, like double-buffered rendering, missing vblanks can
trick the GPU into running at a lower frequence, when really we
want to be running at a higher frequency to not miss the vblanks
in the first place.

This is partially inspired by a trick i915 does, but implemented
via dma-fence for a couple of reasons:

1) To continue to be able to use the atomic helpers
2) To support cases where display and gpu are different drivers

[1] https://patchwork.freedesktop.org/series/90331/

Rob Clark (4):
  dma-fence: Add deadline awareness
  drm/vblank: Add helper to get next vblank time
  drm/atomic-helper: Set fence deadline for vblank
  drm/scheduler: Add fence deadline support

 drivers/dma-buf/dma-fence.c | 39 +
 drivers/gpu/drm/drm_atomic_helper.c | 36 +++
 drivers/gpu/drm/drm_vblank.c| 31 
 drivers/gpu/drm/scheduler/sched_fence.c | 10 +++
 drivers/gpu/drm/scheduler/sched_main.c  |  3 ++
 include/drm/drm_vblank.h|  1 +
 include/linux/dma-fence.h   | 17 +++
 7 files changed, 137 insertions(+)

-- 
2.31.1

[RFC 1/4] dma-fence: Add deadline awareness

2021-07-26 Thread Rob Clark

From: Rob Clark 

Add a way to hint to the fence signaler of an upcoming deadline, such as
vblank, which the fence waiter would prefer not to miss.  This is to aid
the fence signaler in making power management decisions, like boosting
frequency as the deadline approaches and awareness of missing deadlines
so that can be factored in to the frequency scaling.

Signed-off-by: Rob Clark 
---
 drivers/dma-buf/dma-fence.c | 39 +
 include/linux/dma-fence.h   | 17 
 2 files changed, 56 insertions(+)

diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index ce0f5eff575d..2e0d25ab457e 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -910,6 +910,45 @@ dma_fence_wait_any_timeout(struct dma_fence **fences, 
uint32_t count,
 }
 EXPORT_SYMBOL(dma_fence_wait_any_timeout);
 
+
+/**
+ * dma_fence_set_deadline - set desired fence-wait deadline
+ * @fence:the fence that is to be waited on
+ * @deadline: the time by which the waiter hopes for the fence to be
+ *signaled
+ *
+ * Inform the fence signaler of an upcoming deadline, such as vblank, by
+ * which point the waiter would prefer the fence to be signaled by.  This
+ * is intended to give feedback to the fence signaler to aid in power
+ * management decisions, such as boosting GPU frequency if a periodic
+ * vblank deadline is approaching.
+ */
+void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
+{
+   unsigned long flags;
+
+   if (dma_fence_is_signaled(fence))
+   return;
+
+   spin_lock_irqsave(fence->lock, flags);
+
+   /* If we already have an earlier deadline, keep it: */
+   if (test_bit(DMA_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags) &&
+   ktime_before(fence->deadline, deadline)) {
+   spin_unlock_irqrestore(fence->lock, flags);
+   return;
+   }
+
+   fence->deadline = deadline;
+   set_bit(DMA_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags);
+
+   spin_unlock_irqrestore(fence->lock, flags);
+
+   if (fence->ops->set_deadline)
+   fence->ops->set_deadline(fence, deadline);
+}
+EXPORT_SYMBOL(dma_fence_set_deadline);
+
 /**
  * dma_fence_init - Initialize a custom fence.
  * @fence: the fence to initialize
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 6ffb4b2c6371..4e6cfe4e6fbc 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -88,6 +88,7 @@ struct dma_fence {
/* @timestamp replaced by @rcu on dma_fence_release() */
struct rcu_head rcu;
};
+   ktime_t deadline;
u64 context;
u64 seqno;
unsigned long flags;
@@ -99,6 +100,7 @@ enum dma_fence_flag_bits {
DMA_FENCE_FLAG_SIGNALED_BIT,
DMA_FENCE_FLAG_TIMESTAMP_BIT,
DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+   DMA_FENCE_FLAG_HAS_DEADLINE_BIT,
DMA_FENCE_FLAG_USER_BITS, /* must always be last member */
 };
 
@@ -261,6 +263,19 @@ struct dma_fence_ops {
 */
void (*timeline_value_str)(struct dma_fence *fence,
   char *str, int size);
+
+   /**
+* @set_deadline:
+*
+* Callback to allow a fence waiter to inform the fence signaler of an
+* upcoming deadline, such as vblank, by which point the waiter would
+* prefer the fence to be signaled by.  This is intended to give 
feedback
+* to the fence signaler to aid in power management decisions, such as
+* boosting GPU frequency.
+*
+* This callback is optional.
+*/
+   void (*set_deadline)(struct dma_fence *fence, ktime_t deadline);
 };
 
 void dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops,
@@ -586,6 +601,8 @@ static inline signed long dma_fence_wait(struct dma_fence 
*fence, bool intr)
return ret < 0 ? ret : 0;
 }
 
+void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline);
+
 struct dma_fence *dma_fence_get_stub(void);
 struct dma_fence *dma_fence_allocate_private_stub(void);
 u64 dma_fence_context_alloc(unsigned num);
-- 
2.31.1

[RFC 2/4] drm/vblank: Add helper to get next vblank time

2021-07-26 Thread Rob Clark

From: Rob Clark 

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/drm_vblank.c | 31 +++
 include/drm/drm_vblank.h |  1 +
 2 files changed, 32 insertions(+)

diff --git a/drivers/gpu/drm/drm_vblank.c b/drivers/gpu/drm/drm_vblank.c
index 3417e1ac7918..88c824c294dc 100644
--- a/drivers/gpu/drm/drm_vblank.c
+++ b/drivers/gpu/drm/drm_vblank.c
@@ -980,6 +980,37 @@ u64 drm_crtc_vblank_count_and_time(struct drm_crtc *crtc,
 }
 EXPORT_SYMBOL(drm_crtc_vblank_count_and_time);
 
+/**
+ * drm_crtc_next_vblank_time - calculate the time of the next vblank
+ * @crtc: the crtc for which to calculate next vblank time
+ * @vblanktime: pointer to time to receive the next vblank timestamp.
+ *
+ * Calculate the expected time of the next vblank based on time of previous
+ * vblank and frame duration
+ */
+int drm_crtc_next_vblank_time(struct drm_crtc *crtc, ktime_t *vblanktime)
+{
+   unsigned int pipe = drm_crtc_index(crtc);
+   struct drm_vblank_crtc *vblank = &crtc->dev->vblank[pipe];
+   u64 count;
+
+   if (!vblank->framedur_ns)
+   return -EINVAL;
+
+   count = drm_vblank_count_and_time(crtc->dev, pipe, vblanktime);
+
+   /*
+* If we don't get a valid count, then we probably also don't
+* have a valid time:
+*/
+   if (!count)
+   return -EINVAL;
+
+   *vblanktime = ktime_add(*vblanktime, ns_to_ktime(vblank->framedur_ns));
+
+   return 0;
+}
+
 static void send_vblank_event(struct drm_device *dev,
struct drm_pending_vblank_event *e,
u64 seq, ktime_t now)
diff --git a/include/drm/drm_vblank.h b/include/drm/drm_vblank.h
index 733a3e2d1d10..a63bc2c92f3c 100644
--- a/include/drm/drm_vblank.h
+++ b/include/drm/drm_vblank.h
@@ -230,6 +230,7 @@ bool drm_dev_has_vblank(const struct drm_device *dev);
 u64 drm_crtc_vblank_count(struct drm_crtc *crtc);
 u64 drm_crtc_vblank_count_and_time(struct drm_crtc *crtc,
   ktime_t *vblanktime);
+int drm_crtc_next_vblank_time(struct drm_crtc *crtc, ktime_t *vblanktime);
 void drm_crtc_send_vblank_event(struct drm_crtc *crtc,
   struct drm_pending_vblank_event *e);
 void drm_crtc_arm_vblank_event(struct drm_crtc *crtc,
-- 
2.31.1

[RFC 3/4] drm/atomic-helper: Set fence deadline for vblank

2021-07-26 Thread Rob Clark

From: Rob Clark 

For an atomic commit updating a single CRTC (ie. a pageflip) calculate
the next vblank time, and inform the fence(s) of that deadline.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/drm_atomic_helper.c | 36 +
 1 file changed, 36 insertions(+)

diff --git a/drivers/gpu/drm/drm_atomic_helper.c 
b/drivers/gpu/drm/drm_atomic_helper.c
index bc3487964fb5..f81b20775b15 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -1406,6 +1406,40 @@ void drm_atomic_helper_commit_modeset_enables(struct 
drm_device *dev,
 }
 EXPORT_SYMBOL(drm_atomic_helper_commit_modeset_enables);
 
+/*
+ * For atomic updates which touch just a single CRTC, calculate the time of the
+ * next vblank, and inform all the fences of the of the deadline.
+ */
+static void set_fence_deadline(struct drm_device *dev,
+  struct drm_atomic_state *state)
+{
+   struct drm_crtc *crtc, *wait_crtc = NULL;
+   struct drm_crtc_state *new_crtc_state;
+   struct drm_plane *plane;
+   struct drm_plane_state *new_plane_state;
+   ktime_t vbltime;
+   int i;
+
+   for_each_new_crtc_in_state (state, crtc, new_crtc_state, i) {
+   if (!wait_crtc)
+   return;
+   wait_crtc = crtc;
+   }
+
+   /* If no CRTCs updated, then nothing to do: */
+   if (!wait_crtc)
+   return;
+
+   if (drm_crtc_next_vblank_time(wait_crtc, &vbltime))
+   return;
+
+   for_each_new_plane_in_state (state, plane, new_plane_state, i) {
+   if (!new_plane_state->fence)
+   continue;
+   dma_fence_set_deadline(new_plane_state->fence, vbltime);
+   }
+}
+
 /**
  * drm_atomic_helper_wait_for_fences - wait for fences stashed in plane state
  * @dev: DRM device
@@ -1435,6 +1469,8 @@ int drm_atomic_helper_wait_for_fences(struct drm_device 
*dev,
struct drm_plane_state *new_plane_state;
int i, ret;
 
+   set_fence_deadline(dev, state);
+
for_each_new_plane_in_state(state, plane, new_plane_state, i) {
if (!new_plane_state->fence)
continue;
-- 
2.31.1

[RFC 4/4] drm/scheduler: Add fence deadline support

2021-07-26 Thread Rob Clark

From: Rob Clark 

As the finished fence is the one that is exposed to userspace, and
therefore the one that other operations, like atomic update, would
block on, we need to propagate the deadline from from the finished
fence to the actual hw fence.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/scheduler/sched_fence.c | 10 ++
 drivers/gpu/drm/scheduler/sched_main.c  |  3 +++
 2 files changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/scheduler/sched_fence.c 
b/drivers/gpu/drm/scheduler/sched_fence.c
index 69de2c76731f..3aa6351d2101 100644
--- a/drivers/gpu/drm/scheduler/sched_fence.c
+++ b/drivers/gpu/drm/scheduler/sched_fence.c
@@ -128,6 +128,15 @@ static void drm_sched_fence_release_finished(struct 
dma_fence *f)
dma_fence_put(&fence->scheduled);
 }
 
+static void drm_sched_fence_set_deadline_finished(struct dma_fence *f,
+ ktime_t deadline)
+{
+   struct drm_sched_fence *fence = to_drm_sched_fence(f);
+
+   if (fence->parent)
+   dma_fence_set_deadline(fence->parent, deadline);
+}
+
 static const struct dma_fence_ops drm_sched_fence_ops_scheduled = {
.get_driver_name = drm_sched_fence_get_driver_name,
.get_timeline_name = drm_sched_fence_get_timeline_name,
@@ -138,6 +147,7 @@ static const struct dma_fence_ops 
drm_sched_fence_ops_finished = {
.get_driver_name = drm_sched_fence_get_driver_name,
.get_timeline_name = drm_sched_fence_get_timeline_name,
.release = drm_sched_fence_release_finished,
+   .set_deadline = drm_sched_fence_set_deadline_finished,
 };
 
 struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a953693b45..fcc601962e92 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -818,6 +818,9 @@ static int drm_sched_main(void *param)
 
if (!IS_ERR_OR_NULL(fence)) {
s_fence->parent = dma_fence_get(fence);
+   if (test_bit(DMA_FENCE_FLAG_HAS_DEADLINE_BIT,
+&s_fence->finished.flags))
+   dma_fence_set_deadline(fence, 
s_fence->finished.deadline);
r = dma_fence_add_callback(fence, &sched_job->cb,
   drm_sched_job_done_cb);
if (r == -ENOENT)
-- 
2.31.1

Re: [RFC 0/4] dma-fence: Deadline awareness

2021-07-26 Thread Rob Clark

On Mon, Jul 26, 2021 at 4:34 PM Rob Clark  wrote:
>
> From: Rob Clark 
>
> Based on discussion from a previous series[1] to add a "boost" mechanism
> when, for example, vblank deadlines are missed.  Instead of a boost
> callback, this approach adds a way to set a deadline on the fence, by
> which the waiter would like to see the fence signalled.
>
> I've not yet had a chance to re-work the drm/msm part of this, but
> wanted to send this out as an RFC in case I don't have a chance to
> finish the drm/msm part this week.

Fwiw, what I'm thinking for the drm/msm part is a timer set to expire
a bit (couple ms?) before the deadline, which boosts if the timer
expires before the fence is signaled.

Assuming this is roughly in line with what other drivers would do,
possibly there is some room to build this timer into dma-fence itself?

BR,
-R

>
> Original description:
>
> In some cases, like double-buffered rendering, missing vblanks can
> trick the GPU into running at a lower frequence, when really we
> want to be running at a higher frequency to not miss the vblanks
> in the first place.
>
> This is partially inspired by a trick i915 does, but implemented
> via dma-fence for a couple of reasons:
>
> 1) To continue to be able to use the atomic helpers
> 2) To support cases where display and gpu are different drivers
>
> [1] https://patchwork.freedesktop.org/series/90331/
>
> Rob Clark (4):
>   dma-fence: Add deadline awareness
>   drm/vblank: Add helper to get next vblank time
>   drm/atomic-helper: Set fence deadline for vblank
>   drm/scheduler: Add fence deadline support
>
>  drivers/dma-buf/dma-fence.c | 39 +
>  drivers/gpu/drm/drm_atomic_helper.c | 36 +++
>  drivers/gpu/drm/drm_vblank.c| 31 
>  drivers/gpu/drm/scheduler/sched_fence.c | 10 +++
>  drivers/gpu/drm/scheduler/sched_main.c  |  3 ++
>  include/drm/drm_vblank.h|  1 +
>  include/linux/dma-fence.h   | 17 +++
>  7 files changed, 137 insertions(+)
>
> --
> 2.31.1
>

Re: [RFC 1/4] dma-fence: Add deadline awareness

2021-07-27 Thread Rob Clark

On Tue, Jul 27, 2021 at 12:11 AM Christian König
 wrote:
>
> Am 27.07.21 um 01:38 schrieb Rob Clark:
> > From: Rob Clark 
> >
> > Add a way to hint to the fence signaler of an upcoming deadline, such as
> > vblank, which the fence waiter would prefer not to miss.  This is to aid
> > the fence signaler in making power management decisions, like boosting
> > frequency as the deadline approaches and awareness of missing deadlines
> > so that can be factored in to the frequency scaling.
> >
> > Signed-off-by: Rob Clark 
> > ---
> >   drivers/dma-buf/dma-fence.c | 39 +
> >   include/linux/dma-fence.h   | 17 
> >   2 files changed, 56 insertions(+)
> >
> > diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
> > index ce0f5eff575d..2e0d25ab457e 100644
> > --- a/drivers/dma-buf/dma-fence.c
> > +++ b/drivers/dma-buf/dma-fence.c
> > @@ -910,6 +910,45 @@ dma_fence_wait_any_timeout(struct dma_fence **fences, 
> > uint32_t count,
> >   }
> >   EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> >
> > +
> > +/**
> > + * dma_fence_set_deadline - set desired fence-wait deadline
> > + * @fence:the fence that is to be waited on
> > + * @deadline: the time by which the waiter hopes for the fence to be
> > + *signaled
> > + *
> > + * Inform the fence signaler of an upcoming deadline, such as vblank, by
> > + * which point the waiter would prefer the fence to be signaled by.  This
> > + * is intended to give feedback to the fence signaler to aid in power
> > + * management decisions, such as boosting GPU frequency if a periodic
> > + * vblank deadline is approaching.
> > + */
> > +void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
> > +{
> > + unsigned long flags;
> > +
> > + if (dma_fence_is_signaled(fence))
> > + return;
> > +
> > + spin_lock_irqsave(fence->lock, flags);
> > +
> > + /* If we already have an earlier deadline, keep it: */
> > + if (test_bit(DMA_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags) &&
> > + ktime_before(fence->deadline, deadline)) {
> > + spin_unlock_irqrestore(fence->lock, flags);
> > + return;
> > + }
> > +
> > + fence->deadline = deadline;
> > + set_bit(DMA_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags);
> > +
> > + spin_unlock_irqrestore(fence->lock, flags);
> > +
> > + if (fence->ops->set_deadline)
> > + fence->ops->set_deadline(fence, deadline);
> > +}
> > +EXPORT_SYMBOL(dma_fence_set_deadline);
> > +
> >   /**
> >* dma_fence_init - Initialize a custom fence.
> >* @fence: the fence to initialize
> > diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
> > index 6ffb4b2c6371..4e6cfe4e6fbc 100644
> > --- a/include/linux/dma-fence.h
> > +++ b/include/linux/dma-fence.h
> > @@ -88,6 +88,7 @@ struct dma_fence {
> >   /* @timestamp replaced by @rcu on dma_fence_release() */
> >   struct rcu_head rcu;
> >   };
> > + ktime_t deadline;
>
> Mhm, adding the flag sounds ok to me but I'm a bit hesitating adding the
> deadline as extra field here.
>
> We tuned the dma_fence structure intentionally so that it is only 64 bytes.

Hmm, then I guess you wouldn't be a fan of also adding an hrtimer?

We could push the ktime_t (and timer) down into the derived fence
class, but I think there is going to need to be some extra storage
*somewhere*.. maybe the fence signaler could get away with just
storing the nearest upcoming deadline per fence-context instead?

BR,
-R

> Regards,
> Christian.
>
> >   u64 context;
> >   u64 seqno;
> >   unsigned long flags;
> > @@ -99,6 +100,7 @@ enum dma_fence_flag_bits {
> >   DMA_FENCE_FLAG_SIGNALED_BIT,
> >   DMA_FENCE_FLAG_TIMESTAMP_BIT,
> >   DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> > + DMA_FENCE_FLAG_HAS_DEADLINE_BIT,
> >   DMA_FENCE_FLAG_USER_BITS, /* must always be last member */
> >   };
> >
> > @@ -261,6 +263,19 @@ struct dma_fence_ops {
> >*/
> >   void (*timeline_value_str)(struct dma_fence *fence,
> >  char *str, int size);
> > +
> > + /**
> > +  * @set_deadline:
> > +  *
> > +  * Callback to allow a fence waiter to inform the fence signaler of an
> > +  * upcoming deadline, suc

Re: [RFC 3/4] drm/atomic-helper: Set fence deadline for vblank

2021-07-27 Thread Rob Clark

On Tue, Jul 27, 2021 at 3:44 AM Michel Dänzer  wrote:
>
> On 2021-07-27 1:38 a.m., Rob Clark wrote:
> > From: Rob Clark 
> >
> > For an atomic commit updating a single CRTC (ie. a pageflip) calculate
> > the next vblank time, and inform the fence(s) of that deadline.
> >
> > Signed-off-by: Rob Clark 
> > ---
> >  drivers/gpu/drm/drm_atomic_helper.c | 36 +
> >  1 file changed, 36 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/drm_atomic_helper.c 
> > b/drivers/gpu/drm/drm_atomic_helper.c
> > index bc3487964fb5..f81b20775b15 100644
> > --- a/drivers/gpu/drm/drm_atomic_helper.c
> > +++ b/drivers/gpu/drm/drm_atomic_helper.c
> > @@ -1406,6 +1406,40 @@ void drm_atomic_helper_commit_modeset_enables(struct 
> > drm_device *dev,
> >  }
> >  EXPORT_SYMBOL(drm_atomic_helper_commit_modeset_enables);
> >
> > +/*
> > + * For atomic updates which touch just a single CRTC, calculate the time 
> > of the
> > + * next vblank, and inform all the fences of the of the deadline.
> > + */
> > +static void set_fence_deadline(struct drm_device *dev,
> > +struct drm_atomic_state *state)
> > +{
> > + struct drm_crtc *crtc, *wait_crtc = NULL;
> > + struct drm_crtc_state *new_crtc_state;
> > + struct drm_plane *plane;
> > + struct drm_plane_state *new_plane_state;
> > + ktime_t vbltime;
> > + int i;
> > +
> > + for_each_new_crtc_in_state (state, crtc, new_crtc_state, i) {
> > + if (!wait_crtc)
> > + return;
>
> Either this return or the next one below would always be taken, I doubt this 
> was intended.

oops, the condition here is mistakenly inverted, it was meant to bail
if there is more than a single CRTC

>
> > + wait_crtc = crtc;
> > + }
> > +
> > + /* If no CRTCs updated, then nothing to do: */
> > + if (!wait_crtc)
> > + return;
> > +
> > + if (drm_crtc_next_vblank_time(wait_crtc, &vbltime))
> > + return;
> > +
> > + for_each_new_plane_in_state (state, plane, new_plane_state, i) {
> > + if (!new_plane_state->fence)
> > + continue;
> > + dma_fence_set_deadline(new_plane_state->fence, vbltime);
> > + }
>
> vblank timestamps correspond to the end of vertical blank, the deadline 
> should be the start of vertical blank though.
>

hmm, I suppose this depends on whether the hw actually has separate
irq's for frame-done and vblank (and whether the driver
differentiates).. and if the display controller is doing some
buffering, the point at which it wants to flip could be a bit earlier
still.  Maybe we just want a kms driver provided offset for how early
it wants the deadline relative to vblank?

BR,
-R

>
> --
> Earthling Michel Dänzer   |   https://redhat.com
> Libre software enthusiast | Mesa and X developer

Re: [RFC 0/4] dma-fence: Deadline awareness

2021-07-27 Thread Rob Clark

On Tue, Jul 27, 2021 at 7:50 AM Michel Dänzer  wrote:
>
> On 2021-07-27 1:38 a.m., Rob Clark wrote:
> > From: Rob Clark 
> >
> > Based on discussion from a previous series[1] to add a "boost" mechanism
> > when, for example, vblank deadlines are missed.  Instead of a boost
> > callback, this approach adds a way to set a deadline on the fence, by
> > which the waiter would like to see the fence signalled.
> >
> > I've not yet had a chance to re-work the drm/msm part of this, but
> > wanted to send this out as an RFC in case I don't have a chance to
> > finish the drm/msm part this week.
> >
> > Original description:
> >
> > In some cases, like double-buffered rendering, missing vblanks can
> > trick the GPU into running at a lower frequence, when really we
> > want to be running at a higher frequency to not miss the vblanks
> > in the first place.
> >
> > This is partially inspired by a trick i915 does, but implemented
> > via dma-fence for a couple of reasons:
> >
> > 1) To continue to be able to use the atomic helpers
> > 2) To support cases where display and gpu are different drivers
> >
> > [1] https://patchwork.freedesktop.org/series/90331/
>
> Unfortunately, none of these approaches will have the full intended effect 
> once Wayland compositors start waiting for client buffers to become idle 
> before using them for an output frame (to prevent output frames from getting 
> delayed by client work). See 
> https://gitlab.gnome.org/GNOME/mutter/-/merge_requests/1880 (shameless plug 
> :) for a proof of concept of this for mutter. The boost will only affect the 
> compositor's own GPU work, not the client work (which means no effect at all 
> for fullscreen apps where the compositor can scan out the client buffers 
> directly).
>

I guess you mean "no effect at all *except* for fullscreen..."?  Games
are usually running fullscreen, it is a case I care about a lot ;-)

I'd perhaps recommend that wayland compositors, in cases where only a
single layer is changing, not try to be clever and just push the
update down to the kernel.

BR,
-R

>
> --
> Earthling Michel Dänzer   |   https://redhat.com
> Libre software enthusiast | Mesa and X developer

Re: [RFC 0/4] dma-fence: Deadline awareness

2021-07-27 Thread Rob Clark

On Tue, Jul 27, 2021 at 8:19 AM Michel Dänzer  wrote:
>
> On 2021-07-27 5:12 p.m., Rob Clark wrote:
> > On Tue, Jul 27, 2021 at 7:50 AM Michel Dänzer  wrote:
> >>
> >> On 2021-07-27 1:38 a.m., Rob Clark wrote:
> >>> From: Rob Clark 
> >>>
> >>> Based on discussion from a previous series[1] to add a "boost" mechanism
> >>> when, for example, vblank deadlines are missed.  Instead of a boost
> >>> callback, this approach adds a way to set a deadline on the fence, by
> >>> which the waiter would like to see the fence signalled.
> >>>
> >>> I've not yet had a chance to re-work the drm/msm part of this, but
> >>> wanted to send this out as an RFC in case I don't have a chance to
> >>> finish the drm/msm part this week.
> >>>
> >>> Original description:
> >>>
> >>> In some cases, like double-buffered rendering, missing vblanks can
> >>> trick the GPU into running at a lower frequence, when really we
> >>> want to be running at a higher frequency to not miss the vblanks
> >>> in the first place.
> >>>
> >>> This is partially inspired by a trick i915 does, but implemented
> >>> via dma-fence for a couple of reasons:
> >>>
> >>> 1) To continue to be able to use the atomic helpers
> >>> 2) To support cases where display and gpu are different drivers
> >>>
> >>> [1] https://patchwork.freedesktop.org/series/90331/
> >>
> >> Unfortunately, none of these approaches will have the full intended effect 
> >> once Wayland compositors start waiting for client buffers to become idle 
> >> before using them for an output frame (to prevent output frames from 
> >> getting delayed by client work). See 
> >> https://gitlab.gnome.org/GNOME/mutter/-/merge_requests/1880 (shameless 
> >> plug :) for a proof of concept of this for mutter. The boost will only 
> >> affect the compositor's own GPU work, not the client work (which means no 
> >> effect at all for fullscreen apps where the compositor can scan out the 
> >> client buffers directly).
> >>
> >
> > I guess you mean "no effect at all *except* for fullscreen..."?
>
> I meant what I wrote: The compositor will wait for the next buffer to become 
> idle, so there's no boost from this mechanism for the client drawing to that 
> buffer. And since the compositor does no drawing of its own in this case, 
> there's no boost from that either.
>
>
> > I'd perhaps recommend that wayland compositors, in cases where only a
> > single layer is changing, not try to be clever and just push the
> > update down to the kernel.
>
> Even just for the fullscreen direct scanout case, that would require some 
> kind of atomic KMS API extension to allow queuing multiple page flips for the 
> same CRTC.
>
> For other cases, this would also require a mechanism to cancel a pending 
> atomic commit, for when another surface update comes in before the 
> compositor's deadline, which affects the previously single updating surface 
> as well.
>

Well, in the end, there is more than one compositor out there.. and if
some wayland compositors are going this route, they can also implement
the same mechanism in userspace using the sysfs that devfreq exports.

But it sounds simpler to me for the compositor to have a sort of "game
mode" for fullscreen games.. I'm less worried about UI interactive
workloads, boosting the GPU freq upon sudden activity after a period
of inactivity seems to work reasonably well there.

BR,
-R

>
> --
> Earthling Michel Dänzer   |   https://redhat.com
> Libre software enthusiast | Mesa and X developer

[pull] drm/msm: drm-msm-fixes-2021-07-27 for v5.14-rc4

2021-07-27 Thread Rob Clark

Hi Dave & Daniel,

A few fixes for v5.14, including a fix for a crash if display triggers
an iommu fault (which tends to happen at probe time on devices with
bootloader fw that leaves display enabled as kernel starts)

The following changes since commit ff1176468d368232b684f75e82563369208bc371:

  Linux 5.14-rc3 (2021-07-25 15:35:14 -0700)

are available in the Git repository at:

  https://gitlab.freedesktop.org/drm/msm.git drm-msm-fixes-2021-07-27

for you to fetch changes up to fc71c9e6f41f9912d22a75dfa76bc10811af7e22:

  drm/msm/dp: Initialize dp->aux->drm_dev before registration
(2021-07-27 08:14:58 -0700)


Bjorn Andersson (1):
  drm/msm/dp: Initialize the INTF_CONFIG register

Kuogee Hsieh (2):
  drm/msm/dp: use dp_ctrl_off_link_stream during PHY compliance test run
  drm/msm/dp: signal audio plugged change at dp_pm_resume

Rob Clark (1):
  drm/msm: Fix display fault handling

Robert Foss (1):
  drm/msm/dpu: Fix sm8250_mdp register length

Sean Paul (1):
  drm/msm/dp: Initialize dp->aux->drm_dev before registration

 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c |  2 +-
 drivers/gpu/drm/msm/dp/dp_catalog.c|  1 +
 drivers/gpu/drm/msm/dp/dp_ctrl.c   |  2 +-
 drivers/gpu/drm/msm/dp/dp_display.c|  5 +
 drivers/gpu/drm/msm/msm_iommu.c| 11 ++-
 5 files changed, 18 insertions(+), 3 deletions(-)

[PATCH v3 00/13] drm/msm: drm scheduler conversion and cleanups

2021-07-27 Thread Rob Clark

From: Rob Clark 

Conversion to gpu_scheduler, and bonus removal of
drm_gem_object_put_locked()

v2: Fix priority mixup (msm UAPI has lower numeric priority value as
higher priority, inverse of drm/scheduler) and add some comments
in the UAPI header to clarify.

Now that we move active refcnt get into msm_gem_submit, add a
patch to mark all bos busy before pinning, to avoid evicting bos
used in same batch.

Fix bo locking for cmdstream dumping ($debugfs/n/{rd,hangrd})

v3: Add a patch to drop submit bo_list and instead use -EALREADY
to detect errors with same obj appearing multiple times in the
submit ioctl bos table.  Otherwise, with struct_mutex locking
dropped, we'd need to move insertion into and removal from
bo_list under the obj lock.

Rob Clark (13):
  drm/msm: Docs and misc cleanup
  drm/msm: Small submitqueue creation cleanup
  drm/msm: drop drm_gem_object_put_locked()
  drm: Drop drm_gem_object_put_locked()
  drm/msm/submit: Simplify out-fence-fd handling
  drm/msm: Consolidate submit bo state
  drm/msm: Track "seqno" fences by idr
  drm/msm: Return ERR_PTR() from submit_create()
  drm/msm: Conversion to drm scheduler
  drm/msm: Drop submit bo_list
  drm/msm: Drop struct_mutex in submit path
  drm/msm: Utilize gpu scheduler priorities
  drm/msm/gem: Mark active before pinning

 drivers/gpu/drm/drm_gem.c   |  22 --
 drivers/gpu/drm/msm/Kconfig |   1 +
 drivers/gpu/drm/msm/adreno/a5xx_debugfs.c   |   4 +-
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c   |   6 +-
 drivers/gpu/drm/msm/adreno/a5xx_power.c |   2 +-
 drivers/gpu/drm/msm/adreno/a5xx_preempt.c   |   7 +-
 drivers/gpu/drm/msm/adreno/a6xx_gmu.c   |  12 +-
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   |   2 +-
 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c |   4 +-
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |   6 +-
 drivers/gpu/drm/msm/msm_drv.c   |  30 +-
 drivers/gpu/drm/msm/msm_fence.c |  39 ---
 drivers/gpu/drm/msm/msm_fence.h |   2 -
 drivers/gpu/drm/msm/msm_gem.c   |  94 +-
 drivers/gpu/drm/msm/msm_gem.h   |  47 +--
 drivers/gpu/drm/msm/msm_gem_submit.c| 344 
 drivers/gpu/drm/msm/msm_gpu.c   |  46 +--
 drivers/gpu/drm/msm/msm_gpu.h   |  78 -
 drivers/gpu/drm/msm/msm_rd.c|   6 +-
 drivers/gpu/drm/msm/msm_ringbuffer.c|  70 +++-
 drivers/gpu/drm/msm/msm_ringbuffer.h|  12 +
 drivers/gpu/drm/msm/msm_submitqueue.c   |  53 ++-
 include/drm/drm_gem.h   |   2 -
 include/uapi/drm/msm_drm.h  |  14 +-
 24 files changed, 516 insertions(+), 387 deletions(-)

-- 
2.31.1

[PATCH v3 01/13] drm/msm: Docs and misc cleanup

2021-07-27 Thread Rob Clark

From: Rob Clark 

Fix a couple incorrect or misspelt comments, and add submitqueue doc
comment.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem.h |  3 +--
 drivers/gpu/drm/msm/msm_gem_submit.c  |  1 +
 drivers/gpu/drm/msm/msm_gpu.h | 15 +++
 drivers/gpu/drm/msm/msm_ringbuffer.c  |  2 +-
 drivers/gpu/drm/msm/msm_submitqueue.c |  9 +
 5 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index 405f8411e395..d69fcb37ce17 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -313,8 +313,7 @@ void msm_gem_vunmap(struct drm_gem_object *obj);
 
 /* Created per submit-ioctl, to track bo's and cmdstream bufs, etc,
  * associated with the cmdstream submission for synchronization (and
- * make it easier to unwind when things go wrong, etc).  This only
- * lasts for the duration of the submit-ioctl.
+ * make it easier to unwind when things go wrong, etc).
  */
 struct msm_gem_submit {
struct kref ref;
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index 44f84bfd0c0e..6d46f9275a40 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -655,6 +655,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
bool has_ww_ticket = false;
unsigned i;
int ret, submitid;
+
if (!gpu)
return -ENXIO;
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 710c3fedfbf3..96efcb31e502 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -250,6 +250,21 @@ struct msm_gpu_perfcntr {
const char *name;
 };
 
+/**
+ * A submitqueue is associated with a gl context or vk queue (or equiv)
+ * in userspace.
+ *
+ * @id:userspace id for the submitqueue, unique within the drm_file
+ * @flags: userspace flags for the submitqueue, specified at creation
+ * (currently unusued)
+ * @prio:  the submitqueue priority
+ * @faults:the number of GPU hangs associated with this submitqueue
+ * @ctx:   the per-drm_file context associated with the submitqueue (ie.
+ * which set of pgtables do submits jobs associated with the
+ * submitqueue use)
+ * @node:  node in the context's list of submitqueues
+ * @ref:   reference count
+ */
 struct msm_gpu_submitqueue {
int id;
u32 flags;
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c 
b/drivers/gpu/drm/msm/msm_ringbuffer.c
index 7e92d9532454..054461662af5 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -32,7 +32,7 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu 
*gpu, int id,
 
if (IS_ERR(ring->start)) {
ret = PTR_ERR(ring->start);
-   ring->start = 0;
+   ring->start = NULL;
goto fail;
}
 
diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c 
b/drivers/gpu/drm/msm/msm_submitqueue.c
index c3d206105d28..e5eef11ed014 100644
--- a/drivers/gpu/drm/msm/msm_submitqueue.c
+++ b/drivers/gpu/drm/msm/msm_submitqueue.c
@@ -98,17 +98,18 @@ int msm_submitqueue_create(struct drm_device *drm, struct 
msm_file_private *ctx,
return 0;
 }
 
+/*
+ * Create the default submit-queue (id==0), used for backwards compatibility
+ * for userspace that pre-dates the introduction of submitqueues.
+ */
 int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx)
 {
struct msm_drm_private *priv = drm->dev_private;
int default_prio;
 
-   if (!ctx)
-   return 0;
-
/*
 * Select priority 2 as the "default priority" unless nr_rings is less
-* than 2 and then pick the lowest pirority
+* than 2 and then pick the lowest priority
 */
default_prio = priv->gpu ?
clamp_t(uint32_t, 2, 0, priv->gpu->nr_rings - 1) : 0;
-- 
2.31.1

[PATCH v3 02/13] drm/msm: Small submitqueue creation cleanup

2021-07-27 Thread Rob Clark

From: Rob Clark 

If we don't have a gpu, there is no need to create a submitqueue, which
lets us simplify the error handling and submitqueue creation.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_submitqueue.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c 
b/drivers/gpu/drm/msm/msm_submitqueue.c
index e5eef11ed014..9e9fec61d629 100644
--- a/drivers/gpu/drm/msm/msm_submitqueue.c
+++ b/drivers/gpu/drm/msm/msm_submitqueue.c
@@ -66,6 +66,12 @@ int msm_submitqueue_create(struct drm_device *drm, struct 
msm_file_private *ctx,
if (!ctx)
return -ENODEV;
 
+   if (!priv->gpu)
+   return -ENODEV;
+
+   if (prio >= priv->gpu->nr_rings)
+   return -EINVAL;
+
queue = kzalloc(sizeof(*queue), GFP_KERNEL);
 
if (!queue)
@@ -73,15 +79,7 @@ int msm_submitqueue_create(struct drm_device *drm, struct 
msm_file_private *ctx,
 
kref_init(&queue->ref);
queue->flags = flags;
-
-   if (priv->gpu) {
-   if (prio >= priv->gpu->nr_rings) {
-   kfree(queue);
-   return -EINVAL;
-   }
-
-   queue->prio = prio;
-   }
+   queue->prio = prio;
 
write_lock(&ctx->queuelock);
 
@@ -107,12 +105,14 @@ int msm_submitqueue_init(struct drm_device *drm, struct 
msm_file_private *ctx)
struct msm_drm_private *priv = drm->dev_private;
int default_prio;
 
+   if (!priv->gpu)
+   return -ENODEV;
+
/*
 * Select priority 2 as the "default priority" unless nr_rings is less
 * than 2 and then pick the lowest priority
 */
-   default_prio = priv->gpu ?
-   clamp_t(uint32_t, 2, 0, priv->gpu->nr_rings - 1) : 0;
+   default_prio = clamp_t(uint32_t, 2, 0, priv->gpu->nr_rings - 1);
 
INIT_LIST_HEAD(&ctx->submitqueues);
 
-- 
2.31.1

[PATCH v3 03/13] drm/msm: drop drm_gem_object_put_locked()

2021-07-27 Thread Rob Clark

From: Rob Clark 

No idea why we were still using this.  It certainly hasn't been needed
for some time.  So drop the pointless twin codepaths.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/adreno/a5xx_debugfs.c   |  4 +-
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c   |  6 +--
 drivers/gpu/drm/msm/adreno/a5xx_power.c |  2 +-
 drivers/gpu/drm/msm/adreno/a5xx_preempt.c   |  7 ++-
 drivers/gpu/drm/msm/adreno/a6xx_gmu.c   | 12 ++---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   |  2 +-
 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c |  4 +-
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |  2 +-
 drivers/gpu/drm/msm/msm_gem.c   | 56 -
 drivers/gpu/drm/msm/msm_gem.h   |  7 +--
 drivers/gpu/drm/msm/msm_gem_submit.c|  2 +-
 drivers/gpu/drm/msm/msm_gpu.c   |  4 +-
 drivers/gpu/drm/msm/msm_ringbuffer.c|  2 +-
 13 files changed, 33 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c 
b/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c
index fc2c905b6c9e..c9d11d57aed6 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c
@@ -117,13 +117,13 @@ reset_set(void *data, u64 val)
 
if (a5xx_gpu->pm4_bo) {
msm_gem_unpin_iova(a5xx_gpu->pm4_bo, gpu->aspace);
-   drm_gem_object_put_locked(a5xx_gpu->pm4_bo);
+   drm_gem_object_put(a5xx_gpu->pm4_bo);
a5xx_gpu->pm4_bo = NULL;
}
 
if (a5xx_gpu->pfp_bo) {
msm_gem_unpin_iova(a5xx_gpu->pfp_bo, gpu->aspace);
-   drm_gem_object_put_locked(a5xx_gpu->pfp_bo);
+   drm_gem_object_put(a5xx_gpu->pfp_bo);
a5xx_gpu->pfp_bo = NULL;
}
 
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 7a271de9a212..0a93ed1d6b06 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -1415,7 +1415,7 @@ struct a5xx_gpu_state {
 static int a5xx_crashdumper_init(struct msm_gpu *gpu,
struct a5xx_crashdumper *dumper)
 {
-   dumper->ptr = msm_gem_kernel_new_locked(gpu->dev,
+   dumper->ptr = msm_gem_kernel_new(gpu->dev,
SZ_1M, MSM_BO_WC, gpu->aspace,
&dumper->bo, &dumper->iova);
 
@@ -1517,7 +1517,7 @@ static void a5xx_gpu_state_get_hlsq_regs(struct msm_gpu 
*gpu,
 
if (a5xx_crashdumper_run(gpu, &dumper)) {
kfree(a5xx_state->hlsqregs);
-   msm_gem_kernel_put(dumper.bo, gpu->aspace, true);
+   msm_gem_kernel_put(dumper.bo, gpu->aspace);
return;
}
 
@@ -1525,7 +1525,7 @@ static void a5xx_gpu_state_get_hlsq_regs(struct msm_gpu 
*gpu,
memcpy(a5xx_state->hlsqregs, dumper.ptr + (256 * SZ_1K),
count * sizeof(u32));
 
-   msm_gem_kernel_put(dumper.bo, gpu->aspace, true);
+   msm_gem_kernel_put(dumper.bo, gpu->aspace);
 }
 
 static struct msm_gpu_state *a5xx_gpu_state_get(struct msm_gpu *gpu)
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_power.c 
b/drivers/gpu/drm/msm/adreno/a5xx_power.c
index cdb165236a88..0e63a1429189 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_power.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_power.c
@@ -362,7 +362,7 @@ void a5xx_gpmu_ucode_init(struct msm_gpu *gpu)
 */
bosize = (cmds_size + (cmds_size / TYPE4_MAX_PAYLOAD) + 1) << 2;
 
-   ptr = msm_gem_kernel_new_locked(drm, bosize,
+   ptr = msm_gem_kernel_new(drm, bosize,
MSM_BO_WC | MSM_BO_GPU_READONLY, gpu->aspace,
&a5xx_gpu->gpmu_bo, &a5xx_gpu->gpmu_iova);
if (IS_ERR(ptr))
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c 
b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
index ee72510ff8ce..8abc9a2b114a 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
@@ -240,7 +240,7 @@ static int preempt_init_ring(struct a5xx_gpu *a5xx_gpu,
A5XX_PREEMPT_COUNTER_SIZE,
MSM_BO_WC, gpu->aspace, &counters_bo, &counters_iova);
if (IS_ERR(counters)) {
-   msm_gem_kernel_put(bo, gpu->aspace, true);
+   msm_gem_kernel_put(bo, gpu->aspace);
return PTR_ERR(counters);
}
 
@@ -272,9 +272,8 @@ void a5xx_preempt_fini(struct msm_gpu *gpu)
int i;
 
for (i = 0; i < gpu->nr_rings; i++) {
-   msm_gem_kernel_put(a5xx_gpu->preempt_bo[i], gpu->aspace, true);
-   msm_gem_kernel_put(a5xx_gpu->preempt_counters_bo[i],
-   gpu->aspace, true);
+   msm_gem_kernel_put(a5xx_gpu->preempt_bo[i], gpu->aspace);
+   msm_gem_kernel_put(a5xx_gpu->preempt_counters_bo[i], 
gpu->aspace);
}
 }
 
di

[PATCH v3 05/13] drm/msm/submit: Simplify out-fence-fd handling

2021-07-27 Thread Rob Clark

From: Rob Clark 

No need for this to be split in two parts.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem_submit.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index e789f68d5be1..8abd743adfb0 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -645,7 +645,6 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
struct msm_file_private *ctx = file->driver_priv;
struct msm_gem_submit *submit;
struct msm_gpu *gpu = priv->gpu;
-   struct sync_file *sync_file = NULL;
struct msm_gpu_submitqueue *queue;
struct msm_ringbuffer *ring;
struct msm_submit_post_dep *post_deps = NULL;
@@ -824,22 +823,19 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void 
*data,
}
 
if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
-   sync_file = sync_file_create(submit->fence);
+   struct sync_file *sync_file = sync_file_create(submit->fence);
if (!sync_file) {
ret = -ENOMEM;
goto out;
}
+   fd_install(out_fence_fd, sync_file->file);
+   args->fence_fd = out_fence_fd;
}
 
msm_gpu_submit(gpu, submit);
 
args->fence = submit->fence->seqno;
 
-   if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
-   fd_install(out_fence_fd, sync_file->file);
-   args->fence_fd = out_fence_fd;
-   }
-
msm_reset_syncobjs(syncobjs_to_reset, args->nr_in_syncobjs);
msm_process_post_deps(post_deps, args->nr_out_syncobjs,
  submit->fence);
-- 
2.31.1

[PATCH v3 04/13] drm: Drop drm_gem_object_put_locked()

2021-07-27 Thread Rob Clark

From: Rob Clark 

Now that no one is using it, remove it.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
Reviewed-by: Daniel Vetter 
---
 drivers/gpu/drm/drm_gem.c | 22 --
 include/drm/drm_gem.h |  2 --
 2 files changed, 24 deletions(-)

diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index d62fb1a3c916..a34525332bef 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -973,28 +973,6 @@ drm_gem_object_free(struct kref *kref)
 }
 EXPORT_SYMBOL(drm_gem_object_free);
 
-/**
- * drm_gem_object_put_locked - release a GEM buffer object reference
- * @obj: GEM buffer object
- *
- * This releases a reference to @obj. Callers must hold the
- * &drm_device.struct_mutex lock when calling this function, even when the
- * driver doesn't use &drm_device.struct_mutex for anything.
- *
- * For drivers not encumbered with legacy locking use
- * drm_gem_object_put() instead.
- */
-void
-drm_gem_object_put_locked(struct drm_gem_object *obj)
-{
-   if (obj) {
-   WARN_ON(!mutex_is_locked(&obj->dev->struct_mutex));
-
-   kref_put(&obj->refcount, drm_gem_object_free);
-   }
-}
-EXPORT_SYMBOL(drm_gem_object_put_locked);
-
 /**
  * drm_gem_vm_open - vma->ops->open implementation for GEM
  * @vma: VM area structure
diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
index 240049566592..35e7f44c2a75 100644
--- a/include/drm/drm_gem.h
+++ b/include/drm/drm_gem.h
@@ -384,8 +384,6 @@ drm_gem_object_put(struct drm_gem_object *obj)
__drm_gem_object_put(obj);
 }
 
-void drm_gem_object_put_locked(struct drm_gem_object *obj);
-
 int drm_gem_handle_create(struct drm_file *file_priv,
  struct drm_gem_object *obj,
  u32 *handlep);
-- 
2.31.1

[PATCH v3 06/13] drm/msm: Consolidate submit bo state

2021-07-27 Thread Rob Clark

From: Rob Clark 

Move all the locked/active/pinned state handling to msm_gem_submit.c.
In particular, for drm/scheduler, we'll need to do all this before
pushing the submit job to the scheduler.  But while we're at it we can
get rid of the dupicate pin and refcnt.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem.h|  2 +
 drivers/gpu/drm/msm/msm_gem_submit.c | 92 ++--
 drivers/gpu/drm/msm/msm_gpu.c| 29 +
 3 files changed, 75 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index 71ccf87a646b..da3af702a6c8 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -361,6 +361,8 @@ static inline void msm_gem_submit_put(struct msm_gem_submit 
*submit)
kref_put(&submit->ref, __msm_gem_submit_destroy);
 }
 
+void msm_submit_retire(struct msm_gem_submit *submit);
+
 /* helper to determine of a buffer in submit should be dumped, used for both
  * devcoredump and debugfs cmdstream dumping:
  */
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index 8abd743adfb0..4f02fa3c78f9 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -23,8 +23,8 @@
 
 /* make sure these don't conflict w/ MSM_SUBMIT_BO_x */
 #define BO_VALID0x8000   /* is current addr in cmdstream correct/valid? */
-#define BO_LOCKED   0x4000
-#define BO_PINNED   0x2000
+#define BO_LOCKED   0x4000   /* obj lock is held */
+#define BO_PINNED   0x2000   /* obj is pinned and on active list */
 
 static struct msm_gem_submit *submit_create(struct drm_device *dev,
struct msm_gpu *gpu,
@@ -220,21 +220,33 @@ static int submit_lookup_cmds(struct msm_gem_submit 
*submit,
return ret;
 }
 
-static void submit_unlock_unpin_bo(struct msm_gem_submit *submit,
-   int i, bool backoff)
+/* Unwind bo state, according to cleanup_flags.  In the success case, only
+ * the lock is dropped at the end of the submit (and active/pin ref is dropped
+ * later when the submit is retired).
+ */
+static void submit_cleanup_bo(struct msm_gem_submit *submit, int i,
+   unsigned cleanup_flags)
 {
-   struct msm_gem_object *msm_obj = submit->bos[i].obj;
+   struct drm_gem_object *obj = &submit->bos[i].obj->base;
+   unsigned flags = submit->bos[i].flags & cleanup_flags;
 
-   if (submit->bos[i].flags & BO_PINNED)
-   msm_gem_unpin_iova_locked(&msm_obj->base, submit->aspace);
+   if (flags & BO_PINNED) {
+   msm_gem_unpin_iova_locked(obj, submit->aspace);
+   msm_gem_active_put(obj);
+   }
 
-   if (submit->bos[i].flags & BO_LOCKED)
-   dma_resv_unlock(msm_obj->base.resv);
+   if (flags & BO_LOCKED)
+   dma_resv_unlock(obj->resv);
 
-   if (backoff && !(submit->bos[i].flags & BO_VALID))
-   submit->bos[i].iova = 0;
+   submit->bos[i].flags &= ~cleanup_flags;
+}
 
-   submit->bos[i].flags &= ~(BO_LOCKED | BO_PINNED);
+static void submit_unlock_unpin_bo(struct msm_gem_submit *submit, int i)
+{
+   submit_cleanup_bo(submit, i, BO_PINNED | BO_LOCKED);
+
+   if (!(submit->bos[i].flags & BO_VALID))
+   submit->bos[i].iova = 0;
 }
 
 /* This is where we make sure all the bo's are reserved and pin'd: */
@@ -266,10 +278,10 @@ static int submit_lock_objects(struct msm_gem_submit 
*submit)
 
 fail:
for (; i >= 0; i--)
-   submit_unlock_unpin_bo(submit, i, true);
+   submit_unlock_unpin_bo(submit, i);
 
if (slow_locked > 0)
-   submit_unlock_unpin_bo(submit, slow_locked, true);
+   submit_unlock_unpin_bo(submit, slow_locked);
 
if (ret == -EDEADLK) {
struct msm_gem_object *msm_obj = submit->bos[contended].obj;
@@ -325,16 +337,18 @@ static int submit_pin_objects(struct msm_gem_submit 
*submit)
submit->valid = true;
 
for (i = 0; i < submit->nr_bos; i++) {
-   struct msm_gem_object *msm_obj = submit->bos[i].obj;
+   struct drm_gem_object *obj = &submit->bos[i].obj->base;
uint64_t iova;
 
/* if locking succeeded, pin bo: */
-   ret = msm_gem_get_and_pin_iova_locked(&msm_obj->base,
+   ret = msm_gem_get_and_pin_iova_locked(obj,
submit->aspace, &iova);
 
if (ret)
break;
 
+   msm_gem_active_get(obj, submit->gpu);
+
submit->bos[i].flags |= BO_PINNED;
 
if (iova == submit->bos[i].iova) {
@@ -350,6 +364,20 @@ static int submit_pin_objects(struct msm_gem_submit 
*submit)
return re

[PATCH v3 07/13] drm/msm: Track "seqno" fences by idr

2021-07-27 Thread Rob Clark

From: Rob Clark 

Previously the (non-fd) fence returned from submit ioctl was a raw
seqno, which is scoped to the ring.  But from UABI standpoint, the
ioctls related to seqno fences all specify a submitqueue.  We can
take advantage of that to replace the seqno fences with a cyclic idr
handle.

This is in preperation for moving to drm scheduler, at which point
the submit ioctl will return after queuing the submit job to the
scheduler, but before the submit is written into the ring (and
therefore before a ring seqno has been assigned).  Which means we
need to replace the dma_fence that userspace may need to wait on
with a scheduler fence.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_drv.c | 30 +++--
 drivers/gpu/drm/msm/msm_fence.c   | 39 ---
 drivers/gpu/drm/msm/msm_fence.h   |  2 --
 drivers/gpu/drm/msm/msm_gem.h |  1 +
 drivers/gpu/drm/msm/msm_gem_submit.c  | 23 +++-
 drivers/gpu/drm/msm/msm_gpu.h |  5 
 drivers/gpu/drm/msm/msm_submitqueue.c |  5 
 7 files changed, 61 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 9b8fa2ad0d84..1594ae39d54f 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -911,6 +911,7 @@ static int msm_ioctl_wait_fence(struct drm_device *dev, 
void *data,
ktime_t timeout = to_ktime(args->timeout);
struct msm_gpu_submitqueue *queue;
struct msm_gpu *gpu = priv->gpu;
+   struct dma_fence *fence;
int ret;
 
if (args->pad) {
@@ -925,10 +926,35 @@ static int msm_ioctl_wait_fence(struct drm_device *dev, 
void *data,
if (!queue)
return -ENOENT;
 
-   ret = msm_wait_fence(gpu->rb[queue->prio]->fctx, args->fence, &timeout,
-   true);
+   /*
+* Map submitqueue scoped "seqno" (which is actually an idr key)
+* back to underlying dma-fence
+*
+* The fence is removed from the fence_idr when the submit is
+* retired, so if the fence is not found it means there is nothing
+* to wait for
+*/
+   ret = mutex_lock_interruptible(&queue->lock);
+   if (ret)
+   return ret;
+   fence = idr_find(&queue->fence_idr, args->fence);
+   if (fence)
+   fence = dma_fence_get_rcu(fence);
+   mutex_unlock(&queue->lock);
+
+   if (!fence)
+   return 0;
 
+   ret = dma_fence_wait_timeout(fence, true, timeout_to_jiffies(&timeout));
+   if (ret == 0) {
+   ret = -ETIMEDOUT;
+   } else if (ret != -ERESTARTSYS) {
+   ret = 0;
+   }
+
+   dma_fence_put(fence);
msm_submitqueue_put(queue);
+
return ret;
 }
 
diff --git a/drivers/gpu/drm/msm/msm_fence.c b/drivers/gpu/drm/msm/msm_fence.c
index b92a9091a1e2..d8228029708e 100644
--- a/drivers/gpu/drm/msm/msm_fence.c
+++ b/drivers/gpu/drm/msm/msm_fence.c
@@ -45,45 +45,6 @@ static inline bool fence_completed(struct msm_fence_context 
*fctx, uint32_t fenc
(int32_t)(*fctx->fenceptr - fence) >= 0;
 }
 
-/* legacy path for WAIT_FENCE ioctl: */
-int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence,
-   ktime_t *timeout, bool interruptible)
-{
-   int ret;
-
-   if (fence > fctx->last_fence) {
-   DRM_ERROR_RATELIMITED("%s: waiting on invalid fence: %u (of 
%u)\n",
-   fctx->name, fence, fctx->last_fence);
-   return -EINVAL;
-   }
-
-   if (!timeout) {
-   /* no-wait: */
-   ret = fence_completed(fctx, fence) ? 0 : -EBUSY;
-   } else {
-   unsigned long remaining_jiffies = timeout_to_jiffies(timeout);
-
-   if (interruptible)
-   ret = wait_event_interruptible_timeout(fctx->event,
-   fence_completed(fctx, fence),
-   remaining_jiffies);
-   else
-   ret = wait_event_timeout(fctx->event,
-   fence_completed(fctx, fence),
-   remaining_jiffies);
-
-   if (ret == 0) {
-   DBG("timeout waiting for fence: %u (completed: %u)",
-   fence, fctx->completed_fence);
-   ret = -ETIMEDOUT;
-   } else if (ret != -ERESTARTSYS) {
-   ret = 0;
-   }
-   }
-
-   return ret;
-}
-
 /* called from workqueue */
 void msm_update_fence(struct msm_fence_context *fctx, uint32_t fence)
 {
diff --git a/drivers/gpu/drm/msm/msm_fence.h b/drivers/gpu/drm/msm/msm_fence.h
index 6ab97062ff1a..6de97d0f5153 100644
--- a/drivers/gpu/drm/msm/msm_fence.h
+++ b/drivers/gpu/dr

[PATCH v3 08/13] drm/msm: Return ERR_PTR() from submit_create()

2021-07-27 Thread Rob Clark

From: Rob Clark 

In the next patch, we start having more than a single potential failure
reason.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem_submit.c | 21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index f6f595aae2c5..f570155bc086 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -32,30 +32,27 @@ static struct msm_gem_submit *submit_create(struct 
drm_device *dev,
uint32_t nr_cmds)
 {
struct msm_gem_submit *submit;
-   uint64_t sz = struct_size(submit, bos, nr_bos) +
- ((u64)nr_cmds * sizeof(submit->cmd[0]));
+   uint64_t sz;
+
+   sz = struct_size(submit, bos, nr_bos) +
+   ((u64)nr_cmds * sizeof(submit->cmd[0]));
 
if (sz > SIZE_MAX)
-   return NULL;
+   return ERR_PTR(-ENOMEM);
 
-   submit = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+   submit = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
if (!submit)
-   return NULL;
+   return ERR_PTR(-ENOMEM);
 
kref_init(&submit->ref);
submit->dev = dev;
submit->aspace = queue->ctx->aspace;
submit->gpu = gpu;
-   submit->fence = NULL;
submit->cmd = (void *)&submit->bos[nr_bos];
submit->queue = queue;
submit->ring = gpu->rb[queue->prio];
submit->fault_dumped = false;
 
-   /* initially, until copy_from_user() and bo lookup succeeds: */
-   submit->nr_bos = 0;
-   submit->nr_cmds = 0;
-
INIT_LIST_HEAD(&submit->node);
INIT_LIST_HEAD(&submit->bo_list);
 
@@ -799,8 +796,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 
submit = submit_create(dev, gpu, queue, args->nr_bos,
args->nr_cmds);
-   if (!submit) {
-   ret = -ENOMEM;
+   if (IS_ERR(submit)) {
+   ret = PTR_ERR(submit);
goto out_unlock;
}
 
-- 
2.31.1

[PATCH v3 10/13] drm/msm: Drop submit bo_list

2021-07-27 Thread Rob Clark

From: Rob Clark 

This was only used to detect userspace including the same bo multiple
times in a submit.  But ww_mutex can already tell us this.

When we drop struct_mutex around the submit ioctl, we'd otherwise need
to lock the bo before adding it to the bo_list.  But since ww_mutex can
already tell us this, it is simpler just to remove the bo_list.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/msm_gem.c|  1 -
 drivers/gpu/drm/msm/msm_gem.h|  8 
 drivers/gpu/drm/msm/msm_gem_submit.c | 28 +---
 3 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index a527a6b1d6ba..af199ef53d2f 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -1151,7 +1151,6 @@ static int msm_gem_new_impl(struct drm_device *dev,
msm_obj->flags = flags;
msm_obj->madv = MSM_MADV_WILLNEED;
 
-   INIT_LIST_HEAD(&msm_obj->submit_entry);
INIT_LIST_HEAD(&msm_obj->vmas);
 
*obj = &msm_obj->base;
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index a48114058ff9..f9e3ffb2309a 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -88,13 +88,6 @@ struct msm_gem_object {
 */
struct list_head mm_list;
 
-   /* Transiently in the process of submit ioctl, objects associated
-* with the submit are on submit->bo_list.. this only lasts for
-* the duration of the ioctl, so one bo can never be on multiple
-* submit lists.
-*/
-   struct list_head submit_entry;
-
struct page **pages;
struct sg_table *sgt;
void *vaddr;
@@ -316,7 +309,6 @@ struct msm_gem_submit {
struct msm_gpu *gpu;
struct msm_gem_address_space *aspace;
struct list_head node;   /* node in ring submit list */
-   struct list_head bo_list;
struct ww_acquire_ctx ticket;
uint32_t seqno; /* Sequence number of the submit on the ring */
 
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index 2b158433a6e5..e11e4bb63695 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -63,7 +63,6 @@ static struct msm_gem_submit *submit_create(struct drm_device 
*dev,
submit->fault_dumped = false;
 
INIT_LIST_HEAD(&submit->node);
-   INIT_LIST_HEAD(&submit->bo_list);
 
return submit;
 }
@@ -143,7 +142,6 @@ static int submit_lookup_objects(struct msm_gem_submit 
*submit,
 
for (i = 0; i < args->nr_bos; i++) {
struct drm_gem_object *obj;
-   struct msm_gem_object *msm_obj;
 
/* normally use drm_gem_object_lookup(), but for bulk lookup
 * all under single table_lock just hit object_idr directly:
@@ -155,20 +153,9 @@ static int submit_lookup_objects(struct msm_gem_submit 
*submit,
goto out_unlock;
}
 
-   msm_obj = to_msm_bo(obj);
-
-   if (!list_empty(&msm_obj->submit_entry)) {
-   DRM_ERROR("handle %u at index %u already on submit 
list\n",
-   submit->bos[i].handle, i);
-   ret = -EINVAL;
-   goto out_unlock;
-   }
-
drm_gem_object_get(obj);
 
-   submit->bos[i].obj = msm_obj;
-
-   list_add_tail(&msm_obj->submit_entry, &submit->bo_list);
+   submit->bos[i].obj = to_msm_bo(obj);
}
 
 out_unlock:
@@ -299,6 +286,12 @@ static int submit_lock_objects(struct msm_gem_submit 
*submit)
return 0;
 
 fail:
+   if (ret == -EALREADY) {
+   DRM_ERROR("handle %u at index %u already on submit list\n",
+   submit->bos[i].handle, i);
+   ret = -EINVAL;
+   }
+
for (; i >= 0; i--)
submit_unlock_unpin_bo(submit, i);
 
@@ -315,6 +308,12 @@ static int submit_lock_objects(struct msm_gem_submit 
*submit)
slow_locked = contended;
goto retry;
}
+
+   /* Not expecting -EALREADY here, if the bo was already
+* locked, we should have gotten -EALREADY already from
+* the dma_resv_lock_interruptable() call.
+*/
+   WARN_ON_ONCE(ret == -EALREADY);
}
 
return ret;
@@ -508,7 +507,6 @@ static void submit_cleanup(struct msm_gem_submit *submit, 
bool error)
for (i = 0; i < submit->nr_bos; i++) {
struct msm_gem_object *msm_obj = submit->bos[i].obj;
submit_cleanup_bo(submit, i, cleanup_flags);
-   list_del_init(&msm_obj->submit_entry);
if (error)
drm_gem_object_put(&msm_obj->base);
}
-- 
2.31.1

[PATCH v3 11/13] drm/msm: Drop struct_mutex in submit path

2021-07-27 Thread Rob Clark

From: Rob Clark 

It is sufficient to serialize on the submit queue now.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem_submit.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index e11e4bb63695..450efe59abb5 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -709,7 +709,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
struct msm_drm_private *priv = dev->dev_private;
struct drm_msm_gem_submit *args = data;
struct msm_file_private *ctx = file->driver_priv;
-   struct msm_gem_submit *submit;
+   struct msm_gem_submit *submit = NULL;
struct msm_gpu *gpu = priv->gpu;
struct msm_gpu_submitqueue *queue;
struct msm_ringbuffer *ring;
@@ -753,7 +753,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
trace_msm_gpu_submit(pid_nr(pid), ring->id, submitid,
args->nr_bos, args->nr_cmds);
 
-   ret = mutex_lock_interruptible(&dev->struct_mutex);
+   ret = mutex_lock_interruptible(&queue->lock);
if (ret)
goto out_post_unlock;
 
@@ -874,10 +874,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void 
*data,
 * Allocate an id which can be used by WAIT_FENCE ioctl to map back
 * to the underlying fence.
 */
-   mutex_lock(&queue->lock);
submit->fence_id = idr_alloc_cyclic(&queue->fence_idr,
submit->user_fence, 0, INT_MAX, GFP_KERNEL);
-   mutex_unlock(&queue->lock);
if (submit->fence_id < 0) {
ret = submit->fence_id = 0;
submit->fence_id = 0;
@@ -912,12 +910,12 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void 
*data,
submit_cleanup(submit, !!ret);
if (has_ww_ticket)
ww_acquire_fini(&submit->ticket);
-   msm_gem_submit_put(submit);
 out_unlock:
if (ret && (out_fence_fd >= 0))
put_unused_fd(out_fence_fd);
-   mutex_unlock(&dev->struct_mutex);
-
+   mutex_unlock(&queue->lock);
+   if (submit)
+   msm_gem_submit_put(submit);
 out_post_unlock:
if (!IS_ERR_OR_NULL(post_deps)) {
for (i = 0; i < args->nr_out_syncobjs; ++i) {
-- 
2.31.1

[PATCH v3 09/13] drm/msm: Conversion to drm scheduler

2021-07-27 Thread Rob Clark

From: Rob Clark 

For existing adrenos, there is one or more ringbuffer, depending on
whether preemption is supported.  When preemption is supported, each
ringbuffer has it's own priority.  A submitqueue (which maps to a
gl context or vk queue in userspace) is mapped to a specific ring-
buffer at creation time, based on the submitqueue's priority.

Each ringbuffer has it's own drm_gpu_scheduler.  Each submitqueue
maps to a drm_sched_entity.  And each submit maps to a drm_sched_job.

Closes: https://gitlab.freedesktop.org/drm/msm/-/issues/4
Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/Kconfig   |   1 +
 drivers/gpu/drm/msm/msm_gem.c |  35 --
 drivers/gpu/drm/msm/msm_gem.h |  26 -
 drivers/gpu/drm/msm/msm_gem_submit.c  | 161 +-
 drivers/gpu/drm/msm/msm_gpu.c |  13 +--
 drivers/gpu/drm/msm/msm_gpu.h |   2 +
 drivers/gpu/drm/msm/msm_rd.c  |   6 +-
 drivers/gpu/drm/msm/msm_ringbuffer.c  |  66 +++
 drivers/gpu/drm/msm/msm_ringbuffer.h  |  12 ++
 drivers/gpu/drm/msm/msm_submitqueue.c |  26 +
 10 files changed, 217 insertions(+), 131 deletions(-)

diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
index 52536e7adb95..dc7f3e40850b 100644
--- a/drivers/gpu/drm/msm/Kconfig
+++ b/drivers/gpu/drm/msm/Kconfig
@@ -14,6 +14,7 @@ config DRM_MSM
select REGULATOR
select DRM_KMS_HELPER
select DRM_PANEL
+   select DRM_SCHED
select SHMEM
select TMPFS
select QCOM_SCM if ARCH_QCOM
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 4e99c448b83a..a527a6b1d6ba 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -806,41 +806,6 @@ void msm_gem_vunmap(struct drm_gem_object *obj)
msm_obj->vaddr = NULL;
 }
 
-/* must be called before _move_to_active().. */
-int msm_gem_sync_object(struct drm_gem_object *obj,
-   struct msm_fence_context *fctx, bool exclusive)
-{
-   struct dma_resv_list *fobj;
-   struct dma_fence *fence;
-   int i, ret;
-
-   fobj = dma_resv_shared_list(obj->resv);
-   if (!fobj || (fobj->shared_count == 0)) {
-   fence = dma_resv_excl_fence(obj->resv);
-   /* don't need to wait on our own fences, since ring is fifo */
-   if (fence && (fence->context != fctx->context)) {
-   ret = dma_fence_wait(fence, true);
-   if (ret)
-   return ret;
-   }
-   }
-
-   if (!exclusive || !fobj)
-   return 0;
-
-   for (i = 0; i < fobj->shared_count; i++) {
-   fence = rcu_dereference_protected(fobj->shared[i],
-   dma_resv_held(obj->resv));
-   if (fence->context != fctx->context) {
-   ret = dma_fence_wait(fence, true);
-   if (ret)
-   return ret;
-   }
-   }
-
-   return 0;
-}
-
 void msm_gem_active_get(struct drm_gem_object *obj, struct msm_gpu *gpu)
 {
struct msm_gem_object *msm_obj = to_msm_bo(obj);
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index e0579abda5b9..a48114058ff9 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -9,6 +9,7 @@
 
 #include 
 #include 
+#include "drm/gpu_scheduler.h"
 #include "msm_drv.h"
 
 /* Make all GEM related WARN_ON()s ratelimited.. when things go wrong they
@@ -143,8 +144,6 @@ void *msm_gem_get_vaddr_active(struct drm_gem_object *obj);
 void msm_gem_put_vaddr_locked(struct drm_gem_object *obj);
 void msm_gem_put_vaddr(struct drm_gem_object *obj);
 int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv);
-int msm_gem_sync_object(struct drm_gem_object *obj,
-   struct msm_fence_context *fctx, bool exclusive);
 void msm_gem_active_get(struct drm_gem_object *obj, struct msm_gpu *gpu);
 void msm_gem_active_put(struct drm_gem_object *obj);
 int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t 
*timeout);
@@ -311,6 +310,7 @@ void msm_gem_vunmap(struct drm_gem_object *obj);
  * make it easier to unwind when things go wrong, etc).
  */
 struct msm_gem_submit {
+   struct drm_sched_job base;
struct kref ref;
struct drm_device *dev;
struct msm_gpu *gpu;
@@ -319,7 +319,22 @@ struct msm_gem_submit {
struct list_head bo_list;
struct ww_acquire_ctx ticket;
uint32_t seqno; /* Sequence number of the submit on the ring */
-   struct dma_fence *fence;
+
+   /* Array of struct dma_fence * to block on before submitting this job.
+*/
+   struct xarray deps;
+   unsigned long last_dep;
+
+   /* Hw fence, which is created when the scheduler executes the

[PATCH v3 12/13] drm/msm: Utilize gpu scheduler priorities

2021-07-27 Thread Rob Clark

From: Rob Clark 

The drm/scheduler provides additional prioritization on top of that
provided by however many number of ringbuffers (each with their own
priority level) is supported on a given generation.  Expose the
additional levels of priority to userspace and map the userspace
priority back to ring (first level of priority) and schedular priority
(additional priority levels within the ring).

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |  4 +-
 drivers/gpu/drm/msm/msm_gem_submit.c|  4 +-
 drivers/gpu/drm/msm/msm_gpu.h   | 58 -
 drivers/gpu/drm/msm/msm_submitqueue.c   | 35 +++
 include/uapi/drm/msm_drm.h  | 14 +-
 5 files changed, 88 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c 
b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index bad4809b68ef..748665232d29 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -261,8 +261,8 @@ int adreno_get_param(struct msm_gpu *gpu, uint32_t param, 
uint64_t *value)
return ret;
}
return -EINVAL;
-   case MSM_PARAM_NR_RINGS:
-   *value = gpu->nr_rings;
+   case MSM_PARAM_PRIORITIES:
+   *value = gpu->nr_rings * NR_SCHED_PRIORITIES;
return 0;
case MSM_PARAM_PP_PGTABLE:
*value = 0;
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index 450efe59abb5..c2ecec5b11c4 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -59,7 +59,7 @@ static struct msm_gem_submit *submit_create(struct drm_device 
*dev,
submit->gpu = gpu;
submit->cmd = (void *)&submit->bos[nr_bos];
submit->queue = queue;
-   submit->ring = gpu->rb[queue->prio];
+   submit->ring = gpu->rb[queue->ring_nr];
submit->fault_dumped = false;
 
INIT_LIST_HEAD(&submit->node);
@@ -749,7 +749,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
/* Get a unique identifier for the submission for logging purposes */
submitid = atomic_inc_return(&ident) - 1;
 
-   ring = gpu->rb[queue->prio];
+   ring = gpu->rb[queue->ring_nr];
trace_msm_gpu_submit(pid_nr(pid), ring->id, submitid,
args->nr_bos, args->nr_cmds);
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index b912cacaecc0..0e4b45bff2e6 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -250,6 +250,59 @@ struct msm_gpu_perfcntr {
const char *name;
 };
 
+/*
+ * The number of priority levels provided by drm gpu scheduler.  The
+ * DRM_SCHED_PRIORITY_KERNEL priority level is treated specially in some
+ * cases, so we don't use it (no need for kernel generated jobs).
+ */
+#define NR_SCHED_PRIORITIES (1 + DRM_SCHED_PRIORITY_HIGH - 
DRM_SCHED_PRIORITY_MIN)
+
+/**
+ * msm_gpu_convert_priority - Map userspace priority to ring # and sched 
priority
+ *
+ * @gpu:the gpu instance
+ * @prio:   the userspace priority level
+ * @ring_nr:[out] the ringbuffer the userspace priority maps to
+ * @sched_prio: [out] the gpu scheduler priority level which the userspace
+ *  priority maps to
+ *
+ * With drm/scheduler providing it's own level of prioritization, our total
+ * number of available priority levels is (nr_rings * NR_SCHED_PRIORITIES).
+ * Each ring is associated with it's own scheduler instance.  However, our
+ * UABI is that lower numerical values are higher priority.  So mapping the
+ * single userspace priority level into ring_nr and sched_prio takes some
+ * care.  The userspace provided priority (when a submitqueue is created)
+ * is mapped to ring nr and scheduler priority as such:
+ *
+ *   ring_nr= userspace_prio / NR_SCHED_PRIORITIES
+ *   sched_prio = NR_SCHED_PRIORITIES -
+ *(userspace_prio % NR_SCHED_PRIORITIES) - 1
+ *
+ * This allows generations without preemption (nr_rings==1) to have some
+ * amount of prioritization, and provides more priority levels for gens
+ * that do have preemption.
+ */
+static inline int msm_gpu_convert_priority(struct msm_gpu *gpu, int prio,
+   unsigned *ring_nr, enum drm_sched_priority *sched_prio)
+{
+   unsigned rn, sp;
+
+   rn = div_u64_rem(prio, NR_SCHED_PRIORITIES, &sp);
+
+   /* invert sched priority to map to higher-numeric-is-higher-
+* priority convention
+*/
+   sp = NR_SCHED_PRIORITIES - sp - 1;
+
+   if (rn >= gpu->nr_rings)
+   return -EINVAL;
+
+   *ring_nr = rn;
+   *sched_prio = sp;
+
+   return 0;
+}
+
 /**
  * A submitqueue is associated with a gl context or vk queue (or equiv)
  * in userspace.
@@ -257,7 +310,8 @@ st

[PATCH v3 13/13] drm/msm/gem: Mark active before pinning

2021-07-27 Thread Rob Clark

From: Rob Clark 

Mark all the bos in the submit as active, before pinning, to prevent
evicting a buffer in the same submit to make room for a buffer earlier
in the table.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/msm_gem.c|  2 --
 drivers/gpu/drm/msm/msm_gem_submit.c | 28 
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index af199ef53d2f..15b1804fa64e 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -131,7 +131,6 @@ static struct page **get_pages(struct drm_gem_object *obj)
if (msm_obj->flags & (MSM_BO_WC|MSM_BO_UNCACHED))
sync_for_device(msm_obj);
 
-   GEM_WARN_ON(msm_obj->active_count);
update_inactive(msm_obj);
}
 
@@ -815,7 +814,6 @@ void msm_gem_active_get(struct drm_gem_object *obj, struct 
msm_gpu *gpu)
GEM_WARN_ON(!msm_gem_is_locked(obj));
GEM_WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED);
GEM_WARN_ON(msm_obj->dontneed);
-   GEM_WARN_ON(!msm_obj->sgt);
 
if (msm_obj->active_count++ == 0) {
mutex_lock(&priv->mm_lock);
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index c2ecec5b11c4..fc25a85eb1ca 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -24,7 +24,8 @@
 /* make sure these don't conflict w/ MSM_SUBMIT_BO_x */
 #define BO_VALID0x8000   /* is current addr in cmdstream correct/valid? */
 #define BO_LOCKED   0x4000   /* obj lock is held */
-#define BO_PINNED   0x2000   /* obj is pinned and on active list */
+#define BO_ACTIVE   0x2000   /* active refcnt is held */
+#define BO_PINNED   0x1000   /* obj is pinned and on active list */
 
 static struct msm_gem_submit *submit_create(struct drm_device *dev,
struct msm_gpu *gpu,
@@ -239,10 +240,11 @@ static void submit_cleanup_bo(struct msm_gem_submit 
*submit, int i,
struct drm_gem_object *obj = &submit->bos[i].obj->base;
unsigned flags = submit->bos[i].flags & cleanup_flags;
 
-   if (flags & BO_PINNED) {
+   if (flags & BO_PINNED)
msm_gem_unpin_iova_locked(obj, submit->aspace);
+
+   if (flags & BO_ACTIVE)
msm_gem_active_put(obj);
-   }
 
if (flags & BO_LOCKED)
dma_resv_unlock(obj->resv);
@@ -252,7 +254,7 @@ static void submit_cleanup_bo(struct msm_gem_submit 
*submit, int i,
 
 static void submit_unlock_unpin_bo(struct msm_gem_submit *submit, int i)
 {
-   submit_cleanup_bo(submit, i, BO_PINNED | BO_LOCKED);
+   submit_cleanup_bo(submit, i, BO_PINNED | BO_ACTIVE | BO_LOCKED);
 
if (!(submit->bos[i].flags & BO_VALID))
submit->bos[i].iova = 0;
@@ -356,6 +358,18 @@ static int submit_pin_objects(struct msm_gem_submit 
*submit)
 
submit->valid = true;
 
+   /*
+* Increment active_count first, so if under memory pressure, we
+* don't inadvertently evict a bo needed by the submit in order
+* to pin an earlier bo in the same submit.
+*/
+   for (i = 0; i < submit->nr_bos; i++) {
+   struct drm_gem_object *obj = &submit->bos[i].obj->base;
+
+   msm_gem_active_get(obj, submit->gpu);
+   submit->bos[i].flags |= BO_ACTIVE;
+   }
+
for (i = 0; i < submit->nr_bos; i++) {
struct drm_gem_object *obj = &submit->bos[i].obj->base;
uint64_t iova;
@@ -367,8 +381,6 @@ static int submit_pin_objects(struct msm_gem_submit *submit)
if (ret)
break;
 
-   msm_gem_active_get(obj, submit->gpu);
-
submit->bos[i].flags |= BO_PINNED;
 
if (iova == submit->bos[i].iova) {
@@ -502,7 +514,7 @@ static void submit_cleanup(struct msm_gem_submit *submit, 
bool error)
unsigned i;
 
if (error)
-   cleanup_flags |= BO_PINNED;
+   cleanup_flags |= BO_PINNED | BO_ACTIVE;
 
for (i = 0; i < submit->nr_bos; i++) {
struct msm_gem_object *msm_obj = submit->bos[i].obj;
@@ -520,7 +532,7 @@ void msm_submit_retire(struct msm_gem_submit *submit)
struct drm_gem_object *obj = &submit->bos[i].obj->base;
 
msm_gem_lock(obj);
-   submit_cleanup_bo(submit, i, BO_PINNED);
+   submit_cleanup_bo(submit, i, BO_PINNED | BO_ACTIVE);
msm_gem_unlock(obj);
drm_gem_object_put(obj);
}
-- 
2.31.1

[RFC 5/4] drm/msm: Add deadline based boost support

2021-07-27 Thread Rob Clark

From: Rob Clark 

Signed-off-by: Rob Clark 
---
This is a quick implementation of what I had in mind for driver side
of deadline boost.  For a couple games with bad gpu devfreq behavior
this boosts "Render quality" from ~35% to ~95%.  (The "Render quality"
metric in chrome://arc-overview-tracing is basically a measure of the
deviation in frame/commit time, so 100% would be a consistent fps
with no variantion.)  Not quite 100%, this is still a bit of a re-
active mechanism.

A similar result can be had by tuning devfreq to boost to max OPP at
a much lower threshold of busyness.  With the obvious downside that
you spend a lot of time running the GPU much faster than needed.

 drivers/gpu/drm/msm/msm_fence.c   | 76 +++
 drivers/gpu/drm/msm/msm_fence.h   | 20 +++
 drivers/gpu/drm/msm/msm_gpu.h |  1 +
 drivers/gpu/drm/msm/msm_gpu_devfreq.c | 20 +++
 4 files changed, 117 insertions(+)

diff --git a/drivers/gpu/drm/msm/msm_fence.c b/drivers/gpu/drm/msm/msm_fence.c
index f2cece542c3f..67c2a96e1c85 100644
--- a/drivers/gpu/drm/msm/msm_fence.c
+++ b/drivers/gpu/drm/msm/msm_fence.c
@@ -8,6 +8,37 @@
 
 #include "msm_drv.h"
 #include "msm_fence.h"
+#include "msm_gpu.h"
+
+static inline bool fence_completed(struct msm_fence_context *fctx, uint32_t 
fence);
+
+static struct msm_gpu *fctx2gpu(struct msm_fence_context *fctx)
+{
+   struct msm_drm_private *priv = fctx->dev->dev_private;
+   return priv->gpu;
+}
+
+static enum hrtimer_restart deadline_timer(struct hrtimer *t)
+{
+   struct msm_fence_context *fctx = container_of(t,
+   struct msm_fence_context, deadline_timer);
+
+   kthread_queue_work(fctx2gpu(fctx)->worker, &fctx->deadline_work);
+
+   return HRTIMER_NORESTART;
+}
+
+static void deadline_work(struct kthread_work *work)
+{
+   struct msm_fence_context *fctx = container_of(work,
+   struct msm_fence_context, deadline_work);
+
+   /* If deadline fence has already passed, nothing to do: */
+   if (fence_completed(fctx, fctx->next_deadline_fence))
+   return;
+
+   msm_devfreq_boost(fctx2gpu(fctx), 2);
+}
 
 
 struct msm_fence_context *
@@ -26,6 +57,13 @@ msm_fence_context_alloc(struct drm_device *dev, volatile 
uint32_t *fenceptr,
fctx->fenceptr = fenceptr;
spin_lock_init(&fctx->spinlock);
 
+   hrtimer_init(&fctx->deadline_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+   fctx->deadline_timer.function = deadline_timer;
+
+   kthread_init_work(&fctx->deadline_work, deadline_work);
+
+   fctx->next_deadline = ktime_get();
+
return fctx;
 }
 
@@ -49,6 +87,8 @@ void msm_update_fence(struct msm_fence_context *fctx, 
uint32_t fence)
 {
spin_lock(&fctx->spinlock);
fctx->completed_fence = max(fence, fctx->completed_fence);
+   if (fence_completed(fctx, fctx->next_deadline_fence))
+   hrtimer_cancel(&fctx->deadline_timer);
spin_unlock(&fctx->spinlock);
 }
 
@@ -79,10 +119,46 @@ static bool msm_fence_signaled(struct dma_fence *fence)
return fence_completed(f->fctx, f->base.seqno);
 }
 
+static void msm_fence_set_deadline(struct dma_fence *fence, ktime_t deadline)
+{
+   struct msm_fence *f = to_msm_fence(fence);
+   struct msm_fence_context *fctx = f->fctx;
+   unsigned long flags;
+   ktime_t now;
+
+   spin_lock_irqsave(&fctx->spinlock, flags);
+   now = ktime_get();
+
+   if (ktime_after(now, fctx->next_deadline) ||
+   ktime_before(deadline, fctx->next_deadline)) {
+   fctx->next_deadline = deadline;
+   fctx->next_deadline_fence =
+   max(fctx->next_deadline_fence, (uint32_t)fence->seqno);
+
+   /*
+* Set timer to trigger boost 3ms before deadline, or
+* if we are already less than 3ms before the deadline
+* schedule boost work immediately.
+*/
+   deadline = ktime_sub(deadline, ms_to_ktime(3));
+
+   if (ktime_after(now, deadline)) {
+   kthread_queue_work(fctx2gpu(fctx)->worker,
+   &fctx->deadline_work);
+   } else {
+   hrtimer_start(&fctx->deadline_timer, deadline,
+   HRTIMER_MODE_ABS);
+   }
+   }
+
+   spin_unlock_irqrestore(&fctx->spinlock, flags);
+}
+
 static const struct dma_fence_ops msm_fence_ops = {
.get_driver_name = msm_fence_get_driver_name,
.get_timeline_name = msm_fence_get_timeline_name,
.signaled = msm_fence_signaled,
+   .set_deadline = msm_fence_set_deadline,
 };
 
 struct dma_fence *
diff --git a/drivers/gpu/drm/m

[PATCH v4 00/13] drm/msm: drm scheduler conversion and cleanups

2021-07-27 Thread Rob Clark

From: Rob Clark 

Conversion to gpu_scheduler, and bonus removal of
drm_gem_object_put_locked()

v2: Fix priority mixup (msm UAPI has lower numeric priority value as
higher priority, inverse of drm/scheduler) and add some comments
in the UAPI header to clarify.

Now that we move active refcnt get into msm_gem_submit, add a
patch to mark all bos busy before pinning, to avoid evicting bos
used in same batch.

Fix bo locking for cmdstream dumping ($debugfs/n/{rd,hangrd})

v3: Add a patch to drop submit bo_list and instead use -EALREADY
to detect errors with same obj appearing multiple times in the
submit ioctl bos table.  Otherwise, with struct_mutex locking
dropped, we'd need to move insertion into and removal from
bo_list under the obj lock.

v4: One last small tweak, drop unused wait_queue_head_t in
msm_fence_context

Rob Clark (13):
  drm/msm: Docs and misc cleanup
  drm/msm: Small submitqueue creation cleanup
  drm/msm: drop drm_gem_object_put_locked()
  drm: Drop drm_gem_object_put_locked()
  drm/msm/submit: Simplify out-fence-fd handling
  drm/msm: Consolidate submit bo state
  drm/msm: Track "seqno" fences by idr
  drm/msm: Return ERR_PTR() from submit_create()
  drm/msm: Conversion to drm scheduler
  drm/msm: Drop submit bo_list
  drm/msm: Drop struct_mutex in submit path
  drm/msm: Utilize gpu scheduler priorities
  drm/msm/gem: Mark active before pinning

 drivers/gpu/drm/drm_gem.c   |  22 --
 drivers/gpu/drm/msm/Kconfig |   1 +
 drivers/gpu/drm/msm/adreno/a5xx_debugfs.c   |   4 +-
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c   |   6 +-
 drivers/gpu/drm/msm/adreno/a5xx_power.c |   2 +-
 drivers/gpu/drm/msm/adreno/a5xx_preempt.c   |   7 +-
 drivers/gpu/drm/msm/adreno/a6xx_gmu.c   |  12 +-
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   |   2 +-
 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c |   4 +-
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |   6 +-
 drivers/gpu/drm/msm/msm_drv.c   |  30 +-
 drivers/gpu/drm/msm/msm_fence.c |  42 ---
 drivers/gpu/drm/msm/msm_fence.h |   3 -
 drivers/gpu/drm/msm/msm_gem.c   |  94 +-
 drivers/gpu/drm/msm/msm_gem.h   |  47 +--
 drivers/gpu/drm/msm/msm_gem_submit.c| 344 
 drivers/gpu/drm/msm/msm_gpu.c   |  46 +--
 drivers/gpu/drm/msm/msm_gpu.h   |  78 -
 drivers/gpu/drm/msm/msm_rd.c|   6 +-
 drivers/gpu/drm/msm/msm_ringbuffer.c|  70 +++-
 drivers/gpu/drm/msm/msm_ringbuffer.h|  12 +
 drivers/gpu/drm/msm/msm_submitqueue.c   |  53 ++-
 include/drm/drm_gem.h   |   2 -
 include/uapi/drm/msm_drm.h  |  14 +-
 24 files changed, 516 insertions(+), 391 deletions(-)

-- 
2.31.1

[PATCH v4 02/13] drm/msm: Small submitqueue creation cleanup

2021-07-27 Thread Rob Clark

From: Rob Clark 

If we don't have a gpu, there is no need to create a submitqueue, which
lets us simplify the error handling and submitqueue creation.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_submitqueue.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c 
b/drivers/gpu/drm/msm/msm_submitqueue.c
index e5eef11ed014..9e9fec61d629 100644
--- a/drivers/gpu/drm/msm/msm_submitqueue.c
+++ b/drivers/gpu/drm/msm/msm_submitqueue.c
@@ -66,6 +66,12 @@ int msm_submitqueue_create(struct drm_device *drm, struct 
msm_file_private *ctx,
if (!ctx)
return -ENODEV;
 
+   if (!priv->gpu)
+   return -ENODEV;
+
+   if (prio >= priv->gpu->nr_rings)
+   return -EINVAL;
+
queue = kzalloc(sizeof(*queue), GFP_KERNEL);
 
if (!queue)
@@ -73,15 +79,7 @@ int msm_submitqueue_create(struct drm_device *drm, struct 
msm_file_private *ctx,
 
kref_init(&queue->ref);
queue->flags = flags;
-
-   if (priv->gpu) {
-   if (prio >= priv->gpu->nr_rings) {
-   kfree(queue);
-   return -EINVAL;
-   }
-
-   queue->prio = prio;
-   }
+   queue->prio = prio;
 
write_lock(&ctx->queuelock);
 
@@ -107,12 +105,14 @@ int msm_submitqueue_init(struct drm_device *drm, struct 
msm_file_private *ctx)
struct msm_drm_private *priv = drm->dev_private;
int default_prio;
 
+   if (!priv->gpu)
+   return -ENODEV;
+
/*
 * Select priority 2 as the "default priority" unless nr_rings is less
 * than 2 and then pick the lowest priority
 */
-   default_prio = priv->gpu ?
-   clamp_t(uint32_t, 2, 0, priv->gpu->nr_rings - 1) : 0;
+   default_prio = clamp_t(uint32_t, 2, 0, priv->gpu->nr_rings - 1);
 
INIT_LIST_HEAD(&ctx->submitqueues);
 
-- 
2.31.1

[PATCH v4 01/13] drm/msm: Docs and misc cleanup

2021-07-27 Thread Rob Clark

From: Rob Clark 

Fix a couple incorrect or misspelt comments, and add submitqueue doc
comment.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem.h |  3 +--
 drivers/gpu/drm/msm/msm_gem_submit.c  |  1 +
 drivers/gpu/drm/msm/msm_gpu.h | 15 +++
 drivers/gpu/drm/msm/msm_ringbuffer.c  |  2 +-
 drivers/gpu/drm/msm/msm_submitqueue.c |  9 +
 5 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index 405f8411e395..d69fcb37ce17 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -313,8 +313,7 @@ void msm_gem_vunmap(struct drm_gem_object *obj);
 
 /* Created per submit-ioctl, to track bo's and cmdstream bufs, etc,
  * associated with the cmdstream submission for synchronization (and
- * make it easier to unwind when things go wrong, etc).  This only
- * lasts for the duration of the submit-ioctl.
+ * make it easier to unwind when things go wrong, etc).
  */
 struct msm_gem_submit {
struct kref ref;
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index 44f84bfd0c0e..6d46f9275a40 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -655,6 +655,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
bool has_ww_ticket = false;
unsigned i;
int ret, submitid;
+
if (!gpu)
return -ENXIO;
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 710c3fedfbf3..96efcb31e502 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -250,6 +250,21 @@ struct msm_gpu_perfcntr {
const char *name;
 };
 
+/**
+ * A submitqueue is associated with a gl context or vk queue (or equiv)
+ * in userspace.
+ *
+ * @id:userspace id for the submitqueue, unique within the drm_file
+ * @flags: userspace flags for the submitqueue, specified at creation
+ * (currently unusued)
+ * @prio:  the submitqueue priority
+ * @faults:the number of GPU hangs associated with this submitqueue
+ * @ctx:   the per-drm_file context associated with the submitqueue (ie.
+ * which set of pgtables do submits jobs associated with the
+ * submitqueue use)
+ * @node:  node in the context's list of submitqueues
+ * @ref:   reference count
+ */
 struct msm_gpu_submitqueue {
int id;
u32 flags;
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c 
b/drivers/gpu/drm/msm/msm_ringbuffer.c
index 7e92d9532454..054461662af5 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -32,7 +32,7 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu 
*gpu, int id,
 
if (IS_ERR(ring->start)) {
ret = PTR_ERR(ring->start);
-   ring->start = 0;
+   ring->start = NULL;
goto fail;
}
 
diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c 
b/drivers/gpu/drm/msm/msm_submitqueue.c
index c3d206105d28..e5eef11ed014 100644
--- a/drivers/gpu/drm/msm/msm_submitqueue.c
+++ b/drivers/gpu/drm/msm/msm_submitqueue.c
@@ -98,17 +98,18 @@ int msm_submitqueue_create(struct drm_device *drm, struct 
msm_file_private *ctx,
return 0;
 }
 
+/*
+ * Create the default submit-queue (id==0), used for backwards compatibility
+ * for userspace that pre-dates the introduction of submitqueues.
+ */
 int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx)
 {
struct msm_drm_private *priv = drm->dev_private;
int default_prio;
 
-   if (!ctx)
-   return 0;
-
/*
 * Select priority 2 as the "default priority" unless nr_rings is less
-* than 2 and then pick the lowest pirority
+* than 2 and then pick the lowest priority
 */
default_prio = priv->gpu ?
clamp_t(uint32_t, 2, 0, priv->gpu->nr_rings - 1) : 0;
-- 
2.31.1

[PATCH v4 03/13] drm/msm: drop drm_gem_object_put_locked()

2021-07-27 Thread Rob Clark

From: Rob Clark 

No idea why we were still using this.  It certainly hasn't been needed
for some time.  So drop the pointless twin codepaths.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/adreno/a5xx_debugfs.c   |  4 +-
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c   |  6 +--
 drivers/gpu/drm/msm/adreno/a5xx_power.c |  2 +-
 drivers/gpu/drm/msm/adreno/a5xx_preempt.c   |  7 ++-
 drivers/gpu/drm/msm/adreno/a6xx_gmu.c   | 12 ++---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   |  2 +-
 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c |  4 +-
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |  2 +-
 drivers/gpu/drm/msm/msm_gem.c   | 56 -
 drivers/gpu/drm/msm/msm_gem.h   |  7 +--
 drivers/gpu/drm/msm/msm_gem_submit.c|  2 +-
 drivers/gpu/drm/msm/msm_gpu.c   |  4 +-
 drivers/gpu/drm/msm/msm_ringbuffer.c|  2 +-
 13 files changed, 33 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c 
b/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c
index fc2c905b6c9e..c9d11d57aed6 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_debugfs.c
@@ -117,13 +117,13 @@ reset_set(void *data, u64 val)
 
if (a5xx_gpu->pm4_bo) {
msm_gem_unpin_iova(a5xx_gpu->pm4_bo, gpu->aspace);
-   drm_gem_object_put_locked(a5xx_gpu->pm4_bo);
+   drm_gem_object_put(a5xx_gpu->pm4_bo);
a5xx_gpu->pm4_bo = NULL;
}
 
if (a5xx_gpu->pfp_bo) {
msm_gem_unpin_iova(a5xx_gpu->pfp_bo, gpu->aspace);
-   drm_gem_object_put_locked(a5xx_gpu->pfp_bo);
+   drm_gem_object_put(a5xx_gpu->pfp_bo);
a5xx_gpu->pfp_bo = NULL;
}
 
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 7a271de9a212..0a93ed1d6b06 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -1415,7 +1415,7 @@ struct a5xx_gpu_state {
 static int a5xx_crashdumper_init(struct msm_gpu *gpu,
struct a5xx_crashdumper *dumper)
 {
-   dumper->ptr = msm_gem_kernel_new_locked(gpu->dev,
+   dumper->ptr = msm_gem_kernel_new(gpu->dev,
SZ_1M, MSM_BO_WC, gpu->aspace,
&dumper->bo, &dumper->iova);
 
@@ -1517,7 +1517,7 @@ static void a5xx_gpu_state_get_hlsq_regs(struct msm_gpu 
*gpu,
 
if (a5xx_crashdumper_run(gpu, &dumper)) {
kfree(a5xx_state->hlsqregs);
-   msm_gem_kernel_put(dumper.bo, gpu->aspace, true);
+   msm_gem_kernel_put(dumper.bo, gpu->aspace);
return;
}
 
@@ -1525,7 +1525,7 @@ static void a5xx_gpu_state_get_hlsq_regs(struct msm_gpu 
*gpu,
memcpy(a5xx_state->hlsqregs, dumper.ptr + (256 * SZ_1K),
count * sizeof(u32));
 
-   msm_gem_kernel_put(dumper.bo, gpu->aspace, true);
+   msm_gem_kernel_put(dumper.bo, gpu->aspace);
 }
 
 static struct msm_gpu_state *a5xx_gpu_state_get(struct msm_gpu *gpu)
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_power.c 
b/drivers/gpu/drm/msm/adreno/a5xx_power.c
index cdb165236a88..0e63a1429189 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_power.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_power.c
@@ -362,7 +362,7 @@ void a5xx_gpmu_ucode_init(struct msm_gpu *gpu)
 */
bosize = (cmds_size + (cmds_size / TYPE4_MAX_PAYLOAD) + 1) << 2;
 
-   ptr = msm_gem_kernel_new_locked(drm, bosize,
+   ptr = msm_gem_kernel_new(drm, bosize,
MSM_BO_WC | MSM_BO_GPU_READONLY, gpu->aspace,
&a5xx_gpu->gpmu_bo, &a5xx_gpu->gpmu_iova);
if (IS_ERR(ptr))
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c 
b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
index ee72510ff8ce..8abc9a2b114a 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
@@ -240,7 +240,7 @@ static int preempt_init_ring(struct a5xx_gpu *a5xx_gpu,
A5XX_PREEMPT_COUNTER_SIZE,
MSM_BO_WC, gpu->aspace, &counters_bo, &counters_iova);
if (IS_ERR(counters)) {
-   msm_gem_kernel_put(bo, gpu->aspace, true);
+   msm_gem_kernel_put(bo, gpu->aspace);
return PTR_ERR(counters);
}
 
@@ -272,9 +272,8 @@ void a5xx_preempt_fini(struct msm_gpu *gpu)
int i;
 
for (i = 0; i < gpu->nr_rings; i++) {
-   msm_gem_kernel_put(a5xx_gpu->preempt_bo[i], gpu->aspace, true);
-   msm_gem_kernel_put(a5xx_gpu->preempt_counters_bo[i],
-   gpu->aspace, true);
+   msm_gem_kernel_put(a5xx_gpu->preempt_bo[i], gpu->aspace);
+   msm_gem_kernel_put(a5xx_gpu->preempt_counters_bo[i], 
gpu->aspace);
}
 }
 
di

[PATCH v4 04/13] drm: Drop drm_gem_object_put_locked()

2021-07-27 Thread Rob Clark

From: Rob Clark 

Now that no one is using it, remove it.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
Reviewed-by: Daniel Vetter 
---
 drivers/gpu/drm/drm_gem.c | 22 --
 include/drm/drm_gem.h |  2 --
 2 files changed, 24 deletions(-)

diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index d62fb1a3c916..a34525332bef 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -973,28 +973,6 @@ drm_gem_object_free(struct kref *kref)
 }
 EXPORT_SYMBOL(drm_gem_object_free);
 
-/**
- * drm_gem_object_put_locked - release a GEM buffer object reference
- * @obj: GEM buffer object
- *
- * This releases a reference to @obj. Callers must hold the
- * &drm_device.struct_mutex lock when calling this function, even when the
- * driver doesn't use &drm_device.struct_mutex for anything.
- *
- * For drivers not encumbered with legacy locking use
- * drm_gem_object_put() instead.
- */
-void
-drm_gem_object_put_locked(struct drm_gem_object *obj)
-{
-   if (obj) {
-   WARN_ON(!mutex_is_locked(&obj->dev->struct_mutex));
-
-   kref_put(&obj->refcount, drm_gem_object_free);
-   }
-}
-EXPORT_SYMBOL(drm_gem_object_put_locked);
-
 /**
  * drm_gem_vm_open - vma->ops->open implementation for GEM
  * @vma: VM area structure
diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
index 240049566592..35e7f44c2a75 100644
--- a/include/drm/drm_gem.h
+++ b/include/drm/drm_gem.h
@@ -384,8 +384,6 @@ drm_gem_object_put(struct drm_gem_object *obj)
__drm_gem_object_put(obj);
 }
 
-void drm_gem_object_put_locked(struct drm_gem_object *obj);
-
 int drm_gem_handle_create(struct drm_file *file_priv,
  struct drm_gem_object *obj,
  u32 *handlep);
-- 
2.31.1

[PATCH v4 05/13] drm/msm/submit: Simplify out-fence-fd handling

2021-07-27 Thread Rob Clark

From: Rob Clark 

No need for this to be split in two parts.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem_submit.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index e789f68d5be1..8abd743adfb0 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -645,7 +645,6 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
struct msm_file_private *ctx = file->driver_priv;
struct msm_gem_submit *submit;
struct msm_gpu *gpu = priv->gpu;
-   struct sync_file *sync_file = NULL;
struct msm_gpu_submitqueue *queue;
struct msm_ringbuffer *ring;
struct msm_submit_post_dep *post_deps = NULL;
@@ -824,22 +823,19 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void 
*data,
}
 
if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
-   sync_file = sync_file_create(submit->fence);
+   struct sync_file *sync_file = sync_file_create(submit->fence);
if (!sync_file) {
ret = -ENOMEM;
goto out;
}
+   fd_install(out_fence_fd, sync_file->file);
+   args->fence_fd = out_fence_fd;
}
 
msm_gpu_submit(gpu, submit);
 
args->fence = submit->fence->seqno;
 
-   if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
-   fd_install(out_fence_fd, sync_file->file);
-   args->fence_fd = out_fence_fd;
-   }
-
msm_reset_syncobjs(syncobjs_to_reset, args->nr_in_syncobjs);
msm_process_post_deps(post_deps, args->nr_out_syncobjs,
  submit->fence);
-- 
2.31.1

[PATCH v4 06/13] drm/msm: Consolidate submit bo state

2021-07-27 Thread Rob Clark

From: Rob Clark 

Move all the locked/active/pinned state handling to msm_gem_submit.c.
In particular, for drm/scheduler, we'll need to do all this before
pushing the submit job to the scheduler.  But while we're at it we can
get rid of the dupicate pin and refcnt.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem.h|  2 +
 drivers/gpu/drm/msm/msm_gem_submit.c | 92 ++--
 drivers/gpu/drm/msm/msm_gpu.c| 29 +
 3 files changed, 75 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index 71ccf87a646b..da3af702a6c8 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -361,6 +361,8 @@ static inline void msm_gem_submit_put(struct msm_gem_submit 
*submit)
kref_put(&submit->ref, __msm_gem_submit_destroy);
 }
 
+void msm_submit_retire(struct msm_gem_submit *submit);
+
 /* helper to determine of a buffer in submit should be dumped, used for both
  * devcoredump and debugfs cmdstream dumping:
  */
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index 8abd743adfb0..4f02fa3c78f9 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -23,8 +23,8 @@
 
 /* make sure these don't conflict w/ MSM_SUBMIT_BO_x */
 #define BO_VALID0x8000   /* is current addr in cmdstream correct/valid? */
-#define BO_LOCKED   0x4000
-#define BO_PINNED   0x2000
+#define BO_LOCKED   0x4000   /* obj lock is held */
+#define BO_PINNED   0x2000   /* obj is pinned and on active list */
 
 static struct msm_gem_submit *submit_create(struct drm_device *dev,
struct msm_gpu *gpu,
@@ -220,21 +220,33 @@ static int submit_lookup_cmds(struct msm_gem_submit 
*submit,
return ret;
 }
 
-static void submit_unlock_unpin_bo(struct msm_gem_submit *submit,
-   int i, bool backoff)
+/* Unwind bo state, according to cleanup_flags.  In the success case, only
+ * the lock is dropped at the end of the submit (and active/pin ref is dropped
+ * later when the submit is retired).
+ */
+static void submit_cleanup_bo(struct msm_gem_submit *submit, int i,
+   unsigned cleanup_flags)
 {
-   struct msm_gem_object *msm_obj = submit->bos[i].obj;
+   struct drm_gem_object *obj = &submit->bos[i].obj->base;
+   unsigned flags = submit->bos[i].flags & cleanup_flags;
 
-   if (submit->bos[i].flags & BO_PINNED)
-   msm_gem_unpin_iova_locked(&msm_obj->base, submit->aspace);
+   if (flags & BO_PINNED) {
+   msm_gem_unpin_iova_locked(obj, submit->aspace);
+   msm_gem_active_put(obj);
+   }
 
-   if (submit->bos[i].flags & BO_LOCKED)
-   dma_resv_unlock(msm_obj->base.resv);
+   if (flags & BO_LOCKED)
+   dma_resv_unlock(obj->resv);
 
-   if (backoff && !(submit->bos[i].flags & BO_VALID))
-   submit->bos[i].iova = 0;
+   submit->bos[i].flags &= ~cleanup_flags;
+}
 
-   submit->bos[i].flags &= ~(BO_LOCKED | BO_PINNED);
+static void submit_unlock_unpin_bo(struct msm_gem_submit *submit, int i)
+{
+   submit_cleanup_bo(submit, i, BO_PINNED | BO_LOCKED);
+
+   if (!(submit->bos[i].flags & BO_VALID))
+   submit->bos[i].iova = 0;
 }
 
 /* This is where we make sure all the bo's are reserved and pin'd: */
@@ -266,10 +278,10 @@ static int submit_lock_objects(struct msm_gem_submit 
*submit)
 
 fail:
for (; i >= 0; i--)
-   submit_unlock_unpin_bo(submit, i, true);
+   submit_unlock_unpin_bo(submit, i);
 
if (slow_locked > 0)
-   submit_unlock_unpin_bo(submit, slow_locked, true);
+   submit_unlock_unpin_bo(submit, slow_locked);
 
if (ret == -EDEADLK) {
struct msm_gem_object *msm_obj = submit->bos[contended].obj;
@@ -325,16 +337,18 @@ static int submit_pin_objects(struct msm_gem_submit 
*submit)
submit->valid = true;
 
for (i = 0; i < submit->nr_bos; i++) {
-   struct msm_gem_object *msm_obj = submit->bos[i].obj;
+   struct drm_gem_object *obj = &submit->bos[i].obj->base;
uint64_t iova;
 
/* if locking succeeded, pin bo: */
-   ret = msm_gem_get_and_pin_iova_locked(&msm_obj->base,
+   ret = msm_gem_get_and_pin_iova_locked(obj,
submit->aspace, &iova);
 
if (ret)
break;
 
+   msm_gem_active_get(obj, submit->gpu);
+
submit->bos[i].flags |= BO_PINNED;
 
if (iova == submit->bos[i].iova) {
@@ -350,6 +364,20 @@ static int submit_pin_objects(struct msm_gem_submit 
*submit)
return re

[PATCH v4 07/13] drm/msm: Track "seqno" fences by idr

2021-07-27 Thread Rob Clark

From: Rob Clark 

Previously the (non-fd) fence returned from submit ioctl was a raw
seqno, which is scoped to the ring.  But from UABI standpoint, the
ioctls related to seqno fences all specify a submitqueue.  We can
take advantage of that to replace the seqno fences with a cyclic idr
handle.

This is in preperation for moving to drm scheduler, at which point
the submit ioctl will return after queuing the submit job to the
scheduler, but before the submit is written into the ring (and
therefore before a ring seqno has been assigned).  Which means we
need to replace the dma_fence that userspace may need to wait on
with a scheduler fence.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_drv.c | 30 +--
 drivers/gpu/drm/msm/msm_fence.c   | 42 ---
 drivers/gpu/drm/msm/msm_fence.h   |  3 --
 drivers/gpu/drm/msm/msm_gem.h |  1 +
 drivers/gpu/drm/msm/msm_gem_submit.c  | 23 ++-
 drivers/gpu/drm/msm/msm_gpu.h |  5 
 drivers/gpu/drm/msm/msm_submitqueue.c |  5 
 7 files changed, 61 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 9b8fa2ad0d84..1594ae39d54f 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -911,6 +911,7 @@ static int msm_ioctl_wait_fence(struct drm_device *dev, 
void *data,
ktime_t timeout = to_ktime(args->timeout);
struct msm_gpu_submitqueue *queue;
struct msm_gpu *gpu = priv->gpu;
+   struct dma_fence *fence;
int ret;
 
if (args->pad) {
@@ -925,10 +926,35 @@ static int msm_ioctl_wait_fence(struct drm_device *dev, 
void *data,
if (!queue)
return -ENOENT;
 
-   ret = msm_wait_fence(gpu->rb[queue->prio]->fctx, args->fence, &timeout,
-   true);
+   /*
+* Map submitqueue scoped "seqno" (which is actually an idr key)
+* back to underlying dma-fence
+*
+* The fence is removed from the fence_idr when the submit is
+* retired, so if the fence is not found it means there is nothing
+* to wait for
+*/
+   ret = mutex_lock_interruptible(&queue->lock);
+   if (ret)
+   return ret;
+   fence = idr_find(&queue->fence_idr, args->fence);
+   if (fence)
+   fence = dma_fence_get_rcu(fence);
+   mutex_unlock(&queue->lock);
+
+   if (!fence)
+   return 0;
 
+   ret = dma_fence_wait_timeout(fence, true, timeout_to_jiffies(&timeout));
+   if (ret == 0) {
+   ret = -ETIMEDOUT;
+   } else if (ret != -ERESTARTSYS) {
+   ret = 0;
+   }
+
+   dma_fence_put(fence);
msm_submitqueue_put(queue);
+
return ret;
 }
 
diff --git a/drivers/gpu/drm/msm/msm_fence.c b/drivers/gpu/drm/msm/msm_fence.c
index b92a9091a1e2..f2cece542c3f 100644
--- a/drivers/gpu/drm/msm/msm_fence.c
+++ b/drivers/gpu/drm/msm/msm_fence.c
@@ -24,7 +24,6 @@ msm_fence_context_alloc(struct drm_device *dev, volatile 
uint32_t *fenceptr,
strncpy(fctx->name, name, sizeof(fctx->name));
fctx->context = dma_fence_context_alloc(1);
fctx->fenceptr = fenceptr;
-   init_waitqueue_head(&fctx->event);
spin_lock_init(&fctx->spinlock);
 
return fctx;
@@ -45,53 +44,12 @@ static inline bool fence_completed(struct msm_fence_context 
*fctx, uint32_t fenc
(int32_t)(*fctx->fenceptr - fence) >= 0;
 }
 
-/* legacy path for WAIT_FENCE ioctl: */
-int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence,
-   ktime_t *timeout, bool interruptible)
-{
-   int ret;
-
-   if (fence > fctx->last_fence) {
-   DRM_ERROR_RATELIMITED("%s: waiting on invalid fence: %u (of 
%u)\n",
-   fctx->name, fence, fctx->last_fence);
-   return -EINVAL;
-   }
-
-   if (!timeout) {
-   /* no-wait: */
-   ret = fence_completed(fctx, fence) ? 0 : -EBUSY;
-   } else {
-   unsigned long remaining_jiffies = timeout_to_jiffies(timeout);
-
-   if (interruptible)
-   ret = wait_event_interruptible_timeout(fctx->event,
-   fence_completed(fctx, fence),
-   remaining_jiffies);
-   else
-   ret = wait_event_timeout(fctx->event,
-   fence_completed(fctx, fence),
-   remaining_jiffies);
-
-   if (ret == 0) {
-   DBG("timeout waiting for fence: %u (completed: %u)",
-   fence, fctx->completed_fence);
-   ret = -ETIMEDOUT;
-   } else if (ret != -E

[PATCH v4 10/13] drm/msm: Drop submit bo_list

2021-07-27 Thread Rob Clark

From: Rob Clark 

This was only used to detect userspace including the same bo multiple
times in a submit.  But ww_mutex can already tell us this.

When we drop struct_mutex around the submit ioctl, we'd otherwise need
to lock the bo before adding it to the bo_list.  But since ww_mutex can
already tell us this, it is simpler just to remove the bo_list.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/msm_gem.c|  1 -
 drivers/gpu/drm/msm/msm_gem.h|  8 
 drivers/gpu/drm/msm/msm_gem_submit.c | 28 +---
 3 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index a527a6b1d6ba..af199ef53d2f 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -1151,7 +1151,6 @@ static int msm_gem_new_impl(struct drm_device *dev,
msm_obj->flags = flags;
msm_obj->madv = MSM_MADV_WILLNEED;
 
-   INIT_LIST_HEAD(&msm_obj->submit_entry);
INIT_LIST_HEAD(&msm_obj->vmas);
 
*obj = &msm_obj->base;
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index a48114058ff9..f9e3ffb2309a 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -88,13 +88,6 @@ struct msm_gem_object {
 */
struct list_head mm_list;
 
-   /* Transiently in the process of submit ioctl, objects associated
-* with the submit are on submit->bo_list.. this only lasts for
-* the duration of the ioctl, so one bo can never be on multiple
-* submit lists.
-*/
-   struct list_head submit_entry;
-
struct page **pages;
struct sg_table *sgt;
void *vaddr;
@@ -316,7 +309,6 @@ struct msm_gem_submit {
struct msm_gpu *gpu;
struct msm_gem_address_space *aspace;
struct list_head node;   /* node in ring submit list */
-   struct list_head bo_list;
struct ww_acquire_ctx ticket;
uint32_t seqno; /* Sequence number of the submit on the ring */
 
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index 2b158433a6e5..e11e4bb63695 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -63,7 +63,6 @@ static struct msm_gem_submit *submit_create(struct drm_device 
*dev,
submit->fault_dumped = false;
 
INIT_LIST_HEAD(&submit->node);
-   INIT_LIST_HEAD(&submit->bo_list);
 
return submit;
 }
@@ -143,7 +142,6 @@ static int submit_lookup_objects(struct msm_gem_submit 
*submit,
 
for (i = 0; i < args->nr_bos; i++) {
struct drm_gem_object *obj;
-   struct msm_gem_object *msm_obj;
 
/* normally use drm_gem_object_lookup(), but for bulk lookup
 * all under single table_lock just hit object_idr directly:
@@ -155,20 +153,9 @@ static int submit_lookup_objects(struct msm_gem_submit 
*submit,
goto out_unlock;
}
 
-   msm_obj = to_msm_bo(obj);
-
-   if (!list_empty(&msm_obj->submit_entry)) {
-   DRM_ERROR("handle %u at index %u already on submit 
list\n",
-   submit->bos[i].handle, i);
-   ret = -EINVAL;
-   goto out_unlock;
-   }
-
drm_gem_object_get(obj);
 
-   submit->bos[i].obj = msm_obj;
-
-   list_add_tail(&msm_obj->submit_entry, &submit->bo_list);
+   submit->bos[i].obj = to_msm_bo(obj);
}
 
 out_unlock:
@@ -299,6 +286,12 @@ static int submit_lock_objects(struct msm_gem_submit 
*submit)
return 0;
 
 fail:
+   if (ret == -EALREADY) {
+   DRM_ERROR("handle %u at index %u already on submit list\n",
+   submit->bos[i].handle, i);
+   ret = -EINVAL;
+   }
+
for (; i >= 0; i--)
submit_unlock_unpin_bo(submit, i);
 
@@ -315,6 +308,12 @@ static int submit_lock_objects(struct msm_gem_submit 
*submit)
slow_locked = contended;
goto retry;
}
+
+   /* Not expecting -EALREADY here, if the bo was already
+* locked, we should have gotten -EALREADY already from
+* the dma_resv_lock_interruptable() call.
+*/
+   WARN_ON_ONCE(ret == -EALREADY);
}
 
return ret;
@@ -508,7 +507,6 @@ static void submit_cleanup(struct msm_gem_submit *submit, 
bool error)
for (i = 0; i < submit->nr_bos; i++) {
struct msm_gem_object *msm_obj = submit->bos[i].obj;
submit_cleanup_bo(submit, i, cleanup_flags);
-   list_del_init(&msm_obj->submit_entry);
if (error)
drm_gem_object_put(&msm_obj->base);
}
-- 
2.31.1

[PATCH v4 09/13] drm/msm: Conversion to drm scheduler

2021-07-27 Thread Rob Clark

From: Rob Clark 

For existing adrenos, there is one or more ringbuffer, depending on
whether preemption is supported.  When preemption is supported, each
ringbuffer has it's own priority.  A submitqueue (which maps to a
gl context or vk queue in userspace) is mapped to a specific ring-
buffer at creation time, based on the submitqueue's priority.

Each ringbuffer has it's own drm_gpu_scheduler.  Each submitqueue
maps to a drm_sched_entity.  And each submit maps to a drm_sched_job.

Closes: https://gitlab.freedesktop.org/drm/msm/-/issues/4
Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/Kconfig   |   1 +
 drivers/gpu/drm/msm/msm_gem.c |  35 --
 drivers/gpu/drm/msm/msm_gem.h |  26 -
 drivers/gpu/drm/msm/msm_gem_submit.c  | 161 +-
 drivers/gpu/drm/msm/msm_gpu.c |  13 +--
 drivers/gpu/drm/msm/msm_gpu.h |   2 +
 drivers/gpu/drm/msm/msm_rd.c  |   6 +-
 drivers/gpu/drm/msm/msm_ringbuffer.c  |  66 +++
 drivers/gpu/drm/msm/msm_ringbuffer.h  |  12 ++
 drivers/gpu/drm/msm/msm_submitqueue.c |  26 +
 10 files changed, 217 insertions(+), 131 deletions(-)

diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
index 52536e7adb95..dc7f3e40850b 100644
--- a/drivers/gpu/drm/msm/Kconfig
+++ b/drivers/gpu/drm/msm/Kconfig
@@ -14,6 +14,7 @@ config DRM_MSM
select REGULATOR
select DRM_KMS_HELPER
select DRM_PANEL
+   select DRM_SCHED
select SHMEM
select TMPFS
select QCOM_SCM if ARCH_QCOM
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 4e99c448b83a..a527a6b1d6ba 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -806,41 +806,6 @@ void msm_gem_vunmap(struct drm_gem_object *obj)
msm_obj->vaddr = NULL;
 }
 
-/* must be called before _move_to_active().. */
-int msm_gem_sync_object(struct drm_gem_object *obj,
-   struct msm_fence_context *fctx, bool exclusive)
-{
-   struct dma_resv_list *fobj;
-   struct dma_fence *fence;
-   int i, ret;
-
-   fobj = dma_resv_shared_list(obj->resv);
-   if (!fobj || (fobj->shared_count == 0)) {
-   fence = dma_resv_excl_fence(obj->resv);
-   /* don't need to wait on our own fences, since ring is fifo */
-   if (fence && (fence->context != fctx->context)) {
-   ret = dma_fence_wait(fence, true);
-   if (ret)
-   return ret;
-   }
-   }
-
-   if (!exclusive || !fobj)
-   return 0;
-
-   for (i = 0; i < fobj->shared_count; i++) {
-   fence = rcu_dereference_protected(fobj->shared[i],
-   dma_resv_held(obj->resv));
-   if (fence->context != fctx->context) {
-   ret = dma_fence_wait(fence, true);
-   if (ret)
-   return ret;
-   }
-   }
-
-   return 0;
-}
-
 void msm_gem_active_get(struct drm_gem_object *obj, struct msm_gpu *gpu)
 {
struct msm_gem_object *msm_obj = to_msm_bo(obj);
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index e0579abda5b9..a48114058ff9 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -9,6 +9,7 @@
 
 #include 
 #include 
+#include "drm/gpu_scheduler.h"
 #include "msm_drv.h"
 
 /* Make all GEM related WARN_ON()s ratelimited.. when things go wrong they
@@ -143,8 +144,6 @@ void *msm_gem_get_vaddr_active(struct drm_gem_object *obj);
 void msm_gem_put_vaddr_locked(struct drm_gem_object *obj);
 void msm_gem_put_vaddr(struct drm_gem_object *obj);
 int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv);
-int msm_gem_sync_object(struct drm_gem_object *obj,
-   struct msm_fence_context *fctx, bool exclusive);
 void msm_gem_active_get(struct drm_gem_object *obj, struct msm_gpu *gpu);
 void msm_gem_active_put(struct drm_gem_object *obj);
 int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t 
*timeout);
@@ -311,6 +310,7 @@ void msm_gem_vunmap(struct drm_gem_object *obj);
  * make it easier to unwind when things go wrong, etc).
  */
 struct msm_gem_submit {
+   struct drm_sched_job base;
struct kref ref;
struct drm_device *dev;
struct msm_gpu *gpu;
@@ -319,7 +319,22 @@ struct msm_gem_submit {
struct list_head bo_list;
struct ww_acquire_ctx ticket;
uint32_t seqno; /* Sequence number of the submit on the ring */
-   struct dma_fence *fence;
+
+   /* Array of struct dma_fence * to block on before submitting this job.
+*/
+   struct xarray deps;
+   unsigned long last_dep;
+
+   /* Hw fence, which is created when the scheduler executes the

[PATCH v4 11/13] drm/msm: Drop struct_mutex in submit path

2021-07-27 Thread Rob Clark

From: Rob Clark 

It is sufficient to serialize on the submit queue now.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem_submit.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index e11e4bb63695..450efe59abb5 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -709,7 +709,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
struct msm_drm_private *priv = dev->dev_private;
struct drm_msm_gem_submit *args = data;
struct msm_file_private *ctx = file->driver_priv;
-   struct msm_gem_submit *submit;
+   struct msm_gem_submit *submit = NULL;
struct msm_gpu *gpu = priv->gpu;
struct msm_gpu_submitqueue *queue;
struct msm_ringbuffer *ring;
@@ -753,7 +753,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
trace_msm_gpu_submit(pid_nr(pid), ring->id, submitid,
args->nr_bos, args->nr_cmds);
 
-   ret = mutex_lock_interruptible(&dev->struct_mutex);
+   ret = mutex_lock_interruptible(&queue->lock);
if (ret)
goto out_post_unlock;
 
@@ -874,10 +874,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void 
*data,
 * Allocate an id which can be used by WAIT_FENCE ioctl to map back
 * to the underlying fence.
 */
-   mutex_lock(&queue->lock);
submit->fence_id = idr_alloc_cyclic(&queue->fence_idr,
submit->user_fence, 0, INT_MAX, GFP_KERNEL);
-   mutex_unlock(&queue->lock);
if (submit->fence_id < 0) {
ret = submit->fence_id = 0;
submit->fence_id = 0;
@@ -912,12 +910,12 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void 
*data,
submit_cleanup(submit, !!ret);
if (has_ww_ticket)
ww_acquire_fini(&submit->ticket);
-   msm_gem_submit_put(submit);
 out_unlock:
if (ret && (out_fence_fd >= 0))
put_unused_fd(out_fence_fd);
-   mutex_unlock(&dev->struct_mutex);
-
+   mutex_unlock(&queue->lock);
+   if (submit)
+   msm_gem_submit_put(submit);
 out_post_unlock:
if (!IS_ERR_OR_NULL(post_deps)) {
for (i = 0; i < args->nr_out_syncobjs; ++i) {
-- 
2.31.1

[PATCH v4 08/13] drm/msm: Return ERR_PTR() from submit_create()

2021-07-27 Thread Rob Clark

From: Rob Clark 

In the next patch, we start having more than a single potential failure
reason.

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/msm_gem_submit.c | 21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index f6f595aae2c5..f570155bc086 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -32,30 +32,27 @@ static struct msm_gem_submit *submit_create(struct 
drm_device *dev,
uint32_t nr_cmds)
 {
struct msm_gem_submit *submit;
-   uint64_t sz = struct_size(submit, bos, nr_bos) +
- ((u64)nr_cmds * sizeof(submit->cmd[0]));
+   uint64_t sz;
+
+   sz = struct_size(submit, bos, nr_bos) +
+   ((u64)nr_cmds * sizeof(submit->cmd[0]));
 
if (sz > SIZE_MAX)
-   return NULL;
+   return ERR_PTR(-ENOMEM);
 
-   submit = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+   submit = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
if (!submit)
-   return NULL;
+   return ERR_PTR(-ENOMEM);
 
kref_init(&submit->ref);
submit->dev = dev;
submit->aspace = queue->ctx->aspace;
submit->gpu = gpu;
-   submit->fence = NULL;
submit->cmd = (void *)&submit->bos[nr_bos];
submit->queue = queue;
submit->ring = gpu->rb[queue->prio];
submit->fault_dumped = false;
 
-   /* initially, until copy_from_user() and bo lookup succeeds: */
-   submit->nr_bos = 0;
-   submit->nr_cmds = 0;
-
INIT_LIST_HEAD(&submit->node);
INIT_LIST_HEAD(&submit->bo_list);
 
@@ -799,8 +796,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 
submit = submit_create(dev, gpu, queue, args->nr_bos,
args->nr_cmds);
-   if (!submit) {
-   ret = -ENOMEM;
+   if (IS_ERR(submit)) {
+   ret = PTR_ERR(submit);
goto out_unlock;
}
 
-- 
2.31.1

[PATCH v4 12/13] drm/msm: Utilize gpu scheduler priorities

2021-07-27 Thread Rob Clark

From: Rob Clark 

The drm/scheduler provides additional prioritization on top of that
provided by however many number of ringbuffers (each with their own
priority level) is supported on a given generation.  Expose the
additional levels of priority to userspace and map the userspace
priority back to ring (first level of priority) and schedular priority
(additional priority levels within the ring).

Signed-off-by: Rob Clark 
Acked-by: Christian König 
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |  4 +-
 drivers/gpu/drm/msm/msm_gem_submit.c|  4 +-
 drivers/gpu/drm/msm/msm_gpu.h   | 58 -
 drivers/gpu/drm/msm/msm_submitqueue.c   | 35 +++
 include/uapi/drm/msm_drm.h  | 14 +-
 5 files changed, 88 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c 
b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index bad4809b68ef..748665232d29 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -261,8 +261,8 @@ int adreno_get_param(struct msm_gpu *gpu, uint32_t param, 
uint64_t *value)
return ret;
}
return -EINVAL;
-   case MSM_PARAM_NR_RINGS:
-   *value = gpu->nr_rings;
+   case MSM_PARAM_PRIORITIES:
+   *value = gpu->nr_rings * NR_SCHED_PRIORITIES;
return 0;
case MSM_PARAM_PP_PGTABLE:
*value = 0;
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index 450efe59abb5..c2ecec5b11c4 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -59,7 +59,7 @@ static struct msm_gem_submit *submit_create(struct drm_device 
*dev,
submit->gpu = gpu;
submit->cmd = (void *)&submit->bos[nr_bos];
submit->queue = queue;
-   submit->ring = gpu->rb[queue->prio];
+   submit->ring = gpu->rb[queue->ring_nr];
submit->fault_dumped = false;
 
INIT_LIST_HEAD(&submit->node);
@@ -749,7 +749,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
/* Get a unique identifier for the submission for logging purposes */
submitid = atomic_inc_return(&ident) - 1;
 
-   ring = gpu->rb[queue->prio];
+   ring = gpu->rb[queue->ring_nr];
trace_msm_gpu_submit(pid_nr(pid), ring->id, submitid,
args->nr_bos, args->nr_cmds);
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index b912cacaecc0..0e4b45bff2e6 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -250,6 +250,59 @@ struct msm_gpu_perfcntr {
const char *name;
 };
 
+/*
+ * The number of priority levels provided by drm gpu scheduler.  The
+ * DRM_SCHED_PRIORITY_KERNEL priority level is treated specially in some
+ * cases, so we don't use it (no need for kernel generated jobs).
+ */
+#define NR_SCHED_PRIORITIES (1 + DRM_SCHED_PRIORITY_HIGH - 
DRM_SCHED_PRIORITY_MIN)
+
+/**
+ * msm_gpu_convert_priority - Map userspace priority to ring # and sched 
priority
+ *
+ * @gpu:the gpu instance
+ * @prio:   the userspace priority level
+ * @ring_nr:[out] the ringbuffer the userspace priority maps to
+ * @sched_prio: [out] the gpu scheduler priority level which the userspace
+ *  priority maps to
+ *
+ * With drm/scheduler providing it's own level of prioritization, our total
+ * number of available priority levels is (nr_rings * NR_SCHED_PRIORITIES).
+ * Each ring is associated with it's own scheduler instance.  However, our
+ * UABI is that lower numerical values are higher priority.  So mapping the
+ * single userspace priority level into ring_nr and sched_prio takes some
+ * care.  The userspace provided priority (when a submitqueue is created)
+ * is mapped to ring nr and scheduler priority as such:
+ *
+ *   ring_nr= userspace_prio / NR_SCHED_PRIORITIES
+ *   sched_prio = NR_SCHED_PRIORITIES -
+ *(userspace_prio % NR_SCHED_PRIORITIES) - 1
+ *
+ * This allows generations without preemption (nr_rings==1) to have some
+ * amount of prioritization, and provides more priority levels for gens
+ * that do have preemption.
+ */
+static inline int msm_gpu_convert_priority(struct msm_gpu *gpu, int prio,
+   unsigned *ring_nr, enum drm_sched_priority *sched_prio)
+{
+   unsigned rn, sp;
+
+   rn = div_u64_rem(prio, NR_SCHED_PRIORITIES, &sp);
+
+   /* invert sched priority to map to higher-numeric-is-higher-
+* priority convention
+*/
+   sp = NR_SCHED_PRIORITIES - sp - 1;
+
+   if (rn >= gpu->nr_rings)
+   return -EINVAL;
+
+   *ring_nr = rn;
+   *sched_prio = sp;
+
+   return 0;
+}
+
 /**
  * A submitqueue is associated with a gl context or vk queue (or equiv)
  * in userspace.
@@ -257,7 +310,8 @@ st

[PATCH v4 13/13] drm/msm/gem: Mark active before pinning

2021-07-27 Thread Rob Clark

From: Rob Clark 

Mark all the bos in the submit as active, before pinning, to prevent
evicting a buffer in the same submit to make room for a buffer earlier
in the table.

Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/msm_gem.c|  2 --
 drivers/gpu/drm/msm/msm_gem_submit.c | 28 
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index af199ef53d2f..15b1804fa64e 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -131,7 +131,6 @@ static struct page **get_pages(struct drm_gem_object *obj)
if (msm_obj->flags & (MSM_BO_WC|MSM_BO_UNCACHED))
sync_for_device(msm_obj);
 
-   GEM_WARN_ON(msm_obj->active_count);
update_inactive(msm_obj);
}
 
@@ -815,7 +814,6 @@ void msm_gem_active_get(struct drm_gem_object *obj, struct 
msm_gpu *gpu)
GEM_WARN_ON(!msm_gem_is_locked(obj));
GEM_WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED);
GEM_WARN_ON(msm_obj->dontneed);
-   GEM_WARN_ON(!msm_obj->sgt);
 
if (msm_obj->active_count++ == 0) {
mutex_lock(&priv->mm_lock);
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index c2ecec5b11c4..fc25a85eb1ca 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -24,7 +24,8 @@
 /* make sure these don't conflict w/ MSM_SUBMIT_BO_x */
 #define BO_VALID0x8000   /* is current addr in cmdstream correct/valid? */
 #define BO_LOCKED   0x4000   /* obj lock is held */
-#define BO_PINNED   0x2000   /* obj is pinned and on active list */
+#define BO_ACTIVE   0x2000   /* active refcnt is held */
+#define BO_PINNED   0x1000   /* obj is pinned and on active list */
 
 static struct msm_gem_submit *submit_create(struct drm_device *dev,
struct msm_gpu *gpu,
@@ -239,10 +240,11 @@ static void submit_cleanup_bo(struct msm_gem_submit 
*submit, int i,
struct drm_gem_object *obj = &submit->bos[i].obj->base;
unsigned flags = submit->bos[i].flags & cleanup_flags;
 
-   if (flags & BO_PINNED) {
+   if (flags & BO_PINNED)
msm_gem_unpin_iova_locked(obj, submit->aspace);
+
+   if (flags & BO_ACTIVE)
msm_gem_active_put(obj);
-   }
 
if (flags & BO_LOCKED)
dma_resv_unlock(obj->resv);
@@ -252,7 +254,7 @@ static void submit_cleanup_bo(struct msm_gem_submit 
*submit, int i,
 
 static void submit_unlock_unpin_bo(struct msm_gem_submit *submit, int i)
 {
-   submit_cleanup_bo(submit, i, BO_PINNED | BO_LOCKED);
+   submit_cleanup_bo(submit, i, BO_PINNED | BO_ACTIVE | BO_LOCKED);
 
if (!(submit->bos[i].flags & BO_VALID))
submit->bos[i].iova = 0;
@@ -356,6 +358,18 @@ static int submit_pin_objects(struct msm_gem_submit 
*submit)
 
submit->valid = true;
 
+   /*
+* Increment active_count first, so if under memory pressure, we
+* don't inadvertently evict a bo needed by the submit in order
+* to pin an earlier bo in the same submit.
+*/
+   for (i = 0; i < submit->nr_bos; i++) {
+   struct drm_gem_object *obj = &submit->bos[i].obj->base;
+
+   msm_gem_active_get(obj, submit->gpu);
+   submit->bos[i].flags |= BO_ACTIVE;
+   }
+
for (i = 0; i < submit->nr_bos; i++) {
struct drm_gem_object *obj = &submit->bos[i].obj->base;
uint64_t iova;
@@ -367,8 +381,6 @@ static int submit_pin_objects(struct msm_gem_submit *submit)
if (ret)
break;
 
-   msm_gem_active_get(obj, submit->gpu);
-
submit->bos[i].flags |= BO_PINNED;
 
if (iova == submit->bos[i].iova) {
@@ -502,7 +514,7 @@ static void submit_cleanup(struct msm_gem_submit *submit, 
bool error)
unsigned i;
 
if (error)
-   cleanup_flags |= BO_PINNED;
+   cleanup_flags |= BO_PINNED | BO_ACTIVE;
 
for (i = 0; i < submit->nr_bos; i++) {
struct msm_gem_object *msm_obj = submit->bos[i].obj;
@@ -520,7 +532,7 @@ void msm_submit_retire(struct msm_gem_submit *submit)
struct drm_gem_object *obj = &submit->bos[i].obj->base;
 
msm_gem_lock(obj);
-   submit_cleanup_bo(submit, i, BO_PINNED);
+   submit_cleanup_bo(submit, i, BO_PINNED | BO_ACTIVE);
msm_gem_unlock(obj);
drm_gem_object_put(obj);
}
-- 
2.31.1

Re: [RFC 1/4] dma-fence: Add deadline awareness

2021-07-28 Thread Rob Clark

On Wed, Jul 28, 2021 at 4:37 AM Christian König
 wrote:
>
> Am 28.07.21 um 09:03 schrieb Christian König:
> > Am 27.07.21 um 16:25 schrieb Rob Clark:
> >> On Tue, Jul 27, 2021 at 12:11 AM Christian König
> >>  wrote:
> >>> Am 27.07.21 um 01:38 schrieb Rob Clark:
> >>>> From: Rob Clark 
> >>>>
> >>>> Add a way to hint to the fence signaler of an upcoming deadline,
> >>>> such as
> >>>> vblank, which the fence waiter would prefer not to miss. This is to
> >>>> aid
> >>>> the fence signaler in making power management decisions, like boosting
> >>>> frequency as the deadline approaches and awareness of missing
> >>>> deadlines
> >>>> so that can be factored in to the frequency scaling.
> >>>>
> >>>> Signed-off-by: Rob Clark 
> >>>> ---
> >>>>drivers/dma-buf/dma-fence.c | 39
> >>>> +
> >>>>include/linux/dma-fence.h   | 17 
> >>>>2 files changed, 56 insertions(+)
> >>>>
> >>>> diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
> >>>> index ce0f5eff575d..2e0d25ab457e 100644
> >>>> --- a/drivers/dma-buf/dma-fence.c
> >>>> +++ b/drivers/dma-buf/dma-fence.c
> >>>> @@ -910,6 +910,45 @@ dma_fence_wait_any_timeout(struct dma_fence
> >>>> **fences, uint32_t count,
> >>>>}
> >>>>EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> >>>>
> >>>> +
> >>>> +/**
> >>>> + * dma_fence_set_deadline - set desired fence-wait deadline
> >>>> + * @fence:the fence that is to be waited on
> >>>> + * @deadline: the time by which the waiter hopes for the fence to be
> >>>> + *signaled
> >>>> + *
> >>>> + * Inform the fence signaler of an upcoming deadline, such as
> >>>> vblank, by
> >>>> + * which point the waiter would prefer the fence to be signaled
> >>>> by.  This
> >>>> + * is intended to give feedback to the fence signaler to aid in power
> >>>> + * management decisions, such as boosting GPU frequency if a periodic
> >>>> + * vblank deadline is approaching.
> >>>> + */
> >>>> +void dma_fence_set_deadline(struct dma_fence *fence, ktime_t
> >>>> deadline)
> >>>> +{
> >>>> + unsigned long flags;
> >>>> +
> >>>> + if (dma_fence_is_signaled(fence))
> >>>> + return;
> >>>> +
> >>>> + spin_lock_irqsave(fence->lock, flags);
> >>>> +
> >>>> + /* If we already have an earlier deadline, keep it: */
> >>>> + if (test_bit(DMA_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags) &&
> >>>> + ktime_before(fence->deadline, deadline)) {
> >>>> + spin_unlock_irqrestore(fence->lock, flags);
> >>>> + return;
> >>>> + }
> >>>> +
> >>>> + fence->deadline = deadline;
> >>>> + set_bit(DMA_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags);
> >>>> +
> >>>> + spin_unlock_irqrestore(fence->lock, flags);
> >>>> +
> >>>> + if (fence->ops->set_deadline)
> >>>> + fence->ops->set_deadline(fence, deadline);
> >>>> +}
> >>>> +EXPORT_SYMBOL(dma_fence_set_deadline);
> >>>> +
> >>>>/**
> >>>> * dma_fence_init - Initialize a custom fence.
> >>>> * @fence: the fence to initialize
> >>>> diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
> >>>> index 6ffb4b2c6371..4e6cfe4e6fbc 100644
> >>>> --- a/include/linux/dma-fence.h
> >>>> +++ b/include/linux/dma-fence.h
> >>>> @@ -88,6 +88,7 @@ struct dma_fence {
> >>>>/* @timestamp replaced by @rcu on
> >>>> dma_fence_release() */
> >>>>struct rcu_head rcu;
> >>>>};
> >>>> + ktime_t deadline;
> >>> Mhm, adding the flag sounds ok to me but I'm a bit hesitating adding
> >>> the
> >>> deadline as extra field here.
> >>>
> >>

Re: [RFC 0/4] dma-fence: Deadline awareness

2021-07-28 Thread Rob Clark

On Wed, Jul 28, 2021 at 6:57 AM Pekka Paalanen  wrote:
>
> On Wed, 28 Jul 2021 15:31:41 +0200
> Christian König  wrote:
>
> > Am 28.07.21 um 15:24 schrieb Michel Dänzer:
> > > On 2021-07-28 3:13 p.m., Christian König wrote:
> > >> Am 28.07.21 um 15:08 schrieb Michel Dänzer:
> > >>> On 2021-07-28 1:36 p.m., Christian König wrote:
>
> >  At least AMD hardware is already capable of flipping frames on GPU 
> >  events like finishing rendering (or uploading etc).
> > 
> >  By waiting in userspace on the CPU before send the frame to the 
> >  hardware you are completely killing of such features.
> > 
> >  For composing use cases that makes sense, but certainly not for full 
> >  screen applications as far as I can see.
> > >>> Even for fullscreen, the current KMS API only allows queuing a single 
> > >>> page flip per CRTC, with no way to cancel or otherwise modify it. 
> > >>> Therefore, a Wayland compositor has to set a deadline for the next 
> > >>> refresh cycle, and when the deadline passes, it has to select the best 
> > >>> buffer available for the fullscreen surface. To make sure the flip will 
> > >>> not miss the next refresh cycle, the compositor has to pick an idle 
> > >>> buffer. If it picks a non-idle buffer, and the pending rendering does 
> > >>> not finish in time for vertical blank, the flip will be delayed by at 
> > >>> least one refresh cycle, which results in visible stuttering.
> > >>>
> > >>> (Until the deadline passes, the Wayland compositor can't even know if a 
> > >>> previously fullscreen surface will still be fullscreen for the next 
> > >>> refresh cycle)
> > >> Well then let's extend the KMS API instead of hacking together 
> > >> workarounds in userspace.
> > > That's indeed a possible solution for the fullscreen / direct scanout 
> > > case.
> > >
> > > Not for the general compositing case though, since a compositor does not 
> > > want to composite multiple output frames per display refresh cycle, so it 
> > > has to make sure the one frame hits the target.
> >
> > Yeah, that's true as well.
> >
> > At least as long as nobody invents a mechanism to do this decision on
> > the GPU instead.
>
> That would mean putting the whole window manager into the GPU.
>

Hmm, seems like we could come up with a way for a shader to figure out
if a fence has signaled or not on the GPU, and then either sample from
the current or previous window surface?

BR,
-R

Re: [RFC 0/4] dma-fence: Deadline awareness

2021-07-28 Thread Rob Clark

On Wed, Jul 28, 2021 at 6:24 AM Michel Dänzer  wrote:
>
> On 2021-07-28 3:13 p.m., Christian König wrote:
> > Am 28.07.21 um 15:08 schrieb Michel Dänzer:
> >> On 2021-07-28 1:36 p.m., Christian König wrote:
> >>> Am 27.07.21 um 17:37 schrieb Rob Clark:
> >>>> On Tue, Jul 27, 2021 at 8:19 AM Michel Dänzer  wrote:
> >>>>> On 2021-07-27 5:12 p.m., Rob Clark wrote:
> >>>>>> On Tue, Jul 27, 2021 at 7:50 AM Michel Dänzer  
> >>>>>> wrote:
> >>>>>>> On 2021-07-27 1:38 a.m., Rob Clark wrote:
> >>>>>>>> From: Rob Clark 
> >>>>>>>>
> >>>>>>>> Based on discussion from a previous series[1] to add a "boost" 
> >>>>>>>> mechanism
> >>>>>>>> when, for example, vblank deadlines are missed.  Instead of a boost
> >>>>>>>> callback, this approach adds a way to set a deadline on the fence, by
> >>>>>>>> which the waiter would like to see the fence signalled.
> >>>>>>>>
> >>>>>>>> I've not yet had a chance to re-work the drm/msm part of this, but
> >>>>>>>> wanted to send this out as an RFC in case I don't have a chance to
> >>>>>>>> finish the drm/msm part this week.
> >>>>>>>>
> >>>>>>>> Original description:
> >>>>>>>>
> >>>>>>>> In some cases, like double-buffered rendering, missing vblanks can
> >>>>>>>> trick the GPU into running at a lower frequence, when really we
> >>>>>>>> want to be running at a higher frequency to not miss the vblanks
> >>>>>>>> in the first place.
> >>>>>>>>
> >>>>>>>> This is partially inspired by a trick i915 does, but implemented
> >>>>>>>> via dma-fence for a couple of reasons:
> >>>>>>>>
> >>>>>>>> 1) To continue to be able to use the atomic helpers
> >>>>>>>> 2) To support cases where display and gpu are different drivers
> >>>>>>>>
> >>>>>>>> [1] 
> >>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fseries%2F90331%2F&data=04%7C01%7Cchristian.koenig%40amd.com%7C269b2df3e1dc4f0b856d08d951c8c768%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637630745091538563%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=eYaSOSS5wOngNAd9wufp5eWCx5GtAwo6GkultJgrjmA%3D&reserved=0
> >>>>>>> Unfortunately, none of these approaches will have the full intended 
> >>>>>>> effect once Wayland compositors start waiting for client buffers to 
> >>>>>>> become idle before using them for an output frame (to prevent output 
> >>>>>>> frames from getting delayed by client work). See 
> >>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.gnome.org%2FGNOME%2Fmutter%2F-%2Fmerge_requests%2F1880&data=04%7C01%7Cchristian.koenig%40amd.com%7C269b2df3e1dc4f0b856d08d951c8c768%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637630745091538563%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=1ZkOzLqbiKSyCixGZ0u7Hd%2Fc1YnUZub%2F%2Fx7RuEclFKg%3D&reserved=0
> >>>>>>>  (shameless plug :) for a proof of concept of this for mutter. The 
> >>>>>>> boost will only affect the compositor's own GPU work, not the client 
> >>>>>>> work (which means no effect at all for fullscreen apps where the 
> >>>>>>> compositor can scan out the client buffers directly).
> >>>>>>>
> >>>>>> I guess you mean "no effect at all *except* for fullscreen..."?
> >>>>> I meant what I wrote: The compositor will wait for the next buffer to 
> >>>>> become idle, so there's no boost from this mechanism for the client 
> >>>>> drawing to that buffer. And since the compositor does no drawing of its 
> >>>>> own in this case, there's no boost from that either.
> >>>>>
> >>>>>
> >>>>>> I'd perhaps recommend that wayland compositors, in cases where only a
> >>>>>> sing

Re: [RFC 1/4] dma-fence: Add deadline awareness

2021-07-28 Thread Rob Clark

On Wed, Jul 28, 2021 at 10:23 AM Christian König
 wrote:
>
>
>
> Am 28.07.21 um 17:15 schrieb Rob Clark:
> > On Wed, Jul 28, 2021 at 4:37 AM Christian König
> >  wrote:
> >> Am 28.07.21 um 09:03 schrieb Christian König:
> >>> Am 27.07.21 um 16:25 schrieb Rob Clark:
> >>>> On Tue, Jul 27, 2021 at 12:11 AM Christian König
> >>>>  wrote:
> >>>>> Am 27.07.21 um 01:38 schrieb Rob Clark:
> >>>>>> From: Rob Clark 
> >>>>>>
> >>>>>> Add a way to hint to the fence signaler of an upcoming deadline,
> >>>>>> such as
> >>>>>> vblank, which the fence waiter would prefer not to miss. This is to
> >>>>>> aid
> >>>>>> the fence signaler in making power management decisions, like boosting
> >>>>>> frequency as the deadline approaches and awareness of missing
> >>>>>> deadlines
> >>>>>> so that can be factored in to the frequency scaling.
> >>>>>>
> >>>>>> Signed-off-by: Rob Clark 
> >>>>>> ---
> >>>>>> drivers/dma-buf/dma-fence.c | 39
> >>>>>> +
> >>>>>> include/linux/dma-fence.h   | 17 
> >>>>>> 2 files changed, 56 insertions(+)
> >>>>>>
> >>>>>> diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
> >>>>>> index ce0f5eff575d..2e0d25ab457e 100644
> >>>>>> --- a/drivers/dma-buf/dma-fence.c
> >>>>>> +++ b/drivers/dma-buf/dma-fence.c
> >>>>>> @@ -910,6 +910,45 @@ dma_fence_wait_any_timeout(struct dma_fence
> >>>>>> **fences, uint32_t count,
> >>>>>> }
> >>>>>> EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> >>>>>>
> >>>>>> +
> >>>>>> +/**
> >>>>>> + * dma_fence_set_deadline - set desired fence-wait deadline
> >>>>>> + * @fence:the fence that is to be waited on
> >>>>>> + * @deadline: the time by which the waiter hopes for the fence to be
> >>>>>> + *signaled
> >>>>>> + *
> >>>>>> + * Inform the fence signaler of an upcoming deadline, such as
> >>>>>> vblank, by
> >>>>>> + * which point the waiter would prefer the fence to be signaled
> >>>>>> by.  This
> >>>>>> + * is intended to give feedback to the fence signaler to aid in power
> >>>>>> + * management decisions, such as boosting GPU frequency if a periodic
> >>>>>> + * vblank deadline is approaching.
> >>>>>> + */
> >>>>>> +void dma_fence_set_deadline(struct dma_fence *fence, ktime_t
> >>>>>> deadline)
> >>>>>> +{
> >>>>>> + unsigned long flags;
> >>>>>> +
> >>>>>> + if (dma_fence_is_signaled(fence))
> >>>>>> + return;
> >>>>>> +
> >>>>>> + spin_lock_irqsave(fence->lock, flags);
> >>>>>> +
> >>>>>> + /* If we already have an earlier deadline, keep it: */
> >>>>>> + if (test_bit(DMA_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags) &&
> >>>>>> + ktime_before(fence->deadline, deadline)) {
> >>>>>> + spin_unlock_irqrestore(fence->lock, flags);
> >>>>>> + return;
> >>>>>> + }
> >>>>>> +
> >>>>>> + fence->deadline = deadline;
> >>>>>> + set_bit(DMA_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags);
> >>>>>> +
> >>>>>> + spin_unlock_irqrestore(fence->lock, flags);
> >>>>>> +
> >>>>>> + if (fence->ops->set_deadline)
> >>>>>> + fence->ops->set_deadline(fence, deadline);
> >>>>>> +}
> >>>>>> +EXPORT_SYMBOL(dma_fence_set_deadline);
> >>>>>> +
> >>>>>> /**
> >>>>>>  * dma_fence_init - Initialize a custom fence.
> >>>>>>  * @fence: the fence to initialize
>

[early pull] drm/msm: drm-msm-next-2021-07-28 for v5.15

2021-07-28 Thread Rob Clark

Hi Dave & Daniel,

An early pull for v5.15 (there'll be more coming in a week or two),
consisting of the drm/scheduler conversion and a couple other small
series that one was based one.  Mostly sending this now because IIUC
danvet wanted it in drm-next so he could rebase on it.  (Daniel, if
you disagree then speak up, and I'll instead include this in the main
pull request once that is ready.)

This also has a core patch to drop drm_gem_object_put_locked() now
that the last use of it is removed.

The following changes since commit ff1176468d368232b684f75e82563369208bc371:

  Linux 5.14-rc3 (2021-07-25 15:35:14 -0700)

are available in the Git repository at:

  https://gitlab.freedesktop.org/drm/msm.git drm-msm-next-2021-07-28

for you to fetch changes up to 4541e4f2225c30b0e9442be9eb2fb8b7086cdd1f:

  drm/msm/gem: Mark active before pinning (2021-07-28 09:19:00 -0700)

--------
Rob Clark (18):
  drm/msm: Let fences read directly from memptrs
  drm/msm: Signal fences sooner
  drm/msm: Split out devfreq handling
  drm/msm: Split out get_freq() helper
  drm/msm: Devfreq tuning
  drm/msm: Docs and misc cleanup
  drm/msm: Small submitqueue creation cleanup
  drm/msm: drop drm_gem_object_put_locked()
  drm: Drop drm_gem_object_put_locked()
  drm/msm/submit: Simplify out-fence-fd handling
  drm/msm: Consolidate submit bo state
  drm/msm: Track "seqno" fences by idr
  drm/msm: Return ERR_PTR() from submit_create()
  drm/msm: Conversion to drm scheduler
  drm/msm: Drop submit bo_list
  drm/msm: Drop struct_mutex in submit path
  drm/msm: Utilize gpu scheduler priorities
  drm/msm/gem: Mark active before pinning

 drivers/gpu/drm/drm_gem.c   |  22 --
 drivers/gpu/drm/msm/Kconfig |   1 +
 drivers/gpu/drm/msm/Makefile|   1 +
 drivers/gpu/drm/msm/adreno/a5xx_debugfs.c   |   4 +-
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c   |   6 +-
 drivers/gpu/drm/msm/adreno/a5xx_power.c |   2 +-
 drivers/gpu/drm/msm/adreno/a5xx_preempt.c   |   7 +-
 drivers/gpu/drm/msm/adreno/a6xx_gmu.c   |  12 +-
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   |   6 +-
 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c |   4 +-
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |   6 +-
 drivers/gpu/drm/msm/msm_drv.c   |  30 ++-
 drivers/gpu/drm/msm/msm_fence.c |  53 +
 drivers/gpu/drm/msm/msm_fence.h |  44 +++-
 drivers/gpu/drm/msm/msm_gem.c   |  94 +---
 drivers/gpu/drm/msm/msm_gem.h   |  47 ++--
 drivers/gpu/drm/msm/msm_gem_submit.c| 344 +---
 drivers/gpu/drm/msm/msm_gpu.c   | 220 --
 drivers/gpu/drm/msm/msm_gpu.h   | 139 ++-
 drivers/gpu/drm/msm/msm_gpu_devfreq.c   | 203 
 drivers/gpu/drm/msm/msm_rd.c|   6 +-
 drivers/gpu/drm/msm/msm_ringbuffer.c|  69 +-
 drivers/gpu/drm/msm/msm_ringbuffer.h|  12 +
 drivers/gpu/drm/msm/msm_submitqueue.c   |  53 +++--
 include/drm/drm_gem.h   |   2 -
 include/uapi/drm/msm_drm.h  |  14 +-
 26 files changed, 865 insertions(+), 536 deletions(-)
 create mode 100644 drivers/gpu/drm/msm/msm_gpu_devfreq.c

Re: [early pull] drm/msm: drm-msm-next-2021-07-28 for v5.15

2021-07-28 Thread Rob Clark

Jordan, any idea if more frequent frequency changes would for some
reason make a630 grumpy?  I was expecting it should be somewhat
similar to a618 (same GMU fw, etc).  The main result of that patch
should be clamping to min freq when gpu goes idle, and the toggling
back to devfreq provided freq on idle->active transition.  So there
might be more frequent freq transitions.

Caleb, I don't suppose you could somehow delay starting UI and get
some traces?  Something along the lines of:

  localhost ~ # cd /sys/kernel/debug/tracing/
  localhost /sys/kernel/debug/tracing # echo 1 > events/drm_msm_gpu/enable
  localhost /sys/kernel/debug/tracing # echo 1 > tracing_on
  localhost /sys/kernel/debug/tracing # cat trace_pipe

Does adding an 'if (1) return' at the top of msm_devfreq_idle() help?
That should bypass the clamping to min freq when the GPU isn't doing
anything and reduce the # of freq transitions.  I suppose we could
opt-in to this behavior on a per-gpu basis..

BR,
-R

On Wed, Jul 28, 2021 at 5:35 PM Caleb Connolly
 wrote:
>
> Hi Rob,
>
> This series causes a fatal crash on my Oneplus 6, the device goes to
> Qualcomm crashdump mode shortly after reaching UI with the following errors:
>
> https://paste.ubuntu.com/p/HvjmzZYtgw/
>
> I did a git bisect and the patch ("drm/msm: Devfreq tuning") seems to be
> the cause of the crash, reverting it resolves the issue.
>
>
> On 28/07/2021 21:52, Rob Clark wrote:
> > Hi Dave & Daniel,
> >
> > An early pull for v5.15 (there'll be more coming in a week or two),
> > consisting of the drm/scheduler conversion and a couple other small
> > series that one was based one.  Mostly sending this now because IIUC
> > danvet wanted it in drm-next so he could rebase on it.  (Daniel, if
> > you disagree then speak up, and I'll instead include this in the main
> > pull request once that is ready.)
> >
> > This also has a core patch to drop drm_gem_object_put_locked() now
> > that the last use of it is removed.
> >
> > The following changes since commit ff1176468d368232b684f75e82563369208bc371:
> >
> >Linux 5.14-rc3 (2021-07-25 15:35:14 -0700)
> >
> > are available in the Git repository at:
> >
> >https://gitlab.freedesktop.org/drm/msm.git drm-msm-next-2021-07-28
> >
> > for you to fetch changes up to 4541e4f2225c30b0e9442be9eb2fb8b7086cdd1f:
> >
> >drm/msm/gem: Mark active before pinning (2021-07-28 09:19:00 -0700)
> >
> > 
> > Rob Clark (18):
> >drm/msm: Let fences read directly from memptrs
> >drm/msm: Signal fences sooner
> >drm/msm: Split out devfreq handling
> >drm/msm: Split out get_freq() helper
> >drm/msm: Devfreq tuning
> >drm/msm: Docs and misc cleanup
> >drm/msm: Small submitqueue creation cleanup
> >drm/msm: drop drm_gem_object_put_locked()
> >drm: Drop drm_gem_object_put_locked()
> >drm/msm/submit: Simplify out-fence-fd handling
> >drm/msm: Consolidate submit bo state
> >drm/msm: Track "seqno" fences by idr
> >drm/msm: Return ERR_PTR() from submit_create()
> >drm/msm: Conversion to drm scheduler
> >drm/msm: Drop submit bo_list
> >drm/msm: Drop struct_mutex in submit path
> >drm/msm: Utilize gpu scheduler priorities
> >drm/msm/gem: Mark active before pinning
> >
> >   drivers/gpu/drm/drm_gem.c   |  22 --
> >   drivers/gpu/drm/msm/Kconfig |   1 +
> >   drivers/gpu/drm/msm/Makefile|   1 +
> >   drivers/gpu/drm/msm/adreno/a5xx_debugfs.c   |   4 +-
> >   drivers/gpu/drm/msm/adreno/a5xx_gpu.c   |   6 +-
> >   drivers/gpu/drm/msm/adreno/a5xx_power.c |   2 +-
> >   drivers/gpu/drm/msm/adreno/a5xx_preempt.c   |   7 +-
> >   drivers/gpu/drm/msm/adreno/a6xx_gmu.c   |  12 +-
> >   drivers/gpu/drm/msm/adreno/a6xx_gpu.c   |   6 +-
> >   drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c |   4 +-
> >   drivers/gpu/drm/msm/adreno/adreno_gpu.c |   6 +-
> >   drivers/gpu/drm/msm/msm_drv.c   |  30 ++-
> >   drivers/gpu/drm/msm/msm_fence.c |  53 +
> >   drivers/gpu/drm/msm/msm_fence.h |  44 +++-
> >   drivers/gpu/drm/msm/msm_gem.c   |  94 +---
> >   drivers/gpu/drm/msm/msm_gem.h   |  47 ++--
> >   drivers/gpu/drm/msm/msm_gem_submit.c| 344 
> > +---
> >   drivers/gpu/drm/msm/msm_gpu.c   | 220 --
> >

Re: [early pull] drm/msm: drm-msm-next-2021-07-28 for v5.15

2021-07-28 Thread Rob Clark

On Wed, Jul 28, 2021 at 7:18 PM Caleb Connolly
 wrote:
>
>
>
> On 29/07/2021 02:02, Rob Clark wrote:
> > Jordan, any idea if more frequent frequency changes would for some
> > reason make a630 grumpy?  I was expecting it should be somewhat
> > similar to a618 (same GMU fw, etc).  The main result of that patch
> > should be clamping to min freq when gpu goes idle, and the toggling
> > back to devfreq provided freq on idle->active transition.  So there
> > might be more frequent freq transitions.
> >
> > Caleb, I don't suppose you could somehow delay starting UI and get
> > some traces?  Something along the lines of:
> >
> >localhost ~ # cd /sys/kernel/debug/tracing/
> >localhost /sys/kernel/debug/tracing # echo 1 > events/drm_msm_gpu/enable
> >localhost /sys/kernel/debug/tracing # echo 1 > tracing_on
> >localhost /sys/kernel/debug/tracing # cat trace_pipe
> Sure, here's the last ~1k lines of the trace logs:
> https://paste.ubuntu.com/p/XMKjKDWxYg/
> And what I managed to get from dmesg before the crash (mostly the same
> as before): https://paste.ubuntu.com/p/kGVtRHDWKH/
> >
> > Does adding an 'if (1) return' at the top of msm_devfreq_idle() help?
> > That should bypass the clamping to min freq when the GPU isn't doing
> > anything and reduce the # of freq transitions.  I suppose we could
> > opt-in to this behavior on a per-gpu basis..
> Yeah, that seems to resolve the issue, although I got the following
> probably unrelated (?) error on rebooting the device:
> [  134.994449] [drm:dpu_encoder_vsync_event_handler:1749] [dpu
> error]invalid parameters

I think that should probably be unrelated..

Based on the traces, I'm seeing rapid toggling between idle freq and
non-idle freq.. but no invalid freq's (assuming the dts opp table is
correct) so I *guess* there is maybe some sort of race condition
communicating with GMU or some other issue with rapid freq transition?
 Maybe Jordan has some ideas.

The earlier dmesg you posted look like gpu getting cranky about what
looks like a valid opcode, and then it goes off into the weeds.. when
you start seeing things like "0xDEAFBEAF" I think that means the GPU
has lost context (ie. power collapse and back, and now it is reading
bogus power-on default values).

I think I can put together a patch to make the "clamp to min freq when
gpu is idle" opt-in so we can enable it per-gpu once someone has
confirmed that it doesn't cause problems.  I guess that would at least
work as a short term solution.  But not sure if that is just papering
over some gpu/gmu bug (or maybe gdsc/clk bug), or if it is a legit
workaround for some limitation..

BR,
-R

>
> I wonder if the PocoPhone F1 has the same problem...
> >
> > BR,
> > -R
> >
> > On Wed, Jul 28, 2021 at 5:35 PM Caleb Connolly
> >  wrote:
> >>
> >> Hi Rob,
> >>
> >> This series causes a fatal crash on my Oneplus 6, the device goes to
> >> Qualcomm crashdump mode shortly after reaching UI with the following 
> >> errors:
> >>
> >> https://paste.ubuntu.com/p/HvjmzZYtgw/
> >>
> >> I did a git bisect and the patch ("drm/msm: Devfreq tuning") seems to be
> >> the cause of the crash, reverting it resolves the issue.
> >>
> >>
> >> On 28/07/2021 21:52, Rob Clark wrote:
> >>> Hi Dave & Daniel,
> >>>
> >>> An early pull for v5.15 (there'll be more coming in a week or two),
> >>> consisting of the drm/scheduler conversion and a couple other small
> >>> series that one was based one.  Mostly sending this now because IIUC
> >>> danvet wanted it in drm-next so he could rebase on it.  (Daniel, if
> >>> you disagree then speak up, and I'll instead include this in the main
> >>> pull request once that is ready.)
> >>>
> >>> This also has a core patch to drop drm_gem_object_put_locked() now
> >>> that the last use of it is removed.
> >>>
> >>> The following changes since commit 
> >>> ff1176468d368232b684f75e82563369208bc371:
> >>>
> >>> Linux 5.14-rc3 (2021-07-25 15:35:14 -0700)
> >>>
> >>> are available in the Git repository at:
> >>>
> >>> https://gitlab.freedesktop.org/drm/msm.git drm-msm-next-2021-07-28
> >>>
> >>> for you to fetch changes up to 4541e4f2225c30b0e9442be9eb2fb8b7086cdd1f:
> >>>
> >>> drm/msm/gem: Mark active before pinning (2021-07-28 09:19:00 -0700)
> >>>
> >>> --

Re: [RFC 1/4] dma-fence: Add deadline awareness

2021-07-29 Thread Rob Clark

On Thu, Jul 29, 2021 at 12:03 AM Daniel Vetter  wrote:
>
> On Wed, Jul 28, 2021 at 10:58:51AM -0700, Rob Clark wrote:
> > On Wed, Jul 28, 2021 at 10:23 AM Christian König
> >  wrote:
> > >
> > >
> > >
> > > Am 28.07.21 um 17:15 schrieb Rob Clark:
> > > > On Wed, Jul 28, 2021 at 4:37 AM Christian König
> > > >  wrote:
> > > >> Am 28.07.21 um 09:03 schrieb Christian König:
> > > >>> Am 27.07.21 um 16:25 schrieb Rob Clark:
> > > >>>> On Tue, Jul 27, 2021 at 12:11 AM Christian König
> > > >>>>  wrote:
> > > >>>>> Am 27.07.21 um 01:38 schrieb Rob Clark:
> > > >>>>>> From: Rob Clark 
> > > >>>>>>
> > > >>>>>> Add a way to hint to the fence signaler of an upcoming deadline,
> > > >>>>>> such as
> > > >>>>>> vblank, which the fence waiter would prefer not to miss. This is to
> > > >>>>>> aid
> > > >>>>>> the fence signaler in making power management decisions, like 
> > > >>>>>> boosting
> > > >>>>>> frequency as the deadline approaches and awareness of missing
> > > >>>>>> deadlines
> > > >>>>>> so that can be factored in to the frequency scaling.
> > > >>>>>>
> > > >>>>>> Signed-off-by: Rob Clark 
> > > >>>>>> ---
> > > >>>>>> drivers/dma-buf/dma-fence.c | 39
> > > >>>>>> +
> > > >>>>>> include/linux/dma-fence.h   | 17 
> > > >>>>>> 2 files changed, 56 insertions(+)
> > > >>>>>>
> > > >>>>>> diff --git a/drivers/dma-buf/dma-fence.c 
> > > >>>>>> b/drivers/dma-buf/dma-fence.c
> > > >>>>>> index ce0f5eff575d..2e0d25ab457e 100644
> > > >>>>>> --- a/drivers/dma-buf/dma-fence.c
> > > >>>>>> +++ b/drivers/dma-buf/dma-fence.c
> > > >>>>>> @@ -910,6 +910,45 @@ dma_fence_wait_any_timeout(struct dma_fence
> > > >>>>>> **fences, uint32_t count,
> > > >>>>>> }
> > > >>>>>> EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> > > >>>>>>
> > > >>>>>> +
> > > >>>>>> +/**
> > > >>>>>> + * dma_fence_set_deadline - set desired fence-wait deadline
> > > >>>>>> + * @fence:the fence that is to be waited on
> > > >>>>>> + * @deadline: the time by which the waiter hopes for the fence to 
> > > >>>>>> be
> > > >>>>>> + *signaled
> > > >>>>>> + *
> > > >>>>>> + * Inform the fence signaler of an upcoming deadline, such as
> > > >>>>>> vblank, by
> > > >>>>>> + * which point the waiter would prefer the fence to be signaled
> > > >>>>>> by.  This
> > > >>>>>> + * is intended to give feedback to the fence signaler to aid in 
> > > >>>>>> power
> > > >>>>>> + * management decisions, such as boosting GPU frequency if a 
> > > >>>>>> periodic
> > > >>>>>> + * vblank deadline is approaching.
> > > >>>>>> + */
> > > >>>>>> +void dma_fence_set_deadline(struct dma_fence *fence, ktime_t
> > > >>>>>> deadline)
> > > >>>>>> +{
> > > >>>>>> + unsigned long flags;
> > > >>>>>> +
> > > >>>>>> + if (dma_fence_is_signaled(fence))
> > > >>>>>> + return;
> > > >>>>>> +
> > > >>>>>> + spin_lock_irqsave(fence->lock, flags);
> > > >>>>>> +
> > > >>>>>> + /* If we already have an earlier deadline, keep it: */
> > > >>>>>> + if (test_bit(DMA_FENCE_FLAG_HAS_DEADLINE_BIT, &fence->flags) 
> > > >>>>>> &&
> > > >>>>>> + ktime_before(fence

Re: [PATCH v2 2/3] drm/msm/a6xx: Use rev to identify SKU

2021-07-29 Thread Rob Clark

On Thu, Jul 29, 2021 at 7:33 AM Akhil P Oommen  wrote:
>
> Use rev instead of revn to identify the SKU. This is in
> preparation to the introduction of 7c3 gpu which won't have a
> revn.
>
> Signed-off-by: Akhil P Oommen 
> ---
>
> (no changes since v1)
>
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 11 +--
>  1 file changed, 5 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index 183b9f9..0da1a66 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -1675,11 +1675,11 @@ static u32 a618_get_speed_bin(u32 fuse)
> return UINT_MAX;
>  }
>
> -static u32 fuse_to_supp_hw(struct device *dev, u32 revn, u32 fuse)
> +static u32 fuse_to_supp_hw(struct device *dev, struct adreno_rev rev, u32 
> fuse)
>  {
> u32 val = UINT_MAX;
>
> -   if (revn == 618)
> +   if (adreno_cmp_rev(ADRENO_REV(6, 1, 8, ANY_ID), rev))

Looks like adreno_cmp_rev() ended up in patch 3/3 when it should have
been in this patch..

But I guess we could also move this into adreno_is_a618() and use that here

BR,
-R

> val = a618_get_speed_bin(fuse);
>
> if (val == UINT_MAX) {
> @@ -1692,8 +1692,7 @@ static u32 fuse_to_supp_hw(struct device *dev, u32 
> revn, u32 fuse)
> return (1 << val);
>  }
>
> -static int a6xx_set_supported_hw(struct device *dev, struct a6xx_gpu 
> *a6xx_gpu,
> -   u32 revn)
> +static int a6xx_set_supported_hw(struct device *dev, struct adreno_rev rev)
>  {
> u32 supp_hw = UINT_MAX;
> u16 speedbin;
> @@ -1714,7 +1713,7 @@ static int a6xx_set_supported_hw(struct device *dev, 
> struct a6xx_gpu *a6xx_gpu,
> }
> speedbin = le16_to_cpu(speedbin);
>
> -   supp_hw = fuse_to_supp_hw(dev, revn, speedbin);
> +   supp_hw = fuse_to_supp_hw(dev, rev, speedbin);
>
>  done:
> ret = devm_pm_opp_set_supported_hw(dev, &supp_hw, 1);
> @@ -1785,7 +1784,7 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev)
>
> a6xx_llc_slices_init(pdev, a6xx_gpu);
>
> -   ret = a6xx_set_supported_hw(&pdev->dev, a6xx_gpu, info->revn);
> +   ret = a6xx_set_supported_hw(&pdev->dev, config->rev);
> if (ret) {
> a6xx_destroy(&(a6xx_gpu->base.base));
> return ERR_PTR(ret);
> --
> QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
> of Code Aurora Forum, hosted by The Linux Foundation.
>

Re: [PATCH v3 3/3] drm/msm/a6xx: Add support for Adreno 7c Gen 3 gpu

2021-07-29 Thread Rob Clark

On Thu, Jul 29, 2021 at 8:21 AM Akhil P Oommen  wrote:
>
> This patch adds support for the gpu found in the Snapdragon 7c Gen 3
> compute platform. This gpu is similar to the exisiting a660 gpu with
> minor delta in the programing sequence. As the Adreno GPUs are moving
> away from a numeric chipid based naming scheme to a string, it was
> decided to use 0x06030500 as the gpu id of this gpu to communicate
> to the userspace driver.

s/gpu id/chip id/ (and in the cover letter too)

>
> Signed-off-by: Akhil P Oommen 
> ---
>
> (no changes since v2)
>
> Changes in v2:
> - Introduce adreno_is_a660_family() (Rob)
> - Remove revn for 7c3 (Rob)
> - Remove CPR register programing since they are not required for 7c3
>
>  drivers/gpu/drm/msm/adreno/a6xx_gmu.c  |  8 ++--
>  drivers/gpu/drm/msm/adreno/a6xx_gmu.h  |  1 +
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c  | 32 
> --
>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c  | 32 
> ++
>  drivers/gpu/drm/msm/adreno/adreno_device.c | 27 +++--
>  drivers/gpu/drm/msm/adreno/adreno_gpu.h| 18 -
>  6 files changed, 98 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c 
> b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> index b349692..70ba3bf 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> @@ -519,7 +519,7 @@ static void a6xx_gmu_rpmh_init(struct a6xx_gmu *gmu)
> if (!pdcptr)
> goto err;
>
> -   if (adreno_is_a650(adreno_gpu) || adreno_is_a660(adreno_gpu))
> +   if (adreno_is_a650(adreno_gpu) || adreno_is_a660_family(adreno_gpu))
> pdc_in_aop = true;
> else if (adreno_is_a618(adreno_gpu) || adreno_is_a640(adreno_gpu))
> pdc_address_offset = 0x30090;
> @@ -933,6 +933,7 @@ int a6xx_gmu_resume(struct a6xx_gpu *a6xx_gpu)
>
> /* Use a known rate to bring up the GMU */
> clk_set_rate(gmu->core_clk, 2);
> +   clk_set_rate(gmu->hub_clk, 15000);
> ret = clk_bulk_prepare_enable(gmu->nr_clocks, gmu->clocks);
> if (ret) {
> pm_runtime_put(gmu->gxpd);
> @@ -1393,6 +1394,9 @@ static int a6xx_gmu_clocks_probe(struct a6xx_gmu *gmu)
> gmu->core_clk = msm_clk_bulk_get_clock(gmu->clocks,
> gmu->nr_clocks, "gmu");
>
> +   gmu->hub_clk = msm_clk_bulk_get_clock(gmu->clocks,
> +   gmu->nr_clocks, "hub");
> +
> return 0;
>  }
>
> @@ -1504,7 +1508,7 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct 
> device_node *node)
>  * are otherwise unused by a660.
>  */
> gmu->dummy.size = SZ_4K;
> -   if (adreno_is_a660(adreno_gpu)) {
> +   if (adreno_is_a660_family(adreno_gpu)) {
> ret = a6xx_gmu_memory_alloc(gmu, &gmu->debug, SZ_4K * 7, 
> 0x6040);
> if (ret)
> goto err_memory;
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h 
> b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> index 71dfa600..3c74f64 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> @@ -66,6 +66,7 @@ struct a6xx_gmu {
> int nr_clocks;
> struct clk_bulk_data *clocks;
> struct clk *core_clk;
> +   struct clk *hub_clk;
>
> /* current performance index set externally */
> int current_perf_index;
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index 0da1a66..1881e09 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -652,7 +652,7 @@ static void a6xx_set_cp_protect(struct msm_gpu *gpu)
> regs = a650_protect;
> count = ARRAY_SIZE(a650_protect);
> count_max = 48;
> -   } else if (adreno_is_a660(adreno_gpu)) {
> +   } else if (adreno_is_a660_family(adreno_gpu)) {
> regs = a660_protect;
> count = ARRAY_SIZE(a660_protect);
> count_max = 48;
> @@ -694,6 +694,13 @@ static void a6xx_set_ubwc_config(struct msm_gpu *gpu)
> uavflagprd_inv = 2;
> }
>
> +   if (adreno_is_7c3(adreno_gpu)) {
> +   lower_bit = 1;
> +   amsbc = 1;
> +   rgb565_predicator = 1;
> +   uavflagprd_inv = 2;
> +   }
> +
> gpu_write(gpu, REG_A6XX_RB_NC_MODE_CNTL,
> rgb565_predicator << 11 | amsbc << 4 | lower_bit << 1);
> gpu_write(gpu, REG_A6XX_TPL1_NC_MODE_CNTL, lower_bit << 1);
> @@ -787,7 +794,7 @@ static bool a6xx_ucode_check_version(struct a6xx_gpu 
> *a6xx_gpu,
> DRM_DEV_ERROR(&gpu->pdev->dev,
> "a650 SQE ucode is too old. Have version %x need at 
> least %x\n",
> buf[0] & 0xfff, 0x095);
> -   } else if (adreno_is_a660(adreno_gpu)) {
> +   } else if

Re: [PATCH v2 2/3] drm/msm/a6xx: Use rev to identify SKU

2021-07-29 Thread Rob Clark

On Thu, Jul 29, 2021 at 8:36 AM Akhil P Oommen  wrote:
>
> On 7/29/2021 8:57 PM, Rob Clark wrote:
> > On Thu, Jul 29, 2021 at 7:33 AM Akhil P Oommen  
> > wrote:
> >>
> >> Use rev instead of revn to identify the SKU. This is in
> >> preparation to the introduction of 7c3 gpu which won't have a
> >> revn.
> >>
> >> Signed-off-by: Akhil P Oommen 
> >> ---
> >>
> >> (no changes since v1)
> >>
> >>   drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 11 +--
> >>   1 file changed, 5 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
> >> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> >> index 183b9f9..0da1a66 100644
> >> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> >> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> >> @@ -1675,11 +1675,11 @@ static u32 a618_get_speed_bin(u32 fuse)
> >>  return UINT_MAX;
> >>   }
> >>
> >> -static u32 fuse_to_supp_hw(struct device *dev, u32 revn, u32 fuse)
> >> +static u32 fuse_to_supp_hw(struct device *dev, struct adreno_rev rev, u32 
> >> fuse)
> >>   {
> >>  u32 val = UINT_MAX;
> >>
> >> -   if (revn == 618)
> >> +   if (adreno_cmp_rev(ADRENO_REV(6, 1, 8, ANY_ID), rev))
> >
> > Looks like adreno_cmp_rev() ended up in patch 3/3 when it should have
> > been in this patch..
> >
> > But I guess we could also move this into adreno_is_a618() and use that here
> >
> > BR,
> > -R
> Ahh! I reordered the patches. This is too early in the probe sequence to
> call adreno_is_axxx(), right?

ahh, right, I guess you do still need to open code adreno_cmp_rev()
here.. but you can at least move adreno_cmp_rev() into this patch.

BR,
-R

>
> -Akhil.
> >
> >>  val = a618_get_speed_bin(fuse);
> >>
> >>  if (val == UINT_MAX) {
> >> @@ -1692,8 +1692,7 @@ static u32 fuse_to_supp_hw(struct device *dev, u32 
> >> revn, u32 fuse)
> >>  return (1 << val);
> >>   }
> >>
> >> -static int a6xx_set_supported_hw(struct device *dev, struct a6xx_gpu 
> >> *a6xx_gpu,
> >> -   u32 revn)
> >> +static int a6xx_set_supported_hw(struct device *dev, struct adreno_rev 
> >> rev)
> >>   {
> >>  u32 supp_hw = UINT_MAX;
> >>  u16 speedbin;
> >> @@ -1714,7 +1713,7 @@ static int a6xx_set_supported_hw(struct device *dev, 
> >> struct a6xx_gpu *a6xx_gpu,
> >>  }
> >>  speedbin = le16_to_cpu(speedbin);
> >>
> >> -   supp_hw = fuse_to_supp_hw(dev, revn, speedbin);
> >> +   supp_hw = fuse_to_supp_hw(dev, rev, speedbin);
> >>
> >>   done:
> >>  ret = devm_pm_opp_set_supported_hw(dev, &supp_hw, 1);
> >> @@ -1785,7 +1784,7 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev)
> >>
> >>  a6xx_llc_slices_init(pdev, a6xx_gpu);
> >>
> >> -   ret = a6xx_set_supported_hw(&pdev->dev, a6xx_gpu, info->revn);
> >> +   ret = a6xx_set_supported_hw(&pdev->dev, config->rev);
> >>  if (ret) {
> >>  a6xx_destroy(&(a6xx_gpu->base.base));
> >>  return ERR_PTR(ret);
> >> --
> >> QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
> >> of Code Aurora Forum, hosted by The Linux Foundation.
> >>
>

Re: [RFC PATCH] drm/msm: Introduce Adreno Features

2021-07-29 Thread Rob Clark

On Thu, Jul 29, 2021 at 8:31 AM Akhil P Oommen  wrote:
>
> Introduce a feature flag in gpulist to easily identify the capabilities
> of each gpu revision. This will help to avoid a lot of adreno_is_axxx()
> check when we add new features. In the current patch, HW APRIV feature
> is converted to a feature flag.
>
> Signed-off-by: Akhil P Oommen 
> ---
> This patch is rebased on top of the below series:
> https://patchwork.freedesktop.org/series/93192/
>
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c  | 12 
>  drivers/gpu/drm/msm/adreno/adreno_device.c |  3 +++
>  drivers/gpu/drm/msm/adreno/adreno_gpu.c|  3 +++
>  drivers/gpu/drm/msm/adreno/adreno_gpu.h|  9 +
>  4 files changed, 15 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index 1881e09..b28305b 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -1765,7 +1765,6 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev)
> struct msm_drm_private *priv = dev->dev_private;
> struct platform_device *pdev = priv->gpu_pdev;
> struct adreno_platform_config *config = pdev->dev.platform_data;
> -   const struct adreno_info *info;
> struct device_node *node;
> struct a6xx_gpu *a6xx_gpu;
> struct adreno_gpu *adreno_gpu;
> @@ -1781,17 +1780,6 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev)
>
> adreno_gpu->registers = NULL;
>
> -   /*
> -* We need to know the platform type before calling into 
> adreno_gpu_init
> -* so that the hw_apriv flag can be correctly set. Snoop into the info
> -* and grab the revision number
> -*/
> -   info = adreno_info(config->rev);
> -
> -   if (info && (info->revn == 650 || info->revn == 660 ||
> -   adreno_cmp_rev(ADRENO_REV(6, 3, 5, ANY_ID), 
> info->rev)))
> -   adreno_gpu->base.hw_apriv = true;
> -
> a6xx_llc_slices_init(pdev, a6xx_gpu);
>
> ret = a6xx_set_supported_hw(&pdev->dev, config->rev);
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c 
> b/drivers/gpu/drm/msm/adreno/adreno_device.c
> index 7b9d605..44321ec 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_device.c
> +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c
> @@ -276,6 +276,7 @@ static const struct adreno_info gpulist[] = {
> .rev = ADRENO_REV(6, 5, 0, ANY_ID),
> .revn = 650,
> .name = "A650",
> +   .features = ADRENO_APRIV,

I guess this should be:

.features = BIT(ADRENO_APRIV),

> .fw = {
> [ADRENO_FW_SQE] = "a650_sqe.fw",
> [ADRENO_FW_GMU] = "a650_gmu.bin",
> @@ -289,6 +290,7 @@ static const struct adreno_info gpulist[] = {
> .rev = ADRENO_REV(6, 6, 0, ANY_ID),
> .revn = 660,
> .name = "A660",
> +   .features = ADRENO_APRIV,
> .fw = {
> [ADRENO_FW_SQE] = "a660_sqe.fw",
> [ADRENO_FW_GMU] = "a660_gmu.bin",
> @@ -301,6 +303,7 @@ static const struct adreno_info gpulist[] = {
> }, {
> .rev = ADRENO_REV(6, 3, 5, ANY_ID),
> .name = "Adreno 7c Gen 3",
> +   .features = ADRENO_APRIV,
> .fw = {
> [ADRENO_FW_SQE] = "a660_sqe.fw",
> [ADRENO_FW_GMU] = "a660_gmu.bin",
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c 
> b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> index 9f5a302..e8acadf5 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> @@ -945,6 +945,9 @@ int adreno_gpu_init(struct drm_device *drm, struct 
> platform_device *pdev,
> pm_runtime_use_autosuspend(dev);
> pm_runtime_enable(dev);
>
> +   if (ADRENO_FEAT(adreno_gpu, ADRENO_APRIV))
> +   adreno_gpu->base.hw_apriv = true;
> +
> return msm_gpu_init(drm, pdev, &adreno_gpu->base, &funcs->base,
> adreno_gpu->info->name, &adreno_gpu_config);
>  }
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h 
> b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> index 50b4d53..61797c3 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> @@ -35,6 +35,11 @@ enum adreno_quirks {
> ADRENO_QUIRK_LMLOADKILL_DISABLE = 3,
>  };
>
> +enum adreno_features {
> +   /* ADRENO has HW APRIV feature */
> +   ADRENO_APRIV,
> +};
> +
>  struct adreno_rev {
> uint8_t  core;
> uint8_t  major;
> @@ -63,6 +68,7 @@ struct adreno_info {
> struct adreno_rev rev;
> uint32_t revn;
> const char *name;
> +   u32 features;
> const char *fw[ADRENO_FW_MAX];
> uint32_t gmem;
> enum adreno_quirks quirks;
> @@ -388,6 +394,9 @@ stati

Re: [RFC 1/4] dma-fence: Add deadline awareness

2021-07-29 Thread Rob Clark

On Thu, Jul 29, 2021 at 9:18 AM Daniel Vetter  wrote:
>
> On Thu, Jul 29, 2021 at 5:19 PM Rob Clark  wrote:
> >
> > On Thu, Jul 29, 2021 at 12:03 AM Daniel Vetter  wrote:
> > >
> > > On Wed, Jul 28, 2021 at 10:58:51AM -0700, Rob Clark wrote:
> > > > On Wed, Jul 28, 2021 at 10:23 AM Christian König
> > > >  wrote:
> > > > >
> > > > >
> > > > >
> > > > > Am 28.07.21 um 17:15 schrieb Rob Clark:
> > > > > > On Wed, Jul 28, 2021 at 4:37 AM Christian König
> > > > > >  wrote:
> > > > > >> Am 28.07.21 um 09:03 schrieb Christian König:
> > > > > >>> Am 27.07.21 um 16:25 schrieb Rob Clark:
> > > > > >>>> On Tue, Jul 27, 2021 at 12:11 AM Christian König
> > > > > >>>>  wrote:
> > > > > >>>>> Am 27.07.21 um 01:38 schrieb Rob Clark:
> > > > > >>>>>> From: Rob Clark 
> > > > > >>>>>>
> > > > > >>>>>> Add a way to hint to the fence signaler of an upcoming 
> > > > > >>>>>> deadline,
> > > > > >>>>>> such as
> > > > > >>>>>> vblank, which the fence waiter would prefer not to miss. This 
> > > > > >>>>>> is to
> > > > > >>>>>> aid
> > > > > >>>>>> the fence signaler in making power management decisions, like 
> > > > > >>>>>> boosting
> > > > > >>>>>> frequency as the deadline approaches and awareness of missing
> > > > > >>>>>> deadlines
> > > > > >>>>>> so that can be factored in to the frequency scaling.
> > > > > >>>>>>
> > > > > >>>>>> Signed-off-by: Rob Clark 
> > > > > >>>>>> ---
> > > > > >>>>>> drivers/dma-buf/dma-fence.c | 39
> > > > > >>>>>> +
> > > > > >>>>>> include/linux/dma-fence.h   | 17 
> > > > > >>>>>> 2 files changed, 56 insertions(+)
> > > > > >>>>>>
> > > > > >>>>>> diff --git a/drivers/dma-buf/dma-fence.c 
> > > > > >>>>>> b/drivers/dma-buf/dma-fence.c
> > > > > >>>>>> index ce0f5eff575d..2e0d25ab457e 100644
> > > > > >>>>>> --- a/drivers/dma-buf/dma-fence.c
> > > > > >>>>>> +++ b/drivers/dma-buf/dma-fence.c
> > > > > >>>>>> @@ -910,6 +910,45 @@ dma_fence_wait_any_timeout(struct 
> > > > > >>>>>> dma_fence
> > > > > >>>>>> **fences, uint32_t count,
> > > > > >>>>>> }
> > > > > >>>>>> EXPORT_SYMBOL(dma_fence_wait_any_timeout);
> > > > > >>>>>>
> > > > > >>>>>> +
> > > > > >>>>>> +/**
> > > > > >>>>>> + * dma_fence_set_deadline - set desired fence-wait deadline
> > > > > >>>>>> + * @fence:the fence that is to be waited on
> > > > > >>>>>> + * @deadline: the time by which the waiter hopes for the 
> > > > > >>>>>> fence to be
> > > > > >>>>>> + *signaled
> > > > > >>>>>> + *
> > > > > >>>>>> + * Inform the fence signaler of an upcoming deadline, such as
> > > > > >>>>>> vblank, by
> > > > > >>>>>> + * which point the waiter would prefer the fence to be 
> > > > > >>>>>> signaled
> > > > > >>>>>> by.  This
> > > > > >>>>>> + * is intended to give feedback to the fence signaler to aid 
> > > > > >>>>>> in power
> > > > > >>>>>> + * management decisions, such as boosting GPU frequency if a 
> > > > > >>>>>> periodic
> > > > > >>>>>> + * vblank deadline is approaching.
> > > > > >>>>>> +

Re: [PATCH v3 1/2] arm64: dts: qcom: sc7280: Add gpu support

2021-07-29 Thread Rob Clark

On Thu, Jul 29, 2021 at 10:19 AM Stephen Boyd  wrote:
>
> Quoting Akhil P Oommen (2021-07-28 04:54:01)
> > diff --git a/arch/arm64/boot/dts/qcom/sc7280.dtsi 
> > b/arch/arm64/boot/dts/qcom/sc7280.dtsi
> > index 029723a..c88f366 100644
> > --- a/arch/arm64/boot/dts/qcom/sc7280.dtsi
> > +++ b/arch/arm64/boot/dts/qcom/sc7280.dtsi
> > @@ -592,6 +593,85 @@
> > qcom,bcm-voters = <&apps_bcm_voter>;
> > };
> >
> > +   gpu@3d0 {
> > +   compatible = "qcom,adreno-635.0", "qcom,adreno";
> > +   #stream-id-cells = <16>;
> > +   reg = <0 0x03d0 0 0x4>,
> > + <0 0x03d9e000 0 0x1000>,
> > + <0 0x03d61000 0 0x800>;
> > +   reg-names = "kgsl_3d0_reg_memory",
> > +   "cx_mem",
> > +   "cx_dbgc";
> > +   interrupts = ;
> > +   iommus = <&adreno_smmu 0 0x401>;
> > +   operating-points-v2 = <&gpu_opp_table>;
> > +   qcom,gmu = <&gmu>;
> > +   interconnects = <&gem_noc MASTER_GFX3D 0 &mc_virt 
> > SLAVE_EBI1 0>;
> > +   interconnect-names = "gfx-mem";
> > +
> > +   gpu_opp_table: opp-table {
> > +   compatible = "operating-points-v2";
> > +
> > +   opp-55000 {
> > +   opp-hz = /bits/ 64 <55000>;
> > +   opp-level = 
> > ;
> > +   opp-peak-kBps = <6832000>;
> > +   };
> > +
> > +   opp-45000 {
>
> Why is 45000 after 55000? Is it on purpose? If not intended
> please sort by frequency.

We've used descending order, at least for gpu opp table, on other
gens, fwiw.. not sure if that just means we were doing it wrong
previously

BR,
-R

>
> > +   opp-hz = /bits/ 64 <45000>;
> > +   opp-level = 
> > ;
> > +   opp-peak-kBps = <4068000>;
> > +   };
> > +
> > +   opp-31500 {
> > +   opp-hz = /bits/ 64 <31500>;
> > +   opp-level = 
> > ;
> > +   opp-peak-kBps = <1804000>;
> > +   };
> > +   };
> > +   };
> > +

[PATCH] drm/msm: Disable frequency clamping on a630

2021-07-29 Thread Rob Clark

From: Rob Clark 

The more frequent frequency transitions resulting from clamping freq to
minimum when the GPU is idle seems to be causing some issue with the bus
getting voted off when it should be on.  (An enable racing with an async
disable?)  This might be a problem outside of the GPU, as I can't
reproduce this on a618 which uses the same GMU fw and same mechanism to
communicate with GMU to set opp.  For now, just revert to previous
devfreq behavior on a630 until the issue is understood.

Reported-by: Caleb Connolly 
Fixes: 9bc95570175a ("drm/msm: Devfreq tuning")
Signed-off-by: Rob Clark 
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |  3 +++
 drivers/gpu/drm/msm/msm_gpu.h   |  2 ++
 drivers/gpu/drm/msm/msm_gpu_devfreq.c   | 12 
 3 files changed, 17 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c 
b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 748665232d29..9fd08b413010 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -945,6 +945,9 @@ int adreno_gpu_init(struct drm_device *drm, struct 
platform_device *pdev,
pm_runtime_use_autosuspend(dev);
pm_runtime_enable(dev);
 
+   if (adreno_is_a630(adreno_gpu))
+   gpu->devfreq.disable_freq_clamping = true;
+
return msm_gpu_init(drm, pdev, &adreno_gpu->base, &funcs->base,
adreno_gpu->info->name, &adreno_gpu_config);
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 0e4b45bff2e6..7e11b667f939 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -112,6 +112,8 @@ struct msm_gpu_devfreq {
 * it is inactive.
 */
unsigned long idle_freq;
+
+   bool disable_freq_clamping;
 };
 
 struct msm_gpu {
diff --git a/drivers/gpu/drm/msm/msm_gpu_devfreq.c 
b/drivers/gpu/drm/msm/msm_gpu_devfreq.c
index 0a1ee20296a2..a832af436251 100644
--- a/drivers/gpu/drm/msm/msm_gpu_devfreq.c
+++ b/drivers/gpu/drm/msm/msm_gpu_devfreq.c
@@ -94,6 +94,12 @@ void msm_devfreq_init(struct msm_gpu *gpu)
if (!gpu->funcs->gpu_busy)
return;
 
+   /* Revert to previous polling interval if we aren't using freq clamping
+* to preserve previous behavior
+*/
+   if (gpu->devfreq.disable_freq_clamping)
+   msm_devfreq_profile.polling_ms = 10;
+
msm_devfreq_profile.initial_freq = gpu->fast_rate;
 
/*
@@ -151,6 +157,9 @@ void msm_devfreq_active(struct msm_gpu *gpu)
unsigned int idle_time;
unsigned long target_freq = df->idle_freq;
 
+   if (gpu->devfreq.disable_freq_clamping)
+   return;
+
/*
 * Hold devfreq lock to synchronize with get_dev_status()/
 * target() callbacks
@@ -186,6 +195,9 @@ void msm_devfreq_idle(struct msm_gpu *gpu)
struct msm_gpu_devfreq *df = &gpu->devfreq;
unsigned long idle_freq, target_freq = 0;
 
+   if (gpu->devfreq.disable_freq_clamping)
+   return;
+
/*
 * Hold devfreq lock to synchronize with get_dev_status()/
 * target() callbacks
-- 
2.31.1

Re: [PATCH] drm/msm: Disable frequency clamping on a630

2021-07-29 Thread Rob Clark

On Thu, Jul 29, 2021 at 1:06 PM Caleb Connolly
 wrote:
>
> Hi Rob,
>
> I've done some more testing! It looks like before that patch ("drm/msm: 
> Devfreq tuning") the GPU would never get above
> the second frequency in the OPP table (342MHz) (at least, not in glxgears). 
> With the patch applied it would more
> aggressively jump up to the max frequency which seems to be unstable at the 
> default regulator voltages.

*ohh*, yeah, ok, that would explain it

> Hacking the pm8005 s1 regulator (which provides VDD_GFX) up to 0.988v 
> (instead of the stock 0.516v) makes the GPU stable
> at the higher frequencies.
>
> Applying this patch reverts the behaviour, and the GPU never goes above 
> 342MHz in glxgears, losing ~30% performance in
> glxgear.
>
> I think (?) that enabling CPR support would be the proper solution to this - 
> that would ensure that the regulators run
> at the voltage the hardware needs to be stable.
>
> Is hacking the voltage higher (although ideally not quite that high) an 
> acceptable short term solution until we have
> CPR? Or would it be safer to just not make use of the higher frequencies on 
> a630 for now?
>

tbh, I'm not sure about the regulator stuff and CPR.. Bjorn is already
on CC and I added sboyd, maybe one of them knows better.

In the short term, removing the higher problematic OPPs from dts might
be a better option than this patch (which I'm dropping), since there
is nothing stopping other workloads from hitting higher OPPs.

I'm slightly curious why I didn't have problems at higher OPPs on my
c630 laptop (sdm850)

BR,
-R

>
> On 29/07/2021 19:39, Rob Clark wrote:
> > From: Rob Clark 
> >
> > The more frequent frequency transitions resulting from clamping freq to
> > minimum when the GPU is idle seems to be causing some issue with the bus
> > getting voted off when it should be on.  (An enable racing with an async
> > disable?)  This might be a problem outside of the GPU, as I can't
> > reproduce this on a618 which uses the same GMU fw and same mechanism to
> > communicate with GMU to set opp.  For now, just revert to previous
> > devfreq behavior on a630 until the issue is understood.
> >
> > Reported-by: Caleb Connolly 
> > Fixes: 9bc95570175a ("drm/msm: Devfreq tuning")
> > Signed-off-by: Rob Clark 
> > ---
> >   drivers/gpu/drm/msm/adreno/adreno_gpu.c |  3 +++
> >   drivers/gpu/drm/msm/msm_gpu.h   |  2 ++
> >   drivers/gpu/drm/msm/msm_gpu_devfreq.c   | 12 
> >   3 files changed, 17 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c 
> > b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> > index 748665232d29..9fd08b413010 100644
> > --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> > +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> > @@ -945,6 +945,9 @@ int adreno_gpu_init(struct drm_device *drm, struct 
> > platform_device *pdev,
> >   pm_runtime_use_autosuspend(dev);
> >   pm_runtime_enable(dev);
> >
> > + if (adreno_is_a630(adreno_gpu))
> > + gpu->devfreq.disable_freq_clamping = true;
> > +
> >   return msm_gpu_init(drm, pdev, &adreno_gpu->base, &funcs->base,
> >   adreno_gpu->info->name, &adreno_gpu_config);
> >   }
> > diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> > index 0e4b45bff2e6..7e11b667f939 100644
> > --- a/drivers/gpu/drm/msm/msm_gpu.h
> > +++ b/drivers/gpu/drm/msm/msm_gpu.h
> > @@ -112,6 +112,8 @@ struct msm_gpu_devfreq {
> >* it is inactive.
> >*/
> >   unsigned long idle_freq;
> > +
> > + bool disable_freq_clamping;
> >   };
> >
> >   struct msm_gpu {
> > diff --git a/drivers/gpu/drm/msm/msm_gpu_devfreq.c 
> > b/drivers/gpu/drm/msm/msm_gpu_devfreq.c
> > index 0a1ee20296a2..a832af436251 100644
> > --- a/drivers/gpu/drm/msm/msm_gpu_devfreq.c
> > +++ b/drivers/gpu/drm/msm/msm_gpu_devfreq.c
> > @@ -94,6 +94,12 @@ void msm_devfreq_init(struct msm_gpu *gpu)
> >   if (!gpu->funcs->gpu_busy)
> >   return;
> >
> > + /* Revert to previous polling interval if we aren't using freq 
> > clamping
> > +  * to preserve previous behavior
> > +  */
> > + if (gpu->devfreq.disable_freq_clamping)
> > + msm_devfreq_profile.polling_ms = 10;
> > +
> >   msm_devfreq_profile.initial_freq = gpu->fast_rate;
> >
> >   /*
> > @@ -151,6 +157,9 @@ void msm_devfreq_active(struct msm_gpu *gpu)
> >   unsigne

Re: [PATCH] drm/msm: Disable frequency clamping on a630

2021-07-29 Thread Rob Clark

On Thu, Jul 29, 2021 at 1:28 PM Caleb Connolly
 wrote:
>
>
>
> On 29/07/2021 21:24, Rob Clark wrote:
> > On Thu, Jul 29, 2021 at 1:06 PM Caleb Connolly
> >  wrote:
> >>
> >> Hi Rob,
> >>
> >> I've done some more testing! It looks like before that patch ("drm/msm: 
> >> Devfreq tuning") the GPU would never get above
> >> the second frequency in the OPP table (342MHz) (at least, not in 
> >> glxgears). With the patch applied it would more
> >> aggressively jump up to the max frequency which seems to be unstable at 
> >> the default regulator voltages.
> >
> > *ohh*, yeah, ok, that would explain it
> >
> >> Hacking the pm8005 s1 regulator (which provides VDD_GFX) up to 0.988v 
> >> (instead of the stock 0.516v) makes the GPU stable
> >> at the higher frequencies.
> >>
> >> Applying this patch reverts the behaviour, and the GPU never goes above 
> >> 342MHz in glxgears, losing ~30% performance in
> >> glxgear.
> >>
> >> I think (?) that enabling CPR support would be the proper solution to this 
> >> - that would ensure that the regulators run
> >> at the voltage the hardware needs to be stable.
> >>
> >> Is hacking the voltage higher (although ideally not quite that high) an 
> >> acceptable short term solution until we have
> >> CPR? Or would it be safer to just not make use of the higher frequencies 
> >> on a630 for now?
> >>
> >
> > tbh, I'm not sure about the regulator stuff and CPR.. Bjorn is already
> > on CC and I added sboyd, maybe one of them knows better.
> >
> > In the short term, removing the higher problematic OPPs from dts might
> > be a better option than this patch (which I'm dropping), since there
> > is nothing stopping other workloads from hitting higher OPPs.
> Oh yeah that sounds like a more sensible workaround than mine .
> >
> > I'm slightly curious why I didn't have problems at higher OPPs on my
> > c630 laptop (sdm850)
> Perhaps you won the sillicon lottery - iirc sdm850 is binned for higher 
> clocks as is out of the factory.
>
> Would it be best to drop the OPPs for all devices? Or just those affected? I 
> guess it's possible another c630 might
> crash where yours doesn't?

I've not heard any reports of similar issues from the handful of other
folks with c630's on #aarch64-laptops.. but I can't really say if that
is luck or not.

Maybe just remove it for affected devices?  But I'll defer to Bjorn.

BR,
-R

Re: [PATCH] drm: msm: Add 680 gpu to the adreno gpu list

2021-07-29 Thread Rob Clark

On Sat, Jul 24, 2021 at 8:21 PM Bjorn Andersson
 wrote:
>
> This patch adds a Adreno 680 entry to the gpulist.

Looks reasonable, but I wonder if we should just go ahead and add
adreno_is_a640_family() in a similar vein to
adreno_is_a650_familiy()/adreno_is_a660_family().. I think most of the
'if (a640) ...' should also apply to a680?

BR,
-R

> Signed-off-by: Bjorn Andersson 
> ---
>  drivers/gpu/drm/msm/adreno/a6xx_gmu.c  |  5 +++--
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c  | 12 +++-
>  drivers/gpu/drm/msm/adreno/a6xx_hfi.c  |  2 +-
>  drivers/gpu/drm/msm/adreno/adreno_device.c | 13 +
>  drivers/gpu/drm/msm/adreno/adreno_gpu.h|  5 +
>  5 files changed, 29 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c 
> b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> index b349692219b7..1c0d75e1189f 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> @@ -521,7 +521,8 @@ static void a6xx_gmu_rpmh_init(struct a6xx_gmu *gmu)
>
> if (adreno_is_a650(adreno_gpu) || adreno_is_a660(adreno_gpu))
> pdc_in_aop = true;
> -   else if (adreno_is_a618(adreno_gpu) || adreno_is_a640(adreno_gpu))
> +   else if (adreno_is_a618(adreno_gpu) || adreno_is_a640(adreno_gpu) ||
> +adreno_is_a680(adreno_gpu))
> pdc_address_offset = 0x30090;
> else
> pdc_address_offset = 0x30080;
> @@ -1522,7 +1523,7 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct 
> device_node *node)
> SZ_16M - SZ_16K, 0x04000);
> if (ret)
> goto err_memory;
> -   } else if (adreno_is_a640(adreno_gpu)) {
> +   } else if (adreno_is_a640(adreno_gpu) || adreno_is_a680(adreno_gpu)) {
> ret = a6xx_gmu_memory_alloc(gmu, &gmu->icache,
> SZ_256K - SZ_16K, 0x04000);
> if (ret)
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index 9c5e4618aa0a..5cdafc6c8bb0 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -683,7 +683,7 @@ static void a6xx_set_ubwc_config(struct msm_gpu *gpu)
> if (adreno_is_a618(adreno_gpu))
> return;
>
> -   if (adreno_is_a640(adreno_gpu))
> +   if (adreno_is_a640(adreno_gpu) || adreno_is_a680(adreno_gpu))
> amsbc = 1;
>
> if (adreno_is_a650(adreno_gpu) || adreno_is_a660(adreno_gpu)) {
> @@ -757,7 +757,7 @@ static bool a6xx_ucode_check_version(struct a6xx_gpu 
> *a6xx_gpu,
>  * a660 targets have all the critical security fixes from the start
>  */
> if (adreno_is_a618(adreno_gpu) || adreno_is_a630(adreno_gpu) ||
> -   adreno_is_a640(adreno_gpu)) {
> +   adreno_is_a640(adreno_gpu) || adreno_is_a680(adreno_gpu)) {
> /*
>  * If the lowest nibble is 0xa that is an indication that this
>  * microcode has been patched. The actual version is in dword
> @@ -897,7 +897,8 @@ static int a6xx_hw_init(struct msm_gpu *gpu)
> a6xx_set_hwcg(gpu, true);
>
> /* VBIF/GBIF start*/
> -   if (adreno_is_a640(adreno_gpu) || adreno_is_a650_family(adreno_gpu)) {
> +   if (adreno_is_a640(adreno_gpu) || adreno_is_a650_family(adreno_gpu) ||
> +   adreno_is_a680(adreno_gpu)) {
> gpu_write(gpu, REG_A6XX_GBIF_QSB_SIDE0, 0x00071620);
> gpu_write(gpu, REG_A6XX_GBIF_QSB_SIDE1, 0x00071620);
> gpu_write(gpu, REG_A6XX_GBIF_QSB_SIDE2, 0x00071620);
> @@ -935,7 +936,8 @@ static int a6xx_hw_init(struct msm_gpu *gpu)
> gpu_write(gpu, REG_A6XX_UCHE_FILTER_CNTL, 0x804);
> gpu_write(gpu, REG_A6XX_UCHE_CACHE_WAYS, 0x4);
>
> -   if (adreno_is_a640(adreno_gpu) || adreno_is_a650_family(adreno_gpu))
> +   if (adreno_is_a640(adreno_gpu) || adreno_is_a650_family(adreno_gpu) ||
> +   adreno_is_a680(adreno_gpu))
> gpu_write(gpu, REG_A6XX_CP_ROQ_THRESHOLDS_2, 0x02000140);
> else
> gpu_write(gpu, REG_A6XX_CP_ROQ_THRESHOLDS_2, 0x01c0);
> @@ -952,7 +954,7 @@ static int a6xx_hw_init(struct msm_gpu *gpu)
> */
> if (adreno_is_a650(adreno_gpu) || adreno_is_a660(adreno_gpu))
> gpu_write(gpu, REG_A6XX_PC_DBG_ECO_CNTL, 0x00300200);
> -   else if (adreno_is_a640(adreno_gpu))
> +   else if (adreno_is_a640(adreno_gpu) || adreno_is_a680(adreno_gpu))
> gpu_write(gpu, REG_A6XX_PC_DBG_ECO_CNTL, 0x00200200);
> else
> gpu_write(gpu, REG_A6XX_PC_DBG_ECO_CNTL, 0x0018);
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c 
> b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> index 919433732b43..df8af237cf6a 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c
> @@ -428,7 +428,7 @@ static in

Re: [Freedreno] [PATCH] drm: msm: Add 680 gpu to the adreno gpu list

2021-07-29 Thread Rob Clark

On Thu, Jul 29, 2021 at 8:53 PM Akhil P Oommen  wrote:
>
> On 7/30/2021 5:38 AM, Rob Clark wrote:
> > On Sat, Jul 24, 2021 at 8:21 PM Bjorn Andersson
> >  wrote:
> >>
> >> This patch adds a Adreno 680 entry to the gpulist.
> >
> > Looks reasonable, but I wonder if we should just go ahead and add
> > adreno_is_a640_family() in a similar vein to
> > adreno_is_a650_familiy()/adreno_is_a660_family().. I think most of the
> > 'if (a640) ...' should also apply to a680?
>
> If there is no delta, wouldn't it be better to simply add a680 to
> adreno_is_a640?

Until we move to features flags (and if needed, other config table
params.. I know UMD needs more than just booleans but that may not be
the case on kernel side), I'd kinda prefer "_family()" to make it
clear that it applies to more than just a single gpu but a
sub-generation of a6xx.. I'd kinda assume the differences in memory
configuration (iirc dual vs quad ddr interfaces) have some small
differences between a640 and a680 that matters somewhere on the kernel
side?

BR,
-R

> -Akhil.
>
> >
> > BR,
> > -R
> >
> >> Signed-off-by: Bjorn Andersson 
> >> ---
> >>   drivers/gpu/drm/msm/adreno/a6xx_gmu.c  |  5 +++--
> >>   drivers/gpu/drm/msm/adreno/a6xx_gpu.c  | 12 +++-
> >>   drivers/gpu/drm/msm/adreno/a6xx_hfi.c  |  2 +-
> >>   drivers/gpu/drm/msm/adreno/adreno_device.c | 13 +
> >>   drivers/gpu/drm/msm/adreno/adreno_gpu.h|  5 +
> >>   5 files changed, 29 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c 
> >> b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> >> index b349692219b7..1c0d75e1189f 100644
> >> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> >> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> >> @@ -521,7 +521,8 @@ static void a6xx_gmu_rpmh_init(struct a6xx_gmu *gmu)
> >>
> >>  if (adreno_is_a650(adreno_gpu) || adreno_is_a660(adreno_gpu))
> >>  pdc_in_aop = true;
> >> -   else if (adreno_is_a618(adreno_gpu) || adreno_is_a640(adreno_gpu))
> >> +   else if (adreno_is_a618(adreno_gpu) || adreno_is_a640(adreno_gpu) 
> >> ||
> >> +adreno_is_a680(adreno_gpu))
> >>  pdc_address_offset = 0x30090;
> >>  else
> >>  pdc_address_offset = 0x30080;
> >> @@ -1522,7 +1523,7 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct 
> >> device_node *node)
> >>  SZ_16M - SZ_16K, 0x04000);
> >>  if (ret)
> >>  goto err_memory;
> >> -   } else if (adreno_is_a640(adreno_gpu)) {
> >> +   } else if (adreno_is_a640(adreno_gpu) || 
> >> adreno_is_a680(adreno_gpu)) {
> >>  ret = a6xx_gmu_memory_alloc(gmu, &gmu->icache,
> >>  SZ_256K - SZ_16K, 0x04000);
> >>  if (ret)
> >> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
> >> b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> >> index 9c5e4618aa0a..5cdafc6c8bb0 100644
> >> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> >> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> >> @@ -683,7 +683,7 @@ static void a6xx_set_ubwc_config(struct msm_gpu *gpu)
> >>  if (adreno_is_a618(adreno_gpu))
> >>  return;
> >>
> >> -   if (adreno_is_a640(adreno_gpu))
> >> +   if (adreno_is_a640(adreno_gpu) || adreno_is_a680(adreno_gpu))
> >>  amsbc = 1;
> >>
> >>  if (adreno_is_a650(adreno_gpu) || adreno_is_a660(adreno_gpu)) {
> >> @@ -757,7 +757,7 @@ static bool a6xx_ucode_check_version(struct a6xx_gpu 
> >> *a6xx_gpu,
> >>   * a660 targets have all the critical security fixes from the 
> >> start
> >>   */
> >>  if (adreno_is_a618(adreno_gpu) || adreno_is_a630(adreno_gpu) ||
> >> -   adreno_is_a640(adreno_gpu)) {
> >> +   adreno_is_a640(adreno_gpu) || adreno_is_a680(adreno_gpu)) {
> >>  /*
> >>   * If the lowest nibble is 0xa that is an indication that 
> >> this
> >>   * microcode has been patched. The actual version is in 
> >> dword
> >> @@ -897,7 +897,8 @@ static int a6xx_hw_init(struct msm_gpu *gpu)
> >>  a6xx_set_hwcg(gpu, true);
> >>
> >>  /* VBIF/GBIF start*/
>

Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-08-02 Thread Rob Clark

On Mon, Aug 2, 2021 at 3:55 AM Will Deacon  wrote:
>
> On Thu, Jul 29, 2021 at 10:08:22AM +0530, Sai Prakash Ranjan wrote:
> > On 2021-07-28 19:30, Georgi Djakov wrote:
> > > On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash Ranjan wrote:
> > > > commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag")
> > > > removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
> > > > the memory type setting required for the non-coherent masters to use
> > > > system cache. Now that system cache support for GPU is added, we will
> > > > need to set the right PTE attribute for GPU buffers to be sys cached.
> > > > Without this, the system cache lines are not allocated for GPU.
> > > >
> > > > So the patches in this series introduces a new prot flag IOMMU_LLC,
> > > > renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC
> > > > and makes GPU the user of this protection flag.
> > >
> > > Thank you for the patchset! Are you planning to refresh it, as it does
> > > not apply anymore?
> > >
> >
> > I was waiting on Will's reply [1]. If there are no changes needed, then
> > I can repost the patch.
>
> I still think you need to handle the mismatched alias, no? You're adding
> a new memory type to the SMMU which doesn't exist on the CPU side. That
> can't be right.
>

Just curious, and maybe this is a dumb question, but what is your
concern about mismatched aliases?  I mean the cache hierarchy on the
GPU device side (anything beyond the LLC) is pretty different and
doesn't really care about the smmu pgtable attributes..

BR,
-R

Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-08-02 Thread Rob Clark

On Mon, Aug 2, 2021 at 8:14 AM Will Deacon  wrote:
>
> On Mon, Aug 02, 2021 at 08:08:07AM -0700, Rob Clark wrote:
> > On Mon, Aug 2, 2021 at 3:55 AM Will Deacon  wrote:
> > >
> > > On Thu, Jul 29, 2021 at 10:08:22AM +0530, Sai Prakash Ranjan wrote:
> > > > On 2021-07-28 19:30, Georgi Djakov wrote:
> > > > > On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash Ranjan wrote:
> > > > > > commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY 
> > > > > > flag")
> > > > > > removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
> > > > > > the memory type setting required for the non-coherent masters to use
> > > > > > system cache. Now that system cache support for GPU is added, we 
> > > > > > will
> > > > > > need to set the right PTE attribute for GPU buffers to be sys 
> > > > > > cached.
> > > > > > Without this, the system cache lines are not allocated for GPU.
> > > > > >
> > > > > > So the patches in this series introduces a new prot flag IOMMU_LLC,
> > > > > > renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC
> > > > > > and makes GPU the user of this protection flag.
> > > > >
> > > > > Thank you for the patchset! Are you planning to refresh it, as it does
> > > > > not apply anymore?
> > > > >
> > > >
> > > > I was waiting on Will's reply [1]. If there are no changes needed, then
> > > > I can repost the patch.
> > >
> > > I still think you need to handle the mismatched alias, no? You're adding
> > > a new memory type to the SMMU which doesn't exist on the CPU side. That
> > > can't be right.
> > >
> >
> > Just curious, and maybe this is a dumb question, but what is your
> > concern about mismatched aliases?  I mean the cache hierarchy on the
> > GPU device side (anything beyond the LLC) is pretty different and
> > doesn't really care about the smmu pgtable attributes..
>
> If the CPU accesses a shared buffer with different attributes to those which
> the device is using then you fall into the "mismatched memory attributes"
> part of the Arm architecture. It's reasonably unforgiving (you should go and
> read it) and in some cases can apply to speculative accesses as well, but
> the end result is typically loss of coherency.

Ok, I might have a few other sections to read first to decipher the
terminology..

But my understanding of LLC is that it looks just like system memory
to the CPU and GPU (I think that would make it "the point of
coherence" between the GPU and CPU?)  If that is true, shouldn't it be
invisible from the point of view of different CPU mapping options?

BR,
-R

Re: [PATCH v2 07/14] drm/msm: Convert to Linux IRQ interfaces

2021-08-03 Thread Rob Clark

On Tue, Aug 3, 2021 at 2:37 AM Dmitry Baryshkov
 wrote:
>
> On 03/08/2021 12:06, Thomas Zimmermann wrote:
> > Drop the DRM IRQ midlayer in favor of Linux IRQ interfaces. DRM's
> > IRQ helpers are mostly useful for UMS drivers. Modern KMS drivers
> > don't benefit from using it.
> >
> > DRM IRQ callbacks are now being called directly or inlined.
> >
> > Signed-off-by: Thomas Zimmermann 
>
> Reviewed-by: Dmitry Baryshkov 
>
> Rob should probably also give his blessing on this patch though.

It looks ok.. I can't really test it this week, but it should be
pretty obvious if it wasn't working

BR,
-R

>
> > ---
> >   drivers/gpu/drm/msm/msm_drv.c | 113 --
> >   drivers/gpu/drm/msm/msm_kms.h |   2 +-
> >   2 files changed, 69 insertions(+), 46 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
> > index 1594ae39d54f..a332b09a5a11 100644
> > --- a/drivers/gpu/drm/msm/msm_drv.c
> > +++ b/drivers/gpu/drm/msm/msm_drv.c
> > @@ -14,7 +14,6 @@
> >   #include 
> >   #include 
> >   #include 
> > -#include 
> >   #include 
> >   #include 
> >   #include 
> > @@ -201,6 +200,71 @@ void msm_rmw(void __iomem *addr, u32 mask, u32 or)
> >   msm_writel(val | or, addr);
> >   }
> >
> > +static irqreturn_t msm_irq(int irq, void *arg)
> > +{
> > + struct drm_device *dev = arg;
> > + struct msm_drm_private *priv = dev->dev_private;
> > + struct msm_kms *kms = priv->kms;
> > +
> > + BUG_ON(!kms);
> > +
> > + return kms->funcs->irq(kms);
> > +}
> > +
> > +static void msm_irq_preinstall(struct drm_device *dev)
> > +{
> > + struct msm_drm_private *priv = dev->dev_private;
> > + struct msm_kms *kms = priv->kms;
> > +
> > + BUG_ON(!kms);
> > +
> > + kms->funcs->irq_preinstall(kms);
> > +}
> > +
> > +static int msm_irq_postinstall(struct drm_device *dev)
> > +{
> > + struct msm_drm_private *priv = dev->dev_private;
> > + struct msm_kms *kms = priv->kms;
> > +
> > + BUG_ON(!kms);
> > +
> > + if (kms->funcs->irq_postinstall)
> > + return kms->funcs->irq_postinstall(kms);
> > +
> > + return 0;
> > +}
> > +
> > +static int msm_irq_install(struct drm_device *dev, unsigned int irq)
> > +{
> > + int ret;
> > +
> > + if (irq == IRQ_NOTCONNECTED)
> > + return -ENOTCONN;
> > +
> > + msm_irq_preinstall(dev);
> > +
> > + ret = request_irq(irq, msm_irq, 0, dev->driver->name, dev);
> > + if (ret)
> > + return ret;
> > +
> > + ret = msm_irq_postinstall(dev);
> > + if (ret) {
> > + free_irq(irq, dev);
> > + return ret;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +static void msm_irq_uninstall(struct drm_device *dev)
> > +{
> > + struct msm_drm_private *priv = dev->dev_private;
> > + struct msm_kms *kms = priv->kms;
> > +
> > + kms->funcs->irq_uninstall(kms);
> > + free_irq(kms->irq, dev);
> > +}
> > +
> >   struct msm_vblank_work {
> >   struct work_struct work;
> >   int crtc_id;
> > @@ -265,7 +329,7 @@ static int msm_drm_uninit(struct device *dev)
> >   }
> >
> >   /* We must cancel and cleanup any pending vblank enable/disable
> > -  * work before drm_irq_uninstall() to avoid work re-enabling an
> > +  * work before msm_irq_uninstall() to avoid work re-enabling an
> >* irq after uninstall has disabled it.
> >*/
> >
> > @@ -294,7 +358,7 @@ static int msm_drm_uninit(struct device *dev)
> >   drm_mode_config_cleanup(ddev);
> >
> >   pm_runtime_get_sync(dev);
> > - drm_irq_uninstall(ddev);
> > + msm_irq_uninstall(ddev);
> >   pm_runtime_put_sync(dev);
> >
> >   if (kms && kms->funcs)
> > @@ -553,7 +617,7 @@ static int msm_drm_init(struct device *dev, const 
> > struct drm_driver *drv)
> >
> >   if (kms) {
> >   pm_runtime_get_sync(dev);
> > - ret = drm_irq_install(ddev, kms->irq);
> > + ret = msm_irq_install(ddev, kms->irq);
> >   pm_runtime_put_sync(dev);
> >   if (ret < 0) {
> >   DRM_DEV_ERROR(dev, "failed to install IRQ handler\n");
> > @@ -662,43 +726,6 @@ static void msm_postclose(struct drm_device *dev, 
> > struct drm_file *file)
> >   context_close(ctx);
> >   }
> >
> > -static irqreturn_t msm_irq(int irq, void *arg)
> > -{
> > - struct drm_device *dev = arg;
> > - struct msm_drm_private *priv = dev->dev_private;
> > - struct msm_kms *kms = priv->kms;
> > - BUG_ON(!kms);
> > - return kms->funcs->irq(kms);
> > -}
> > -
> > -static void msm_irq_preinstall(struct drm_device *dev)
> > -{
> > - struct msm_drm_private *priv = dev->dev_private;
> > - struct msm_kms *kms = priv->kms;
> > - BUG_ON(!kms);
> > - kms->funcs->irq_preinstall(kms);
> > -}
> > -
> > -static int msm_irq_postinstall(struct drm_device *dev)
> > -{
> > - struct msm_drm_private *priv = dev->dev_private;
> > - struct msm_kms *kms = priv->kms;
>

Re: [PATCH 0/8] drm/msm: Swappable GEM objects

2021-04-08 Thread Rob Clark

On Thu, Apr 8, 2021 at 4:15 AM Daniel Vetter  wrote:
>
> On Mon, Apr 05, 2021 at 10:45:23AM -0700, Rob Clark wrote:
> > From: Rob Clark 
> >
> > One would normally hope not to be under enough memory pressure to need
> > to swap GEM objects to disk backed swap.  But memory backed zram swap
> > (as enabled on chromebooks, for example) can actually be quite fast
> > and useful on devices with less RAM.  On a 4GB device, opening up ~4
> > memory intensive web pages (in separate windows rather than tabs, to try
> > and prevent tab discard), I see ~500MB worth of GEM objects, of which
> > maybe only 10% are active at any time, and with unpin/evict enabled,
> > only about half resident (which is a number that gets much lower if you
> > simulate extreme memory pressure).  Assuming a 2:1 compression ratio (I
> > see a bit higher in practice, but cannot isolate swapped out GEM pages
> > vs other), that is like having an extra 100+MB of RAM, or more under
> > higher memory pressure.
> >
> > Rob Clark (8):
> >   drm/msm: ratelimit GEM related WARN_ON()s
> >   drm/msm: Reorganize msm_gem_shrinker_scan()
> >   drm/msm: Clear msm_obj->sgt in put_pages()
> >   drm/msm: Split iova purge and close
> >   drm/msm: Add $debugfs/gem stats on resident objects
> >   drm/msm: Track potentially evictable objects
> >   drm/msm: Small msm_gem_purge() fix
> >   drm/msm: Support evicting GEM objects to swap
>
> Given how much entertainement shrinkers are, should we aim for more common
> code here?
>
> Christian has tons of fun with adding something like this for ttm (well
> different shades of grey). i915 is going to adopt ttm, at least for
> discrete.
>
> The locking is also an utter pain, and msm seems to still live a lot in
> its own land here. I think as much as possible a standard approach here
> would be really good, ideally maybe as building blocks shared between ttm
> and gem-shmem drivers ...

I don't disagree.. but also replacing the engines on an airplane
mid-flight isn't a great option either.. ;-)

The hard part (esp. wrt to locking) is tracking the state of a given
bo.. ie. is it active, active+purgable, inactive+purgable,
inactive+unpinnable, etc.  Currently the shmem helpers don't really
provide anything here.  If they did, I suppose they could provide some
shrinker helpers as well.  Unfortunately these days I barely have
enough time for drm/msm, let alone bolting this onto the shmem
helpers.  I would recommend that if someone wanted to do this, that
they look at recent drm/msm shrinker patches that I've sent (ie. make
shrinker->count() lockless, and drop the locks in shrinker->scan()
body.. when the system is under heavy memory pressure, you start
getting shrinker called from all the threads so contention for mm_lock
can be a really bad problem)

(Well, the other potential problem is that drm/msm has a lot of
different possible iommu pairings across the generations, so there is
some potential here to uncover exciting new bugs.. the locking at
least is the same for all the generations and pretty easy to test with
and without lockdep with some tests that push essentially all memory
into swap)

BR,
-R

> -Daniel
>
> >
> >  drivers/gpu/drm/msm/msm_drv.c  |   2 +-
> >  drivers/gpu/drm/msm/msm_drv.h  |  13 ++-
> >  drivers/gpu/drm/msm/msm_gem.c  | 155 +
> >  drivers/gpu/drm/msm/msm_gem.h  |  68 +--
> >  drivers/gpu/drm/msm/msm_gem_shrinker.c | 129 
> >  drivers/gpu/drm/msm/msm_gpu_trace.h|  13 +++
> >  6 files changed, 272 insertions(+), 108 deletions(-)
> >
> > --
> > 2.30.2
> >
> > ___
> > dri-devel mailing list
> > dri-devel@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [Freedreno] [PATCH 1/3] drm/msm/mdp5: Configure PP_SYNC_HEIGHT to double the vtotal

2021-04-08 Thread Rob Clark

On Wed, Apr 7, 2021 at 12:11 PM AngeloGioacchino Del Regno
 wrote:
>
> Il 07/04/21 20:19, abhin...@codeaurora.org ha scritto:
> > Hi Marijn
> >
> > On 2021-04-06 14:47, Marijn Suijten wrote:
> >> Leaving this at a close-to-maximum register value 0xFFF0 means it takes
> >> very long for the MDSS to generate a software vsync interrupt when the
> >> hardware TE interrupt doesn't arrive.  Configuring this to double the
> >> vtotal (like some downstream kernels) leads to a frame to take at most
> >> twice before the vsync signal, until hardware TE comes up.
> >>
> >> In this case the hardware interrupt responsible for providing this
> >> signal - "disp-te" gpio - is not hooked up to the mdp5 vsync/pp logic at
> >> all.  This solves severe panel update issues observed on at least the
> >> Xperia Loire and Tone series, until said gpio is properly hooked up to
> >> an irq.
> >
> > The reason the CONFIG_HEIGHT was at such a high value is to make sure that
> > we always get the TE only from the panel vsync and not false positives
> > coming
> > from the tear check logic itself.
> >
> > When you say that disp-te gpio is not hooked up, is it something
> > incorrect with
> > the schematic OR panel is not generating the TE correctly?
> >
>
> Sometimes, some panels aren't getting correctly configured by the
> OEM/ODM in the first place: especially when porting devices from
> downstream to upstream, developers often get in a situation in which
> their TE line is either misconfigured or the DriverIC is not configured
> to raise V-Sync interrupts.
> Please remember: some DDICs need a "commands sequence" to enable
> generating the TE interrupts, sometimes this is not standard, and
> sometimes OEMs/ODMs are not even doing that in their downstream code
> (but instead they work around it in creative ways "for reasons", even
> though their DDIC supports indeed sending TE events).
>
> This mostly happens when bringing up devices that have autorefresh
> enabled from the bootloader (when the bootloader sets up the splash
> screen) by using simple-panel as a (hopefully) temporary solution to get
> through the initial stages of porting.
>
> We are not trying to cover cases related to incorrect schematics or
> hardware mistakes here, as the fix for that - as you know - is to just
> fix your hardware.
> What we're trying to do here is to stop freezes and, in some cases,
> lockups, other than false positives making the developer go offroad when
> the platform shows that something is wrong during early porting.
>
> Also, sometimes, some DDICs will not generate TE interrupts when
> expected... in these cases we get a PP timeout and a MDP5 recovery: this
> is totally avoidable if we rely on the 2*vtotal, as we wouldn't get
> through the very time consuming task of recovering the entire MDP.
>
> Of course, if something is wrong in the MDP and the block really needs
> recovery, this "trick" won't save anyone and the recovery will anyway be
> triggered, as the PP-done will anyway timeout.

So, is this (mostly) a workaround due to TE not wired up?  In which
case I think it is ok, but maybe should have a comment about the
interaction with TE?

Currently I have this patch in msm-next-staging but I guess we need to
decide in the next day or so whether to drop it or smash in a comment?

BR,
-R

> >>
> >> Suggested-by: AngeloGioacchino Del Regno
> >> 
> >> Signed-off-by: Marijn Suijten 
> >> Reviewed-by: AngeloGioacchino Del Regno
> >> 
> >> ---
> >>  drivers/gpu/drm/msm/disp/mdp5/mdp5_cmd_encoder.c | 2 +-
> >>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_cmd_encoder.c
> >> b/drivers/gpu/drm/msm/disp/mdp5/mdp5_cmd_encoder.c
> >> index ff2c1d583c79..2d5ac03dbc17 100644
> >> --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_cmd_encoder.c
> >> +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_cmd_encoder.c
> >> @@ -51,7 +51,7 @@ static int pingpong_tearcheck_setup(struct
> >> drm_encoder *encoder,
> >>
> >>  mdp5_write(mdp5_kms, REG_MDP5_PP_SYNC_CONFIG_VSYNC(pp_id), cfg);
> >>  mdp5_write(mdp5_kms,
> >> -REG_MDP5_PP_SYNC_CONFIG_HEIGHT(pp_id), 0xfff0);
> >> +REG_MDP5_PP_SYNC_CONFIG_HEIGHT(pp_id), (2 * mode->vtotal));
> >>  mdp5_write(mdp5_kms,
> >>  REG_MDP5_PP_VSYNC_INIT_VAL(pp_id), mode->vdisplay);
> >>  mdp5_write(mdp5_kms, REG_MDP5_PP_RD_PTR_IRQ(pp_id),
> >> mode->vdisplay + 1);
>
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [Freedreno] [PATCH 1/3] drm/msm/mdp5: Configure PP_SYNC_HEIGHT to double the vtotal

2021-04-08 Thread Rob Clark

On Thu, Apr 8, 2021 at 4:16 PM AngeloGioacchino Del Regno
 wrote:
>
>
> Il gio 8 apr 2021, 21:05 Rob Clark  ha scritto:
>>
>> On Wed, Apr 7, 2021 at 12:11 PM AngeloGioacchino Del Regno
>>  wrote:
>> >
>> > Il 07/04/21 20:19, abhin...@codeaurora.org ha scritto:
>> > > Hi Marijn
>> > >
>> > > On 2021-04-06 14:47, Marijn Suijten wrote:
>> > >> Leaving this at a close-to-maximum register value 0xFFF0 means it takes
>> > >> very long for the MDSS to generate a software vsync interrupt when the
>> > >> hardware TE interrupt doesn't arrive.  Configuring this to double the
>> > >> vtotal (like some downstream kernels) leads to a frame to take at most
>> > >> twice before the vsync signal, until hardware TE comes up.
>> > >>
>> > >> In this case the hardware interrupt responsible for providing this
>> > >> signal - "disp-te" gpio - is not hooked up to the mdp5 vsync/pp logic at
>> > >> all.  This solves severe panel update issues observed on at least the
>> > >> Xperia Loire and Tone series, until said gpio is properly hooked up to
>> > >> an irq.
>> > >
>> > > The reason the CONFIG_HEIGHT was at such a high value is to make sure 
>> > > that
>> > > we always get the TE only from the panel vsync and not false positives
>> > > coming
>> > > from the tear check logic itself.
>> > >
>> > > When you say that disp-te gpio is not hooked up, is it something
>> > > incorrect with
>> > > the schematic OR panel is not generating the TE correctly?
>> > >
>> >
>> > Sometimes, some panels aren't getting correctly configured by the
>> > OEM/ODM in the first place: especially when porting devices from
>> > downstream to upstream, developers often get in a situation in which
>> > their TE line is either misconfigured or the DriverIC is not configured
>> > to raise V-Sync interrupts.
>> > Please remember: some DDICs need a "commands sequence" to enable
>> > generating the TE interrupts, sometimes this is not standard, and
>> > sometimes OEMs/ODMs are not even doing that in their downstream code
>> > (but instead they work around it in creative ways "for reasons", even
>> > though their DDIC supports indeed sending TE events).
>> >
>> > This mostly happens when bringing up devices that have autorefresh
>> > enabled from the bootloader (when the bootloader sets up the splash
>> > screen) by using simple-panel as a (hopefully) temporary solution to get
>> > through the initial stages of porting.
>> >
>> > We are not trying to cover cases related to incorrect schematics or
>> > hardware mistakes here, as the fix for that - as you know - is to just
>> > fix your hardware.
>> > What we're trying to do here is to stop freezes and, in some cases,
>> > lockups, other than false positives making the developer go offroad when
>> > the platform shows that something is wrong during early porting.
>> >
>> > Also, sometimes, some DDICs will not generate TE interrupts when
>> > expected... in these cases we get a PP timeout and a MDP5 recovery: this
>> > is totally avoidable if we rely on the 2*vtotal, as we wouldn't get
>> > through the very time consuming task of recovering the entire MDP.
>> >
>> > Of course, if something is wrong in the MDP and the block really needs
>> > recovery, this "trick" won't save anyone and the recovery will anyway be
>> > triggered, as the PP-done will anyway timeout.
>>
>> So, is this (mostly) a workaround due to TE not wired up?  In which
>> case I think it is ok, but maybe should have a comment about the
>> interaction with TE?
>
>
> Mostly, yes.
>
>>
>> Currently I have this patch in msm-next-staging but I guess we need to
>> decide in the next day or so whether to drop it or smash in a comment?
>>
>> BR,
>> -R
>
>
> Marijn, can you please urgently throw a comment in, reminding that these 
> timers are interacting with TE and send a fast V2?
>

Or just reply on list w/ a comment to smash in, if that is easier

BR,
-R
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v3 2/3] drm/msm/dpu: add support to dump dpu registers

2021-04-09 Thread Rob Clark

On Thu, Apr 8, 2021 at 7:28 PM Abhinav Kumar  wrote:
>
> Add the dpu_dbg module which adds supports to dump dpu registers
> which can be used in case of error conditions.
>
> changes in v3:
>  - Get rid of registration mechanism for sub-modules and instead get
>this information from the dpu catalog itself
>  - Get rid of global dpu_dbg struct and instead store it in dpu_kms
>  - delegate the power management of the sub-modules to the resp drivers
>  - refactor and remove the linked list logic and simplify it to have
>just an array
>
> Change-Id: Ide975ecf5d7952ae44daaa6eb611e27d09630be5
> Reported-by: kernel test robot 
> Signed-off-by: Abhinav Kumar 
> ---
>  drivers/gpu/drm/msm/Makefile   |   2 +
>  drivers/gpu/drm/msm/disp/dpu1/dpu_dbg.c| 221 +
>  drivers/gpu/drm/msm/disp/dpu1/dpu_dbg.h| 200 +++
>  drivers/gpu/drm/msm/disp/dpu1/dpu_dbg_util.c   | 257 
> +
>  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h |   2 +-
>  drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c|  86 +
>  drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h|   5 +
>  drivers/gpu/drm/msm/dp/dp_catalog.c|  10 +
>  drivers/gpu/drm/msm/dp/dp_catalog.h|   5 +
>  drivers/gpu/drm/msm/dp/dp_display.c|  37 
>  drivers/gpu/drm/msm/dp/dp_display.h|   1 +
>  drivers/gpu/drm/msm/dsi/dsi.c  |   5 +
>  drivers/gpu/drm/msm/dsi/dsi.h  |   4 +
>  drivers/gpu/drm/msm/dsi/dsi_host.c |  25 +++
>  drivers/gpu/drm/msm/msm_drv.c  |  29 ++-
>  drivers/gpu/drm/msm/msm_drv.h  |   2 +
>  16 files changed, 889 insertions(+), 2 deletions(-)
>  create mode 100644 drivers/gpu/drm/msm/disp/dpu1/dpu_dbg.c
>  create mode 100644 drivers/gpu/drm/msm/disp/dpu1/dpu_dbg.h
>  create mode 100644 drivers/gpu/drm/msm/disp/dpu1/dpu_dbg_util.c
>

[snip]

> diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_dbg.h 
> b/drivers/gpu/drm/msm/disp/dpu1/dpu_dbg.h
> new file mode 100644
> index 000..302205a
> --- /dev/null
> +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_dbg.h
> @@ -0,0 +1,200 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
> + */
> +
> +#ifndef DPU_DBG_H_
> +#define DPU_DBG_H_
> +
> +#include 
> +#include 
> +#include "../../../drm_crtc_internal.h"
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "dpu_hw_catalog.h"
> +#include "dpu_kms.h"
> +#include "dsi.h"
> +
> +#define DPU_DBG_DUMP_DATA_LIMITER (NULL)
> +
> +enum dpu_dbg_dump_flag {
> +   DPU_DBG_DUMP_IN_LOG = BIT(0),
> +   DPU_DBG_DUMP_IN_MEM = BIT(1),
> +   DPU_DBG_DUMP_IN_COREDUMP = BIT(2),
> +};

overall, I like this better, but..

I'm not completely convinced about the need for
DUMP_IN_LOG/DUMP_IN_MEM.. we haven't really needed it on the GPU side
of things, and the only case I can think of where it might be useful
is if you can't boot far enough to get to some minimal userspace..
once you have at least some minimal userspace, you can just pull out
and clear the devcore dump via sysfs.  That said, if state snapshot
and printing were better separated it would just take a few lines of
code to use a different drm_printer to print the state snapshot to
dmesg.

[snip]

> diff --git a/drivers/gpu/drm/msm/dsi/dsi_host.c 
> b/drivers/gpu/drm/msm/dsi/dsi_host.c
> index ab281cb..d1675ee 100644
> --- a/drivers/gpu/drm/msm/dsi/dsi_host.c
> +++ b/drivers/gpu/drm/msm/dsi/dsi_host.c
> @@ -2489,3 +2489,28 @@ struct drm_bridge *msm_dsi_host_get_bridge(struct 
> mipi_dsi_host *host)
>
> return of_drm_find_bridge(msm_host->device_node);
>  }
> +
> +void msm_dsi_host_dump_regs(struct mipi_dsi_host *host)
> +{
> +   struct msm_dsi_host *msm_host = to_msm_dsi_host(host);
> +   struct drm_device *dev = msm_host->dev;
> +   struct dpu_dbg_base *dpu_dbg;
> +
> +   dpu_dbg = dpu_dbg_get(dev);
> +
> +   if (dpu_dbg_is_drm_printer_needed(dpu_dbg) &&
> +   !dpu_dbg->dpu_dbg_printer) {
> +   pr_err("invalid drm printer\n");
> +   return;
> +   }

for example, here ^^^

why should the other blocks even care?  All they should know is that
they've been asked to snapshot their state..

> +   if (dpu_dbg->reg_dump_method == DPU_DBG_DUMP_IN_MEM)
> +   pm_runtime_get_sync(&msm_host->pdev->dev);
> +
> +   dpu_dbg_dump_regs(&dpu_dbg->dsi_ctrl_regs[msm_host->id],
> +   msm_iomap_size(msm_host->pdev, "dsi_ctrl"), 
> msm_host->ctrl_base,
> +   dpu_dbg->reg_dump_method, dpu_dbg->dpu_dbg_printer);
> +
> +   if (dpu_dbg->reg_dump_method == DPU_DBG_DUMP_IN_MEM)
> +   pm_runtime_put_sync(&msm_host->pdev->dev);
> +}

I

[pull] drm/msm: drm-msm-next for 5.13

2021-04-11 Thread Rob Clark

sm/disp/dpu1: fix display underruns during modeset.

Konrad Dybcio (1):
  drm/msm/adreno: a5xx_power: Don't apply A540 lm_setup to other GPUs

Krishna Manikandan (7):
  drm/msm/disp/dpu1: add support for display for SC7280 target
  drm/msm/disp/dpu1: add intf offsets for SC7280 target
  drm/msm/disp/dpu1: add support to program fetch active in ctl path
  drm/msm/disp/dpu1: enable DATA_HCTL_EN for sc7280 target
  drm/msm/disp/dpu1: increase the range of interrupts in dpu_irq_map
  drm/msm/disp/dpu1: add vsync and underrun irqs for INTF_5
  drm/msm/disp/dpu1: add flags to indicate obsolete irqs

Marijn Suijten (2):
  drm/msm/mdp5: Configure PP_SYNC_HEIGHT to double the vtotal
  drm/msm/mdp5: Do not multiply vclk line count by 100

Rob Clark (18):
  drm/msm: Ratelimit invalid-fence message
  drm/msm: Fix a5xx/a6xx timestamps
  Merge tag 'drm-msm-fixes-2021-04-02' into msm-next
  drm/msm: Remove unused freed llist node
  drm/msm: Avoid mutex in shrinker_count()
  drm/msm: Fix debugfs deadlock
  drm/msm: Improved debugfs gem stats
  drm/msm: Drop mm_lock in scan loop
  drm/msm: Fix spelling "purgable" -> "purgeable"
  drm/msm: Add param for userspace to query suspend count
  drm/msm: ratelimit GEM related WARN_ON()s
  drm/msm: Reorganize msm_gem_shrinker_scan()
  drm/msm: Clear msm_obj->sgt in put_pages()
  drm/msm: Split iova purge and close
  drm/msm: Add $debugfs/gem stats on resident objects
  drm/msm: Track potentially evictable objects
  drm/msm: Small msm_gem_purge() fix
  drm/msm: Support evicting GEM objects to swap

Stephen Boyd (3):
  drm/msm/kms: Use nested locking for crtc lock instead of custom classes
  drm/msm/dp: Restore aux retry tuning logic
  drm/msm: Set drvdata to NULL when msm_drm_init() fails

 drivers/clk/clk-mux.c  |   35 +
 drivers/gpu/drm/msm/Kconfig|9 +-
 drivers/gpu/drm/msm/Makefile   |9 -
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c  |4 +-
 drivers/gpu/drm/msm/adreno/a5xx_power.c|2 +-
 drivers/gpu/drm/msm/adreno/a6xx_gmu.c  |   14 +-
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c  |  108 +-
 drivers/gpu/drm/msm/adreno/adreno_gpu.c|3 +
 drivers/gpu/drm/msm/disp/dpu1/dpu_core_irq.c   |4 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c  |1 -
 drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c   |   88 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c|   30 +
 drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.h|   11 +
 drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys.h   |1 +
 .../gpu/drm/msm/disp/dpu1/dpu_encoder_phys_vid.c   |   26 +
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c |  195 +++-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h |   10 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c |   31 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.h |3 +
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c  |  793 --
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.h  |5 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c|   12 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.h|1 +
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_top.h |4 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c|   19 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c   |   54 +-
 drivers/gpu/drm/msm/disp/mdp5/mdp5_cmd_encoder.c   |   19 +-
 drivers/gpu/drm/msm/dp/dp_aux.c|7 +
 drivers/gpu/drm/msm/dp/dp_debug.c  |   33 +-
 drivers/gpu/drm/msm/dp/dp_hpd.c|4 +-
 drivers/gpu/drm/msm/dp/dp_power.c  |2 +-
 drivers/gpu/drm/msm/dsi/dsi.h  |   60 +-
 drivers/gpu/drm/msm/dsi/dsi_cfg.c  |6 +-
 drivers/gpu/drm/msm/dsi/dsi_host.c |6 +-
 drivers/gpu/drm/msm/dsi/dsi_manager.c  |   30 +-
 drivers/gpu/drm/msm/dsi/phy/dsi_phy.c  |  161 +--
 drivers/gpu/drm/msm/dsi/phy/dsi_phy.h  |   41 +-
 drivers/gpu/drm/msm/dsi/phy/dsi_phy_10nm.c |  747 -
 drivers/gpu/drm/msm/dsi/phy/dsi_phy_14nm.c |  939 -
 drivers/gpu/drm/msm/dsi/phy/dsi_phy_20nm.c |   16 +-
 drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm.c |  654 +++-
 drivers/gpu/drm/msm/dsi/phy/dsi_phy_28nm_8960.c|  479 -
 drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c  |  774 +-
 drivers/gpu/drm/msm/dsi/pll/dsi_pll.c  |  184 
 drivers/gpu/drm/msm/dsi/pll/dsi_pll.h  |  130 ---
 drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c |  881 
 drivers/gpu/drm/msm/dsi/pll/dsi_pll_14nm.c | 1096 
 drivers/gpu/drm/msm/dsi/pll/dsi_pll_28nm.c |  643 
 drivers/gp

Re: [PATCH 0/8] drm/msm: Swappable GEM objects

2021-04-12 Thread Rob Clark

On Mon, Apr 12, 2021 at 7:28 AM Daniel Vetter  wrote:
>
> On Thu, Apr 08, 2021 at 08:23:42AM -0700, Rob Clark wrote:
> > On Thu, Apr 8, 2021 at 4:15 AM Daniel Vetter  wrote:
> > >
> > > On Mon, Apr 05, 2021 at 10:45:23AM -0700, Rob Clark wrote:
> > > > From: Rob Clark 
> > > >
> > > > One would normally hope not to be under enough memory pressure to need
> > > > to swap GEM objects to disk backed swap.  But memory backed zram swap
> > > > (as enabled on chromebooks, for example) can actually be quite fast
> > > > and useful on devices with less RAM.  On a 4GB device, opening up ~4
> > > > memory intensive web pages (in separate windows rather than tabs, to try
> > > > and prevent tab discard), I see ~500MB worth of GEM objects, of which
> > > > maybe only 10% are active at any time, and with unpin/evict enabled,
> > > > only about half resident (which is a number that gets much lower if you
> > > > simulate extreme memory pressure).  Assuming a 2:1 compression ratio (I
> > > > see a bit higher in practice, but cannot isolate swapped out GEM pages
> > > > vs other), that is like having an extra 100+MB of RAM, or more under
> > > > higher memory pressure.
> > > >
> > > > Rob Clark (8):
> > > >   drm/msm: ratelimit GEM related WARN_ON()s
> > > >   drm/msm: Reorganize msm_gem_shrinker_scan()
> > > >   drm/msm: Clear msm_obj->sgt in put_pages()
> > > >   drm/msm: Split iova purge and close
> > > >   drm/msm: Add $debugfs/gem stats on resident objects
> > > >   drm/msm: Track potentially evictable objects
> > > >   drm/msm: Small msm_gem_purge() fix
> > > >   drm/msm: Support evicting GEM objects to swap
> > >
> > > Given how much entertainement shrinkers are, should we aim for more common
> > > code here?
> > >
> > > Christian has tons of fun with adding something like this for ttm (well
> > > different shades of grey). i915 is going to adopt ttm, at least for
> > > discrete.
> > >
> > > The locking is also an utter pain, and msm seems to still live a lot in
> > > its own land here. I think as much as possible a standard approach here
> > > would be really good, ideally maybe as building blocks shared between ttm
> > > and gem-shmem drivers ...
> >
> > I don't disagree.. but also replacing the engines on an airplane
> > mid-flight isn't a great option either.. ;-)
> >
> > The hard part (esp. wrt to locking) is tracking the state of a given
> > bo.. ie. is it active, active+purgable, inactive+purgable,
> > inactive+unpinnable, etc.  Currently the shmem helpers don't really
> > provide anything here.  If they did, I suppose they could provide some
> > shrinker helpers as well.  Unfortunately these days I barely have
> > enough time for drm/msm, let alone bolting this onto the shmem
> > helpers.  I would recommend that if someone wanted to do this, that
> > they look at recent drm/msm shrinker patches that I've sent (ie. make
> > shrinker->count() lockless, and drop the locks in shrinker->scan()
> > body.. when the system is under heavy memory pressure, you start
> > getting shrinker called from all the threads so contention for mm_lock
> > can be a really bad problem)
> >
> > (Well, the other potential problem is that drm/msm has a lot of
> > different possible iommu pairings across the generations, so there is
> > some potential here to uncover exciting new bugs.. the locking at
> > least is the same for all the generations and pretty easy to test with
> > and without lockdep with some tests that push essentially all memory
> > into swap)
>
> So what we aimed for with i915 and discrete gpu is to first align on
> locking with dma_resv_lock for all buffer state, plus a bunch of
> lru/allocator locks for lists and stuff.
>
> And then with more aligned locking, figure out how to maybe share more
> code.
>
> The trouble is that right now neither shmem helpers, nor drivers using
> them, are really using dma_resv_lock to protect their per-buffer state.

We are actually already using dma_resv_lock() since a few release
cycles back.. msm_gem_lock() and friends are a wrapper around that
from the migration away from using our own lock).. the mm_lock is
symply protecting the lists, not the objects

> So yeah it's a bit an awkward situation, and I don't know myself really
> how to get out of it. Lack of people with tons of free time doesn't help
> much.
>
> So best case I think is

Re: [PATCH v4 2/3] drm/msm: add support to take dpu snapshot

2021-04-14 Thread Rob Clark

 dp_read_aux(struct dp_catalog_private *catalog, u32 
> > offset)
> >   {
> >   offset += MSM_DP_CONTROLLER_AUX_OFFSET;
> > diff --git a/drivers/gpu/drm/msm/dp/dp_catalog.h 
> > b/drivers/gpu/drm/msm/dp/dp_catalog.h
> > index 176a902..e7e8b13 100644
> > --- a/drivers/gpu/drm/msm/dp/dp_catalog.h
> > +++ b/drivers/gpu/drm/msm/dp/dp_catalog.h
> > @@ -9,6 +9,7 @@
> >   #include 
> >
> >   #include "dp_parser.h"
> > +#include "disp/msm_disp_snapshot.h"
> >
> >   /* interrupts */
> >   #define DP_INTR_HPD BIT(0)
> > @@ -71,6 +72,9 @@ struct dp_catalog {
> >   u32 audio_data;
> >   };
> >
> > +/* Debug module */
> > +void dp_catalog_snapshot(struct dp_catalog *dp_catalog, struct 
> > msm_disp_state *disp_state);
> > +
> >   /* AUX APIs */
> >   u32 dp_catalog_aux_read_data(struct dp_catalog *dp_catalog);
> >   int dp_catalog_aux_write_data(struct dp_catalog *dp_catalog);
> > diff --git a/drivers/gpu/drm/msm/dp/dp_display.c 
> > b/drivers/gpu/drm/msm/dp/dp_display.c
> > index 5a39da6..6670558 100644
> > --- a/drivers/gpu/drm/msm/dp/dp_display.c
> > +++ b/drivers/gpu/drm/msm/dp/dp_display.c
> > @@ -1009,6 +1009,35 @@ int dp_display_get_test_bpp(struct msm_dp *dp)
> >   dp_display->link->test_video.test_bit_depth);
> >   }
> >
> > +void msm_dp_snapshot(struct msm_dp *dp)
> > +{
> > + struct dp_display_private *dp_display;
> > + struct drm_device *drm;
> > + struct msm_disp_state *disp_state;
> > +
> > + dp_display = container_of(dp, struct dp_display_private, dp_display);
> > + drm = dp->drm_dev;
> > + disp_state = msm_disp_state_get(drm);
> > +
> > + /*
> > +  * if we are reading registers we need the link clocks to be on
> > +  * however till DP cable is connected this will not happen as we
> > +  * do not know the resolution to power up with. Hence check the
> > +  * power_on status before dumping DP registers to avoid crash due
> > +  * to unclocked access
> > +  */
> > + mutex_lock(&dp_display->event_mutex);
> > +
> > + if (!dp->power_on) {
> > + mutex_unlock(&dp_display->event_mutex);
> > + return;
> > + }
> > +
> > + dp_catalog_snapshot(dp_display->catalog, disp_state);
> > +
> > + mutex_unlock(&dp_display->event_mutex);
> > +}
> > +
> >   static void dp_display_config_hpd(struct dp_display_private *dp)
> >   {
> >
> > diff --git a/drivers/gpu/drm/msm/dp/dp_display.h 
> > b/drivers/gpu/drm/msm/dp/dp_display.h
> > index 6092ba1..4d39373 100644
> > --- a/drivers/gpu/drm/msm/dp/dp_display.h
> > +++ b/drivers/gpu/drm/msm/dp/dp_display.h
> > @@ -8,6 +8,7 @@
> >
> >   #include "dp_panel.h"
> >   #include 
> > +#include "disp/msm_disp_snapshot.h"
> >
> >   struct msm_dp {
> >   struct drm_device *drm_dev;
> > diff --git a/drivers/gpu/drm/msm/dsi/dsi.c b/drivers/gpu/drm/msm/dsi/dsi.c
> > index 62704885..bccc006 100644
> > --- a/drivers/gpu/drm/msm/dsi/dsi.c
> > +++ b/drivers/gpu/drm/msm/dsi/dsi.c
> > @@ -266,3 +266,8 @@ int msm_dsi_modeset_init(struct msm_dsi *msm_dsi, 
> > struct drm_device *dev,
> >   return ret;
> >   }
> >
> > +void msm_dsi_snapshot(struct msm_dsi *msm_dsi)
> > +{
> > + msm_dsi_host_snapshot(msm_dsi->host);
> > +}
> > +
> > diff --git a/drivers/gpu/drm/msm/dsi/dsi.h b/drivers/gpu/drm/msm/dsi/dsi.h
> > index 7abfeab..bb39403 100644
> > --- a/drivers/gpu/drm/msm/dsi/dsi.h
> > +++ b/drivers/gpu/drm/msm/dsi/dsi.h
> > @@ -15,6 +15,7 @@
> >   #include 
> >
> >   #include "msm_drv.h"
> > +#include "disp/msm_disp_snapshot.h"
> >
> >   #define DSI_0   0
> >   #define DSI_1   1
> > @@ -90,6 +91,8 @@ static inline bool msm_dsi_device_connected(struct 
> > msm_dsi *msm_dsi)
> >   return msm_dsi->panel || msm_dsi->external_bridge;
> >   }
> >
> > +void msm_dsi_snapshot(struct msm_dsi *msm_dsi);
> > +
> >   struct drm_encoder *msm_dsi_get_encoder(struct msm_dsi *msm_dsi);
> >
> >   /* dsi host */
> > @@ -146,6 +149,7 @@ int dsi_clk_init_v2(struct msm_dsi_host *msm_host);
> >   int dsi_clk_init_6g_v2(struct msm_dsi_host *msm_host);
> >   int dsi_calc_clk_rate_v2(struct msm_dsi_host *msm_host, bool is_dual_dsi);
> >   int dsi_calc_clk_rate_6g(struct msm_dsi_host *msm_host, bool is_dual_dsi);
> > +void msm_dsi_host_snapshot(struct mipi_dsi_host *host);
> >
> >   /* dsi phy */
> >   struct msm_dsi_phy;
> > diff --git a/drivers/gpu/drm/msm/dsi/dsi_host.c 
> > b/drivers/gpu/drm/msm/dsi/dsi_host.c
> > index 8a10e43..d9fdc07 100644
> > --- a/drivers/gpu/drm/msm/dsi/dsi_host.c
> > +++ b/drivers/gpu/drm/msm/dsi/dsi_host.c
> > @@ -2487,3 +2487,22 @@ struct drm_bridge *msm_dsi_host_get_bridge(struct 
> > mipi_dsi_host *host)
> >
> >   return of_drm_find_bridge(msm_host->device_node);
> >   }
> > +
> > +void msm_dsi_host_snapshot(struct mipi_dsi_host *host)
> > +{
> > + struct msm_dsi_host *msm_host = to_msm_dsi_host(host);
> > + struct drm_device *dev = msm_host->dev;
> > + struct msm_disp_state *disp_state;
> > + char name[SZ_128];
> > +
> > + disp_state = msm_disp_state_get(dev);
> > +
> > + pm_runtime_get_sync(&msm_host->pdev->dev);
> > +
> > + snprintf(name, SZ_128, "dsi%d_ctrl", msm_host->id);
> > +
> > + msm_disp_snapshot_add_block(disp_state, name, 
> > msm_iomap_size(msm_host->pdev, "dsi_ctrl"),
>
> You can store the size into the msm_host, rather than querrying it on
> each dump.
>
> > + msm_host->ctrl_base);
> > +
> > + pm_runtime_put_sync(&msm_host->pdev->dev);
> > +}
> > diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
> > index e1104d2..ebf9283 100644
> > --- a/drivers/gpu/drm/msm/msm_drv.c
> > +++ b/drivers/gpu/drm/msm/msm_drv.c
> > @@ -1,6 +1,6 @@
> >   // SPDX-License-Identifier: GPL-2.0-only
> >   /*
> > - * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
> > + * Copyright (c) 2016-2018, 2020-2021 The Linux Foundation. All rights 
> > reserved.
> >* Copyright (C) 2013 Red Hat
> >* Author: Rob Clark 
> >*/
> > @@ -19,6 +19,7 @@
> >   #include 
> >   #include 
> >
> > +#include "disp/msm_disp_snapshot.h"
> >   #include "msm_drv.h"
> >   #include "msm_debugfs.h"
> >   #include "msm_fence.h"
> > @@ -167,6 +168,24 @@ void __iomem *msm_ioremap_quiet(struct platform_device 
> > *pdev, const char *name,
> >   return _msm_ioremap(pdev, name, dbgname, true);
> >   }
> >
> > +unsigned long msm_iomap_size(struct platform_device *pdev, const char 
> > *name)
> > +{
> > + struct resource *res;
> > +
> > + if (name)
> > + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, 
> > name);
> > + else
> > + res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> > +
> > + if (!res) {
> > + dev_dbg(&pdev->dev, "failed to get memory resource: %s\n",
> > + name);
> > + return 0;
> > + }
> > +
> > + return resource_size(res);
> > +}
> > +
> >   void msm_writel(u32 data, void __iomem *addr)
> >   {
> >   if (reglog)
> > @@ -278,6 +297,8 @@ static int msm_drm_uninit(struct device *dev)
> >   msm_fbdev_free(ddev);
> >   #endif
> >
> > + msm_disp_snapshot_destroy(ddev);
> > +
> >   drm_mode_config_cleanup(ddev);
> >
> >   pm_runtime_get_sync(dev);
> > @@ -550,6 +571,12 @@ static int msm_drm_init(struct device *dev, const 
> > struct drm_driver *drv)
> >   if (ret)
> >   goto err_msm_uninit;
> >
> > + if (get_mdp_ver(pdev) == KMS_DPU) {
> > + ret = msm_disp_snapshot_init(ddev);
> > + if (ret)
> > + DRM_DEV_ERROR(dev, "msm_disp_snapshot_init failed ret 
> > = %d\n", ret);
> > + }
> > +
>
> Let's get it out of if (KMS_DPU), even if we do not dump mdp4/mdp5
> registers for now.
>
> >   drm_mode_config_reset(ddev);
> >
> >   #ifdef CONFIG_DRM_FBDEV_EMULATION
> > diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
> > index 2668941..9c40bac 100644
> > --- a/drivers/gpu/drm/msm/msm_drv.h
> > +++ b/drivers/gpu/drm/msm/msm_drv.h
> > @@ -367,6 +367,7 @@ void msm_dp_display_mode_set(struct msm_dp *dp, struct 
> > drm_encoder *encoder,
> >   struct drm_display_mode *mode,
> >   struct drm_display_mode *adjusted_mode);
> >   void msm_dp_irq_postinstall(struct msm_dp *dp_display);
> > +void msm_dp_snapshot(struct msm_dp *dp_display);
> >
> >   void msm_dp_debugfs_init(struct msm_dp *dp_display, struct drm_minor 
> > *minor);
> >
> > @@ -450,6 +451,7 @@ void __iomem *msm_ioremap(struct platform_device *pdev, 
> > const char *name,
> >   const char *dbgname);
> >   void __iomem *msm_ioremap_quiet(struct platform_device *pdev, const char 
> > *name,
> >   const char *dbgname);
> > +unsigned long msm_iomap_size(struct platform_device *pdev, const char 
> > *name);
> >   void msm_writel(u32 data, void __iomem *addr);
> >   u32 msm_readl(const void __iomem *addr);
> >   void msm_rmw(void __iomem *addr, u32 mask, u32 or);
> >
>
>
> --
> With best wishes
> Dmitry
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 6276 matches

Mail list logo