from:"Michel Dänzer"

Re: [PATCH] dma-buf: fix and rework dma_buf_poll v6

2021-07-20 Thread Michel Dänzer

On 2021-07-09 2:07 p.m., Christian König wrote:
> Daniel pointed me towards this function and there are multiple obvious 
> problems
> in the implementation.
> 
> First of all the retry loop is not working as intended. In general the retry
> makes only sense if you grab the reference first and then check the sequence
> values.
> 
> Then we should always also wait for the exclusive fence.
> 
> It's also good practice to keep the reference around when installing callbacks
> to fences you don't own.
> 
> And last the whole implementation was unnecessary complex and rather hard to
> understand which could lead to probably unexpected behavior of the IOCTL.
> 
> Fix all this by reworking the implementation from scratch. Dropping the
> whole RCU approach and taking the lock instead.
> 
> Only mildly tested and needs a thoughtful review of the code.
> 
> v2: fix the reference counting as well
> v3: keep the excl fence handling as is for stable
> v4: back to testing all fences, drop RCU
> v5: handle in and out separately
> v6: add missing clear of events
> 
> Signed-off-by: Christian König 
> CC: sta...@vger.kernel.org
> ---
>  drivers/dma-buf/dma-buf.c | 156 +-
>  include/linux/dma-buf.h   |   2 +-
>  2 files changed, 72 insertions(+), 86 deletions(-)
> 
> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> index eadd1eaa2fb5..39e1ef872829 100644
> --- a/drivers/dma-buf/dma-buf.c
> +++ b/drivers/dma-buf/dma-buf.c
[...]
>  
>  static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
>  {
>   struct dma_buf *dmabuf;
>   struct dma_resv *resv;
> - struct dma_resv_list *fobj;
> - struct dma_fence *fence_excl;
> + unsigned shared_count;
>   __poll_t events;
> - unsigned shared_count, seq;
> + int r, i;

shared_count, r & i are unused with this patch.



> + if (events & EPOLLOUT && !dma_buf_poll_shared(resv, dcb) &&
> + !dma_buf_poll_excl(resv, dcb))
> + /* No callback queued, wake up any other waiters */
> + dma_buf_poll_cb(NULL, &dcb->cb);
> + else
> + events &= ~EPOLLOUT;

Something like this might be clearer:

if (events & EPOLLOUT) {
if (!dma_buf_poll_shared(resv, dcb) &&
!dma_buf_poll_excl(resv, dcb))
/* No callback queued, wake up any other 
waiters */
dma_buf_poll_cb(NULL, &dcb->cb);
else
events &= ~EPOLLOUT;
}


> + if (events & EPOLLIN && !dma_buf_poll_excl(resv, dcb))
> + /* No callback queued, wake up any other waiters */
>   dma_buf_poll_cb(NULL, &dcb->cb);
> + else
> + events &= ~EPOLLIN;

Similarly:

if (events & EPOLLIN) {
if (!dma_buf_poll_excl(resv, dcb))
/* No callback queued, wake up any other 
waiters */
dma_buf_poll_cb(NULL, &dcb->cb);
else
events &= ~EPOLLIN;
}


Other than that, looks good to me, can't say anything about the locking though.


Haven't been able to test this yet, hopefully later this week.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

[PATCH] dma-buf/poll: Get a file reference for outstanding fence callbacks

2021-07-23 Thread Michel Dänzer

From: Michel Dänzer 

This makes sure we don't hit the

BUG_ON(dmabuf->cb_in.active || dmabuf->cb_out.active);

in dma_buf_release, which could be triggered by user space closing the
dma-buf file description while there are outstanding fence callbacks
from dma_buf_poll.

Cc: sta...@vger.kernel.org
Signed-off-by: Michel Dänzer 
---
 drivers/dma-buf/dma-buf.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 6c520c9bd93c..ec25498a971f 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -65,12 +65,9 @@ static void dma_buf_release(struct dentry *dentry)
BUG_ON(dmabuf->vmapping_counter);
 
/*
-* Any fences that a dma-buf poll can wait on should be signaled
-* before releasing dma-buf. This is the responsibility of each
-* driver that uses the reservation objects.
-*
-* If you hit this BUG() it means someone dropped their ref to the
-* dma-buf while still having pending operation to the buffer.
+* If you hit this BUG() it could mean:
+* * There's a file reference imbalance in dma_buf_poll / 
dma_buf_poll_cb or somewhere else
+* * dmabuf->cb_in/out.active are non-0 despite no pending fence 
callback
 */
BUG_ON(dmabuf->cb_in.active || dmabuf->cb_out.active);
 
@@ -196,6 +193,7 @@ static loff_t dma_buf_llseek(struct file *file, loff_t 
offset, int whence)
 static void dma_buf_poll_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
 {
struct dma_buf_poll_cb_t *dcb = (struct dma_buf_poll_cb_t *)cb;
+   struct dma_buf *dmabuf = container_of(dcb->poll, struct dma_buf, poll);
unsigned long flags;
 
spin_lock_irqsave(&dcb->poll->lock, flags);
@@ -203,6 +201,8 @@ static void dma_buf_poll_cb(struct dma_fence *fence, struct 
dma_fence_cb *cb)
dcb->active = 0;
spin_unlock_irqrestore(&dcb->poll->lock, flags);
dma_fence_put(fence);
+   /* Paired with get_file in dma_buf_poll */
+   fput(dmabuf->file);
 }
 
 static bool dma_buf_poll_shared(struct dma_resv *resv,
@@ -278,6 +278,9 @@ static __poll_t dma_buf_poll(struct file *file, poll_table 
*poll)
spin_unlock_irq(&dmabuf->poll.lock);
 
if (events & EPOLLOUT) {
+   /* Paired with fput in dma_buf_poll_cb */
+   get_file(dmabuf->file);
+
if (!dma_buf_poll_shared(resv, dcb) &&
!dma_buf_poll_excl(resv, dcb))
/* No callback queued, wake up any other 
waiters */
@@ -299,6 +302,9 @@ static __poll_t dma_buf_poll(struct file *file, poll_table 
*poll)
spin_unlock_irq(&dmabuf->poll.lock);
 
if (events & EPOLLIN) {
+   /* Paired with fput in dma_buf_poll_cb */
+   get_file(dmabuf->file);
+
if (!dma_buf_poll_excl(resv, dcb))
/* No callback queued, wake up any other 
waiters */
dma_buf_poll_cb(NULL, &dcb->cb);
-- 
2.32.0

Re: [PATCH] dma-buf: fix and rework dma_buf_poll v7

2021-07-23 Thread Michel Dänzer

On 2021-07-20 3:11 p.m., Christian König wrote:
> Daniel pointed me towards this function and there are multiple obvious 
> problems
> in the implementation.
> 
> First of all the retry loop is not working as intended. In general the retry
> makes only sense if you grab the reference first and then check the sequence
> values.
> 
> Then we should always also wait for the exclusive fence.
> 
> It's also good practice to keep the reference around when installing callbacks
> to fences you don't own.
> 
> And last the whole implementation was unnecessary complex and rather hard to
> understand which could lead to probably unexpected behavior of the IOCTL.
> 
> Fix all this by reworking the implementation from scratch. Dropping the
> whole RCU approach and taking the lock instead.
> 
> Only mildly tested and needs a thoughtful review of the code.
> 
> v2: fix the reference counting as well
> v3: keep the excl fence handling as is for stable
> v4: back to testing all fences, drop RCU
> v5: handle in and out separately
> v6: add missing clear of events
> v7: change coding style as suggested by Michel, drop unused variables
> 
> Signed-off-by: Christian König 
> CC: sta...@vger.kernel.org

Working fine with https://gitlab.gnome.org/GNOME/mutter/-/merge_requests/1880

Tested-by: Michel Dänzer 


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [PATCH] dma-buf/poll: Get a file reference for outstanding fence callbacks

2021-07-23 Thread Michel Dänzer

On 2021-07-23 10:04 a.m., Christian König wrote:
> Am 23.07.21 um 09:58 schrieb Michel Dänzer:
>> From: Michel Dänzer 
>>
>> This makes sure we don't hit the
>>
>> BUG_ON(dmabuf->cb_in.active || dmabuf->cb_out.active);
>>
>> in dma_buf_release, which could be triggered by user space closing the
>> dma-buf file description while there are outstanding fence callbacks
>> from dma_buf_poll.
> 
> I was also wondering the same thing while working on this, but then thought 
> that the poll interface would take care of this.

I was able to hit the BUG_ON with 
https://gitlab.gnome.org/GNOME/mutter/-/merge_requests/1880 .


>> Cc: sta...@vger.kernel.org
>> Signed-off-by: Michel Dänzer 
>> ---
>>   drivers/dma-buf/dma-buf.c | 18 --
>>   1 file changed, 12 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
>> index 6c520c9bd93c..ec25498a971f 100644
>> --- a/drivers/dma-buf/dma-buf.c
>> +++ b/drivers/dma-buf/dma-buf.c
>> @@ -65,12 +65,9 @@ static void dma_buf_release(struct dentry *dentry)
>>   BUG_ON(dmabuf->vmapping_counter);
>>     /*
>> - * Any fences that a dma-buf poll can wait on should be signaled
>> - * before releasing dma-buf. This is the responsibility of each
>> - * driver that uses the reservation objects.
>> - *
>> - * If you hit this BUG() it means someone dropped their ref to the
>> - * dma-buf while still having pending operation to the buffer.
>> + * If you hit this BUG() it could mean:
>> + * * There's a file reference imbalance in dma_buf_poll / 
>> dma_buf_poll_cb or somewhere else
>> + * * dmabuf->cb_in/out.active are non-0 despite no pending fence 
>> callback
>>    */
>>   BUG_ON(dmabuf->cb_in.active || dmabuf->cb_out.active);
>>   @@ -196,6 +193,7 @@ static loff_t dma_buf_llseek(struct file *file, loff_t 
>> offset, int whence)
>>   static void dma_buf_poll_cb(struct dma_fence *fence, struct dma_fence_cb 
>> *cb)
>>   {
>>   struct dma_buf_poll_cb_t *dcb = (struct dma_buf_poll_cb_t *)cb;
>> +    struct dma_buf *dmabuf = container_of(dcb->poll, struct dma_buf, poll);
>>   unsigned long flags;
>>     spin_lock_irqsave(&dcb->poll->lock, flags);
>> @@ -203,6 +201,8 @@ static void dma_buf_poll_cb(struct dma_fence *fence, 
>> struct dma_fence_cb *cb)
>>   dcb->active = 0;
>>   spin_unlock_irqrestore(&dcb->poll->lock, flags);
>>   dma_fence_put(fence);
>> +    /* Paired with get_file in dma_buf_poll */
>> +    fput(dmabuf->file);
> 
> Is calling fput() in interrupt context ok? IIRC that could potentially sleep.

Looks fine AFAICT: It has

if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {

and as a fallback for that, it adds the file to a lock-less delayed_fput_list 
which is processed by a workqueue.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [PATCH] dma-buf/poll: Get a file reference for outstanding fence callbacks

2021-07-23 Thread Michel Dänzer

On 2021-07-23 11:02 a.m., Daniel Vetter wrote:
> On Fri, Jul 23, 2021 at 10:19:49AM +0200, Michel Dänzer wrote:
>> On 2021-07-23 10:04 a.m., Christian König wrote:
>>> Am 23.07.21 um 09:58 schrieb Michel Dänzer:
>>>> From: Michel Dänzer 
>>>>
>>>> This makes sure we don't hit the
>>>>
>>>> BUG_ON(dmabuf->cb_in.active || dmabuf->cb_out.active);
>>>>
>>>> in dma_buf_release, which could be triggered by user space closing the
>>>> dma-buf file description while there are outstanding fence callbacks
>>>> from dma_buf_poll.
>>>
>>> I was also wondering the same thing while working on this, but then thought 
>>> that the poll interface would take care of this.
>>
>> I was able to hit the BUG_ON with 
>> https://gitlab.gnome.org/GNOME/mutter/-/merge_requests/1880 .
> 
> igt test would be really lovely. Maybe base something off the
> import/export igts from Jason?

I'll see what I can do, busy with other stuff right now though.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [RFC 3/4] drm/atomic-helper: Set fence deadline for vblank

2021-07-27 Thread Michel Dänzer

On 2021-07-27 1:38 a.m., Rob Clark wrote:
> From: Rob Clark 
> 
> For an atomic commit updating a single CRTC (ie. a pageflip) calculate
> the next vblank time, and inform the fence(s) of that deadline.
> 
> Signed-off-by: Rob Clark 
> ---
>  drivers/gpu/drm/drm_atomic_helper.c | 36 +
>  1 file changed, 36 insertions(+)
> 
> diff --git a/drivers/gpu/drm/drm_atomic_helper.c 
> b/drivers/gpu/drm/drm_atomic_helper.c
> index bc3487964fb5..f81b20775b15 100644
> --- a/drivers/gpu/drm/drm_atomic_helper.c
> +++ b/drivers/gpu/drm/drm_atomic_helper.c
> @@ -1406,6 +1406,40 @@ void drm_atomic_helper_commit_modeset_enables(struct 
> drm_device *dev,
>  }
>  EXPORT_SYMBOL(drm_atomic_helper_commit_modeset_enables);
>  
> +/*
> + * For atomic updates which touch just a single CRTC, calculate the time of 
> the
> + * next vblank, and inform all the fences of the of the deadline.
> + */
> +static void set_fence_deadline(struct drm_device *dev,
> +struct drm_atomic_state *state)
> +{
> + struct drm_crtc *crtc, *wait_crtc = NULL;
> + struct drm_crtc_state *new_crtc_state;
> + struct drm_plane *plane;
> + struct drm_plane_state *new_plane_state;
> + ktime_t vbltime;
> + int i;
> +
> + for_each_new_crtc_in_state (state, crtc, new_crtc_state, i) {
> + if (!wait_crtc)
> + return;

Either this return or the next one below would always be taken, I doubt this 
was intended.


> + wait_crtc = crtc;
> + }
> +
> + /* If no CRTCs updated, then nothing to do: */
> + if (!wait_crtc)
> + return;
> +
> + if (drm_crtc_next_vblank_time(wait_crtc, &vbltime))
> + return;
> +
> + for_each_new_plane_in_state (state, plane, new_plane_state, i) {
> + if (!new_plane_state->fence)
> + continue;
> + dma_fence_set_deadline(new_plane_state->fence, vbltime);
> + }

vblank timestamps correspond to the end of vertical blank, the deadline should 
be the start of vertical blank though.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [RFC 0/4] dma-fence: Deadline awareness

2021-07-27 Thread Michel Dänzer

On 2021-07-27 1:38 a.m., Rob Clark wrote:
> From: Rob Clark 
> 
> Based on discussion from a previous series[1] to add a "boost" mechanism
> when, for example, vblank deadlines are missed.  Instead of a boost
> callback, this approach adds a way to set a deadline on the fence, by
> which the waiter would like to see the fence signalled.
> 
> I've not yet had a chance to re-work the drm/msm part of this, but
> wanted to send this out as an RFC in case I don't have a chance to
> finish the drm/msm part this week.
> 
> Original description:
> 
> In some cases, like double-buffered rendering, missing vblanks can
> trick the GPU into running at a lower frequence, when really we
> want to be running at a higher frequency to not miss the vblanks
> in the first place.
> 
> This is partially inspired by a trick i915 does, but implemented
> via dma-fence for a couple of reasons:
> 
> 1) To continue to be able to use the atomic helpers
> 2) To support cases where display and gpu are different drivers
> 
> [1] https://patchwork.freedesktop.org/series/90331/

Unfortunately, none of these approaches will have the full intended effect once 
Wayland compositors start waiting for client buffers to become idle before 
using them for an output frame (to prevent output frames from getting delayed 
by client work). See 
https://gitlab.gnome.org/GNOME/mutter/-/merge_requests/1880 (shameless plug :) 
for a proof of concept of this for mutter. The boost will only affect the 
compositor's own GPU work, not the client work (which means no effect at all 
for fullscreen apps where the compositor can scan out the client buffers 
directly).

-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [RFC 0/4] dma-fence: Deadline awareness

2021-07-27 Thread Michel Dänzer

On 2021-07-27 5:12 p.m., Rob Clark wrote:
> On Tue, Jul 27, 2021 at 7:50 AM Michel Dänzer  wrote:
>>
>> On 2021-07-27 1:38 a.m., Rob Clark wrote:
>>> From: Rob Clark 
>>>
>>> Based on discussion from a previous series[1] to add a "boost" mechanism
>>> when, for example, vblank deadlines are missed.  Instead of a boost
>>> callback, this approach adds a way to set a deadline on the fence, by
>>> which the waiter would like to see the fence signalled.
>>>
>>> I've not yet had a chance to re-work the drm/msm part of this, but
>>> wanted to send this out as an RFC in case I don't have a chance to
>>> finish the drm/msm part this week.
>>>
>>> Original description:
>>>
>>> In some cases, like double-buffered rendering, missing vblanks can
>>> trick the GPU into running at a lower frequence, when really we
>>> want to be running at a higher frequency to not miss the vblanks
>>> in the first place.
>>>
>>> This is partially inspired by a trick i915 does, but implemented
>>> via dma-fence for a couple of reasons:
>>>
>>> 1) To continue to be able to use the atomic helpers
>>> 2) To support cases where display and gpu are different drivers
>>>
>>> [1] https://patchwork.freedesktop.org/series/90331/
>>
>> Unfortunately, none of these approaches will have the full intended effect 
>> once Wayland compositors start waiting for client buffers to become idle 
>> before using them for an output frame (to prevent output frames from getting 
>> delayed by client work). See 
>> https://gitlab.gnome.org/GNOME/mutter/-/merge_requests/1880 (shameless plug 
>> :) for a proof of concept of this for mutter. The boost will only affect the 
>> compositor's own GPU work, not the client work (which means no effect at all 
>> for fullscreen apps where the compositor can scan out the client buffers 
>> directly).
>>
> 
> I guess you mean "no effect at all *except* for fullscreen..."?

I meant what I wrote: The compositor will wait for the next buffer to become 
idle, so there's no boost from this mechanism for the client drawing to that 
buffer. And since the compositor does no drawing of its own in this case, 
there's no boost from that either.


> I'd perhaps recommend that wayland compositors, in cases where only a
> single layer is changing, not try to be clever and just push the
> update down to the kernel.

Even just for the fullscreen direct scanout case, that would require some kind 
of atomic KMS API extension to allow queuing multiple page flips for the same 
CRTC.

For other cases, this would also require a mechanism to cancel a pending atomic 
commit, for when another surface update comes in before the compositor's 
deadline, which affects the previously single updating surface as well.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [RFC 0/4] dma-fence: Deadline awareness

2021-07-28 Thread Michel Dänzer

On 2021-07-28 1:36 p.m., Christian König wrote:
> Am 27.07.21 um 17:37 schrieb Rob Clark:
>> On Tue, Jul 27, 2021 at 8:19 AM Michel Dänzer  wrote:
>>> On 2021-07-27 5:12 p.m., Rob Clark wrote:
>>>> On Tue, Jul 27, 2021 at 7:50 AM Michel Dänzer  wrote:
>>>>> On 2021-07-27 1:38 a.m., Rob Clark wrote:
>>>>>> From: Rob Clark 
>>>>>>
>>>>>> Based on discussion from a previous series[1] to add a "boost" mechanism
>>>>>> when, for example, vblank deadlines are missed.  Instead of a boost
>>>>>> callback, this approach adds a way to set a deadline on the fence, by
>>>>>> which the waiter would like to see the fence signalled.
>>>>>>
>>>>>> I've not yet had a chance to re-work the drm/msm part of this, but
>>>>>> wanted to send this out as an RFC in case I don't have a chance to
>>>>>> finish the drm/msm part this week.
>>>>>>
>>>>>> Original description:
>>>>>>
>>>>>> In some cases, like double-buffered rendering, missing vblanks can
>>>>>> trick the GPU into running at a lower frequence, when really we
>>>>>> want to be running at a higher frequency to not miss the vblanks
>>>>>> in the first place.
>>>>>>
>>>>>> This is partially inspired by a trick i915 does, but implemented
>>>>>> via dma-fence for a couple of reasons:
>>>>>>
>>>>>> 1) To continue to be able to use the atomic helpers
>>>>>> 2) To support cases where display and gpu are different drivers
>>>>>>
>>>>>> [1] https://patchwork.freedesktop.org/series/90331/
>>>>> Unfortunately, none of these approaches will have the full intended 
>>>>> effect once Wayland compositors start waiting for client buffers to 
>>>>> become idle before using them for an output frame (to prevent output 
>>>>> frames from getting delayed by client work). See 
>>>>> https://gitlab.gnome.org/GNOME/mutter/-/merge_requests/1880 (shameless 
>>>>> plug :) for a proof of concept of this for mutter. The boost will only 
>>>>> affect the compositor's own GPU work, not the client work (which means no 
>>>>> effect at all for fullscreen apps where the compositor can scan out the 
>>>>> client buffers directly).
>>>>>
>>>> I guess you mean "no effect at all *except* for fullscreen..."?
>>> I meant what I wrote: The compositor will wait for the next buffer to 
>>> become idle, so there's no boost from this mechanism for the client drawing 
>>> to that buffer. And since the compositor does no drawing of its own in this 
>>> case, there's no boost from that either.
>>>
>>>
>>>> I'd perhaps recommend that wayland compositors, in cases where only a
>>>> single layer is changing, not try to be clever and just push the
>>>> update down to the kernel.
>>> Even just for the fullscreen direct scanout case, that would require some 
>>> kind of atomic KMS API extension to allow queuing multiple page flips for 
>>> the same CRTC.
>>>
>>> For other cases, this would also require a mechanism to cancel a pending 
>>> atomic commit, for when another surface update comes in before the 
>>> compositor's deadline, which affects the previously single updating surface 
>>> as well.
>>>
>> Well, in the end, there is more than one compositor out there.. and if
>> some wayland compositors are going this route, they can also implement
>> the same mechanism in userspace using the sysfs that devfreq exports.
>>
>> But it sounds simpler to me for the compositor to have a sort of "game
>> mode" for fullscreen games.. I'm less worried about UI interactive
>> workloads, boosting the GPU freq upon sudden activity after a period
>> of inactivity seems to work reasonably well there.
> 
> At least AMD hardware is already capable of flipping frames on GPU events 
> like finishing rendering (or uploading etc).
> 
> By waiting in userspace on the CPU before send the frame to the hardware you 
> are completely killing of such features.
> 
> For composing use cases that makes sense, but certainly not for full screen 
> applications as far as I can see.

Even for fullscreen, the current KMS API only allows queuing a single page flip 
per CRTC, with no way to cancel or otherwise modify it. Therefore, a Wayland 
compositor has to set a deadline for the next refresh cycle, and when the 
deadline passes, it has to select the best buffer available for the fullscreen 
surface. To make sure the flip will not miss the next refresh cycle, the 
compositor has to pick an idle buffer. If it picks a non-idle buffer, and the 
pending rendering does not finish in time for vertical blank, the flip will be 
delayed by at least one refresh cycle, which results in visible stuttering.

(Until the deadline passes, the Wayland compositor can't even know if a 
previously fullscreen surface will still be fullscreen for the next refresh 
cycle)


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [RFC 0/4] dma-fence: Deadline awareness

2021-07-28 Thread Michel Dänzer

On 2021-07-28 3:13 p.m., Christian König wrote:
> Am 28.07.21 um 15:08 schrieb Michel Dänzer:
>> On 2021-07-28 1:36 p.m., Christian König wrote:
>>> Am 27.07.21 um 17:37 schrieb Rob Clark:
>>>> On Tue, Jul 27, 2021 at 8:19 AM Michel Dänzer  wrote:
>>>>> On 2021-07-27 5:12 p.m., Rob Clark wrote:
>>>>>> On Tue, Jul 27, 2021 at 7:50 AM Michel Dänzer  wrote:
>>>>>>> On 2021-07-27 1:38 a.m., Rob Clark wrote:
>>>>>>>> From: Rob Clark 
>>>>>>>>
>>>>>>>> Based on discussion from a previous series[1] to add a "boost" 
>>>>>>>> mechanism
>>>>>>>> when, for example, vblank deadlines are missed.  Instead of a boost
>>>>>>>> callback, this approach adds a way to set a deadline on the fence, by
>>>>>>>> which the waiter would like to see the fence signalled.
>>>>>>>>
>>>>>>>> I've not yet had a chance to re-work the drm/msm part of this, but
>>>>>>>> wanted to send this out as an RFC in case I don't have a chance to
>>>>>>>> finish the drm/msm part this week.
>>>>>>>>
>>>>>>>> Original description:
>>>>>>>>
>>>>>>>> In some cases, like double-buffered rendering, missing vblanks can
>>>>>>>> trick the GPU into running at a lower frequence, when really we
>>>>>>>> want to be running at a higher frequency to not miss the vblanks
>>>>>>>> in the first place.
>>>>>>>>
>>>>>>>> This is partially inspired by a trick i915 does, but implemented
>>>>>>>> via dma-fence for a couple of reasons:
>>>>>>>>
>>>>>>>> 1) To continue to be able to use the atomic helpers
>>>>>>>> 2) To support cases where display and gpu are different drivers
>>>>>>>>
>>>>>>>> [1] 
>>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fseries%2F90331%2F&data=04%7C01%7Cchristian.koenig%40amd.com%7C269b2df3e1dc4f0b856d08d951c8c768%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637630745091538563%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=eYaSOSS5wOngNAd9wufp5eWCx5GtAwo6GkultJgrjmA%3D&reserved=0
>>>>>>> Unfortunately, none of these approaches will have the full intended 
>>>>>>> effect once Wayland compositors start waiting for client buffers to 
>>>>>>> become idle before using them for an output frame (to prevent output 
>>>>>>> frames from getting delayed by client work). See 
>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.gnome.org%2FGNOME%2Fmutter%2F-%2Fmerge_requests%2F1880&data=04%7C01%7Cchristian.koenig%40amd.com%7C269b2df3e1dc4f0b856d08d951c8c768%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637630745091538563%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=1ZkOzLqbiKSyCixGZ0u7Hd%2Fc1YnUZub%2F%2Fx7RuEclFKg%3D&reserved=0
>>>>>>>  (shameless plug :) for a proof of concept of this for mutter. The 
>>>>>>> boost will only affect the compositor's own GPU work, not the client 
>>>>>>> work (which means no effect at all for fullscreen apps where the 
>>>>>>> compositor can scan out the client buffers directly).
>>>>>>>
>>>>>> I guess you mean "no effect at all *except* for fullscreen..."?
>>>>> I meant what I wrote: The compositor will wait for the next buffer to 
>>>>> become idle, so there's no boost from this mechanism for the client 
>>>>> drawing to that buffer. And since the compositor does no drawing of its 
>>>>> own in this case, there's no boost from that either.
>>>>>
>>>>>
>>>>>> I'd perhaps recommend that wayland compositors, in cases where only a
>>>>>> single layer is changing, not try to be clever and just push the
>>>>>> update down to the kernel.
>>>>> Even just for the fullscreen direct scanout case, that would require some 
>>>>> kind of atomic KMS API extension to allow queuing multiple page flips for 
>>>>> the same CRTC.
>>>>>

Re: [RFC 0/4] dma-fence: Deadline awareness

2021-07-29 Thread Michel Dänzer

On 2021-07-28 4:30 p.m., Christian König wrote:
> Am 28.07.21 um 15:57 schrieb Pekka Paalanen:
>> On Wed, 28 Jul 2021 15:31:41 +0200
>> Christian König  wrote:
>>
>>> Am 28.07.21 um 15:24 schrieb Michel Dänzer:
>>>> On 2021-07-28 3:13 p.m., Christian König wrote:
>>>>> Am 28.07.21 um 15:08 schrieb Michel Dänzer:
>>>>>> On 2021-07-28 1:36 p.m., Christian König wrote:
>>>>>>> At least AMD hardware is already capable of flipping frames on GPU 
>>>>>>> events like finishing rendering (or uploading etc).
>>>>>>>
>>>>>>> By waiting in userspace on the CPU before send the frame to the 
>>>>>>> hardware you are completely killing of such features.
>>>>>>>
>>>>>>> For composing use cases that makes sense, but certainly not for full 
>>>>>>> screen applications as far as I can see.
>>>>>> Even for fullscreen, the current KMS API only allows queuing a single 
>>>>>> page flip per CRTC, with no way to cancel or otherwise modify it. 
>>>>>> Therefore, a Wayland compositor has to set a deadline for the next 
>>>>>> refresh cycle, and when the deadline passes, it has to select the best 
>>>>>> buffer available for the fullscreen surface. To make sure the flip will 
>>>>>> not miss the next refresh cycle, the compositor has to pick an idle 
>>>>>> buffer. If it picks a non-idle buffer, and the pending rendering does 
>>>>>> not finish in time for vertical blank, the flip will be delayed by at 
>>>>>> least one refresh cycle, which results in visible stuttering.
>>>>>>
>>>>>> (Until the deadline passes, the Wayland compositor can't even know if a 
>>>>>> previously fullscreen surface will still be fullscreen for the next 
>>>>>> refresh cycle)
>>>>> Well then let's extend the KMS API instead of hacking together 
>>>>> workarounds in userspace.
>>>> That's indeed a possible solution for the fullscreen / direct scanout case.
>>>>
>>>> Not for the general compositing case though, since a compositor does not 
>>>> want to composite multiple output frames per display refresh cycle, so it 
>>>> has to make sure the one frame hits the target.
>>> Yeah, that's true as well.
>>>
>>> At least as long as nobody invents a mechanism to do this decision on
>>> the GPU instead.
>> That would mean putting the whole window manager into the GPU.
> 
> Not really. You only need to decide if you want to use the new backing store 
> or the old one based on if the new surface is ready or not.

While something like that might be a possible optimization for (probably 
common) cases where the new buffer does not come with other state changes which 
affect the output frame beyond the buffer contents, there will always be cases 
(at least until a Wayland compositor can fully run on the GPU, as Pekka noted 
somewhat jokingly :) where this needs to be handled on the CPU.

I'm currently focusing on that general case. Optimizations for special cases 
may follow later.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [RFC 0/4] dma-fence: Deadline awareness

2021-07-29 Thread Michel Dänzer

On 2021-07-29 9:09 a.m., Daniel Vetter wrote:
> On Wed, Jul 28, 2021 at 08:34:13AM -0700, Rob Clark wrote:
>> On Wed, Jul 28, 2021 at 6:24 AM Michel Dänzer  wrote:
>>> On 2021-07-28 3:13 p.m., Christian König wrote:
>>>> Am 28.07.21 um 15:08 schrieb Michel Dänzer:
>>>>> On 2021-07-28 1:36 p.m., Christian König wrote:
>>>>>> Am 27.07.21 um 17:37 schrieb Rob Clark:
>>>>>>> On Tue, Jul 27, 2021 at 8:19 AM Michel Dänzer  
>>>>>>> wrote:
>>>>>>>> On 2021-07-27 5:12 p.m., Rob Clark wrote:
>>>>>>>>> On Tue, Jul 27, 2021 at 7:50 AM Michel Dänzer  
>>>>>>>>> wrote:
>>>>>>>>>> On 2021-07-27 1:38 a.m., Rob Clark wrote:
>>>>>>>>>>> From: Rob Clark 
>>>>>>>>>>>
>>>>>>>>>>> Based on discussion from a previous series[1] to add a "boost" 
>>>>>>>>>>> mechanism
>>>>>>>>>>> when, for example, vblank deadlines are missed.  Instead of a boost
>>>>>>>>>>> callback, this approach adds a way to set a deadline on the fence, 
>>>>>>>>>>> by
>>>>>>>>>>> which the waiter would like to see the fence signalled.
>>>>>>>>>>>
>>>>>>>>>>> I've not yet had a chance to re-work the drm/msm part of this, but
>>>>>>>>>>> wanted to send this out as an RFC in case I don't have a chance to
>>>>>>>>>>> finish the drm/msm part this week.
>>>>>>>>>>>
>>>>>>>>>>> Original description:
>>>>>>>>>>>
>>>>>>>>>>> In some cases, like double-buffered rendering, missing vblanks can
>>>>>>>>>>> trick the GPU into running at a lower frequence, when really we
>>>>>>>>>>> want to be running at a higher frequency to not miss the vblanks
>>>>>>>>>>> in the first place.
>>>>>>>>>>>
>>>>>>>>>>> This is partially inspired by a trick i915 does, but implemented
>>>>>>>>>>> via dma-fence for a couple of reasons:
>>>>>>>>>>>
>>>>>>>>>>> 1) To continue to be able to use the atomic helpers
>>>>>>>>>>> 2) To support cases where display and gpu are different drivers
>>>>>>>>>>>
>>>>>>>>>>> [1] 
>>>>>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fseries%2F90331%2F&data=04%7C01%7Cchristian.koenig%40amd.com%7C269b2df3e1dc4f0b856d08d951c8c768%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637630745091538563%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=eYaSOSS5wOngNAd9wufp5eWCx5GtAwo6GkultJgrjmA%3D&reserved=0
>>>>>>>>>> Unfortunately, none of these approaches will have the full intended 
>>>>>>>>>> effect once Wayland compositors start waiting for client buffers to 
>>>>>>>>>> become idle before using them for an output frame (to prevent output 
>>>>>>>>>> frames from getting delayed by client work). See 
>>>>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.gnome.org%2FGNOME%2Fmutter%2F-%2Fmerge_requests%2F1880&data=04%7C01%7Cchristian.koenig%40amd.com%7C269b2df3e1dc4f0b856d08d951c8c768%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637630745091538563%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=1ZkOzLqbiKSyCixGZ0u7Hd%2Fc1YnUZub%2F%2Fx7RuEclFKg%3D&reserved=0
>>>>>>>>>>  (shameless plug :) for a proof of concept of this for mutter. The 
>>>>>>>>>> boost will only affect the compositor's own GPU work, not the client 
>>>>>>>>>> work (which means no effect at all for fullscreen apps where the 
>>>>>>>>>> compositor can scan out the client buffers directly).
>>>>>>>>>>
>>>>>>>>> I guess you mean "no effect at all *except* for fullscreen..."?
>>>>>>>> I meant what I

Re: [RFC 0/4] dma-fence: Deadline awareness

2021-07-29 Thread Michel Dänzer

On 2021-07-29 12:14 p.m., Christian König wrote:
> Am 29.07.21 um 11:15 schrieb Pekka Paalanen:
>> [SNIP]
>>> But how does it then help to wait on the CPU instead?
>> A compositor does not "wait" literally. It would only check which state
>> set is ready to be used, and uses the most recent set that is ready. Any
>> state sets that are not ready are ignored and reconsidered the next
>> time the compositor updates the screen.
> 
> Mhm, then I'm not understanding what Michel's changes are actually doing.

In a nutshell, my mutter MR holds back all Wayland state changes which were 
committed together with a new buffer (and dependent later ones) until the 
dma-buf file descriptors for that buffer have become readable. This is achieved 
by adding the fds to the main event loop (if they aren't readable already when 
the buffer is committed), and when they become readable, all corresponding 
state changes are propagated such that they will be taken into account for 
drawing the next frame.


>> Depending on which state sets are selected for a screen update, the
>> global window manager state may be updated accordingly, before the
>> drawing commands for the composition can be created.
>>
>>> See what I'm proposing is to either render the next state of the window
>>> or compose from the old state (including all atomic properties).
>> Yes, that's exactly how it would work. It's just that state for a
>> window is not an independent thing, it can affect how unrelated windows
>> are managed.
>>
>> A simplified example would be two windows side by side where the
>> resizing of one causes the other to move. You can't resize the window
>> or move the other until the buffer with the new size is ready. Until
>> then the compositor uses the old state.
>>
>>> E.g. what do you do if you timeout and can't have the new window content
>>> on time? What's the fallback here?
>> As there is no wait, there is no timeout either.
>>
>> If the app happens to be frozen (e.g. some weird bug in fence handling
>> to make it never ready, or maybe it's just bugged itself and never
>> drawing again), then the app is frozen, and all the rest of the desktop
>> continues running normally without a glitch.
> 
> But that is in contradict to what you told me before.
> 
> See when the window should move but fails to draw it's new content what 
> happens?
> 
> Are the other windows which would be affected by the move not drawn as well?

Basically, the compositor draws its output as if the new buffer and all 
connected Wayland state changes had not been committed yet.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [RFC v1 0/4] drm: Add support for DRM_CAP_DEFERRED_OUT_FENCE capability

2021-07-30 Thread Michel Dänzer

On 2021-07-30 12:25 p.m., Daniel Vetter wrote:
> On Thu, Jul 29, 2021 at 01:16:55AM -0700, Vivek Kasireddy wrote:
>> By separating the OUT_FENCE signalling from pageflip completion allows
>> a Guest compositor to start a new repaint cycle with a new buffer
>> instead of waiting for the old buffer to be free. 
>>
>> This work is based on the idea/suggestion from Simon and Pekka.
>>
>> This capability can be a solution for this issue:
>> https://gitlab.freedesktop.org/wayland/weston/-/issues/514
>>
>> Corresponding Weston MR:
>> https://gitlab.freedesktop.org/wayland/weston/-/merge_requests/668
> 
> Uh I kinda wanted to discuss this a bit more before we jump into typing
> code, but well I guess not that much work yet.
> 
> So maybe I'm not understanding the problem, but I think the fundamental
> underlying issue is that with KMS you can have at most 2 buffers
> in-flight, due to our queue depth limit of 1 pending flip.
> 
> Unfortunately that means for virtual hw where it takes a few more
> steps/vblanks until the framebuffer actually shows up on screen and is
> scanned out, we suffer deeply. The usual fix for that is to drop the
> latency and increase throughput, and have more buffers in-flight. Which
> this patch tries to do.

Per https://gitlab.freedesktop.org/wayland/weston/-/issues/514#note_986797 , 
IMO the underlying issue is actually that the guest compositor repaint cycle is 
not aligned with the host compositor one. If they were aligned, the problem 
would not occur even without allowing multiple page flips in flight, and 
latency would be lower.


> Now I think where we go wrong here is that we're trying to hack this up by
> defining different semantics for the out-fence and for the drm-event. Imo
> that's wrong, they're both meant to show eactly the same thing:
> - when is the new frame actually visible to the user (as in, eyeballs in a
>   human head, preferrably, not the time when we've handed the buffer off
>   to the virtual hw)
> - when is the previous buffer no longer being used by the scanout hw
> 
> We do cheat a bit right now in so far that we assume they're both the
> same, as in, panel-side latency is currently the compositor's problem to
> figure out.
> 
> So for virtual hw I think the timestamp and even completion really need to
> happen only when the buffer has been pushed through the entire
> virtualization chain, i.e. ideally we get the timestamp from the kms
> driver from the host side. Currently that's not done, so this is most
> likely quite broken already (virtio relies on the no-vblank auto event
> sending, which definitely doesn't wait for anything, or I'm completely
> missing something).
> 
> I think instead of hacking up some ill-defined 1.5 queue depth support,
> what we should do is support queue depth > 1 properly. So:
> 
> - Change atomic to support queue depth > 1, this needs to be a per-driver
>   thing due to a bunch of issues in driver code. Essentially drivers must
>   never look at obj->state pointers, and only ever look up state through
>   the passed-in drm_atomic_state * update container.
> 
> - Aside: virtio should loose all it's empty hooks, there's no point in
>   that.
> 
> - We fix virtio to send out the completion event at the end of this entire
>   pipeline, i.e. virtio code needs to take care of sending out the
>   crtc_state->event correctly.
> 
> - We probably also want some kind of (maybe per-crtc) recommended queue
>   depth property so compositors know how many buffers to keep in flight.
>   Not sure about that.

I'd say there would definitely need to be some kind of signal for the display 
server that it should queue multiple flips, since this is normally not 
desirable for latency. In other words, this wouldn't really be useful on bare 
metal (in contrast to the ability to replace a pending flip with a newer one).


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [RFC v1 0/4] drm: Add support for DRM_CAP_DEFERRED_OUT_FENCE capability

2021-08-02 Thread Michel Dänzer

On 2021-08-02 9:59 a.m., Daniel Vetter wrote:
> On Fri, Jul 30, 2021 at 02:50:10PM +0200, Michel Dänzer wrote:
>> On 2021-07-30 12:25 p.m., Daniel Vetter wrote:
>>> On Thu, Jul 29, 2021 at 01:16:55AM -0700, Vivek Kasireddy wrote:
>>>> By separating the OUT_FENCE signalling from pageflip completion allows
>>>> a Guest compositor to start a new repaint cycle with a new buffer
>>>> instead of waiting for the old buffer to be free. 
>>>>
>>>> This work is based on the idea/suggestion from Simon and Pekka.
>>>>
>>>> This capability can be a solution for this issue:
>>>> https://gitlab.freedesktop.org/wayland/weston/-/issues/514
>>>>
>>>> Corresponding Weston MR:
>>>> https://gitlab.freedesktop.org/wayland/weston/-/merge_requests/668
>>>
>>> Uh I kinda wanted to discuss this a bit more before we jump into typing
>>> code, but well I guess not that much work yet.
>>>
>>> So maybe I'm not understanding the problem, but I think the fundamental
>>> underlying issue is that with KMS you can have at most 2 buffers
>>> in-flight, due to our queue depth limit of 1 pending flip.
>>>
>>> Unfortunately that means for virtual hw where it takes a few more
>>> steps/vblanks until the framebuffer actually shows up on screen and is
>>> scanned out, we suffer deeply. The usual fix for that is to drop the
>>> latency and increase throughput, and have more buffers in-flight. Which
>>> this patch tries to do.
>>
>> Per
>> https://gitlab.freedesktop.org/wayland/weston/-/issues/514#note_986797 ,
>> IMO the underlying issue is actually that the guest compositor repaint
>> cycle is not aligned with the host compositor one. If they were aligned,
>> the problem would not occur even without allowing multiple page flips in
>> flight, and latency would be lower.
> 
> Yeah my proposal here is under the premise that we do actually need to fix
> this with a deeper queue depth.
> 
>>> Now I think where we go wrong here is that we're trying to hack this up by
>>> defining different semantics for the out-fence and for the drm-event. Imo
>>> that's wrong, they're both meant to show eactly the same thing:
>>> - when is the new frame actually visible to the user (as in, eyeballs in a
>>>   human head, preferrably, not the time when we've handed the buffer off
>>>   to the virtual hw)
>>> - when is the previous buffer no longer being used by the scanout hw
>>>
>>> We do cheat a bit right now in so far that we assume they're both the
>>> same, as in, panel-side latency is currently the compositor's problem to
>>> figure out.
>>>
>>> So for virtual hw I think the timestamp and even completion really need to
>>> happen only when the buffer has been pushed through the entire
>>> virtualization chain, i.e. ideally we get the timestamp from the kms
>>> driver from the host side. Currently that's not done, so this is most
>>> likely quite broken already (virtio relies on the no-vblank auto event
>>> sending, which definitely doesn't wait for anything, or I'm completely
>>> missing something).
>>>
>>> I think instead of hacking up some ill-defined 1.5 queue depth support,
>>> what we should do is support queue depth > 1 properly. So:
>>>
>>> - Change atomic to support queue depth > 1, this needs to be a per-driver
>>>   thing due to a bunch of issues in driver code. Essentially drivers must
>>>   never look at obj->state pointers, and only ever look up state through
>>>   the passed-in drm_atomic_state * update container.
>>>
>>> - Aside: virtio should loose all it's empty hooks, there's no point in
>>>   that.
>>>
>>> - We fix virtio to send out the completion event at the end of this entire
>>>   pipeline, i.e. virtio code needs to take care of sending out the
>>>   crtc_state->event correctly.
>>>
>>> - We probably also want some kind of (maybe per-crtc) recommended queue
>>>   depth property so compositors know how many buffers to keep in flight.
>>>   Not sure about that.
>>
>> I'd say there would definitely need to be some kind of signal for the
>> display server that it should queue multiple flips, since this is
>> normally not desirable for latency. In other words, this wouldn't really
>> be useful on bare metal (in contrast to the ability to replace a pending
>> flip with a newer

Re: [RFC v1 0/4] drm: Add support for DRM_CAP_DEFERRED_OUT_FENCE capability

2021-08-02 Thread Michel Dänzer

On 2021-08-02 11:06 a.m., Daniel Vetter wrote:
> On Mon, Aug 02, 2021 at 10:49:37AM +0200, Michel Dänzer wrote:
>> On 2021-08-02 9:59 a.m., Daniel Vetter wrote:
>>> On Fri, Jul 30, 2021 at 02:50:10PM +0200, Michel Dänzer wrote:
>>>> On 2021-07-30 12:25 p.m., Daniel Vetter wrote:
>>>>> On Thu, Jul 29, 2021 at 01:16:55AM -0700, Vivek Kasireddy wrote:
>>>>>> By separating the OUT_FENCE signalling from pageflip completion allows
>>>>>> a Guest compositor to start a new repaint cycle with a new buffer
>>>>>> instead of waiting for the old buffer to be free. 
>>>>>>
>>>>>> This work is based on the idea/suggestion from Simon and Pekka.
>>>>>>
>>>>>> This capability can be a solution for this issue:
>>>>>> https://gitlab.freedesktop.org/wayland/weston/-/issues/514
>>>>>>
>>>>>> Corresponding Weston MR:
>>>>>> https://gitlab.freedesktop.org/wayland/weston/-/merge_requests/668
>>>>>
>>>>> Uh I kinda wanted to discuss this a bit more before we jump into typing
>>>>> code, but well I guess not that much work yet.
>>>>>
>>>>> So maybe I'm not understanding the problem, but I think the fundamental
>>>>> underlying issue is that with KMS you can have at most 2 buffers
>>>>> in-flight, due to our queue depth limit of 1 pending flip.
>>>>>
>>>>> Unfortunately that means for virtual hw where it takes a few more
>>>>> steps/vblanks until the framebuffer actually shows up on screen and is
>>>>> scanned out, we suffer deeply. The usual fix for that is to drop the
>>>>> latency and increase throughput, and have more buffers in-flight. Which
>>>>> this patch tries to do.
>>>>
>>>> Per
>>>> https://gitlab.freedesktop.org/wayland/weston/-/issues/514#note_986797 ,
>>>> IMO the underlying issue is actually that the guest compositor repaint
>>>> cycle is not aligned with the host compositor one. If they were aligned,
>>>> the problem would not occur even without allowing multiple page flips in
>>>> flight, and latency would be lower.
>>>
>>> Yeah my proposal here is under the premise that we do actually need to fix
>>> this with a deeper queue depth.
>>>
>>>>> Now I think where we go wrong here is that we're trying to hack this up by
>>>>> defining different semantics for the out-fence and for the drm-event. Imo
>>>>> that's wrong, they're both meant to show eactly the same thing:
>>>>> - when is the new frame actually visible to the user (as in, eyeballs in a
>>>>>   human head, preferrably, not the time when we've handed the buffer off
>>>>>   to the virtual hw)
>>>>> - when is the previous buffer no longer being used by the scanout hw
>>>>>
>>>>> We do cheat a bit right now in so far that we assume they're both the
>>>>> same, as in, panel-side latency is currently the compositor's problem to
>>>>> figure out.
>>>>>
>>>>> So for virtual hw I think the timestamp and even completion really need to
>>>>> happen only when the buffer has been pushed through the entire
>>>>> virtualization chain, i.e. ideally we get the timestamp from the kms
>>>>> driver from the host side. Currently that's not done, so this is most
>>>>> likely quite broken already (virtio relies on the no-vblank auto event
>>>>> sending, which definitely doesn't wait for anything, or I'm completely
>>>>> missing something).
>>>>>
>>>>> I think instead of hacking up some ill-defined 1.5 queue depth support,
>>>>> what we should do is support queue depth > 1 properly. So:
>>>>>
>>>>> - Change atomic to support queue depth > 1, this needs to be a per-driver
>>>>>   thing due to a bunch of issues in driver code. Essentially drivers must
>>>>>   never look at obj->state pointers, and only ever look up state through
>>>>>   the passed-in drm_atomic_state * update container.
>>>>>
>>>>> - Aside: virtio should loose all it's empty hooks, there's no point in
>>>>>   that.
>>>>>
>>>>> - We fix virtio to send out the completion event at the end of this entire
>>>>>   pipel

Re: [RFC v1 0/4] drm: Add support for DRM_CAP_DEFERRED_OUT_FENCE capability

2021-08-03 Thread Michel Dänzer

On 2021-08-03 8:11 a.m., Kasireddy, Vivek wrote:
> 
>>> The goal:
>>> - Maintain full framerate even when the Guest scanout FB is flipped onto a 
>>> hardware
>> plane
>>> on the Host -- regardless of either compositor's scheduling policy -- 
>>> without making any
>>> copies and ensuring that both Host and Guest are not accessing the buffer 
>>> at the same
>> time.
>>>
>>> The problem:
>>> - If the Host compositor flips the client's buffer (in this case Guest 
>>> compositor's buffer)
>>> onto a hardware plane, then it can send a wl_buffer.release event for the 
>>> previous buffer
>>> only after it gets a pageflip completion. And, if the Guest compositor 
>>> takes 10-12 ms to
>>> submit a new buffer and given the fact that the Host compositor waits only 
>>> for 9 ms, the
>>> Guest compositor will miss the Host's repaint cycle resulting in halved 
>>> frame-rate.
>>>
>>> The solution:
>>> - To ensure full framerate, the Guest compositor has to start it's repaint 
>>> cycle (including
>>> the 9 ms wait) when the Host compositor sends the frame callback event to 
>>> its clients.
>>> In order for this to happen, the dma-fence that the Guest KMS waits on -- 
>>> before sending
>>> pageflip completion -- cannot be tied to a wl_buffer.release event. This 
>>> means that, the
>>> Guest compositor has to be forced to use a new buffer for its next repaint 
>>> cycle when it
>>> gets a pageflip completion.
>>
>> Is that really the only solution?
> [Kasireddy, Vivek] There are a few others I mentioned here:
> https://gitlab.freedesktop.org/wayland/weston/-/issues/514#note_986572
> But I think none of them are as compelling as this one.
> 
>>
>> If we fix the event timestamps so that both guest and host use the same
>> timestamp, but then the guest starts 5ms (or something like that) earlier,
>> then things should work too? I.e.
>> - host compositor starts at (previous_frametime + 9ms)
>> - guest compositor starts at (previous_frametime + 4ms)
>>
>> Ofc this only works if the frametimes we hand out to both match _exactly_
>> and are as high-precision as the ones on the host side. Which for many gpu
>> drivers at least is the case, and all the ones you care about for sure :-)
>>
>> But if the frametimes the guest receives are the no_vblank fake ones, then
>> they'll be all over the place and this carefully tuned low-latency redraw
>> loop falls apart. Aside fromm the fact that without tuning the guests to
>> be earlier than the hosts, you're guaranteed to miss every frame (except
>> when the timing wobbliness in the guest is big enough by chance to make
>> the deadline on the oddball frame).
> [Kasireddy, Vivek] The Guest and Host use different event timestamps as we 
> don't
> share these between the Guest and the Host. It does not seem to be causing 
> any other
> problems so far but we did try the experiment you mentioned (i.e., adjusting 
> the delays)
> and it works. However, this patch series is meant to fix the issue without 
> having to tweak
> anything (delays) because we can't do this for every compositor out there.

Maybe there could be a mechanism which allows the compositor in the guest to 
automatically adjust its repaint cycle as needed.

This might even be possible without requiring changes in each compositor, by 
adjusting the vertical blank periods in the guest to be aligned with the host 
compositor repaint cycles. Not sure about that though.

Even if not, both this series or making it possible to queue multiple flips 
require corresponding changes in each compositor as well to have any effect.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [PATCH] drm/radeon: Update pitch for page flip

2021-08-03 Thread Michel Dänzer

On 2021-08-02 4:51 p.m., Alex Deucher wrote:
> On Mon, Aug 2, 2021 at 4:31 AM Daniel Vetter  wrote:
>>
>> On Mon, Aug 02, 2021 at 10:12:47AM +0200, Christian König wrote:
>>> Am 02.08.21 um 09:43 schrieb Zhenneng Li:
>>>> When primary bo is updated, crtc's pitch may
>>>> have not been updated, this will lead to show
>>>> disorder content when user changes display mode,
>>>> we update crtc's pitch in page flip to avoid
>>>> this bug.
>>>> This refers to amdgpu's pageflip.
>>>
>>> Alex is the expert to ask about that code, but I'm not sure if that is
>>> really correct for the old hardware.
>>>
>>> As far as I know the crtc's pitch should not change during a page flip, but
>>> only during a full mode set.
>>>
>>> So could you elaborate a bit more how you trigger this?
>>
>> legacy page_flip ioctl only verifies that the fb->format stays the same.
>> It doesn't check anything else (afair never has), this is all up to
>> drivers to verify.
>>
>> Personally I'd say add a check to reject this, since testing this and
>> making sure it really works everywhere is probably a bit much on this old
>> hw.
> 
> If just the pitch changed, that probably wouldn't be much of a
> problem, but if the pitch is changing, that probably implies other
> stuff has changed as well and we'll just be chasing changes.  I agree
> it would be best to just reject anything other than updating the
> scanout address.

FWIW, that means page flipping cannot be used in some cases which work fine by 
changing the pitch, which can result in tearing: 
https://gitlab.freedesktop.org/xorg/xserver/-/issues/839 (which says the i915 
driver handles this as well).


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [Mesa-dev] [RFC] Linux Graphics Next: Explicit fences everywhere and no BO fences - initial proposal

2021-04-28 Thread Michel Dänzer

On 2021-04-28 8:59 a.m., Christian König wrote:
> Hi Dave,
> 
> Am 27.04.21 um 21:23 schrieb Marek Olšák:
>> Supporting interop with any device is always possible. It depends on which 
>> drivers we need to interoperate with and update them. We've already found 
>> the path forward for amdgpu. We just need to find out how many other drivers 
>> need to be updated and evaluate the cost/benefit aspect.
>>
>> Marek
>>
>> On Tue, Apr 27, 2021 at 2:38 PM Dave Airlie > <mailto:airl...@gmail.com>> wrote:
>>
>> On Tue, 27 Apr 2021 at 22:06, Christian König
>> > <mailto:ckoenig.leichtzumer...@gmail.com>> wrote:
>> >
>> > Correct, we wouldn't have synchronization between device with and 
>> without user queues any more.
>> >
>> > That could only be a problem for A+I Laptops.
>>
>> Since I think you mentioned you'd only be enabling this on newer
>> chipsets, won't it be a problem for A+A where one A is a generation
>> behind the other?
>>
> 
> Crap, that is a good point as well.
> 
>>
>> I'm not really liking where this is going btw, seems like a ill
>> thought out concept, if AMD is really going down the road of designing
>> hw that is currently Linux incompatible, you are going to have to
>> accept a big part of the burden in bringing this support in to more
>> than just amd drivers for upcoming generations of gpu.
>>
> 
> Well we don't really like that either, but we have no other option as far as 
> I can see.

I don't really understand what "future hw may remove support for kernel queues" 
means exactly. While the per-context queues can be mapped to userspace 
directly, they don't *have* to be, do they? I.e. the kernel driver should be 
able to either intercept userspace access to the queues, or in the worst case 
do it all itself, and provide the existing synchronization semantics as needed?

Surely there are resource limits for the per-context queues, so the kernel 
driver needs to do some kind of virtualization / multi-plexing anyway, or we'll 
get sad user faces when there's no queue available for .

I'm probably missing something though, awaiting enlightenment. :)


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH] drm: log errors in drm_gem_fb_init_with_funcs

2021-04-30 Thread Michel Dänzer

On 2021-04-30 4:40 p.m., Simon Ser wrote:
> Let the user know what went wrong in drm_gem_fb_init_with_funcs
> failure paths.
> 
> Signed-off-by: Simon Ser 
> Cc: Daniel Vetter 
> Cc: Sam Ravnborg 
> Cc: Noralf Trønnes 
> Cc: Andrzej Pietrasiewicz 
> ---
>  drivers/gpu/drm/drm_gem_framebuffer_helper.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/drm_gem_framebuffer_helper.c 
> b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
> index 109d11fb4cd4..e4a3c7eb43b2 100644
> --- a/drivers/gpu/drm/drm_gem_framebuffer_helper.c
> +++ b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
> @@ -155,8 +155,10 @@ int drm_gem_fb_init_with_funcs(struct drm_device *dev,
>   int ret, i;
>  
>   info = drm_get_format_info(dev, mode_cmd);
> - if (!info)
> + if (!info) {
> + drm_dbg_kms(dev, "Failed to get FB format info\n");
>   return -EINVAL;
> + }
>  
>   for (i = 0; i < info->num_planes; i++) {
>   unsigned int width = mode_cmd->width / (i ? info->hsub : 1);
> @@ -175,6 +177,9 @@ int drm_gem_fb_init_with_funcs(struct drm_device *dev,
>+ mode_cmd->offsets[i];
>  
>   if (objs[i]->size < min_size) {
> + drm_dbg_kms(dev,
> + "GEM object size (%u) smaller than minimum 
> size (%u) for plane %d\n",
> + objs[i]->size, min_size, i);
>       drm_gem_object_put(objs[i]);
>   ret = -EINVAL;
>   goto err_gem_object_put;
> 

Reviewed-by: Michel Dänzer 


I made almost the same change (except for the strings) for tracking down the 
issue fixed by https://patchwork.freedesktop.org/patch/431677/ (note my 
follow-up there!). :)


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [RFC v1 0/4] drm: Add support for DRM_CAP_DEFERRED_OUT_FENCE capability

2021-08-10 Thread Michel Dänzer

sitor experts seem to think so, and it makes sense to me.
>> [Kasireddy, Vivek] It is definitely possible that if the timestamp is messed 
>> up, then
>> the Guest repaint cycle would be affected. However, I do not believe that is 
>> the case
>> here given the debug and instrumentation data we collected and scrutinized. 
>> Hopefully,
>> compositor experts could chime in to shed some light on this matter.
>>
>>>
>>>>>
>>>>> Only, and I really mean only, when that shows that it's simply impossible
>>>>> to hit 60fps with zero-copy and the guest/host fully aligned should we
>>>>> look into making the overall pipeline deeper.
>>>> [Kasireddy, Vivek] From all the experiments conducted so far and given the
>>>> discussion associated with 
>>>> https://gitlab.freedesktop.org/wayland/weston/-/issues/514
>>>> I think we have already established that in order for a zero-copy solution 
>>>> to work
>>>> reliably, the Guest compositor needs to start its repaint cycle when the 
>>>> Host
>>>> compositor sends a frame callback event to its clients.
>>>>
>>>>>
>>>>>>> Only when that all shows that we just can't hit 60fps consistently and
>>>>>>> really need 3 buffers in flight should we look at deeper kms queues.
>>>>>>> And then we really need to implement them properly and not with a
>>>>>>> mismatch between drm_event an out-fence signalling. These quick hacks
>>>>>>> are good for experiments, but there's a pile of other things we need
>>>>>>> to do first. At least that's how I understand the problem here right
>>>>>>> now.
>>>>>> [Kasireddy, Vivek] Experiments done so far indicate that we can hit 59 
>>>>>> FPS
>>> consistently
>>>>>> -- in a zero-copy way independent of compositors' delays/deadlines -- 
>>>>>> with this
>>>>>> patch series + the Weston MR I linked in the cover letter. The main 
>>>>>> reason why this
>>>>>> works is because we relax the assumption that when the Guest compositor 
>>>>>> gets a
>>>>>> pageflip completion event that it could reuse the old FB it submitted in 
>>>>>> the previous
>>>>>> atomic flip and instead force it to use a new one. And, we send the 
>>>>>> pageflip
>>> completion
>>>>>> event to the Guest when the Host compositor sends a frame callback 
>>>>>> event. Lastly,
>>>>>> we use the (deferred) out_fence as just a mechanism to tell the Guest 
>>>>>> compositor
>>> when
>>>>>> it can release references on old FBs so that they can be reused again.
>>>>>>
>>>>>> With that being said, the only question is how can we accomplish the 
>>>>>> above in an
>>>>> upstream
>>>>>> acceptable way without regressing anything particularly on bare-metal. 
>>>>>> Its not clear
>>> if
>>>>> just
>>>>>> increasing the queue depth would work or not but I think the Guest 
>>>>>> compositor has to
>>> be
>>>>> told
>>>>>> when it can start its repaint cycle and when it can assume the old FB is 
>>>>>> no longer in
>>> use.
>>>>>> On bare-metal -- and also with VKMS as of today -- a pageflip completion 
>>>>>> indicates
>>>>> both.
>>>>>> In other words, Vblank event is the same as Flip done, which makes sense 
>>>>>> on bare-
>>> metal.
>>>>>> But if we were to have two events at-least for VKMS: vblank to indicate 
>>>>>> to Guest to
>>> start
>>>>>> repaint and flip_done to indicate to drop references on old FBs, I think 
>>>>>> this problem
>>> can
>>>>>> be solved even without increasing the queue depth. Can this be 
>>>>>> acceptable?
>>>>>
>>>>> That's just another flavour of your "increase queue depth without
>>>>> increasing the atomic queue depth" approach. I still think the underlying
>>>>> fundamental issue is a timing confusion, and the fact that adjusting the
>>>>> timings fixes things too kinda proves that. So we need to fix that in a
>>>>> clean way, not by shuffling things around semi-randomly until the specific
>>>>> config we tests works.
>>>> [Kasireddy, Vivek] This issue is not due to a timing or timestamp 
>>>> mismatch. We
>>>> have carefully instrumented both the Host and Guest compositors and 
>>>> measured
>>>> the latencies at each step. The relevant debug data only points to the 
>>>> scheduling
>>>> policy -- of both Host and Guest compositors -- playing a role in Guest 
>>>> rendering
>>>> at 30 FPS.
>>>
>>> Hm but that essentially means that the events your passing around have an
>>> even more ad-hoc implementation specific meaning: Essentially it's the
>>> kick-off for the guest's repaint loop? That sounds even worse for a kms
>>> uapi extension.
>> [Kasireddy, Vivek] The pageflip completion event/vblank event indeed serves 
>> as the
>> kick-off for a compositor's (both Guest and Host) repaint loop. AFAICT, 
>> Weston 
>> works that way and even if we increase the queue depth to solve this 
>> problem, I don't
>> think it'll help because the arrival of this event always indicates to a 
>> compositor to
>> start its repaint cycle again and assume that the previous buffers are all 
>> free.
> 
> I thought this is how simple compositors work, and weston has since a
> while it's own timer, which is based on the timestamp it gets (at on
> drivers with vblank support), so that it starts the repaint loop a few ms
> before the next vblank. And not immediately when it receives the old page
> flip completion event.

As long as it's a fixed timer, there's still a risk that the guest compositor 
repaint cycle runs too late for the host one (unless the guest cycle happens to 
be scheduled significantly earlier than the host one).

Note that current mutter Git main (to become the 41 release this autumn) uses 
dynamic scheduling of its repaint cycle based on how long the last 16 frames 
took to draw and present. In theory, this could automatically schedule the 
guest cycle early enough for the host one.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

[PATCH 1/2] drm/amdgpu: Use mod_delayed_work in amdgpu_gfx_off_ctrl

2021-08-11 Thread Michel Dänzer

From: Michel Dänzer 

In contrast to schedule_delayed_work, this pushes back the work if it
was already scheduled before. Specific behaviour change:

Before:

amdgpu_device_delay_enable_gfx_off ran ~100 ms after the first time
GFXOFF was disabled and re-enabled, even if GFXOFF was disabled and
re-enabled again during those 100 ms.

After:

amdgpu_device_delay_enable_gfx_off runs ~100 ms after the last time
GFXOFF is disabled and re-enabled.

The former resulted in frame drops / stutter with the upcoming mutter
41 release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

Signed-off-by: Michel Dänzer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a0be0772c8b3..9cfef56b2aee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -569,7 +569,7 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)
adev->gfx.gfx_off_req_count--;
 
if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) 
{
-   schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
+   mod_delayed_work(system_wq, &adev->gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
} else if (!enable && adev->gfx.gfx_off_state) {
if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
adev->gfx.gfx_off_state = false;
-- 
2.32.0

[PATCH 2/2] drm/amdgpu: Use mod_delayed_work in JPEG/UVD/VCE/VCN ring_end_use hooks

2021-08-11 Thread Michel Dänzer

From: Michel Dänzer 

In contrast to schedule_delayed_work, this pushes back the work if it
was already scheduled before. Specific behaviour change:

Before:

The scheduled work ran ~1 second after the first time ring_end_use was
called, even if the ring was used again during that second.

After:

The scheduled work runs ~1 second after the last time ring_end_use is
called.

Inspired by the corresponding change in amdgpu_gfx_off_ctrl. While I
haven't run into specific issues in this case, the new behaviour makes
more sense to me.

Signed-off-by: Michel Dänzer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c  | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c  | 2 +-
 drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c| 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 8996cb4ed57a..2c0040153f6c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -110,7 +110,7 @@ void amdgpu_jpeg_ring_begin_use(struct amdgpu_ring *ring)
 void amdgpu_jpeg_ring_end_use(struct amdgpu_ring *ring)
 {
atomic_dec(&ring->adev->jpeg.total_submission_cnt);
-   schedule_delayed_work(&ring->adev->jpeg.idle_work, JPEG_IDLE_TIMEOUT);
+   mod_delayed_work(system_wq, &ring->adev->jpeg.idle_work, 
JPEG_IDLE_TIMEOUT);
 }
 
 int amdgpu_jpeg_dec_ring_test_ring(struct amdgpu_ring *ring)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index 0f576f294d8a..b6b1d7eeb8e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -1283,7 +1283,7 @@ void amdgpu_uvd_ring_begin_use(struct amdgpu_ring *ring)
 void amdgpu_uvd_ring_end_use(struct amdgpu_ring *ring)
 {
if (!amdgpu_sriov_vf(ring->adev))
-   schedule_delayed_work(&ring->adev->uvd.idle_work, 
UVD_IDLE_TIMEOUT);
+   mod_delayed_work(system_wq, &ring->adev->uvd.idle_work, 
UVD_IDLE_TIMEOUT);
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index 1ae7f824adc7..2253c18a6688 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -401,7 +401,7 @@ void amdgpu_vce_ring_begin_use(struct amdgpu_ring *ring)
 void amdgpu_vce_ring_end_use(struct amdgpu_ring *ring)
 {
if (!amdgpu_sriov_vf(ring->adev))
-   schedule_delayed_work(&ring->adev->vce.idle_work, 
VCE_IDLE_TIMEOUT);
+   mod_delayed_work(system_wq, &ring->adev->vce.idle_work, 
VCE_IDLE_TIMEOUT);
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
index 284bb42d6c86..d5937ab5ac80 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
@@ -1874,7 +1874,7 @@ void vcn_v1_0_set_pg_for_begin_use(struct amdgpu_ring 
*ring, bool set_clocks)
 
 void vcn_v1_0_ring_end_use(struct amdgpu_ring *ring)
 {
-   schedule_delayed_work(&ring->adev->vcn.idle_work, VCN_IDLE_TIMEOUT);
+   mod_delayed_work(system_wq, &ring->adev->vcn.idle_work, 
VCN_IDLE_TIMEOUT);
mutex_unlock(&ring->adev->vcn.vcn1_jpeg1_workaround);
 }
 
-- 
2.32.0

Re: [PATCH 2/2] drm/amdgpu: Use mod_delayed_work in JPEG/UVD/VCE/VCN ring_end_use hooks

2021-08-12 Thread Michel Dänzer

On 2021-08-12 7:55 a.m., Koenig, Christian wrote:
> Hi James,
> 
> Evan seems to have understood how this all works together.
> 
> See while any begin/end use critical section is active the work should not be 
> active.
> 
> When you handle only one ring you can just call cancel in begin use and 
> schedule in end use. But when you have more than one ring you need a lock or 
> counter to prevent concurrent work items to be started.
> 
> Michelle's idea to use mod_delayed_work is a bad one because it assumes that 
> the delayed work is still running.

It merely assumes that the work may already have been scheduled before.

Admittedly, I missed the cancel_delayed_work_sync calls for patch 2. While I 
think it can still have some effect when there's a single work item for 
multiple rings, as described by James, it's probably negligible, since 
presumably the time intervals between ring_begin_use and ring_end_use are 
normally much shorter than a second.

So, while patch 2 is at worst a no-op (since mod_delayed_work is the same as 
schedule_delayed_work if the work hasn't been scheduled yet), I'm fine with 
dropping it.

> Something similar applies to the first patch I think,

There are no cancel work calls in that case, so the commit log is accurate 
TTBOMK.

I noticed this because current mutter Git main wasn't able to sustain 60 fps on 
Navi 14 with a simple glxgears -fullscreen. mutter was dropping frames because 
its CPU work for a frame update occasionally took up to 3 ms, instead of the 
normal 2-300 microseconds. sysprof showed a lot of cycles spent in the 
functions which enable/disable GFXOFF in the HW.

> so when this makes a difference it is actually a bug.

There was certainly a bug though, which patch 1 fixes. :)

-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [PATCH] drm: Copy drm_wait_vblank request and copy_to_user before return.

2021-08-12 Thread Michel Dänzer

On 2021-08-11 7:55 p.m., Mark Yacoub wrote:
> From: Mark Yacoub 
> 
> [Why]
> Userspace should get back a copy of the request that's been modified
> even when drm_wait_vblank_ioctl returns a failure.
> 
> Rationale:
> drm_wait_vblank_ioctl modifies the request and expects the user to read
> back. When the type is RELATIVE, it modifies it to ABSOLUTE and updates
> the sequence to become current_vblank_count + sequence (which was
> relative), not it becomes absolute.
> drmWaitVBlank (in libdrm), expects this to be the case as it modifies
> the request to be Absolute as it expects the sequence to would have been
> updated.
> 
> The change is in compat_drm_wait_vblank, which is called by
> drm_compat_ioctl. This change of copying the data back regardless of the
> return number makes it en par with drm_ioctl, which always copies the
> data before returning.
> 
> [How]
> Copy the drm_wait_vblank request.
> Return from the function after everything has been copied to user.
> 
> Fixes: IGT:kms_flip::modeset-vs-vblank-race-interruptible
> Tested on ChromeOS Trogdor(msm)
> 
> Signed-off-by: Mark Yacoub 
> ---
>  drivers/gpu/drm/drm_ioc32.c | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_ioc32.c b/drivers/gpu/drm/drm_ioc32.c
> index d29907955ff79..275b860df8fbe 100644
> --- a/drivers/gpu/drm/drm_ioc32.c
> +++ b/drivers/gpu/drm/drm_ioc32.c
> @@ -855,17 +855,19 @@ static int compat_drm_wait_vblank(struct file *file, 
> unsigned int cmd,
>   req.request.sequence = req32.request.sequence;
>   req.request.signal = req32.request.signal;
>   err = drm_ioctl_kernel(file, drm_wait_vblank_ioctl, &req, DRM_UNLOCKED);
> - if (err)
> - return err;
>  
>   req32.reply.type = req.reply.type;
>   req32.reply.sequence = req.reply.sequence;
>   req32.reply.tval_sec = req.reply.tval_sec;
>   req32.reply.tval_usec = req.reply.tval_usec;
> + /* drm_wait_vblank_ioctl modifies Request, update their values here as 
> well. */
> + req32.request.type = req.request.type;
> + req32.request.sequence = req.request.sequence;
> + req32.request.signal = req.request.signal;

The added assignments are superfluous, since req32.reply and req32.request are 
members of the same union.


>   if (copy_to_user(argp, &req32, sizeof(req32)))
>   return -EFAULT;
>  
> - return 0;
> + return err;
>  }

The other changes look correct.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [PATCH 2/2] drm/amdgpu: Use mod_delayed_work in JPEG/UVD/VCE/VCN ring_end_use hooks

2021-08-12 Thread Michel Dänzer

On 2021-08-12 1:33 p.m., Lazar, Lijo wrote:
> On 8/12/2021 1:41 PM, Michel Dänzer wrote:
>> On 2021-08-12 7:55 a.m., Koenig, Christian wrote:
>>> Hi James,
>>>
>>> Evan seems to have understood how this all works together.
>>>
>>> See while any begin/end use critical section is active the work should not 
>>> be active.
>>>
>>> When you handle only one ring you can just call cancel in begin use and 
>>> schedule in end use. But when you have more than one ring you need a lock 
>>> or counter to prevent concurrent work items to be started.
>>>
>>> Michelle's idea to use mod_delayed_work is a bad one because it assumes 
>>> that the delayed work is still running.
>>
>> It merely assumes that the work may already have been scheduled before.
>>
>> Admittedly, I missed the cancel_delayed_work_sync calls for patch 2. While I 
>> think it can still have some effect when there's a single work item for 
>> multiple rings, as described by James, it's probably negligible, since 
>> presumably the time intervals between ring_begin_use and ring_end_use are 
>> normally much shorter than a second.
>>
>> So, while patch 2 is at worst a no-op (since mod_delayed_work is the same as 
>> schedule_delayed_work if the work hasn't been scheduled yet), I'm fine with 
>> dropping it.
>>
>>
>>> Something similar applies to the first patch I think,
>>
>> There are no cancel work calls in that case, so the commit log is accurate 
>> TTBOMK.
> 
> Curious -
> 
> For patch 1, does it make a difference if any delayed work scheduled is 
> cancelled in the else part before proceeding?
> 
> } else if (!enable && adev->gfx.gfx_off_state) {
> cancel_delayed_work();

I tried the patch below.

While this does seem to fix the problem as well, I see a potential issue:

1. amdgpu_gfx_off_ctrl locks adev->gfx.gfx_off_mutex
2. amdgpu_device_delay_enable_gfx_off runs, blocks in mutex_lock
3. amdgpu_gfx_off_ctrl calls cancel_delayed_work_sync

I'm afraid this would deadlock? (CONFIG_PROVE_LOCKING doesn't complain though)


Maybe it's possible to fix it with cancel_delayed_work_sync somehow, but I'm 
not sure how offhand. (With cancel_delayed_work instead, I'm worried 
amdgpu_device_delay_enable_gfx_off might still enable GFXOFF in the HW 
immediately after amdgpu_gfx_off_ctrl unlocks the mutex. Then again, that might 
happen with mod_delayed_work as well...)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

index a0be0772c8b3..3e4585ffb9af 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

@@ -570,8 +570,11 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)



if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) 
{

schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);

-   } else if (!enable && adev->gfx.gfx_off_state) {

-   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {

+   } else if (!enable) {

+       cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);

+

+   if (adev->gfx.gfx_off_state &&

+   !amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {

adev->gfx.gfx_off_state = false;



if (adev->gfx.funcs->init_spm_golden) {



-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [PATCH v2] drm: Copy drm_wait_vblank to user before returning

2021-08-13 Thread Michel Dänzer

On 2021-08-12 9:49 p.m., Mark Yacoub wrote:
> From: Mark Yacoub 
> 
> [Why]
> Userspace should get back a copy of drm_wait_vblank that's been modified
> even when drm_wait_vblank_ioctl returns a failure.
> 
> Rationale:
> drm_wait_vblank_ioctl modifies the request and expects the user to read
> it back. When the type is RELATIVE, it modifies it to ABSOLUTE and updates
> the sequence to become current_vblank_count + sequence (which was
> RELATIVE), but now it became ABSOLUTE.
> drmWaitVBlank (in libdrm) expects this to be the case as it modifies
> the request to be Absolute so it expects the sequence to would have been
> updated.
> 
> The change is in compat_drm_wait_vblank, which is called by
> drm_compat_ioctl. This change of copying the data back regardless of the
> return number makes it en par with drm_ioctl, which always copies the
> data before returning.
> 
> [How]
> Return from the function after everything has been copied to user.
> 
> Fixes: IGT:kms_flip::modeset-vs-vblank-race-interruptible
> Tested on ChromeOS Trogdor(msm)
> 
> Signed-off-by: Mark Yacoub 
> Change-Id: I98da279a5f1329c66a9d1e06b88d40b247b51313

With the Gerrit Change-Id removed,

Reviewed-by: Michel Dänzer 


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

[PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-13 Thread Michel Dänzer

From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
enabled to disabled. This makes sure the delayed work will be scheduled
as intended in the reverse case.

In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
to use mutex_trylock instead of mutex_lock.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
  mod_delayed_work.

Signed-off-by: Michel Dänzer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 13 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h|  3 +++
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..8b025f70706c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,7 +2777,16 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
struct amdgpu_device *adev =
container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
 
-   mutex_lock(&adev->gfx.gfx_off_mutex);
+   /* mutex_lock could deadlock with cancel_delayed_work_sync in 
amdgpu_gfx_off_ctrl. */
+   if (!mutex_trylock(&adev->gfx.gfx_off_mutex)) {
+   /* If there's a bug which causes amdgpu_gfx_off_ctrl to be 
called with enable=true
+* when adev->gfx.gfx_off_req_count is already 0, we might race 
with that.
+* Re-schedule to make sure gfx off will be re-enabled in the 
HW eventually.
+*/
+   schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
AMDGPU_GFX_OFF_DELAY_ENABLE);
+   return;
+   }
+
if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, true))
adev->gfx.gfx_off_state = true;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a0be0772c8b3..da4c46db3093 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -28,9 +28,6 @@
 #include "amdgpu_rlc.h"
 #include "amdgpu_ras.h"
 
-/* delay 0.1 second to enable gfx off feature */
-#define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
-
 /*
  * GPU GFX IP block helpers function.
  */
@@ -569,9 +566,13 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)
adev->gfx.gfx_off_req_count--;
 
if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) 
{
-   schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
-   } else if (!enable && adev->gfx.gfx_off_state) {
-   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
+   schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
AMDGPU_GFX_OFF_DELAY_ENABLE);
+   } else if (!enable) {
+   if (adev->gfx.gfx_off_req_count == 1 && 
!adev->gfx.gfx_off_state)
+   cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
+
+   if (adev->gfx.gfx_off_state &&
+   !amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
adev->gfx.gfx_off_state = false;
 
if (adev->gfx.funcs->init_spm_golden) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index d43fe2ed8116..dcdb505bb7f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -32,6 +32,9 @@
 #include "amdgpu_rlc.h"
 #include "soc15.h"
 
+/* delay 0.1 second to enable gfx off feature */
+#define AMDGPU_GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
+
 /* GFX current status */
 #define AMDGPU_GFX_NORMAL_MODE 0xL
 #define AMDGPU_GFX_SAFE_MODE   0x0001L
-- 
2.32.0

Re: [PATCH 2/2] drm/amdgpu: Use mod_delayed_work in JPEG/UVD/VCE/VCN ring_end_use hooks

2021-08-13 Thread Michel Dänzer

On 2021-08-13 6:23 a.m., Lazar, Lijo wrote:
> 
> 
> On 8/12/2021 10:24 PM, Michel Dänzer wrote:
>> On 2021-08-12 1:33 p.m., Lazar, Lijo wrote:
>>> On 8/12/2021 1:41 PM, Michel Dänzer wrote:
>>>> On 2021-08-12 7:55 a.m., Koenig, Christian wrote:
>>>>> Hi James,
>>>>>
>>>>> Evan seems to have understood how this all works together.
>>>>>
>>>>> See while any begin/end use critical section is active the work should 
>>>>> not be active.
>>>>>
>>>>> When you handle only one ring you can just call cancel in begin use and 
>>>>> schedule in end use. But when you have more than one ring you need a lock 
>>>>> or counter to prevent concurrent work items to be started.
>>>>>
>>>>> Michelle's idea to use mod_delayed_work is a bad one because it assumes 
>>>>> that the delayed work is still running.
>>>>
>>>> It merely assumes that the work may already have been scheduled before.
>>>>
>>>> Admittedly, I missed the cancel_delayed_work_sync calls for patch 2. While 
>>>> I think it can still have some effect when there's a single work item for 
>>>> multiple rings, as described by James, it's probably negligible, since 
>>>> presumably the time intervals between ring_begin_use and ring_end_use are 
>>>> normally much shorter than a second.
>>>>
>>>> So, while patch 2 is at worst a no-op (since mod_delayed_work is the same 
>>>> as schedule_delayed_work if the work hasn't been scheduled yet), I'm fine 
>>>> with dropping it.
>>>>
>>>>
>>>>> Something similar applies to the first patch I think,
>>>>
>>>> There are no cancel work calls in that case, so the commit log is accurate 
>>>> TTBOMK.
>>>
>>> Curious -
>>>
>>> For patch 1, does it make a difference if any delayed work scheduled is 
>>> cancelled in the else part before proceeding?
>>>
>>> } else if (!enable && adev->gfx.gfx_off_state) {
>>> cancel_delayed_work();
>>
>> I tried the patch below.
>>
>> While this does seem to fix the problem as well, I see a potential issue:
>>
>> 1. amdgpu_gfx_off_ctrl locks adev->gfx.gfx_off_mutex
>> 2. amdgpu_device_delay_enable_gfx_off runs, blocks in mutex_lock
>> 3. amdgpu_gfx_off_ctrl calls cancel_delayed_work_sync
>>
>> I'm afraid this would deadlock? (CONFIG_PROVE_LOCKING doesn't complain 
>> though)
> 
> Should use the cancel_delayed_work instead of the _sync version.

The thing is, it's not clear to me from cancel_delayed_work's description that 
it's guaranteed not to wait for amdgpu_device_delay_enable_gfx_off to finish if 
it's already running. If that's not guaranteed, it's prone to the same deadlock.

> As you mentioned - at best work is not scheduled yet and cancelled 
> successfully, or at worst it's waiting for the mutex. In the worst case, if 
> amdgpu_device_delay_enable_gfx_off gets the mutex after amdgpu_gfx_off_ctrl 
> unlocks it, there is an extra check as below.
> 
> if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count)
> 
> The count wouldn't be 0 and hence it won't enable GFXOFF.

I'm not sure, but it might also be possible for 
amdgpu_device_delay_enable_gfx_off to get the mutex only after 
amdgpu_gfx_off_ctrl was called again and set adev->gfx.gfx_off_req_count back 
to 0.


>> Maybe it's possible to fix it with cancel_delayed_work_sync somehow, but I'm 
>> not sure how offhand. (With cancel_delayed_work instead, I'm worried 
>> amdgpu_device_delay_enable_gfx_off might still enable GFXOFF in the HW 
>> immediately after amdgpu_gfx_off_ctrl unlocks the mutex. Then again, that 
>> might happen with mod_delayed_work as well...)
> 
> As mentioned earlier, cancel_delayed_work won't cause this issue.
> 
> In the mod_delayed_ patch, mod_ version is called only when req_count is 0. 
> While that is a good thing, it keeps alive one more contender for the mutex.

Not sure what you mean. It leaves the possibility of 
amdgpu_device_delay_enable_gfx_off running just after amdgpu_gfx_off_ctrl tried 
to postpone it. As discussed above, something similar might be possible with 
cancel_delayed_work as well.

> The cancel_ version eliminates that contender if happens to be called at the 
> right time (more likely if there are multiple requests to disable gfxoff). On 
> the other hand, don't know how costly it is to call cancel_ every time on the 
> else part (or maybe call only once when count increments to 1?).

Sure, why not, though I doubt it matters much — I expect 
adev->gfx.gfx_off_req_count transitioning between 0 <-> 1 to be the most common 
case by far.


I sent out a v2 patch which should address all these issues.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-13 Thread Michel Dänzer

On 2021-08-13 1:50 p.m., Lazar, Lijo wrote:
> 
> 
> On 8/13/2021 3:59 PM, Michel Dänzer wrote:
>> From: Michel Dänzer 
>>
>> schedule_delayed_work does not push back the work if it was already
>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>> was disabled and re-enabled again during those 100 ms.
>>
>> This resulted in frame drops / stutter with the upcoming mutter 41
>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>> disabling it again (for getting the GPU clock counter).
>>
>> To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
>> enabled to disabled. This makes sure the delayed work will be scheduled
>> as intended in the reverse case.
>>
>> In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
>> to use mutex_trylock instead of mutex_lock.
>>
>> v2:
>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>    mod_delayed_work.
>>
>> Signed-off-by: Michel Dänzer 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 13 +++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  3 +++
>>   3 files changed, 20 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index f3fd5ec710b6..8b025f70706c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2777,7 +2777,16 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
>> work_struct *work)
>>   struct amdgpu_device *adev =
>>   container_of(work, struct amdgpu_device, 
>> gfx.gfx_off_delay_work.work);
>>   -    mutex_lock(&adev->gfx.gfx_off_mutex);
>> +    /* mutex_lock could deadlock with cancel_delayed_work_sync in 
>> amdgpu_gfx_off_ctrl. */
>> +    if (!mutex_trylock(&adev->gfx.gfx_off_mutex)) {
>> +    /* If there's a bug which causes amdgpu_gfx_off_ctrl to be called 
>> with enable=true
>> + * when adev->gfx.gfx_off_req_count is already 0, we might race 
>> with that.
>> + * Re-schedule to make sure gfx off will be re-enabled in the HW 
>> eventually.
>> + */
>> +    schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
>> AMDGPU_GFX_OFF_DELAY_ENABLE);
>> +    return;
> 
> This is not needed and is just creating another thread to contend for mutex.

Still not sure what you mean by that. What other thread?

> The checks below take care of enabling gfxoff correctly. If it's already in 
> gfx_off state, it doesn't do anything. So I don't see why this change is 
> needed.

mutex_trylock is needed to prevent the deadlock discussed before and below.

schedule_delayed_work is needed due to this scenario hinted at by the comment:

1. amdgpu_gfx_off_ctrl locks mutex, calls schedule_delayed_work
2. amdgpu_device_delay_enable_gfx_off runs, calls mutex_trylock, which fails

GFXOFF would never get re-enabled in HW in this case (until amdgpu_gfx_off_ctrl 
calls schedule_delayed_work again).

(cancel_delayed_work_sync guarantees there's no pending delayed work when it 
returns, even if amdgpu_device_delay_enable_gfx_off calls schedule_delayed_work)


> The other problem is amdgpu_get_gfx_off_status() also uses the same mutex.

Not sure what for TBH. AFAICT there's only one implementation of this for 
Renoir, which just reads a register. (It's only called from debugfs)

> So it won't be knowing which thread it would be contending against and 
> blindly creates more work items. 

There is only ever at most one instance of the delayed work at any time. 
amdgpu_device_delay_enable_gfx_off doesn't care whether amdgpu_gfx_off_ctrl or 
amdgpu_get_gfx_off_status is holding the mutex, it just keeps re-scheduling 
itself 100 ms later until it succeeds.


>> @@ -569,9 +566,13 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, 
>> bool enable)
>>   adev->gfx.gfx_off_req_count--;
>>     if (enable && !adev->gfx.gfx_off_state && 
>> !adev->gfx.gfx_off_req_count) {
>> -    schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
>> GFX_OFF_DELAY_ENABLE);
>> -    } else if (!enable && adev->gfx.gfx_off_state) {
>> -    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
>> false)) {
>> +    schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
>> AMDGPU_GFX_OFF_DELA

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-13 Thread Michel Dänzer

On 2021-08-13 4:14 p.m., Lazar, Lijo wrote:
> On 8/13/2021 7:04 PM, Michel Dänzer wrote:
>> On 2021-08-13 1:50 p.m., Lazar, Lijo wrote:
>>> On 8/13/2021 3:59 PM, Michel Dänzer wrote:
>>>> From: Michel Dänzer 
>>>>
>>>> schedule_delayed_work does not push back the work if it was already
>>>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>>>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>>>> was disabled and re-enabled again during those 100 ms.
>>>>
>>>> This resulted in frame drops / stutter with the upcoming mutter 41
>>>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>>>> disabling it again (for getting the GPU clock counter).
>>>>
>>>> To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
>>>> enabled to disabled. This makes sure the delayed work will be scheduled
>>>> as intended in the reverse case.
>>>>
>>>> In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
>>>> to use mutex_trylock instead of mutex_lock.
>>>>
>>>> v2:
>>>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>>>     mod_delayed_work.
>>>>
>>>> Signed-off-by: Michel Dänzer 
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 13 +++--
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  3 +++
>>>>    3 files changed, 20 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index f3fd5ec710b6..8b025f70706c 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -2777,7 +2777,16 @@ static void 
>>>> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>>>    struct amdgpu_device *adev =
>>>>    container_of(work, struct amdgpu_device, 
>>>> gfx.gfx_off_delay_work.work);
>>>>    -    mutex_lock(&adev->gfx.gfx_off_mutex);
>>>> +    /* mutex_lock could deadlock with cancel_delayed_work_sync in 
>>>> amdgpu_gfx_off_ctrl. */
>>>> +    if (!mutex_trylock(&adev->gfx.gfx_off_mutex)) {
>>>> +    /* If there's a bug which causes amdgpu_gfx_off_ctrl to be called 
>>>> with enable=true
>>>> + * when adev->gfx.gfx_off_req_count is already 0, we might race 
>>>> with that.
>>>> + * Re-schedule to make sure gfx off will be re-enabled in the HW 
>>>> eventually.
>>>> + */
>>>> +    schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
>>>> AMDGPU_GFX_OFF_DELAY_ENABLE);
>>>> +    return;
>>>
>>> This is not needed and is just creating another thread to contend for mutex.
>>
>> Still not sure what you mean by that. What other thread?
> 
> Sorry, I meant it schedules another workitem and delays GFXOFF enablement 
> further. For ex: if it was another function like gfx_off_status holding the 
> lock at the time of check.
> 
>>
>>> The checks below take care of enabling gfxoff correctly. If it's already in 
>>> gfx_off state, it doesn't do anything. So I don't see why this change is 
>>> needed.
>>
>> mutex_trylock is needed to prevent the deadlock discussed before and below.
>>
>> schedule_delayed_work is needed due to this scenario hinted at by the 
>> comment:
>>
>> 1. amdgpu_gfx_off_ctrl locks mutex, calls schedule_delayed_work
>> 2. amdgpu_device_delay_enable_gfx_off runs, calls mutex_trylock, which fails
>>
>> GFXOFF would never get re-enabled in HW in this case (until 
>> amdgpu_gfx_off_ctrl calls schedule_delayed_work again).
>>
>> (cancel_delayed_work_sync guarantees there's no pending delayed work when it 
>> returns, even if amdgpu_device_delay_enable_gfx_off calls 
>> schedule_delayed_work)
>>
> 
> I think we need to explain based on the original code before. There is an 
> asssumption here that the only other contention of this mutex is with the 
> gfx_off_ctrl function.

Not really.


> As far as I understand if the work has already started running when 
> schedule_delayed_work is called, it will insert another in the work queue 
> after

Re: [PATCH] drm: radeon: r600_dma: Replace cpu_to_le32() by lower_32_bits()

2021-08-13 Thread Michel Dänzer

On 2021-08-13 10:54 a.m., zhaoxiao wrote:
> This patch fixes the following sparse errors:
> drivers/gpu/drm/radeon/r600_dma.c:247:30: warning: incorrect type in 
> assignment (different base types)
> drivers/gpu/drm/radeon/r600_dma.c:247:30:expected unsigned int volatile 
> [usertype]
> drivers/gpu/drm/radeon/r600_dma.c:247:30:got restricted __le32 [usertype]
> 
> Signed-off-by: zhaoxiao 
> ---
>  drivers/gpu/drm/radeon/r600_dma.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/radeon/r600_dma.c 
> b/drivers/gpu/drm/radeon/r600_dma.c
> index fb65e6fb5c4f..a2d0b1edcd22 100644
> --- a/drivers/gpu/drm/radeon/r600_dma.c
> +++ b/drivers/gpu/drm/radeon/r600_dma.c
> @@ -244,7 +244,7 @@ int r600_dma_ring_test(struct radeon_device *rdev,
>   gpu_addr = rdev->wb.gpu_addr + index;
>  
>   tmp = 0xCAFEDEAD;
> - rdev->wb.wb[index/4] = cpu_to_le32(tmp);
> + rdev->wb.wb[index/4] = lower_32_bits(tmp);
>  
>   r = radeon_ring_lock(rdev, ring, 4);
>   if (r) {
> 

Seems better to mark rdev->wb.wb as little endian instead. It's read with 
le32_to_cpu (with some exceptions which look like bugs), which would result in 
0xADEDFECA like this.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-13 Thread Michel Dänzer

On 2021-08-13 5:07 p.m., Lazar, Lijo wrote:
> 
> 
> On 8/13/2021 8:10 PM, Michel Dänzer wrote:
>> On 2021-08-13 4:14 p.m., Lazar, Lijo wrote:
>>> On 8/13/2021 7:04 PM, Michel Dänzer wrote:
>>>> On 2021-08-13 1:50 p.m., Lazar, Lijo wrote:
>>>>> On 8/13/2021 3:59 PM, Michel Dänzer wrote:
>>>>>> From: Michel Dänzer 
>>>>>>
>>>>>> schedule_delayed_work does not push back the work if it was already
>>>>>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>>>>>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>>>>>> was disabled and re-enabled again during those 100 ms.
>>>>>>
>>>>>> This resulted in frame drops / stutter with the upcoming mutter 41
>>>>>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>>>>>> disabling it again (for getting the GPU clock counter).
>>>>>>
>>>>>> To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
>>>>>> enabled to disabled. This makes sure the delayed work will be scheduled
>>>>>> as intended in the reverse case.
>>>>>>
>>>>>> In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
>>>>>> to use mutex_trylock instead of mutex_lock.
>>>>>>
>>>>>> v2:
>>>>>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>>>>>  mod_delayed_work.
>>>>>>
>>>>>> Signed-off-by: Michel Dänzer 
>>>>>> ---
>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 13 +++--
>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  3 +++
>>>>>>     3 files changed, 20 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> index f3fd5ec710b6..8b025f70706c 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> @@ -2777,7 +2777,16 @@ static void 
>>>>>> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>>>>>     struct amdgpu_device *adev =
>>>>>>     container_of(work, struct amdgpu_device, 
>>>>>> gfx.gfx_off_delay_work.work);
>>>>>>     -    mutex_lock(&adev->gfx.gfx_off_mutex);
>>>>>> +    /* mutex_lock could deadlock with cancel_delayed_work_sync in 
>>>>>> amdgpu_gfx_off_ctrl. */
>>>>>> +    if (!mutex_trylock(&adev->gfx.gfx_off_mutex)) {
>>>>>> +    /* If there's a bug which causes amdgpu_gfx_off_ctrl to be 
>>>>>> called with enable=true
>>>>>> + * when adev->gfx.gfx_off_req_count is already 0, we might race 
>>>>>> with that.
>>>>>> + * Re-schedule to make sure gfx off will be re-enabled in the 
>>>>>> HW eventually.
>>>>>> + */
>>>>>> +    schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
>>>>>> AMDGPU_GFX_OFF_DELAY_ENABLE);
>>>>>> +    return;
>>>>>
>>>>> This is not needed and is just creating another thread to contend for 
>>>>> mutex.
>>>>
>>>> Still not sure what you mean by that. What other thread?
>>>
>>> Sorry, I meant it schedules another workitem and delays GFXOFF enablement 
>>> further. For ex: if it was another function like gfx_off_status holding the 
>>> lock at the time of check.
>>>
>>>>
>>>>> The checks below take care of enabling gfxoff correctly. If it's already 
>>>>> in gfx_off state, it doesn't do anything. So I don't see why this change 
>>>>> is needed.
>>>>
>>>> mutex_trylock is needed to prevent the deadlock discussed before and below.
>>>>
>>>> schedule_delayed_work is needed due to this scenario hinted at by the 
>>>> comment:
>>>>
>>>> 1. amdgpu_gfx_off_ctrl locks mutex, calls schedule_delayed_work
>>>> 2. amdgpu_device_delay_enable_gfx_off runs, calls m

[PATCH v3] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-16 Thread Michel Dänzer

From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when the disable count
transitions from 0 to 1, and only schedule the delayed work on the
reverse transition, not if the disable count was already 0. This makes
sure the delayed work doesn't run at unexpected times, and allows it to
be lock-free.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
  mod_delayed_work.
v3:
* Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)

Cc: sta...@vger.kernel.org
Signed-off-by: Michel Dänzer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 22 +-
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..f944ed858f3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
struct amdgpu_device *adev =
container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
 
-   mutex_lock(&adev->gfx.gfx_off_mutex);
-   if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
-   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, true))
-   adev->gfx.gfx_off_state = true;
-   }
-   mutex_unlock(&adev->gfx.gfx_off_mutex);
+   WARN_ON_ONCE(adev->gfx.gfx_off_state);
+   WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
+
+   if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
true))
+   adev->gfx.gfx_off_state = true;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a0be0772c8b3..ca91aafcb32b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -563,15 +563,26 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)
 
mutex_lock(&adev->gfx.gfx_off_mutex);
 
-   if (!enable)
-   adev->gfx.gfx_off_req_count++;
-   else if (adev->gfx.gfx_off_req_count > 0)
+   if (enable) {
+   /* If the count is already 0, it means there's an imbalance bug 
somewhere.
+* Note that the bug may be in a different caller than the one 
which triggers the
+* WARN_ON_ONCE.
+*/
+   if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
+   goto unlock;
+
adev->gfx.gfx_off_req_count--;
+   } else {
+   adev->gfx.gfx_off_req_count++;
+   }
 
if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) 
{
schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
-   } else if (!enable && adev->gfx.gfx_off_state) {
-   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
+   } else if (!enable && adev->gfx.gfx_off_req_count == 1) {
+   cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
+
+   if (adev->gfx.gfx_off_state &&
+   !amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
adev->gfx.gfx_off_state = false;
 
if (adev->gfx.funcs->init_spm_golden) {
@@ -581,6 +592,7 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)
}
}
 
+unlock:
mutex_unlock(&adev->gfx.gfx_off_mutex);
 }
 
-- 
2.32.0

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-16 Thread Michel Dänzer

On 2021-08-16 9:38 a.m., Christian König wrote:
> Am 13.08.21 um 12:29 schrieb Michel Dänzer:
>> From: Michel Dänzer 
>>
>> schedule_delayed_work does not push back the work if it was already
>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>> was disabled and re-enabled again during those 100 ms.
>>
>> This resulted in frame drops / stutter with the upcoming mutter 41
>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>> disabling it again (for getting the GPU clock counter).
>>
>> To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
>> enabled to disabled. This makes sure the delayed work will be scheduled
>> as intended in the reverse case.
>>
>> In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
>> to use mutex_trylock instead of mutex_lock.
>>
>> v2:
>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>    mod_delayed_work.
> 
> While this may work it still smells a little bit fishy.
> 
> In general you have two common locking orders around work items, either 
> lock->work or work->lock. If you mix this as lock->work->lock like here 
> trouble is usually imminent.
> 
> I think what we should do instead is to double check if taking the lock 
> inside the work item is necessary and instead making sure that the work is 
> sync canceled when we don't want it to run. In other words fully switching to 
> the lock->work approach.

Done in v3, thanks for the suggestion!


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-16 Thread Michel Dänzer

On 2021-08-16 12:20 p.m., Quan, Evan wrote:
> [AMD Official Use Only]
> 
> Hi Michel,
> 
> The patch seems reasonable to me(especially the cancel_delayed_work_sync() 
> part).
> However, can you explain more about the code below?
> What's the race issue here exactly?
> 
> + /* mutex_lock could deadlock with cancel_delayed_work_sync in 
> amdgpu_gfx_off_ctrl. */
> + if (!mutex_trylock(&adev->gfx.gfx_off_mutex)) {
> + /* If there's a bug which causes amdgpu_gfx_off_ctrl to be 
> called with enable=true
> +  * when adev->gfx.gfx_off_req_count is already 0, we might race 
> with that.
> +  * Re-schedule to make sure gfx off will be re-enabled in the 
> HW eventually.
> +  */
> + schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
> AMDGPU_GFX_OFF_DELAY_ENABLE);
> + return;
> + }

If amdgpu_gfx_off_ctrl was called with enable=true when 
adev->gfx.gfx_off_req_count == 0 already, it could have prevented 
amdgpu_device_delay_enable_gfx_off from locking the mutex.

v3 solves this by only scheduling the work when adev->gfx.gfx_off_req_count 
transitions from 1 to 0, which means it no longer needs to lock the mutex.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-16 Thread Michel Dänzer

On 2021-08-16 6:13 a.m., Lazar, Lijo wrote:
> On 8/13/2021 9:30 PM, Michel Dänzer wrote:
>> On 2021-08-13 5:07 p.m., Lazar, Lijo wrote:
>>> On 8/13/2021 8:10 PM, Michel Dänzer wrote:
>>>> On 2021-08-13 4:14 p.m., Lazar, Lijo wrote:
>>>>> On 8/13/2021 7:04 PM, Michel Dänzer wrote:
>>>>>> On 2021-08-13 1:50 p.m., Lazar, Lijo wrote:
>>>>>>> On 8/13/2021 3:59 PM, Michel Dänzer wrote:
>>>>>>>> From: Michel Dänzer 
>>>>>>>>
>>>>>>>> schedule_delayed_work does not push back the work if it was already
>>>>>>>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>>>>>>>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>>>>>>>> was disabled and re-enabled again during those 100 ms.
>>>>>>>>
>>>>>>>> This resulted in frame drops / stutter with the upcoming mutter 41
>>>>>>>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>>>>>>>> disabling it again (for getting the GPU clock counter).
>>>>>>>>
>>>>>>>> To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
>>>>>>>> enabled to disabled. This makes sure the delayed work will be scheduled
>>>>>>>> as intended in the reverse case.
>>>>>>>>
>>>>>>>> In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
>>>>>>>> to use mutex_trylock instead of mutex_lock.
>>>>>>>>
>>>>>>>> v2:
>>>>>>>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>>>>>>>   mod_delayed_work.
>>>>>>>>
>>>>>>>> Signed-off-by: Michel Dänzer 
>>>>>>>> ---
>>>>>>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
>>>>>>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 13 +++--
>>>>>>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  3 +++
>>>>>>>>  3 files changed, 20 insertions(+), 7 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> index f3fd5ec710b6..8b025f70706c 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> @@ -2777,7 +2777,16 @@ static void 
>>>>>>>> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>>>>>>>  struct amdgpu_device *adev =
>>>>>>>>  container_of(work, struct amdgpu_device, 
>>>>>>>> gfx.gfx_off_delay_work.work);
>>>>>>>>  -    mutex_lock(&adev->gfx.gfx_off_mutex);
>>>>>>>> +    /* mutex_lock could deadlock with cancel_delayed_work_sync in 
>>>>>>>> amdgpu_gfx_off_ctrl. */
>>>>>>>> +    if (!mutex_trylock(&adev->gfx.gfx_off_mutex)) {
>>>>>>>> +    /* If there's a bug which causes amdgpu_gfx_off_ctrl to be 
>>>>>>>> called with enable=true
>>>>>>>> + * when adev->gfx.gfx_off_req_count is already 0, we might 
>>>>>>>> race with that.
>>>>>>>> + * Re-schedule to make sure gfx off will be re-enabled in the 
>>>>>>>> HW eventually.
>>>>>>>> + */
>>>>>>>> +    schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
>>>>>>>> AMDGPU_GFX_OFF_DELAY_ENABLE);
>>>>>>>> +    return;
>>>>>>>
>>>>>>> This is not needed and is just creating another thread to contend for 
>>>>>>> mutex.
>>>>>>
>>>>>> Still not sure what you mean by that. What other thread?
>>>>>
>>>>> Sorry, I meant it schedules another workitem and delays GFXOFF enablement 
>>>>> further. For ex: if it was another function like gfx_off_status holding 
>>>>> the lock at the time of check.
>>>>>
>>>>>&

Re: [PATCH v3] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-16 Thread Michel Dänzer

On 2021-08-16 2:06 p.m., Christian König wrote:
> Am 16.08.21 um 13:33 schrieb Lazar, Lijo:
>> On 8/16/2021 4:05 PM, Michel Dänzer wrote:
>>> From: Michel Dänzer 
>>>
>>> schedule_delayed_work does not push back the work if it was already
>>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>>> was disabled and re-enabled again during those 100 ms.
>>>
>>> This resulted in frame drops / stutter with the upcoming mutter 41
>>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>>> disabling it again (for getting the GPU clock counter).
>>>
>>> To fix this, call cancel_delayed_work_sync when the disable count
>>> transitions from 0 to 1, and only schedule the delayed work on the
>>> reverse transition, not if the disable count was already 0. This makes
>>> sure the delayed work doesn't run at unexpected times, and allows it to
>>> be lock-free.
>>>
>>> v2:
>>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>>    mod_delayed_work.
>>> v3:
>>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
>>>
>>> Cc: sta...@vger.kernel.org
>>> Signed-off-by: Michel Dänzer 
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 22 +-
>>>   2 files changed, 22 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index f3fd5ec710b6..f944ed858f3e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2777,12 +2777,11 @@ static void 
>>> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>>   struct amdgpu_device *adev =
>>>   container_of(work, struct amdgpu_device, 
>>> gfx.gfx_off_delay_work.work);
>>>   -    mutex_lock(&adev->gfx.gfx_off_mutex);
>>> -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>> -    if (!amdgpu_dpm_set_powergating_by_smu(adev, 
>>> AMD_IP_BLOCK_TYPE_GFX, true))
>>> -    adev->gfx.gfx_off_state = true;
>>> -    }
>>> -    mutex_unlock(&adev->gfx.gfx_off_mutex);
>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>>
>> Don't see any case for this. It's not expected to be scheduled in this case, 
>> right?
>>
>>> + WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>>> +
>>
>> Thinking about ON_ONCE here - this may happen more than once if it's 
>> completed as part of cancel_ call. Is the warning needed?
> 
> WARN_ON_ONCE() is usually used to prevent spamming the system log with 
> warnings. E.g. the warning is only printed once indicating a driver bug and 
> that's it.

Right, these WARN_ONs are like assert()s in user-space code, documenting the 
pre-conditions and checking them at runtime. And I use _ONCE so that if a 
pre-condition is ever violated for some reason, dmesg isn't spammed with 
multiple warnings.


>> Anyway,
>> Reviewed-by: Lijo Lazar 
> 
> Acked-by: Christian König 

Thanks guys!


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

[PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Michel Dänzer

From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when the disable count
transitions from 0 to 1, and only schedule the delayed work on the
reverse transition, not if the disable count was already 0. This makes
sure the delayed work doesn't run at unexpected times, and allows it to
be lock-free.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
  mod_delayed_work.
v3:
* Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
v4:
* Fix race condition between amdgpu_gfx_off_ctrl incrementing
  adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
  checking for it to be 0 (Evan Quan)

Cc: sta...@vger.kernel.org
Reviewed-by: Lijo Lazar  # v3
Acked-by: Christian König  # v3
Signed-off-by: Michel Dänzer 
---

Alex, probably best to wait a bit longer before picking this up. :)

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 36 +++---
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..f944ed858f3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
struct amdgpu_device *adev =
container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
 
-   mutex_lock(&adev->gfx.gfx_off_mutex);
-   if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
-   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, true))
-   adev->gfx.gfx_off_state = true;
-   }
-   mutex_unlock(&adev->gfx.gfx_off_mutex);
+   WARN_ON_ONCE(adev->gfx.gfx_off_state);
+   WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
+
+   if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
true))
+   adev->gfx.gfx_off_state = true;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a0be0772c8b3..b4ced45301be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)
 
mutex_lock(&adev->gfx.gfx_off_mutex);
 
-   if (!enable)
-   adev->gfx.gfx_off_req_count++;
-   else if (adev->gfx.gfx_off_req_count > 0)
+   if (enable) {
+   /* If the count is already 0, it means there's an imbalance bug 
somewhere.
+* Note that the bug may be in a different caller than the one 
which triggers the
+* WARN_ON_ONCE.
+*/
+   if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
+   goto unlock;
+
adev->gfx.gfx_off_req_count--;
 
-   if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) 
{
-   schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
-   } else if (!enable && adev->gfx.gfx_off_state) {
-   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
-   adev->gfx.gfx_off_state = false;
+   if (adev->gfx.gfx_off_req_count == 0 && 
!adev->gfx.gfx_off_state)
+   schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
+   } else {
+   if (adev->gfx.gfx_off_req_count == 0) {
+   cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
+
+   if (adev->gfx.gfx_off_state &&
+   !amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
+   adev->gfx.gfx_off_state = false;
 
-   if (adev->gfx.funcs->init_spm_golden) {
-   dev_dbg(adev->dev, "GFXOFF is disabled, re-init 
SPM golden settings\n");
-   amdgpu_gfx_init_spm_golden(adev);
+   if (adev->gfx.funcs->init_spm_golden) {
+   d

Re: [PATCH v3] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Michel Dänzer

On 2021-08-17 10:17 a.m., Lazar, Lijo wrote:
> On 8/17/2021 1:21 PM, Quan, Evan wrote:
>>> -Original Message-
>>> From: amd-gfx  On Behalf Of
>>> Michel Dänzer
>>> Sent: Monday, August 16, 2021 6:35 PM
>>> To: Deucher, Alexander ; Koenig, Christian
>>> 
>>> Cc: Liu, Leo ; Zhu, James ; amd-
>>> g...@lists.freedesktop.org; dri-devel@lists.freedesktop.org
>>> Subject: [PATCH v3] drm/amdgpu: Cancel delayed work when GFXOFF is
>>> disabled
>>>
>>> From: Michel Dänzer 
>>>
>>> schedule_delayed_work does not push back the work if it was already
>>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>>> was disabled and re-enabled again during those 100 ms.
>>>
>>> This resulted in frame drops / stutter with the upcoming mutter 41
>>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>>> disabling it again (for getting the GPU clock counter).
>>>
>>> To fix this, call cancel_delayed_work_sync when the disable count
>>> transitions from 0 to 1, and only schedule the delayed work on the
>>> reverse transition, not if the disable count was already 0. This makes
>>> sure the delayed work doesn't run at unexpected times, and allows it to
>>> be lock-free.
>>>
>>> v2:
>>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>>    mod_delayed_work.
>>> v3:
>>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
>>>
>>> Cc: sta...@vger.kernel.org
>>> Signed-off-by: Michel Dänzer 
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 22 +-
>>> 
>>>   2 files changed, 22 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index f3fd5ec710b6..f944ed858f3e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2777,12 +2777,11 @@ static void
>>> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>>   struct amdgpu_device *adev =
>>>   container_of(work, struct amdgpu_device,
>>> gfx.gfx_off_delay_work.work);
>>>
>>> -    mutex_lock(&adev->gfx.gfx_off_mutex);
>>> -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>> -    if (!amdgpu_dpm_set_powergating_by_smu(adev,
>>> AMD_IP_BLOCK_TYPE_GFX, true))
>>> -    adev->gfx.gfx_off_state = true;
>>> -    }
>>> -    mutex_unlock(&adev->gfx.gfx_off_mutex);
>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>>> +
>>> +    if (!amdgpu_dpm_set_powergating_by_smu(adev,
>>> AMD_IP_BLOCK_TYPE_GFX, true))
>>> +    adev->gfx.gfx_off_state = true;
>>>   }
>>>
>>>   /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> index a0be0772c8b3..ca91aafcb32b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> @@ -563,15 +563,26 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device
>>> *adev, bool enable)
>>>
>>>   mutex_lock(&adev->gfx.gfx_off_mutex);
>>>
>>> -    if (!enable)
>>> -    adev->gfx.gfx_off_req_count++;
>>> -    else if (adev->gfx.gfx_off_req_count > 0)
>>> +    if (enable) {
>>> +    /* If the count is already 0, it means there's an imbalance bug
>>> somewhere.
>>> + * Note that the bug may be in a different caller than the one
>>> which triggers the
>>> + * WARN_ON_ONCE.
>>> + */
>>> +    if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>>> +    goto unlock;
>>> +
>>>   adev->gfx.gfx_off_req_count--;
>>> +    } else {
>>> +    adev->gfx.gfx_off_req_count++;
>>> +    }
>>>
>>>   if (enable && !adev->gfx.gfx_off_state && !adev-
>>>> gfx.gfx_off_req_count) {
>>>   schedule_delayed_work(&adev->gfx.gfx_off_delay_work,
>

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Michel Dänzer

On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:
> 
> 
> On 8/17/2021 1:53 PM, Michel Dänzer wrote:
>> From: Michel Dänzer 
>>
>> schedule_delayed_work does not push back the work if it was already
>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>> was disabled and re-enabled again during those 100 ms.
>>
>> This resulted in frame drops / stutter with the upcoming mutter 41
>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>> disabling it again (for getting the GPU clock counter).
>>
>> To fix this, call cancel_delayed_work_sync when the disable count
>> transitions from 0 to 1, and only schedule the delayed work on the
>> reverse transition, not if the disable count was already 0. This makes
>> sure the delayed work doesn't run at unexpected times, and allows it to
>> be lock-free.
>>
>> v2:
>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>    mod_delayed_work.
>> v3:
>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
>> v4:
>> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>>    adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>>    checking for it to be 0 (Evan Quan)
>>
>> Cc: sta...@vger.kernel.org
>> Reviewed-by: Lijo Lazar  # v3
>> Acked-by: Christian König  # v3
>> Signed-off-by: Michel Dänzer 
>> ---
>>
>> Alex, probably best to wait a bit longer before picking this up. :)
>>
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++---
>>   2 files changed, 30 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index f3fd5ec710b6..f944ed858f3e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2777,12 +2777,11 @@ static void 
>> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>   struct amdgpu_device *adev =
>>   container_of(work, struct amdgpu_device, 
>> gfx.gfx_off_delay_work.work);
>>   -    mutex_lock(&adev->gfx.gfx_off_mutex);
>> -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>> -    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
>> true))
>> -    adev->gfx.gfx_off_state = true;
>> -    }
>> -    mutex_unlock(&adev->gfx.gfx_off_mutex);
>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>> +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>> +
>> +    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
>> true))
>> +    adev->gfx.gfx_off_state = true;
>>   }
>>     /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index a0be0772c8b3..b4ced45301be 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, 
>> bool enable)
>>     mutex_lock(&adev->gfx.gfx_off_mutex);
>>   -    if (!enable)
>> -    adev->gfx.gfx_off_req_count++;
>> -    else if (adev->gfx.gfx_off_req_count > 0)
>> +    if (enable) {
>> +    /* If the count is already 0, it means there's an imbalance bug 
>> somewhere.
>> + * Note that the bug may be in a different caller than the one 
>> which triggers the
>> + * WARN_ON_ONCE.
>> + */
>> +    if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>> +    goto unlock;
>> +
>>   adev->gfx.gfx_off_req_count--;
>>   -    if (enable && !adev->gfx.gfx_off_state && 
>> !adev->gfx.gfx_off_req_count) {
>> -    schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
>> GFX_OFF_DELAY_ENABLE);
>> -    } else if (!enable && adev->gfx.gfx_off_state) {
>> -    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
>> false)) {
>> -    adev->gfx.gfx_off_state = false;
>> +    if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
>> +    schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
>> GFX_OFF_DELAY_ENABLE);
>> +    } else {
>> +    if (adev->gfx.gfx_off_req_count == 0) {
>> +    cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
>> +
>> +    if (adev->gfx.gfx_off_state &&
> 
> More of a question which I didn't check last time - Is this expected to be 
> true when the disable call comes in first?

My assumption is that cancel_delayed_work_sync guarantees 
amdgpu_device_delay_enable_gfx_off's assignment is visible here.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Michel Dänzer

On 2021-08-17 11:37 a.m., Lazar, Lijo wrote:
> 
> 
> On 8/17/2021 2:56 PM, Michel Dänzer wrote:
>> On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:
>>>
>>>
>>> On 8/17/2021 1:53 PM, Michel Dänzer wrote:
>>>> From: Michel Dänzer 
>>>>
>>>> schedule_delayed_work does not push back the work if it was already
>>>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>>>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>>>> was disabled and re-enabled again during those 100 ms.
>>>>
>>>> This resulted in frame drops / stutter with the upcoming mutter 41
>>>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>>>> disabling it again (for getting the GPU clock counter).
>>>>
>>>> To fix this, call cancel_delayed_work_sync when the disable count
>>>> transitions from 0 to 1, and only schedule the delayed work on the
>>>> reverse transition, not if the disable count was already 0. This makes
>>>> sure the delayed work doesn't run at unexpected times, and allows it to
>>>> be lock-free.
>>>>
>>>> v2:
>>>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>>>     mod_delayed_work.
>>>> v3:
>>>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
>>>> v4:
>>>> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>>>>     adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>>>>     checking for it to be 0 (Evan Quan)
>>>>
>>>> Cc: sta...@vger.kernel.org
>>>> Reviewed-by: Lijo Lazar  # v3
>>>> Acked-by: Christian König  # v3
>>>> Signed-off-by: Michel Dänzer 
>>>> ---
>>>>
>>>> Alex, probably best to wait a bit longer before picking this up. :)
>>>>
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++---
>>>>    2 files changed, 30 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index f3fd5ec710b6..f944ed858f3e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -2777,12 +2777,11 @@ static void 
>>>> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>>>    struct amdgpu_device *adev =
>>>>    container_of(work, struct amdgpu_device, 
>>>> gfx.gfx_off_delay_work.work);
>>>>    -    mutex_lock(&adev->gfx.gfx_off_mutex);
>>>> -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>>> -    if (!amdgpu_dpm_set_powergating_by_smu(adev, 
>>>> AMD_IP_BLOCK_TYPE_GFX, true))
>>>> -    adev->gfx.gfx_off_state = true;
>>>> -    }
>>>> -    mutex_unlock(&adev->gfx.gfx_off_mutex);
>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>>>> +
>>>> +    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
>>>> true))
>>>> +    adev->gfx.gfx_off_state = true;
>>>>    }
>>>>      /**
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> index a0be0772c8b3..b4ced45301be 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, 
>>>> bool enable)
>>>>      mutex_lock(&adev->gfx.gfx_off_mutex);
>>>>    -    if (!enable)
>>>> -    adev->gfx.gfx_off_req_count++;
>>>> -    else if (adev->gfx.gfx_off_req_count > 0)
>>>> +    if (enable) {
>>>> +    /* If the count is already 0, it means there's an imbalance bug 
>>>> somewhere.
>>>> + * Note that the bug may be in a different caller than the one 
>>>> which triggers the
>>>> + * WARN_ON_ONCE.
>>>> + */
>>>> +    if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>>>> +    goto

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Michel Dänzer

On 2021-08-17 12:37 p.m., Lazar, Lijo wrote:
> 
> 
> On 8/17/2021 3:29 PM, Michel Dänzer wrote:
>> On 2021-08-17 11:37 a.m., Lazar, Lijo wrote:
>>>
>>>
>>> On 8/17/2021 2:56 PM, Michel Dänzer wrote:
>>>> On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:
>>>>>
>>>>>
>>>>> On 8/17/2021 1:53 PM, Michel Dänzer wrote:
>>>>>> From: Michel Dänzer 
>>>>>>
>>>>>> schedule_delayed_work does not push back the work if it was already
>>>>>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>>>>>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>>>>>> was disabled and re-enabled again during those 100 ms.
>>>>>>
>>>>>> This resulted in frame drops / stutter with the upcoming mutter 41
>>>>>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>>>>>> disabling it again (for getting the GPU clock counter).
>>>>>>
>>>>>> To fix this, call cancel_delayed_work_sync when the disable count
>>>>>> transitions from 0 to 1, and only schedule the delayed work on the
>>>>>> reverse transition, not if the disable count was already 0. This makes
>>>>>> sure the delayed work doesn't run at unexpected times, and allows it to
>>>>>> be lock-free.
>>>>>>
>>>>>> v2:
>>>>>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>>>>>      mod_delayed_work.
>>>>>> v3:
>>>>>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
>>>>>> v4:
>>>>>> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>>>>>>  adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>>>>>>  checking for it to be 0 (Evan Quan)
>>>>>>
>>>>>> Cc: sta...@vger.kernel.org
>>>>>> Reviewed-by: Lijo Lazar  # v3
>>>>>> Acked-by: Christian König  # v3
>>>>>> Signed-off-by: Michel Dänzer 
>>>>>> ---
>>>>>>
>>>>>> Alex, probably best to wait a bit longer before picking this up. :)
>>>>>>
>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 
>>>>>> +++---
>>>>>>     2 files changed, 30 insertions(+), 17 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> index f3fd5ec710b6..f944ed858f3e 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> @@ -2777,12 +2777,11 @@ static void 
>>>>>> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>>>>>     struct amdgpu_device *adev =
>>>>>>     container_of(work, struct amdgpu_device, 
>>>>>> gfx.gfx_off_delay_work.work);
>>>>>>     -    mutex_lock(&adev->gfx.gfx_off_mutex);
>>>>>> -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>>>>> -    if (!amdgpu_dpm_set_powergating_by_smu(adev, 
>>>>>> AMD_IP_BLOCK_TYPE_GFX, true))
>>>>>> -    adev->gfx.gfx_off_state = true;
>>>>>> -    }
>>>>>> -    mutex_unlock(&adev->gfx.gfx_off_mutex);
>>>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>>>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>>>>>> +
>>>>>> +    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
>>>>>> true))
>>>>>> +    adev->gfx.gfx_off_state = true;
>>>>>>     }
>>>>>>       /**
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> index a0be0772c8b3..b4ced45301be 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct a

Re: [PATCH 3/3] dma-buf: Add an API for exporting sync files (v6)

2021-03-16 Thread Michel Dänzer

On 2021-03-16 12:10 a.m., Jason Ekstrand wrote:
> On Mon, Mar 15, 2021 at 4:05 PM Jason Ekstrand  wrote:
>>
>> Modern userspace APIs like Vulkan are built on an explicit
>> synchronization model.  This doesn't always play nicely with the
>> implicit synchronization used in the kernel and assumed by X11 and
>> Wayland.  The client -> compositor half of the synchronization isn't too
>> bad, at least on intel, because we can control whether or not i915
>> synchronizes on the buffer and whether or not it's considered written.
>>
>> The harder part is the compositor -> client synchronization when we get
>> the buffer back from the compositor.  We're required to be able to
>> provide the client with a VkSemaphore and VkFence representing the point
>> in time where the window system (compositor and/or display) finished
>> using the buffer.  With current APIs, it's very hard to do this in such
>> a way that we don't get confused by the Vulkan driver's access of the
>> buffer.  In particular, once we tell the kernel that we're rendering to
>> the buffer again, any CPU waits on the buffer or GPU dependencies will
>> wait on some of the client rendering and not just the compositor.
>>
>> This new IOCTL solves this problem by allowing us to get a snapshot of
>> the implicit synchronization state of a given dma-buf in the form of a
>> sync file.  It's effectively the same as a poll() or I915_GEM_WAIT only,
>> instead of CPU waiting directly, it encapsulates the wait operation, at
>> the current moment in time, in a sync_file so we can check/wait on it
>> later.  As long as the Vulkan driver does the sync_file export from the
>> dma-buf before we re-introduce it for rendering, it will only contain
>> fences from the compositor or display.  This allows to accurately turn
>> it into a VkFence or VkSemaphore without any over- synchronization.
>>
>> v2 (Jason Ekstrand):
>>  - Use a wrapper dma_fence_array of all fences including the new one
>>when importing an exclusive fence.
>>
>> v3 (Jason Ekstrand):
>>  - Lock around setting shared fences as well as exclusive
>>  - Mark SIGNAL_SYNC_FILE as a read-write ioctl.
>>  - Initialize ret to 0 in dma_buf_wait_sync_file
>>
>> v4 (Jason Ekstrand):
>>  - Use the new dma_resv_get_singleton helper
>>
>> v5 (Jason Ekstrand):
>>  - Rename the IOCTLs to import/export rather than wait/signal
>>  - Drop the WRITE flag and always get/set the exclusive fence
>>
>> v6 (Jason Ekstrand):
>>  - Drop the sync_file import as it was all-around sketchy and not nearly
>>as useful as import.
>>  - Re-introduce READ/WRITE flag support for export
>>  - Rework the commit message
>>
>> Signed-off-by: Jason Ekstrand 
>> ---
>>  drivers/dma-buf/dma-buf.c| 55 
>>  include/uapi/linux/dma-buf.h |  6 
>>  2 files changed, 61 insertions(+)
>>
>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
>> index f264b70c383eb..e7f9dd62c19a9 100644
>> --- a/drivers/dma-buf/dma-buf.c
>> +++ b/drivers/dma-buf/dma-buf.c
[...]
>> @@ -362,6 +363,57 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, 
>> const char __user *buf)
>> return ret;
>>  }
>>
>> +static long dma_buf_export_sync_file(struct dma_buf *dmabuf,
>> +void __user *user_data)
>> +{
>> +   struct dma_buf_sync_file arg;
>> +   struct dma_fence *fence = NULL;
>> +   struct sync_file *sync_file;
>> +   int fd, ret;
>> +
>> +   if (copy_from_user(&arg, user_data, sizeof(arg)))
>> +   return -EFAULT;
>> +
>> +   if (arg.flags & ~DMA_BUF_SYNC_RW)
>> +   return -EINVAL;
>> +
>> +   fd = get_unused_fd_flags(O_CLOEXEC);
>> +   if (fd < 0)
>> +   return fd;
>> +
>> +   if (arg.flags & DMA_BUF_SYNC_WRITE) {
>> +   ret = dma_resv_get_singleton(dmabuf->resv, NULL, &fence);
>> +   if (ret)
>> +   goto err_put_fd;
>> +   } else if (arg.flags & DMA_BUF_SYNC_READ) {
>> +   fence = dma_resv_get_excl(dmabuf->resv);
>> +   }
>> +
>> +   if (!fence)
>> +   fence = dma_fence_get_stub();
>> +
>> +   sync_file = sync_file_create(fence);
>> +
>> +   dma_fence_put(fence);
>> +
>> +   if (!sync_file) {
>> +   ret = -EINVAL;
> 
> Should this be -EINVAL or -ENOMEM?

The latter makes more sense to me, since sync_file_create returning NULL is not 
related to invalid ioctl parameters.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH] drm/amdgpu: Ensure that the modifier requested is supported by plane.

2021-03-24 Thread Michel Dänzer

On 2021-03-23 4:32 p.m., Mark Yacoub wrote:
> On Tue, Mar 23, 2021 at 11:02 AM Alex Deucher  wrote:
>>
>> On Wed, Mar 10, 2021 at 11:15 AM Mark Yacoub  wrote:
>>>
>>> From: Mark Yacoub 
>>>
>>> On initializing the framebuffer, call drm_any_plane_has_format to do a
>>> check if the modifier is supported. drm_any_plane_has_format calls
>>> dm_plane_format_mod_supported which is extended to validate that the
>>> modifier is on the list of the plane's supported modifiers.
>>>
>>> The bug was caught using igt-gpu-tools test: 
>>> kms_addfb_basic.addfb25-bad-modifier
>>>
>>> Tested on ChromeOS Zork by turning on the display, running an overlay
>>> test, and running a YT video.
>>>
>>> Cc: Alex Deucher 
>>> Cc: Bas Nieuwenhuizen 
>>> Signed-off-by: default avatarMark Yacoub 
>>
>> I'm not an expert with modifiers yet.  Will this break chips which
>> don't currently support modifiers?
> No it shouldn't. When you don't support modifiers yet, your will
> default to Linear Modifier (DRM_FORMAT_MOD_LINEAR),
> [...]
No modifier support does not imply linear. It's generally signalled via 
DRM_FORMAT_MOD_INVALID, which roughly means "tiling is determined by driver 
specific mechanisms".


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH] drm/ttm: stop using GFP_TRANSHUGE_LIGHT

2021-01-27 Thread Michel Dänzer


On 2021-01-13 2:13 p.m., Christian König wrote:

The only flag we really need is __GFP_NOMEMALLOC, highmem depends on
dma32 and moveable/compound should never be set in the first place.

Signed-off-by: Christian König 
---
  drivers/gpu/drm/ttm/ttm_pool.c | 11 ++-
  1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index 8cd776adc592..11e0313db0ea 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -79,12 +79,13 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool 
*pool, gfp_t gfp_flags,
struct page *p;
void *vaddr;
  
-	if (order) {

-   gfp_flags |= GFP_TRANSHUGE_LIGHT | __GFP_NORETRY |
+   /* Don't set the __GFP_COMP flag for higher order allocations.
+* Mapping pages directly into an userspace process and calling
+* put_page() on a TTM allocated page is illegal.
+*/
+   if (order)
+   gfp_flags |= __GFP_NOMEMALLOC | __GFP_NORETRY |
__GFP_KSWAPD_RECLAIM;
-   gfp_flags &= ~__GFP_MOVABLE;
-   gfp_flags &= ~__GFP_COMP;
-   }
  
  	if (!pool->use_dma_alloc) {

p = alloc_pages(gfp_flags, order);



I picked up this change today, and got the attached splat while running 
piglit.


scripts/faddr2line drivers/gpu/drm/ttm/ttm.ko ttm_pool_alloc+0x2e4/0x5e0

gives:

ttm_pool_alloc+0x2e4/0x5e0:
 alloc_pages at /home/daenzer/src/linux-git/linux/./include/linux/gfp.h:547

(inlined by) ttm_pool_alloc_page at 
/home/daenzer/src/linux-git/linux/drivers/gpu/drm//ttm/ttm_pool.c:91


(inlined by) ttm_pool_alloc at 
/home/daenzer/src/linux-git/linux/drivers/gpu/drm//ttm/ttm_pool.c:398




I suspect we need __GFP_NOWARN as well to avoid these splats.


--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
[ 9556.710241] clinfo: page allocation failure: order:9, 
mode:0x194dc2(GFP_HIGHUSER|__GFP_RETRY_MAYFAIL|__GFP_NORETRY|__GFP_ZERO|__GFP_NOMEMALLOC),
 nodemask=(null),cpuset=user.slice,mems_allowed=0
[ 9556.710259] CPU: 1 PID: 470821 Comm: clinfo Tainted: GE 
5.10.10+ #4
[ 9556.710264] Hardware name: Micro-Star International Co., Ltd. MS-7A34/B350 
TOMAHAWK (MS-7A34), BIOS 1.OR 11/29/2019
[ 9556.710268] Call Trace:
[ 9556.710281]  dump_stack+0x6b/0x83
[ 9556.710288]  warn_alloc.cold+0x7b/0xdf
[ 9556.710297]  ? __alloc_pages_direct_compact+0x137/0x150
[ 9556.710303]  __alloc_pages_slowpath.constprop.0+0xc1b/0xc50
[ 9556.710312]  __alloc_pages_nodemask+0x2ec/0x320
[ 9556.710325]  ttm_pool_alloc+0x2e4/0x5e0 [ttm]
[ 9556.710332]  ? kvmalloc_node+0x46/0x80
[ 9556.710341]  ttm_tt_populate+0x37/0xe0 [ttm]
[ 9556.710350]  ttm_bo_handle_move_mem+0x142/0x180 [ttm]
[ 9556.710359]  ttm_bo_validate+0x11d/0x190 [ttm]
[ 9556.710391]  ? drm_vma_offset_add+0x2f/0x60 [drm]
[ 9556.710399]  ttm_bo_init_reserved+0x2a7/0x320 [ttm]
[ 9556.710529]  amdgpu_bo_do_create+0x1b8/0x500 [amdgpu]
[ 9556.710657]  ? amdgpu_bo_subtract_pin_size+0x60/0x60 [amdgpu]
[ 9556.710663]  ? get_page_from_freelist+0x11f9/0x1450
[ 9556.710789]  amdgpu_bo_create+0x40/0x270 [amdgpu]
[ 9556.710797]  ? _raw_spin_unlock+0x16/0x30
[ 9556.710927]  amdgpu_gem_create_ioctl+0x123/0x310 [amdgpu]
[ 9556.711062]  ? amdgpu_gem_force_release+0x150/0x150 [amdgpu]
[ 9556.711098]  drm_ioctl_kernel+0xaa/0xf0 [drm]
[ 9556.711133]  drm_ioctl+0x20f/0x3a0 [drm]
[ 9556.711267]  ? amdgpu_gem_force_release+0x150/0x150 [amdgpu]
[ 9556.711276]  ? preempt_count_sub+0x9b/0xd0
[ 9556.711404]  amdgpu_drm_ioctl+0x49/0x80 [amdgpu]
[ 9556.711411]  __x64_sys_ioctl+0x83/0xb0
[ 9556.711417]  do_syscall_64+0x33/0x80
[ 9556.711421]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 9556.711426] RIP: 0033:0x7f14217bdcc7
[ 9556.711431] Code: 00 00 00 48 8b 05 c9 91 0c 00 64 c7 00 26 00 00 00 48 c7 
c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 
f0 ff ff 73 01 c3 48 8b 0d 99 91 0c 00 f7 d8 64 89 01 48
[ 9556.711434] RSP: 002b:7ffd97bfdc68 EFLAGS: 0246 ORIG_RAX: 
0010
[ 9556.711439] RAX: ffda RBX: 7ffd97bfdcc0 RCX: 7f14217bdcc7
[ 9556.711442] RDX: 7ffd97bfdcc0 RSI: c0206440 RDI: 0006
[ 9556.711445] RBP: c0206440 R08: 0008 R09: 7f1421887be0
[ 9556.711448] R10: 7ffd97c9e080 R11: 0246 R12: 564dab762d20
[ 9556.711450] R13: 0006 R14: 0020 R15: 0020
[ 9556.711489] Mem-Info:
[ 9556.711499] active_anon:3253 inactive_anon:141494 isolated_anon:0
active_file:1878780 inactive_file:1558064 isolated_file:32
unevictable:0 dirty:6571 writeback:0
slab_reclaimable:123407 slab_unreclaimable:40992
mapped:62091 shmem:3821 pagetables:3837 bounce:0
free:293596 free_pcp:684 free_cma:0
[ 9556.7115

[PATCH] drm/ttm: Use __GFP_NOWARN for huge pages in ttm_pool_alloc_page

2021-01-28 Thread Michel Dänzer

From: Michel Dänzer 

Without __GFP_NOWARN, attempts at allocating huge pages can trigger
dmesg splats like below (which are essentially noise, since TTM falls
back to normal pages if it can't get a huge one).

[ 9556.710241] clinfo: page allocation failure: order:9, 
mode:0x194dc2(GFP_HIGHUSER|__GFP_RETRY_MAYFAIL|__GFP_NORETRY|__GFP_ZERO|__GFP_NOMEMALLOC),
 nodemask=(null),cpuset=user.slice,mems_allowed=0
[ 9556.710259] CPU: 1 PID: 470821 Comm: clinfo Tainted: GE 
5.10.10+ #4
[ 9556.710264] Hardware name: Micro-Star International Co., Ltd. MS-7A34/B350 
TOMAHAWK (MS-7A34), BIOS 1.OR 11/29/2019
[ 9556.710268] Call Trace:
[ 9556.710281]  dump_stack+0x6b/0x83
[ 9556.710288]  warn_alloc.cold+0x7b/0xdf
[ 9556.710297]  ? __alloc_pages_direct_compact+0x137/0x150
[ 9556.710303]  __alloc_pages_slowpath.constprop.0+0xc1b/0xc50
[ 9556.710312]  __alloc_pages_nodemask+0x2ec/0x320
[ 9556.710325]  ttm_pool_alloc+0x2e4/0x5e0 [ttm]
[ 9556.710332]  ? kvmalloc_node+0x46/0x80
[ 9556.710341]  ttm_tt_populate+0x37/0xe0 [ttm]
[ 9556.710350]  ttm_bo_handle_move_mem+0x142/0x180 [ttm]
[ 9556.710359]  ttm_bo_validate+0x11d/0x190 [ttm]
[ 9556.710391]  ? drm_vma_offset_add+0x2f/0x60 [drm]
[ 9556.710399]  ttm_bo_init_reserved+0x2a7/0x320 [ttm]
[ 9556.710529]  amdgpu_bo_do_create+0x1b8/0x500 [amdgpu]
[ 9556.710657]  ? amdgpu_bo_subtract_pin_size+0x60/0x60 [amdgpu]
[ 9556.710663]  ? get_page_from_freelist+0x11f9/0x1450
[ 9556.710789]  amdgpu_bo_create+0x40/0x270 [amdgpu]
[ 9556.710797]  ? _raw_spin_unlock+0x16/0x30
[ 9556.710927]  amdgpu_gem_create_ioctl+0x123/0x310 [amdgpu]
[ 9556.711062]  ? amdgpu_gem_force_release+0x150/0x150 [amdgpu]
[ 9556.711098]  drm_ioctl_kernel+0xaa/0xf0 [drm]
[ 9556.711133]  drm_ioctl+0x20f/0x3a0 [drm]
[ 9556.711267]  ? amdgpu_gem_force_release+0x150/0x150 [amdgpu]
[ 9556.711276]  ? preempt_count_sub+0x9b/0xd0
[ 9556.711404]  amdgpu_drm_ioctl+0x49/0x80 [amdgpu]
[ 9556.711411]  __x64_sys_ioctl+0x83/0xb0
[ 9556.711417]  do_syscall_64+0x33/0x80
[ 9556.711421]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: bf9eee249ac2 ("drm/ttm: stop using GFP_TRANSHUGE_LIGHT")
Signed-off-by: Michel Dänzer 
---
 drivers/gpu/drm/ttm/ttm_pool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index 8c762a03ad8a..a264760cb2cd 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -84,7 +84,7 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool 
*pool, gfp_t gfp_flags,
 * put_page() on a TTM allocated page is illegal.
 */
if (order)
-   gfp_flags |= __GFP_NOMEMALLOC | __GFP_NORETRY |
+   gfp_flags |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN |
__GFP_KSWAPD_RECLAIM;
 
if (!pool->use_dma_alloc) {
-- 
2.30.0

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH] kernel: Expose SYS_kcmp by default

2021-02-08 Thread Michel Dänzer


On 2021-02-05 9:53 p.m., Daniel Vetter wrote:

On Fri, Feb 5, 2021 at 7:37 PM Kees Cook  wrote:


On Fri, Feb 05, 2021 at 04:37:52PM +, Chris Wilson wrote:

Userspace has discovered the functionality offered by SYS_kcmp and has
started to depend upon it. In particular, Mesa uses SYS_kcmp for
os_same_file_description() in order to identify when two fd (e.g. device
or dmabuf) point to the same struct file. Since they depend on it for
core functionality, lift SYS_kcmp out of the non-default
CONFIG_CHECKPOINT_RESTORE into the selectable syscall category.

Signed-off-by: Chris Wilson 
Cc: Kees Cook 
Cc: Andy Lutomirski 
Cc: Will Drewry 
Cc: Andrew Morton 
Cc: Dave Airlie 
Cc: Daniel Vetter 
Cc: Lucas Stach 
---
  init/Kconfig  | 11 +++
  kernel/Makefile   |  2 +-
  tools/testing/selftests/seccomp/seccomp_bpf.c |  2 +-
  3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index b77c60f8b963..f62fca13ac5b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1194,6 +1194,7 @@ endif # NAMESPACES
  config CHECKPOINT_RESTORE
   bool "Checkpoint/restore support"
   select PROC_CHILDREN
+ select KCMP
   default n
   help
 Enables additional kernel features in a sake of checkpoint/restore.
@@ -1737,6 +1738,16 @@ config ARCH_HAS_MEMBARRIER_CALLBACKS
  config ARCH_HAS_MEMBARRIER_SYNC_CORE
   bool

+config KCMP
+ bool "Enable kcmp() system call" if EXPERT
+ default y


I would expect this to be not default-y, especially if
CHECKPOINT_RESTORE does a "select" on it.

This is a really powerful syscall, but it is bounded by ptrace access
controls, and uses pointer address obfuscation, so it may be okay to
expose this. As it is, at least Ubuntu already has
CONFIG_CHECKPOINT_RESTORE, so really, there's probably not much
difference on exposure.

So, if you drop the "default y", I'm fine with this.


It was maybe stupid, but our userspace started relying on fd
comaprison through sys_kcomp. So for better or worse, if you want to
run the mesa3d gl/vk stacks, you need this.


That's overstating things somewhat. The vast majority of applications 
will work fine regardless (as they did before Mesa started using this 
functionality). Only some special ones will run into issues, because the 
user-space drivers incorrectly assume two file descriptors reference 
different descriptions.




Was maybe not the brighest ideas, but since enough distros had this
enabled by defaults,


Right, that (and the above) is why I considered it fair game to use. 
What should I have done instead? (TBH I was surprised that this 
functionality isn't generally available)



it wasn't really discovered, and now we're
shipping this everywhere.


You're making it sound like this snuck in secretly somehow, which is not 
true of course.




Ofc we can leave the default n, but the select if CONFIG_DRM is
unfortunately needed I think.


Per above, not sure this is really true.


--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH] kernel: Expose SYS_kcmp by default

2021-02-08 Thread Michel Dänzer


On 2021-02-08 12:49 p.m., Michel Dänzer wrote:

On 2021-02-05 9:53 p.m., Daniel Vetter wrote:

On Fri, Feb 5, 2021 at 7:37 PM Kees Cook  wrote:


On Fri, Feb 05, 2021 at 04:37:52PM +, Chris Wilson wrote:

Userspace has discovered the functionality offered by SYS_kcmp and has
started to depend upon it. In particular, Mesa uses SYS_kcmp for
os_same_file_description() in order to identify when two fd (e.g. 
device

or dmabuf) point to the same struct file. Since they depend on it for
core functionality, lift SYS_kcmp out of the non-default
CONFIG_CHECKPOINT_RESTORE into the selectable syscall category.

Signed-off-by: Chris Wilson 
Cc: Kees Cook 
Cc: Andy Lutomirski 
Cc: Will Drewry 
Cc: Andrew Morton 
Cc: Dave Airlie 
Cc: Daniel Vetter 
Cc: Lucas Stach 
---
  init/Kconfig  | 11 +++
  kernel/Makefile   |  2 +-
  tools/testing/selftests/seccomp/seccomp_bpf.c |  2 +-
  3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index b77c60f8b963..f62fca13ac5b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1194,6 +1194,7 @@ endif # NAMESPACES
  config CHECKPOINT_RESTORE
   bool "Checkpoint/restore support"
   select PROC_CHILDREN
+ select KCMP
   default n
   help
 Enables additional kernel features in a sake of 
checkpoint/restore.

@@ -1737,6 +1738,16 @@ config ARCH_HAS_MEMBARRIER_CALLBACKS
  config ARCH_HAS_MEMBARRIER_SYNC_CORE
   bool

+config KCMP
+ bool "Enable kcmp() system call" if EXPERT
+ default y


I would expect this to be not default-y, especially if
CHECKPOINT_RESTORE does a "select" on it.

This is a really powerful syscall, but it is bounded by ptrace access
controls, and uses pointer address obfuscation, so it may be okay to
expose this. As it is, at least Ubuntu already has
CONFIG_CHECKPOINT_RESTORE, so really, there's probably not much
difference on exposure.

So, if you drop the "default y", I'm fine with this.


It was maybe stupid, but our userspace started relying on fd
comaprison through sys_kcomp. So for better or worse, if you want to
run the mesa3d gl/vk stacks, you need this.


That's overstating things somewhat. The vast majority of applications 
will work fine regardless (as they did before Mesa started using this 
functionality). Only some special ones will run into issues, because the 
user-space drivers incorrectly assume two file descriptors reference 
different descriptions.




Was maybe not the brighest ideas, but since enough distros had this
enabled by defaults,


Right, that (and the above) is why I considered it fair game to use. 
What should I have done instead? (TBH I was surprised that this 
functionality isn't generally available)


In that spirit, an alternative might be to make KCMP_FILE available 
unconditionally, and the rest of SYS_kcmp only with CHECKPOINT_RESTORE 
as before. (Or maybe other parts of SYS_kcmp are generally useful as well?)



--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH] kernel: Expose SYS_kcmp by default

2021-02-08 Thread Michel Dänzer


On 2021-02-08 2:34 p.m., Daniel Vetter wrote:

On Mon, Feb 8, 2021 at 12:49 PM Michel Dänzer  wrote:


On 2021-02-05 9:53 p.m., Daniel Vetter wrote:

On Fri, Feb 5, 2021 at 7:37 PM Kees Cook  wrote:


On Fri, Feb 05, 2021 at 04:37:52PM +, Chris Wilson wrote:

Userspace has discovered the functionality offered by SYS_kcmp and has
started to depend upon it. In particular, Mesa uses SYS_kcmp for
os_same_file_description() in order to identify when two fd (e.g. device
or dmabuf) point to the same struct file. Since they depend on it for
core functionality, lift SYS_kcmp out of the non-default
CONFIG_CHECKPOINT_RESTORE into the selectable syscall category.

Signed-off-by: Chris Wilson 
Cc: Kees Cook 
Cc: Andy Lutomirski 
Cc: Will Drewry 
Cc: Andrew Morton 
Cc: Dave Airlie 
Cc: Daniel Vetter 
Cc: Lucas Stach 
---
   init/Kconfig  | 11 +++
   kernel/Makefile   |  2 +-
   tools/testing/selftests/seccomp/seccomp_bpf.c |  2 +-
   3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index b77c60f8b963..f62fca13ac5b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1194,6 +1194,7 @@ endif # NAMESPACES
   config CHECKPOINT_RESTORE
bool "Checkpoint/restore support"
select PROC_CHILDREN
+ select KCMP
default n
help
  Enables additional kernel features in a sake of checkpoint/restore.
@@ -1737,6 +1738,16 @@ config ARCH_HAS_MEMBARRIER_CALLBACKS
   config ARCH_HAS_MEMBARRIER_SYNC_CORE
bool

+config KCMP
+ bool "Enable kcmp() system call" if EXPERT
+ default y


I would expect this to be not default-y, especially if
CHECKPOINT_RESTORE does a "select" on it.

This is a really powerful syscall, but it is bounded by ptrace access
controls, and uses pointer address obfuscation, so it may be okay to
expose this. As it is, at least Ubuntu already has
CONFIG_CHECKPOINT_RESTORE, so really, there's probably not much
difference on exposure.

So, if you drop the "default y", I'm fine with this.


It was maybe stupid, but our userspace started relying on fd
comaprison through sys_kcomp. So for better or worse, if you want to
run the mesa3d gl/vk stacks, you need this.


That's overstating things somewhat. The vast majority of applications
will work fine regardless (as they did before Mesa started using this
functionality). Only some special ones will run into issues, because the
user-space drivers incorrectly assume two file descriptors reference
different descriptions.



Was maybe not the brighest ideas, but since enough distros had this
enabled by defaults,


Right, that (and the above) is why I considered it fair game to use.
What should I have done instead? (TBH I was surprised that this
functionality isn't generally available)


Yeah that one is fine, but I thought we've discussed (irc or
something) more uses for de-duping dma-buf and stuff like that. But
quick grep says that hasn't landed yet, so I got a bit confused (or
just dreamt). Looking at this again I'm kinda surprised the drmfd
de-duping blows up on normal linux distros, but I guess it can all
happen.


One example: GEM handle name-spaces are per file description. If 
user-space incorrectly assumes two DRM fds are independent, when they 
actually reference the same file description, closing a GEM handle with 
one file descriptor will make it unusable with the other file descriptor 
as well.




Ofc we can leave the default n, but the select if CONFIG_DRM is
unfortunately needed I think.


Per above, not sure this is really true.


We seem to be going boom on linux distros now, maybe userspace got
more creative in abusing stuff?


I don't know what you're referring to. I've only seen maybe two or three 
reports from people who didn't enable CHECKPOINT_RESTORE in their 
self-built kernels.




The entire thing is small enough that imo we don't really have to care,
e.g. we also unconditionally select dma-buf, despite that on most
systems there's only 1 gpu, and you're never going to end up with a
buffer sharing case that needs any of that code (aside from the
"here's an fd" part).

But I guess we can limit to just KCMP_FILE like you suggest in another
reply. Just feels a bit like overkill.


Making KCMP_FILE gated by DRM makes as little sense to me as by 
CHECKPOINT_RESTORE.



--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: Fw: [Bug 211587] New: X: page allocation failure: order:8, mode:0x190dc2(GFP_HIGHUSER|__GFP_NORETRY|__GFP_ZERO|__GFP_NOMEMALLOC), nodemask=(null),cpuset=/,mems_allowed=0

2021-02-09 Thread Michel Dänzer


On 2021-02-08 11:08 p.m., Andrew Morton wrote:

I'm not sure who this belongs to...

Begin forwarded message:

Date: Sat, 06 Feb 2021 01:49:51 +
From: bugzilla-dae...@bugzilla.kernel.org
To: a...@linux-foundation.org
Subject: [Bug 211587] New: X: page allocation failure: order:8, 
mode:0x190dc2(GFP_HIGHUSER|__GFP_NORETRY|__GFP_ZERO|__GFP_NOMEMALLOC), 
nodemask=(null),cpuset=/,mems_allowed=0


https://bugzilla.kernel.org/show_bug.cgi?id=211587

 Bug ID: 211587
Summary: X: page allocation failure: order:8,
 mode:0x190dc2(GFP_HIGHUSER|__GFP_NORETRY|__GFP_ZERO|__
 GFP_NOMEMALLOC),
 nodemask=(null),cpuset=/,mems_allowed=0


Fixed in v5.11-rc7 by 2b1b3e544f65 "drm/ttm: Use __GFP_NOWARN for huge 
pages in ttm_pool_alloc_page".



--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v3] kcmp: Support selection of SYS_kcmp without CHECKPOINT_RESTORE

2021-02-12 Thread Michel Dänzer


On 2021-02-12 1:57 p.m., Emil Velikov wrote:

On Fri, 5 Feb 2021 at 22:01, Chris Wilson  wrote:


Userspace has discovered the functionality offered by SYS_kcmp and has
started to depend upon it. In particular, Mesa uses SYS_kcmp for
os_same_file_description() in order to identify when two fd (e.g. device
or dmabuf)


As you rightfully point out, SYS_kcmp is a bit of a two edged sword.
While you mention the CONFIG issue, there is also a portability aspect
(mesa runs on more than just linux) and as well as sandbox filtering
of the extra syscall.

Last time I looked, the latter was still an issue and mesa was using
SYS_kcmp to compare device node fds.
A far shorter and more portable solution is possible, so let me
prepare a Mesa patch.


Make sure to read my comments on 
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6881 first. :)



--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: KMSAN: kernel-infoleak in compat_drm_wait_vblank

2021-03-03 Thread Michel Dänzer


On 2021-02-22 10:15 a.m., syzbot wrote:

Hello,

syzbot found the following issue on:

HEAD commit:29ad81a1 arch/x86: add missing include to sparsemem.h
git tree:   https://github.com/google/kmsan.git master
console output: https://syzkaller.appspot.com/x/log.txt?x=111e6312d0
kernel config:  https://syzkaller.appspot.com/x/.config?x=c8e3b38ca92283e
dashboard link: https://syzkaller.appspot.com/bug?extid=620cf21140fc7e772a5d
compiler:   Debian clang version 11.0.1-2
userspace arch: i386

Unfortunately, I don't have any reproducer for this issue yet.

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+620cf21140fc7e772...@syzkaller.appspotmail.com

=
BUG: KMSAN: kernel-infoleak in kmsan_copy_to_user+0x9c/0xb0 
mm/kmsan/kmsan_hooks.c:249
CPU: 1 PID: 26999 Comm: syz-executor.2 Not tainted 5.11.0-rc7-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:79 [inline]
  dump_stack+0x21c/0x280 lib/dump_stack.c:120
  kmsan_report+0xfb/0x1e0 mm/kmsan/kmsan_report.c:118
  kmsan_internal_check_memory+0x484/0x520 mm/kmsan/kmsan.c:437
  kmsan_copy_to_user+0x9c/0xb0 mm/kmsan/kmsan_hooks.c:249
  instrument_copy_to_user include/linux/instrumented.h:121 [inline]
  _copy_to_user+0x1ac/0x270 lib/usercopy.c:33
  copy_to_user include/linux/uaccess.h:209 [inline]
  compat_drm_wait_vblank+0x36f/0x450 drivers/gpu/drm/drm_ioc32.c:866
  drm_compat_ioctl+0x3f6/0x590 drivers/gpu/drm/drm_ioc32.c:995
  __do_compat_sys_ioctl fs/ioctl.c:842 [inline]
  __se_compat_sys_ioctl+0x53d/0x1100 fs/ioctl.c:793
  __ia32_compat_sys_ioctl+0x4a/0x70 fs/ioctl.c:793
  do_syscall_32_irqs_on arch/x86/entry/common.c:79 [inline]
  __do_fast_syscall_32+0x102/0x160 arch/x86/entry/common.c:141
  do_fast_syscall_32+0x6a/0xc0 arch/x86/entry/common.c:166
  do_SYSENTER_32+0x73/0x90 arch/x86/entry/common.c:209
  entry_SYSENTER_compat_after_hwframe+0x4d/0x5c
RIP: 0023:0xf7f47549
Code: 03 74 c0 01 10 05 03 74 b8 01 10 06 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 
d8 01 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d b4 
26 00 00 00 00 8d b4 26 00 00 00 00
RSP: 002b:f55415fc EFLAGS: 0296 ORIG_RAX: 0036
RAX: ffda RBX: 0003 RCX: c018643a
RDX: 2100 RSI:  RDI: 
RBP:  R08:  R09: 
R10:  R11:  R12: 
R13:  R14:  R15: 

Uninit was stored to memory at:
  kmsan_save_stack_with_flags mm/kmsan/kmsan.c:121 [inline]
  kmsan_internal_chain_origin+0xad/0x130 mm/kmsan/kmsan.c:289
  __msan_chain_origin+0x57/0xa0 mm/kmsan/kmsan_instr.c:147
  compat_drm_wait_vblank+0x43c/0x450 drivers/gpu/drm/drm_ioc32.c:865
  drm_compat_ioctl+0x3f6/0x590 drivers/gpu/drm/drm_ioc32.c:995
  __do_compat_sys_ioctl fs/ioctl.c:842 [inline]
  __se_compat_sys_ioctl+0x53d/0x1100 fs/ioctl.c:793
  __ia32_compat_sys_ioctl+0x4a/0x70 fs/ioctl.c:793
  do_syscall_32_irqs_on arch/x86/entry/common.c:79 [inline]
  __do_fast_syscall_32+0x102/0x160 arch/x86/entry/common.c:141
  do_fast_syscall_32+0x6a/0xc0 arch/x86/entry/common.c:166
  do_SYSENTER_32+0x73/0x90 arch/x86/entry/common.c:209
  entry_SYSENTER_compat_after_hwframe+0x4d/0x5c

Local variable req@compat_drm_wait_vblank created at:
  compat_drm_wait_vblank+0x7b/0x450 drivers/gpu/drm/drm_ioc32.c:849
  compat_drm_wait_vblank+0x7b/0x450 drivers/gpu/drm/drm_ioc32.c:849

Bytes 12-15 of 16 are uninitialized
Memory access of size 16 starts at 88814ffe3c98
Data copied to user address 2100
=


compat_drm_wait_vblank would need to initialize

req.reply.tval_usec = req32.reply.tval_usec;

before calling drm_ioctl_kernel, since it's not aliased by any req.request.* 
member, and drm_wait_vblank_ioctl doesn't always write to it.


--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [RFC] Add DMA_RESV_USAGE flags

2021-05-19 Thread Michel Dänzer

On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter  wrote:
>>
>> On Tue, May 18, 2021 at 7:40 PM Christian König
>>  wrote:
>>>
>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
>>>>  wrote:
>>>>
>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
>>>>> AFAIK nouveau does the right thing as well).
>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
>>>> have to use the one atomic helpers use. Which is also the one that
>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
>>>> so as soon as that lands and someone starts using it, something has to
>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
>>>> shared with another device.
>>>
>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>>>
>>> This doesn't works for amdgpu at all for the reasons outlined above.
>>
>> Uh that's really not how uapi works. "my driver is right, everyone
>> else is wrong" is not how cross driver contracts are defined. If that
>> means a perf impact until you've fixed your rules, that's on you.
>>
>> Also you're a few years too late with nacking this, it's already uapi
>> in the form of the dma-buf poll() support.
> 
> ^^  My fancy new ioctl doesn't expose anything that isn't already
> there.  It just lets you take a snap-shot of a wait instead of doing
> an active wait which might end up with more fences added depending on
> interrupts and retries.  The dma-buf poll waits on all fences for
> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.

Note that the dma-buf poll support could be useful to Wayland compositors for 
the same purpose as Jason's new ioctl (only using client buffers which have 
finished drawing for an output frame, to avoid missing a refresh cycle due to 
client drawing), *if* it didn't work differently with amdgpu.

Am I understanding correctly that Jason's new ioctl would also work differently 
with amdgpu as things stand currently? If so, that would be a real bummer and 
might hinder adoption of the ioctl by Wayland compositors.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [RFC] Add DMA_RESV_USAGE flags

2021-05-19 Thread Michel Dänzer

On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer  wrote:
>>
>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter  wrote:
>>>>
>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
>>>>  wrote:
>>>>>
>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
>>>>>>  wrote:
>>>>>>
>>>>>>> And as long as we are all inside amdgpu we also don't have any oversync,
>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
>>>>>>> AFAIK nouveau does the right thing as well).
>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
>>>>>> have to use the one atomic helpers use. Which is also the one that
>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
>>>>>> so as soon as that lands and someone starts using it, something has to
>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
>>>>>> shared with another device.
>>>>>
>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>>>>>
>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
>>>>
>>>> Uh that's really not how uapi works. "my driver is right, everyone
>>>> else is wrong" is not how cross driver contracts are defined. If that
>>>> means a perf impact until you've fixed your rules, that's on you.
>>>>
>>>> Also you're a few years too late with nacking this, it's already uapi
>>>> in the form of the dma-buf poll() support.
>>>
>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
>>> there.  It just lets you take a snap-shot of a wait instead of doing
>>> an active wait which might end up with more fences added depending on
>>> interrupts and retries.  The dma-buf poll waits on all fences for
>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
>>
>> Note that the dma-buf poll support could be useful to Wayland compositors 
>> for the same purpose as Jason's new ioctl (only using client buffers which 
>> have finished drawing for an output frame, to avoid missing a refresh cycle 
>> due to client drawing), *if* it didn't work differently with amdgpu.
>>
>> Am I understanding correctly that Jason's new ioctl would also work 
>> differently with amdgpu as things stand currently? If so, that would be a 
>> real bummer and might hinder adoption of the ioctl by Wayland compositors.
> 
> My new ioctl has identical semantics to poll().  It just lets you take
> a snapshot in time to wait on later instead of waiting on whatever
> happens to be set right now.  IMO, having identical semantics to
> poll() isn't something we want to change.

Agreed.

I'd argue then that making amdgpu poll semantics match those of other drivers 
is a pre-requisite for the new ioctl, otherwise it seems unlikely that the 
ioctl will be widely adopted.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [RFC] Add DMA_RESV_USAGE flags

2021-05-20 Thread Michel Dänzer

On 2021-05-20 9:55 a.m., Daniel Vetter wrote:
> On Wed, May 19, 2021 at 5:48 PM Michel Dänzer  wrote:
>>
>> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
>>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer  wrote:
>>>>
>>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
>>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter  wrote:
>>>>>>
>>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
>>>>>>  wrote:
>>>>>>>
>>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
>>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
>>>>>>>>  wrote:
>>>>>>>>
>>>>>>>>> And as long as we are all inside amdgpu we also don't have any 
>>>>>>>>> oversync,
>>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
>>>>>>>>> AFAIK nouveau does the right thing as well).
>>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
>>>>>>>> have to use the one atomic helpers use. Which is also the one that
>>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
>>>>>>>> so as soon as that lands and someone starts using it, something has to
>>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
>>>>>>>> shared with another device.
>>>>>>>
>>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>>>>>>>
>>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
>>>>>>
>>>>>> Uh that's really not how uapi works. "my driver is right, everyone
>>>>>> else is wrong" is not how cross driver contracts are defined. If that
>>>>>> means a perf impact until you've fixed your rules, that's on you.
>>>>>>
>>>>>> Also you're a few years too late with nacking this, it's already uapi
>>>>>> in the form of the dma-buf poll() support.
>>>>>
>>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
>>>>> there.  It just lets you take a snap-shot of a wait instead of doing
>>>>> an active wait which might end up with more fences added depending on
>>>>> interrupts and retries.  The dma-buf poll waits on all fences for
>>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
>>>>
>>>> Note that the dma-buf poll support could be useful to Wayland compositors 
>>>> for the same purpose as Jason's new ioctl (only using client buffers which 
>>>> have finished drawing for an output frame, to avoid missing a refresh 
>>>> cycle due to client drawing), *if* it didn't work differently with amdgpu.
>>>>
>>>> Am I understanding correctly that Jason's new ioctl would also work 
>>>> differently with amdgpu as things stand currently? If so, that would be a 
>>>> real bummer and might hinder adoption of the ioctl by Wayland compositors.
>>>
>>> My new ioctl has identical semantics to poll().  It just lets you take
>>> a snapshot in time to wait on later instead of waiting on whatever
>>> happens to be set right now.  IMO, having identical semantics to
>>> poll() isn't something we want to change.
>>
>> Agreed.
>>
>> I'd argue then that making amdgpu poll semantics match those of other 
>> drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely 
>> that the ioctl will be widely adopted.
> 
> This seems backwards, because that means useful improvements in all
> other drivers are stalled until amdgpu is fixed.
> 
> I think we need agreement on what the rules are, reasonable plan to
> get there, and then that should be enough to unblock work in the wider
> community. Holding the community at large hostage because one driver
> is different is really not great.

I think we're in violent agreement. :) The point I was trying to make is that 
amdgpu really needs to be fixed to be consistent with other drivers ASAP.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [RFC] Add DMA_RESV_USAGE flags

2021-05-20 Thread Michel Dänzer

On 2021-05-20 4:18 p.m., Daniel Vetter wrote:
> On Thu, May 20, 2021 at 10:13:38AM +0200, Michel Dänzer wrote:
>> On 2021-05-20 9:55 a.m., Daniel Vetter wrote:
>>> On Wed, May 19, 2021 at 5:48 PM Michel Dänzer  wrote:
>>>>
>>>> On 2021-05-19 5:21 p.m., Jason Ekstrand wrote:
>>>>> On Wed, May 19, 2021 at 5:52 AM Michel Dänzer  wrote:
>>>>>>
>>>>>> On 2021-05-19 12:06 a.m., Jason Ekstrand wrote:
>>>>>>> On Tue, May 18, 2021 at 4:17 PM Daniel Vetter  wrote:
>>>>>>>>
>>>>>>>> On Tue, May 18, 2021 at 7:40 PM Christian König
>>>>>>>>  wrote:
>>>>>>>>>
>>>>>>>>> Am 18.05.21 um 18:48 schrieb Daniel Vetter:
>>>>>>>>>> On Tue, May 18, 2021 at 2:49 PM Christian König
>>>>>>>>>>  wrote:
>>>>>>>>>>
>>>>>>>>>>> And as long as we are all inside amdgpu we also don't have any 
>>>>>>>>>>> oversync,
>>>>>>>>>>> the issue only happens when we share dma-bufs with i915 (radeon and
>>>>>>>>>>> AFAIK nouveau does the right thing as well).
>>>>>>>>>> Yeah because then you can't use the amdgpu dma_resv model anymore and
>>>>>>>>>> have to use the one atomic helpers use. Which is also the one that
>>>>>>>>>> e.g. Jason is threathening to bake in as uapi with his dma_buf ioctl,
>>>>>>>>>> so as soon as that lands and someone starts using it, something has 
>>>>>>>>>> to
>>>>>>>>>> adapt _anytime_ you have a dma-buf hanging around. Not just when it's
>>>>>>>>>> shared with another device.
>>>>>>>>>
>>>>>>>>> Yeah, and that is exactly the reason why I will NAK this uAPI change.
>>>>>>>>>
>>>>>>>>> This doesn't works for amdgpu at all for the reasons outlined above.
>>>>>>>>
>>>>>>>> Uh that's really not how uapi works. "my driver is right, everyone
>>>>>>>> else is wrong" is not how cross driver contracts are defined. If that
>>>>>>>> means a perf impact until you've fixed your rules, that's on you.
>>>>>>>>
>>>>>>>> Also you're a few years too late with nacking this, it's already uapi
>>>>>>>> in the form of the dma-buf poll() support.
>>>>>>>
>>>>>>> ^^  My fancy new ioctl doesn't expose anything that isn't already
>>>>>>> there.  It just lets you take a snap-shot of a wait instead of doing
>>>>>>> an active wait which might end up with more fences added depending on
>>>>>>> interrupts and retries.  The dma-buf poll waits on all fences for
>>>>>>> POLLOUT and only the exclusive fence for POLLIN.  It's already uAPI.
>>>>>>
>>>>>> Note that the dma-buf poll support could be useful to Wayland 
>>>>>> compositors for the same purpose as Jason's new ioctl (only using client 
>>>>>> buffers which have finished drawing for an output frame, to avoid 
>>>>>> missing a refresh cycle due to client drawing), *if* it didn't work 
>>>>>> differently with amdgpu.
>>>>>>
>>>>>> Am I understanding correctly that Jason's new ioctl would also work 
>>>>>> differently with amdgpu as things stand currently? If so, that would be 
>>>>>> a real bummer and might hinder adoption of the ioctl by Wayland 
>>>>>> compositors.
>>>>>
>>>>> My new ioctl has identical semantics to poll().  It just lets you take
>>>>> a snapshot in time to wait on later instead of waiting on whatever
>>>>> happens to be set right now.  IMO, having identical semantics to
>>>>> poll() isn't something we want to change.
>>>>
>>>> Agreed.
>>>>
>>>> I'd argue then that making amdgpu poll semantics match those of other 
>>>> drivers is a pre-requisite for the new ioctl, otherwise it seems unlikely 
>>>> that the ioctl will be widely adopted.
>>>
>>> This seems

Re: [PATCH v3 05/12] drm/ttm: Expose ttm_tt_unpopulate for driver use

2020-11-25 Thread Michel Dänzer


On 2020-11-25 1:57 p.m., Christian König wrote:


Well thinking more about this, it seems to be a another really good 
argument why mapping pages from DMA-bufs into application address space 
directly is a very bad idea :)


Apologies for going off on a tangent here...

Since allowing userspace mmap with dma-buf fds seems to be a trap in 
general[0], I wonder if there's any way we could stop supporting that?



[0] E.g. mutter had to disable handing out dma-bufs for screen capture 
by default with non-i915 for now, because in particular with discrete 
GPUs, direct CPU reads can be unusably slow (think single-digit frames 
per second), and of course there's other userspace which goes "ooh, 
dma-buf, let's map and read!".



--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: GPU-side memory protection landscape

2020-12-01 Thread Michel Dänzer


On 2020-11-30 3:07 p.m., Alexander Monakov wrote:


My other concern is how easy it is to cause system instability or hangs
by out-of-bounds writes from the GPU (via compute shaders or copy
commands). In my experience of several years doing GPU computing with
NVIDIA tech, I don't recall needing to lose time rebooting my PC after
running a buggy CUDA "kernel". Heck, I could run the GCC C testsuite on
the GPU without worrying about locking myself and others from the
server. But now when I develop on a laptop with AMD's latest mobile SoC,
every time I make a mistake in my GLSL code it more often than not forces
a reboot. I hope you understand what a huge pain it is.

What are the existing GPU hardware capabilities for memory protection
(both in terms of preventing random accesses to system memory like with
an IOMMU, and in terms of isolating different process contexts from each
other), and to what extend Linux DRM drivers are taking advantage of
them?


Modern (or more like non-ancient at this point, basically anything which 
came out within the last decade) AMD GPUs have mostly perfect protection 
between different execution contexts (i.e. different processes normally, 
though it's not always a 1:1 mapping). Each context has its own virtual 
GPU address space and cannot access any memory which isn't mapped into 
that (which the kernel driver only does for memory belonging to a buffer 
object which the context has permission to access and has explicitly 
asked to be mapped into its address space).



The instability you're seeing likely isn't due to lack of memory 
protection but due to any of a large number of other ways a GPU can end 
up in a hanging state, and the drivers and wider ecosystem not being 
very good at recovering from that yet.



--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH] drm: Fix drm.h uapi header for Windows

2020-12-02 Thread Michel Dänzer


On 2020-12-01 11:01 a.m., James Park wrote:

This will allow Mesa to port code to Windows more easily.


As discussed in 
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6162#note_712779 
, including drm.h makes no sense when building for Windows.



--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH] drm: Fix drm.h uapi header for Windows

2020-12-02 Thread Michel Dänzer


On 2020-12-02 1:46 p.m., Daniel Vetter wrote:

On Wed, Dec 2, 2020 at 12:43 PM Michel Dänzer  wrote:


On 2020-12-01 11:01 a.m., James Park wrote:

This will allow Mesa to port code to Windows more easily.


As discussed in
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6162#note_712779
, including drm.h makes no sense when building for Windows.


Yeah I think it'd be cleanest if we can avoid this. If not I think the
right fix would be to split out the actually needed parts from drm.h
into a new header (still included by drm.h for backwards compat
reasons) which mesa can use. Since it looks like the problematic parts
are the legacy gunk, and not the new ioctl structures. Pulling out
drm_render.h for all the render stuff and mabe drm_vblank.h for the
vblank stuff (which would fit better in drm_mode.h but mistakes were
made, oops).


If anything currently in drm.h is needed while building for Windows, it 
points to a broken abstraction somewhere in userspace. (Specifically, 
the Mesa Gallium/Vulkan winsys is supposed to abstract away platform 
details like these)



--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH] drm: Fix drm.h uapi header for Windows

2020-12-03 Thread Michel Dänzer


On 2020-12-02 8:47 p.m., James Park wrote:


If we're trying to cut ties with the drm-uapi folder entirely, the stuff 
ac_surface.c need includes the AMD_FMT_MOD stuff in drm_fourcc.h, 
and AMDGPU_TILING_* under amdgpu_drm.h. Is there a better spot for these 
definitions?


The Mesa src/amd/ code should use platform-neutral abstractions for 
these. This wasn't deemed necessary before, because nobody was trying to 
build these drivers for non-UNIX OSes. But now you are.



--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH libdrm] headers: drm: Sync with drm-next

2021-07-05 Thread Michel Dänzer

On 2021-07-03 9:59 p.m., Mario Kleiner wrote:
> Generated using make headers_install from the drm-next
> tree - git://anongit.freedesktop.org/drm/drm
> branch - drm-next
> commit - 8a02ea42bc1d4c448caf1bab0e05899dad503f74
> 
> The changes were as follows (shortlog from
> b10733527bfd864605c33ab2e9a886eec317ec39..HEAD):

libdrm uses GitLab merge requests now: 
https://gitlab.freedesktop.org/mesa/drm/-/merge_requests


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [pull] amdgpu, amdkfd drm-fixes-5.14

2021-07-15 Thread Michel Dänzer

On 2021-07-15 4:07 p.m., Alex Deucher wrote:
> On Thu, Jul 15, 2021 at 12:52 AM Sam Ravnborg  wrote:
>> On Wed, Jul 14, 2021 at 06:08:58PM -0400, Alex Deucher wrote:
>>> Hi Dave, Daniel,
>>>
>>> Fixes for 5.14.  The big change here is unifying the SMU13 code.  This was
>>> new code added in 5.14 to support Yellow Carp, but we've since cleaned it
>>> up and removed a lot of duplication, so better to merge it now to facilitate
>>> any bug fixes in the future that need to go back to this kernel via stable.
>>> Only affects Yellow Carp which is new for 5.14 anyway so not much chance for
>>> regressions.  The rest is just standard bug fixes.
>>
>> This pull seems not to include any fixes for the W=1 warnings that
>> has crept in again. It would be nice if the amdgpu could be warning free
>> again, this would maybe motivate the others to fix theirs too so we
>> could keep most/all of drivers/gpu/ free of W=1 warnings.
> 
> We haven't really been monitoring the W=1 stuff that closely.  I'll
> see what we can do going forward.

IMHO keeping the W=1 build clean isn't realistic without enforcing it in CI.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [PATCH v2 1/8] drm: Add dummy page per device or GEM object

2020-11-16 Thread Michel Dänzer

odify vm_area_struct in this case. I am
not sure if it's legit to write lock tthe mm_sem from this point.
I found some discussions about this here
http://lkml.iu.edu/hypermail/linux/kernel/1909.1/02754.html but it
wasn't really clear to me
what's the solution.

In any case, seems to me that easier and more memory saving solution
would be to just switch to per ttm bo dumy rw page that
would be allocated on demand as you suggested here.  This should also
take care of imported BOs and flink cases.
Then i can drop the per device FD and per GEM object FD dummy BO and
the ugly loop i am using in patch 2 to match faulting BO to the right
dummy page.

Does this makes sense ?


I still don't see the information leak as much of a problem, but if
Daniel insists we should probably do this.


Well amdgpu doesn't clear buffers by default, so indeed you guys are a
lot more laissez-faire here. But in general we really don't do that
kind of leaking. Iirc there's even radeonsi bugs because else clears,
and radeonsi happily displays gunk :-)


btw I think not clearing at alloc breaks the render node model a bit.
Without that this was all fine, since system pages still got cleared
by alloc_page(), and we only leaked vram. And for the legacy node
model with authentication of clients against the X server, leaking
that all around was ok. With render nodes no leaking should happen,
with no knob for userspace to opt out of the forced clearing.


Seconded.


--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 3/3] drm/ttm: make up to 90% of system memory available

2020-11-17 Thread Michel Dänzer


On 2020-11-17 3:06 p.m., Christian König wrote:

Increase the ammount of system memory drivers can use to about 90% of
the total available.

Signed-off-by: Christian König 
---
  drivers/gpu/drm/ttm/ttm_bo.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index a958135cb3fe..0a93df93dba4 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1267,7 +1267,7 @@ static int ttm_bo_global_init(void)
 * the available system memory.
 */
num_pages = (u64)si.totalram * si.mem_unit;
-   num_pages = (num_pages * 50 / 100) >> PAGE_SHIFT;
+   num_pages = (num_pages * 90 / 100) >> PAGE_SHIFT;
  
  	/* But for DMA32 we limit ourself to only use 2GiB maximum. */

num_dma32_pages = (u64)(si.totalram - si.totalhigh) * si.mem_unit;



This should update the comment added in patch 1.


--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 1/2] drm/ttm: Fix a deadlock if the target BO is not idle during swap

2021-09-03 Thread Michel Dänzer



Pan, you're sending patches to amd-gfx-boun...@lists.freedesktop.org, which 
doesn't work. You need to send them to amd-...@lists.freedesktop.org instead.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer

Re: [RFC PATCH] drm/aperture: Add param to disable conflicting framebuffers removal

2021-10-25 Thread Michel Dänzer

On 2021-10-24 22:32, Javier Martinez Canillas wrote:
> Hello Ville,
> 
> On 10/22/21 21:12, Ville Syrjälä wrote:
>> On Fri, Oct 22, 2021 at 04:40:40PM +0200, Javier Martinez Canillas wrote:
>>> The simpledrm driver allows to use the frame buffer that was set-up by the
>>> firmware. This gives early video output before the platform DRM driver is
>>> probed and takes over.
>>>
>>> But it would be useful to have a way to disable this take over by the real
>>> DRM drivers. For example, there may be bugs in the DRM drivers that could
>>> cause the display output to not work correctly.
>>>
>>> For those cases, it would be good to keep the simpledrm driver instead and
>>> at least get a working display as set-up by the firmware.
>>>
>>> Let's add a drm.remove_fb boolean kernel command line parameter, that when
>>> set to false will prevent the conflicting framebuffers to being removed.
>>>
>>> Since the drivers call drm_aperture_remove_conflicting_framebuffers() very
>>> early in their probe callback, this will cause the drivers' probe to fail.
>>
>> Why is that better than just modprobe.blacklisting those drivers?
> 
> Because would allow to deny list all native (as Thomas called it) DRM drivers
> and only allow the simpledrm driver to be probed. This is useful for distros,
> since could add a "Basic graphics mode" to the boot menu entries, that could
> boot the kernel passing a "drm.disable_native_drivers=1" cmdline option.
> 
> That way, if there's any problem with a given DRM driver, the distro may be
> installed and booted using the simpledrm driver and troubleshoot why a native
> DRM driver is not working. Or try updating the kernel package, etc.

For troubleshooting, it'll be helpful if this new parameter can be enabled for 
the boot via the kernel command line, then disabled again after boot-up. One 
simple possibility for this would be allowing the parameter to be changed via 
/sys/module/drm/parameters/, which I suspect doesn't work with the patch 
as is (due to the 0600 permissions).


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: [RFC PATCH] drm/aperture: Add param to disable conflicting framebuffers removal

2021-10-25 Thread Michel Dänzer

On 2021-10-25 14:28, Javier Martinez Canillas wrote:
> Hello Michel,
> 
> On 10/25/21 12:45, Michel Dänzer wrote:
>> On 2021-10-24 22:32, Javier Martinez Canillas wrote:
>>> Hello Ville,
>>>
>>> On 10/22/21 21:12, Ville Syrjälä wrote:
>>>> On Fri, Oct 22, 2021 at 04:40:40PM +0200, Javier Martinez Canillas wrote:
>>>>> The simpledrm driver allows to use the frame buffer that was set-up by the
>>>>> firmware. This gives early video output before the platform DRM driver is
>>>>> probed and takes over.
>>>>>
>>>>> But it would be useful to have a way to disable this take over by the real
>>>>> DRM drivers. For example, there may be bugs in the DRM drivers that could
>>>>> cause the display output to not work correctly.
>>>>>
>>>>> For those cases, it would be good to keep the simpledrm driver instead and
>>>>> at least get a working display as set-up by the firmware.
>>>>>
>>>>> Let's add a drm.remove_fb boolean kernel command line parameter, that when
>>>>> set to false will prevent the conflicting framebuffers to being removed.
>>>>>
>>>>> Since the drivers call drm_aperture_remove_conflicting_framebuffers() very
>>>>> early in their probe callback, this will cause the drivers' probe to fail.
>>>>
>>>> Why is that better than just modprobe.blacklisting those drivers?
>>>
>>> Because would allow to deny list all native (as Thomas called it) DRM 
>>> drivers
>>> and only allow the simpledrm driver to be probed. This is useful for 
>>> distros,
>>> since could add a "Basic graphics mode" to the boot menu entries, that could
>>> boot the kernel passing a "drm.disable_native_drivers=1" cmdline option.
>>>
>>> That way, if there's any problem with a given DRM driver, the distro may be
>>> installed and booted using the simpledrm driver and troubleshoot why a 
>>> native
>>> DRM driver is not working. Or try updating the kernel package, etc.
>>
>> For troubleshooting, it'll be helpful if this new parameter can be enabled 
>> for the boot via the kernel command line, then disabled again after boot-up. 
>> One simple possibility for this would be allowing the parameter to be 
>> changed via /sys/module
> 
> That's already the case with the current patch, i.e:
> 
> $ grep -o drm.* /proc/cmdline 
> drm.disable_native_drivers=1
> 
> $ cat /proc/fb 
> 0 simpledrm
> 
> $ modprobe virtio_gpu
> 
> $ dmesg
> [  125.731549] [drm] pci: virtio-vga detected at :00:01.0
> [  125.732410] virtio_gpu: probe of virtio0 failed with error -16
> 
> $ echo 0 > /sys/module/drm/parameters/disable_native_drivers
> 
> $ modprobe virtio_gpu
> 
> $ dmesg 
> [  187.889136] [drm] pci: virtio-vga detected at :00:01.0
> [  187.894578] Console: switching to colour dummy device 80x25
> [  187.897090] virtio-pci :00:01.0: vgaarb: deactivate vga console
> [  187.899983] [drm] features: -virgl +edid -resource_blob -host_visible
> [  187.907176] [drm] number of scanouts: 1
> [  187.907714] [drm] number of cap sets: 0
> [  187.914108] [drm] Initialized virtio_gpu 0.1.0 0 for virtio0 on minor 1
> [  187.930807] Console: switching to colour frame buffer device 128x48
> [  187.938737] virtio_gpu virtio0: [drm] fb0: virtio_gpu frame buffer device
> 
> $ cat /proc/fb 
> 0 virtio_gpu
> 
> /drm/parameters/, which I suspect doesn't work with the patch as is 
> (due to the 0600 permissions).
>>
>>
> 
> I followed the convention used by other drm module parameters, hence the
> 0600. Do you mean that for this parameter we should be less restrictive ?

No, it was simply a brain fart on my part. :}

Thanks for confirming this works!


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: [PATCH] drm/amdgpu: fix clang out-of-range warning

2021-09-28 Thread Michel Dänzer

On 2021-09-27 14:19, Arnd Bergmann wrote:
> From: Arnd Bergmann 
> 
> clang-14 points out that comparing an 'unsigned int' against a large
> 64-bit constantn is pointless:
> 
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:1206:18: error: result of 
> comparison of constant 4294967296 with expression of type 'resource_size_t' 
> (aka 'unsigned int') is always false 
> [-Werror,-Wtautological-constant-out-of-range-compare]
> res->start > 0x1ull)
> 
> Rephrase the comparison using the upper_32_bits() macro, which should
> keep it legible while avoiding the warning.
> 
> Fixes: 31b8adab3247 ("drm/amdgpu: require a root bus window above 4GB for BAR 
> resize")
> Signed-off-by: Arnd Bergmann 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index ab3794c42d36..741a55031ca1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1203,7 +1203,7 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device 
> *adev)
>  
>   pci_bus_for_each_resource(root, res, i) {
>   if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
> - res->start > 0x1ull)
> +     upper_32_bits(res->start) != 0)

New code matches 1ull << 32, old code didn't. If this difference matters, this 
would break if res->start is 64 bits?


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: [PATCH v5 12/13] drm/i915/ttm: use cached system pages when evicting lmem

2021-09-30 Thread Michel Dänzer

On 2021-09-29 13:54, Thomas Hellström wrote:
> On Mon, 2021-09-27 at 12:41 +0100, Matthew Auld wrote:
>> This should let us do an accelerated copy directly to the shmem pages
>> when temporarily moving lmem-only objects, where the i915-gem
>> shrinker
>> can later kick in to swap out the pages, if needed.
>>
>> Signed-off-by: Matthew Auld 
>> Cc: Thomas Hellström 
>> ---
>>  drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 8 
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> index 194e5f1deda8..46d57541c0b2 100644
>> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> @@ -134,11 +134,11 @@ static enum ttm_caching
>>  i915_ttm_select_tt_caching(const struct drm_i915_gem_object *obj)
>>  {
>> /*
>> -    * Objects only allowed in system get cached cpu-mappings.
>> -    * Other objects get WC mapping for now. Even if in system.
>> +    * Objects only allowed in system get cached cpu-mappings, or
>> when
>> +    * evicting lmem-only buffers to system for swapping. Other
>> objects get
>> +    * WC mapping for now. Even if in system.
>>  */
>> -   if (obj->mm.region->type == INTEL_MEMORY_SYSTEM &&
>> -   obj->mm.n_placements <= 1)
>> +   if (obj->mm.n_placements <= 1)
>> return ttm_cached;
>>  
>> return ttm_write_combined;
> 
> We should be aware that with TTM, even evicted bos can be mapped by
> user-space while evicted, and this will appear to user-space like the
> WC-mapped object suddenly became WB-mapped. But it appears like mesa
> doesn't care about this as long as the mappings are fully coherent.

FWIW, the Mesa radeonsi driver avoids surprises due to this (e.g. some path 
which involves CPU access suddenly goes faster if the BO was evicted from VRAM) 
by asking for WC mapping of BOs intended to be in VRAM even while they're 
evicted (via the AMDGPU_GEM_CREATE_CPU_GTT_USWC flag).


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: [PATCH v5 12/13] drm/i915/ttm: use cached system pages when evicting lmem

2021-09-30 Thread Michel Dänzer

On 2021-09-30 14:27, Matthew Auld wrote:
> On 30/09/2021 11:04, Michel Dänzer wrote:
>> On 2021-09-29 13:54, Thomas Hellström wrote:
>>> On Mon, 2021-09-27 at 12:41 +0100, Matthew Auld wrote:
>>>> This should let us do an accelerated copy directly to the shmem pages
>>>> when temporarily moving lmem-only objects, where the i915-gem
>>>> shrinker
>>>> can later kick in to swap out the pages, if needed.
>>>>
>>>> Signed-off-by: Matthew Auld 
>>>> Cc: Thomas Hellström 
>>>> ---
>>>>   drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 8 
>>>>   1 file changed, 4 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>> b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>> index 194e5f1deda8..46d57541c0b2 100644
>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>> @@ -134,11 +134,11 @@ static enum ttm_caching
>>>>   i915_ttm_select_tt_caching(const struct drm_i915_gem_object *obj)
>>>>   {
>>>>  /*
>>>> -    * Objects only allowed in system get cached cpu-mappings.
>>>> -    * Other objects get WC mapping for now. Even if in system.
>>>> +    * Objects only allowed in system get cached cpu-mappings, or
>>>> when
>>>> +    * evicting lmem-only buffers to system for swapping. Other
>>>> objects get
>>>> +    * WC mapping for now. Even if in system.
>>>>   */
>>>> -   if (obj->mm.region->type == INTEL_MEMORY_SYSTEM &&
>>>> -   obj->mm.n_placements <= 1)
>>>> +   if (obj->mm.n_placements <= 1)
>>>>  return ttm_cached;
>>>>    return ttm_write_combined;
>>>
>>> We should be aware that with TTM, even evicted bos can be mapped by
>>> user-space while evicted, and this will appear to user-space like the
>>> WC-mapped object suddenly became WB-mapped. But it appears like mesa
>>> doesn't care about this as long as the mappings are fully coherent.
>>
>> FWIW, the Mesa radeonsi driver avoids surprises due to this (e.g. some path 
>> which involves CPU access suddenly goes faster if the BO was evicted from 
>> VRAM) by asking for WC mapping of BOs intended to be in VRAM even while 
>> they're evicted (via the AMDGPU_GEM_CREATE_CPU_GTT_USWC flag).
>>
> 
> Ok, so amdgpu just defaults to cached system memory, even for evicted VRAM, 
> unless userspace requests USWC, in which case it will use WC?

Right.


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: [PATCH] MAINTAINERS: Add Helge as fbdev maintainer

2022-01-18 Thread Michel Dänzer

On 2022-01-17 19:47, Sven Schnelle wrote:
> 
>>  * There's no new development in fbdev and there are no new
>>drivers. Everyone works on DRM, which is better in most
>>regards. The consequence is that userspace is slowly loosing the
>>   ability to use fbdev.
> 
> That might be caused by the fact that no new drivers are accepted for
> fbdev. I wrote a driver for the HP Visualize FX5/10 cards end of last
> year which was rejected for inclusion into fbdev[1].
> 
> Based on your recommendation i re-wrote the whole thing in DRM. This
> works but has several drawbacks:
> 
> - no modesetting. With fbdev, i can nicely switch resolutions with
>   fbset. That doesn't work, and i've been told that this is not supported[2]
> 
> - It is *much* slower than fbset with hardware blitting. I would have to
>   dig out the numbers, but it's in the ratio of 1:15. The nice thing
>   with fbdev blitting is that i get an array of pixels and the
>   foreground/background colors all of these these pixels should have.
>   With the help of the hardware blitting, i can write 32 pixels at once
>   with every 32-bit transfer.
> 
>   With DRM, the closest i could find was DRM_FORMAT_C8, which means one
>   byte per pixel. So i can put 4 pixels into one 32-bit transfer.
> 
>   fbdev also clears the lines with hardware blitting, which is much
>   faster than clearing it with memcpy.
> 
>   Based on your recommendation i also verified that pci coalescing is
>   enabled.
> 
>   These numbers are with DRM's unnatural scrolling behaviour - it seems
>   to scroll several (text)lines at once if it takes to much time. I
>   guess if DRM would scroll line by line it would be even slower.
> 
>   If DRM would add those things - hardware clearing of memory regions,
>   hw blitting for text with a FG/BG color and modesetting i wouldn't
>   care about fbdev at all. But right now, it's working way faster for me.

A DRM driver can implement the same fbdev acceleration hooks as an fbdev driver.

(Most DRM drivers never bothered because the HW is more complex than 
traditional 2D accelerators, and can't safely be used under all circumstances 
where fbdev acceleration hooks could get called from. That's not an issue for a 
traditional 2D accelerator driver though)


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: [Linaro-mm-sig] [PATCH] dma-buf/poll: Get a file reference for outstanding fence callbacks

2021-11-03 Thread Michel Dänzer

On 2021-07-23 10:22, Christian König wrote:
> Am 23.07.21 um 10:19 schrieb Michel Dänzer:
>> On 2021-07-23 10:04 a.m., Christian König wrote:
>>> Am 23.07.21 um 09:58 schrieb Michel Dänzer:
>>>> From: Michel Dänzer 
>>>>
>>>> This makes sure we don't hit the
>>>>
>>>>  BUG_ON(dmabuf->cb_in.active || dmabuf->cb_out.active);
>>>>
>>>> in dma_buf_release, which could be triggered by user space closing the
>>>> dma-buf file description while there are outstanding fence callbacks
>>>> from dma_buf_poll.
>>> I was also wondering the same thing while working on this, but then thought 
>>> that the poll interface would take care of this.
>> I was able to hit the BUG_ON with 
>> https://gitlab.gnome.org/GNOME/mutter/-/merge_requests/1880 .
>>
>>
>>>> Cc: sta...@vger.kernel.org
>>>> Signed-off-by: Michel Dänzer 
>>>> ---
>>>>    drivers/dma-buf/dma-buf.c | 18 --
>>>>    1 file changed, 12 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
>>>> index 6c520c9bd93c..ec25498a971f 100644
>>>> --- a/drivers/dma-buf/dma-buf.c
>>>> +++ b/drivers/dma-buf/dma-buf.c
>>>> @@ -65,12 +65,9 @@ static void dma_buf_release(struct dentry *dentry)
>>>>    BUG_ON(dmabuf->vmapping_counter);
>>>>      /*
>>>> - * Any fences that a dma-buf poll can wait on should be signaled
>>>> - * before releasing dma-buf. This is the responsibility of each
>>>> - * driver that uses the reservation objects.
>>>> - *
>>>> - * If you hit this BUG() it means someone dropped their ref to the
>>>> - * dma-buf while still having pending operation to the buffer.
>>>> + * If you hit this BUG() it could mean:
>>>> + * * There's a file reference imbalance in dma_buf_poll / 
>>>> dma_buf_poll_cb or somewhere else
>>>> + * * dmabuf->cb_in/out.active are non-0 despite no pending fence 
>>>> callback
>>>>     */
>>>>    BUG_ON(dmabuf->cb_in.active || dmabuf->cb_out.active);
>>>>    @@ -196,6 +193,7 @@ static loff_t dma_buf_llseek(struct file *file, 
>>>> loff_t offset, int whence)
>>>>    static void dma_buf_poll_cb(struct dma_fence *fence, struct 
>>>> dma_fence_cb *cb)
>>>>    {
>>>>    struct dma_buf_poll_cb_t *dcb = (struct dma_buf_poll_cb_t *)cb;
>>>> +    struct dma_buf *dmabuf = container_of(dcb->poll, struct dma_buf, 
>>>> poll);
>>>>    unsigned long flags;
>>>>      spin_lock_irqsave(&dcb->poll->lock, flags);
>>>> @@ -203,6 +201,8 @@ static void dma_buf_poll_cb(struct dma_fence *fence, 
>>>> struct dma_fence_cb *cb)
>>>>    dcb->active = 0;
>>>>    spin_unlock_irqrestore(&dcb->poll->lock, flags);
>>>>    dma_fence_put(fence);
>>>> +    /* Paired with get_file in dma_buf_poll */
>>>> +    fput(dmabuf->file);
>>> Is calling fput() in interrupt context ok? IIRC that could potentially 
>>> sleep.
>> Looks fine AFAICT: It has
>>
>>     if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
>>
>> and as a fallback for that, it adds the file to a lock-less 
>> delayed_fput_list which is processed by a workqueue.
> 
> Ah, yes that makes sense.
> 
> Fell free to add Reviewed-by: Christian König 

Thanks! AFAICT this fix can be merged now for 5.16?


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: [PATCH] drm/amd/display: Fix error handling on waiting for completion

2021-11-03 Thread Michel Dänzer



[ Adding dri-devel ]

On 2021-11-01 16:00, Wang, Chao-kai (Stylon) wrote:
> 
> The problem with -ERESTARTSYS is the same half-baked atomic state with 
> modifications we made in the interrupted atomic check, is reused in the next 
> retry and fails the atomic check. What we expect in the next retry is with 
> the original atomic state. I am going to dig deeper and see if at DRM side we 
> can go back to use to the original atomic state in the retry.

I suspect either DC/DM needs to be able to handle the modified state on retry, 
or it needs to restore the original state before it returns -ERESTARTSYS.


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: [PATCH] drm/fb-helper: Fix restore_fbdev when there are pending delayed hotplug

2021-11-05 Thread Michel Dänzer

On 2021-11-04 17:32, Jocelyn Falempe wrote:
> When using Xorg/Logind and an external monitor connected with an MST dock.
> After disconnecting the external monitor, switching to VT may not work,
> the (internal) monitor sill display Xorg, and you can't see what you are
> typing in the VT.
> 
> This is related to commit e2809c7db818df6bbd0edf843e1beb2fbc9d8541 which
> introduced delayed hotplug for MST, and commit
> dc5bdb68b5b369d5bc7d1de96fa64cc1737a6320 which introduced a workaround for
> Xorg and logind, and add a force parameter to
> __drm_fb_helper_restore_fbdev_mode_unlocked() in this case.
> 
> When switching to VT, with Xorg and logind, if there
> are pending hotplug event (like MST unplugged), the hotplug path
> may not be fulfilled, because logind may drop the master a bit later.
> It leads to the console not showing up on the monitor.
> 
> So in this case, forward the "force" parameter to the hotplug event,
> and ignore if there is a drm master in this case.

I'm worried that this leaves a race condition which allows the still-master 
(which causes drm_fb_helper_hotplug_event to bail without this patch) to modify 
the state set by __drm_fb_helper_hotplug_event, which could still result in 
similar symptoms.

I wonder if something like calling drm_fb_helper_hotplug_event when master is 
dropped and fb_helper->delayed_hotplug == true could work instead.


> diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
> index 8e7a124d6c5a..c07080f661b1 100644
> --- a/drivers/gpu/drm/drm_fb_helper.c
> +++ b/drivers/gpu/drm/drm_fb_helper.c
> @@ -82,6 +82,8 @@ MODULE_PARM_DESC(drm_leak_fbdev_smem,
>  static LIST_HEAD(kernel_fb_helper_list);
>  static DEFINE_MUTEX(kernel_fb_helper_lock);
>  
> +static int __drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper, 
> bool force);
> +
>  /**
>   * DOC: fbdev helpers
>   *
> @@ -258,7 +260,7 @@ __drm_fb_helper_restore_fbdev_mode_unlocked(struct 
> drm_fb_helper *fb_helper,
>   mutex_unlock(&fb_helper->lock);
>  
>   if (do_delayed)
> - drm_fb_helper_hotplug_event(fb_helper);
> + __drm_fb_helper_hotplug_event(fb_helper, force);
>  
>   return ret;
>  }
> @@ -1930,6 +1932,38 @@ int drm_fb_helper_initial_config(struct drm_fb_helper 
> *fb_helper, int bpp_sel)
>  }
>  EXPORT_SYMBOL(drm_fb_helper_initial_config);
>  
> +static int __drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper, 
> bool force)
> +{
> + int err = 0;
> +
> + if (!drm_fbdev_emulation || !fb_helper)
> + return 0;
> +
> + mutex_lock(&fb_helper->lock);
> + if (fb_helper->deferred_setup) {
> + err = __drm_fb_helper_initial_config_and_unlock(fb_helper,
> + fb_helper->preferred_bpp);
> + return err;
> + }
> + if (!force) {
> + if (!fb_helper->fb || 
> !drm_master_internal_acquire(fb_helper->dev)) {
> + fb_helper->delayed_hotplug = true;
> + mutex_unlock(&fb_helper->lock);
> + return err;
> + }
> + drm_master_internal_release(fb_helper->dev);
> + }
> + drm_dbg_kms(fb_helper->dev, "\n");
> +
> + drm_client_modeset_probe(&fb_helper->client, fb_helper->fb->width, 
> fb_helper->fb->height);
> + drm_setup_crtcs_fb(fb_helper);
> + mutex_unlock(&fb_helper->lock);
> +
> + drm_fb_helper_set_par(fb_helper->fbdev);
> +
> + return 0;
> +}
> +
>  /**
>   * drm_fb_helper_hotplug_event - respond to a hotplug notification by
>   *   probing all the outputs attached to the fb
> @@ -1953,35 +1987,7 @@ EXPORT_SYMBOL(drm_fb_helper_initial_config);
>   */
>  int drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper)
>  {
> - int err = 0;
> -
> - if (!drm_fbdev_emulation || !fb_helper)
> - return 0;
> -
> - mutex_lock(&fb_helper->lock);
> - if (fb_helper->deferred_setup) {
> - err = __drm_fb_helper_initial_config_and_unlock(fb_helper,
> - fb_helper->preferred_bpp);
> - return err;
> - }
> -
> - if (!fb_helper->fb || !drm_master_internal_acquire(fb_helper->dev)) {
> - fb_helper->delayed_hotplug = true;
> - mutex_unlock(&fb_helper->lock);
> - return err;
> - }
> -
> - drm_master_internal_release(fb_helper->dev);
> -
> - drm_dbg_kms(fb_helper->dev, "\n");
> -
> - drm_client_modeset_probe(&fb_helper->client, fb_helper->fb->width, 
> fb_helper->fb->height);
> - drm_setup_crtcs_fb(fb_helper);
> - mutex_unlock(&fb_helper->lock);
> -
> - drm_fb_helper_set_par(fb_helper->fbdev);
> -
> - return 0;
> + return __drm_fb_helper_hotplug_event(fb_helper, false);
>  }
>  EXPORT_SYMBOL(drm_fb_helper_hotplug_event);
>  
> 


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: Questions about KMS flip

2021-11-12 Thread Michel Dänzer

On 2021-11-12 13:47, Christian König wrote:
> 
> Anyway this unfortunately turned out to be work for Harray and Nicholas. In 
> detail it's about this bug report here: 
> https://bugzilla.kernel.org/show_bug.cgi?id=214621
> 
> Lang was able to reproduce the issue and narrow it down to the pin in 
> amdgpu_display_crtc_page_flip_target().
> 
> In other words we somehow have an unbalanced pinning of the scanout buffer in 
> DC.

DC doesn't use amdgpu_display_crtc_page_flip_target AFAICT. The corresponding 
pin with DC would be in dm_plane_helper_prepare_fb, paired with the unpin in 
dm_plane_helper_cleanup_fb.

With non-DC, the pin in amdgpu_display_crtc_page_flip_target is paired with the 
unpin in dm_plane_helper_cleanup_fb & dce_v*_crtc_disable. One thing I notice 
is that the pin is guarded by if (!adev->enable_virtual_display), but the 
unpins seem unconditional. So could this be about virtual display, and the 
problem is actually trying to unpin a BO that was never pinned?

-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: Questions about KMS flip

2021-11-12 Thread Michel Dänzer

On 2021-11-12 15:29, Michel Dänzer wrote:
> On 2021-11-12 13:47, Christian König wrote:
>>
>> Anyway this unfortunately turned out to be work for Harray and Nicholas. In 
>> detail it's about this bug report here: 
>> https://bugzilla.kernel.org/show_bug.cgi?id=214621
>>
>> Lang was able to reproduce the issue and narrow it down to the pin in 
>> amdgpu_display_crtc_page_flip_target().
>>
>> In other words we somehow have an unbalanced pinning of the scanout buffer 
>> in DC.
> 
> DC doesn't use amdgpu_display_crtc_page_flip_target AFAICT. The corresponding 
> pin with DC would be in dm_plane_helper_prepare_fb, paired with the unpin in 
> dm_plane_helper_cleanup_fb.
> 
> 
> With non-DC, the pin in amdgpu_display_crtc_page_flip_target is paired with 
> the unpin in dm_plane_helper_cleanup_fb

This should say amdgpu_display_unpin_work_func.


> & dce_v*_crtc_disable. One thing I notice is that the pin is guarded by if 
> (!adev->enable_virtual_display), but the unpins seem unconditional. So could 
> this be about virtual display, and the problem is actually trying to unpin a 
> BO that was never pinned?
> 
> 


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: Questions about KMS flip

2021-11-12 Thread Michel Dänzer

On 2021-11-12 16:03, Christian König wrote:
> Am 12.11.21 um 15:30 schrieb Michel Dänzer:
>> On 2021-11-12 15:29, Michel Dänzer wrote:
>>> On 2021-11-12 13:47, Christian König wrote:
>>>> Anyway this unfortunately turned out to be work for Harray and Nicholas. 
>>>> In detail it's about this bug report here: 
>>>> https://bugzilla.kernel.org/show_bug.cgi?id=214621
>>>>
>>>> Lang was able to reproduce the issue and narrow it down to the pin in 
>>>> amdgpu_display_crtc_page_flip_target().
>>>>
>>>> In other words we somehow have an unbalanced pinning of the scanout buffer 
>>>> in DC.
>>> DC doesn't use amdgpu_display_crtc_page_flip_target AFAICT. The 
>>> corresponding pin with DC would be in dm_plane_helper_prepare_fb, paired 
>>> with the unpin in
>>> dm_plane_helper_cleanup_fb.
>>>
>>>
>>> With non-DC, the pin in amdgpu_display_crtc_page_flip_target is paired with 
>>> the unpin in dm_plane_helper_cleanup_fb
>> This should say amdgpu_display_unpin_work_func.
> 
> Ah! So that is the classic (e.g. non atomic) path?

Presumably.


>>> & dce_v*_crtc_disable. One thing I notice is that the pin is guarded by if 
>>> (!adev->enable_virtual_display), but the unpins seem unconditional. So 
>>> could this be about virtual display, and the problem is actually trying to 
>>> unpin a BO that was never pinned?
> 
> Nope, my educated guess is rather that we free up the BO before 
> amdgpu_display_unpin_work_func is called.
> 
> E.g. not pin unbalance, but rather use after free.

amdgpu_display_crtc_page_flip_target calls amdgpu_bo_ref(work->old_abo), and 
amdgpu_display_unpin_work_func calls amdgpu_bo_unref(&work->old_abo) only after 
amdgpu_bo_unpin. So what you describe could only happen if there's an imbalance 
elsewhere such that amdgpu_bo_unref is called more often than amdgpu_bo_ref, or 
maybe if amdgpu_bo_reserve fails in amdgpu_display_unpin_work_func (in which 
case the "failed to reserve buffer after flip" error message should appear in 
dmesg).


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: Questions about KMS flip

2021-11-15 Thread Michel Dänzer

On 2021-11-15 07:41, Lang Yu wrote:
> On Fri, Nov 12, 2021 at 05:10:27PM +0100, Michel DDDnzer wrote:
>> On 2021-11-12 16:03, Christian König wrote:
>>> Am 12.11.21 um 15:30 schrieb Michel Dänzer:
>>>> On 2021-11-12 15:29, Michel Dänzer wrote:
>>>>> On 2021-11-12 13:47, Christian König wrote:
>>>>>> Anyway this unfortunately turned out to be work for Harray and Nicholas. 
>>>>>> In detail it's about this bug report here: 
>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.kernel.org%2Fshow_bug.cgi%3Fid%3D214621&data=04%7C01%7CLang.Yu%40amd.com%7Cca557eab16864ab544a108d9a5f6f288%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637723302338811983%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=9oG6k22BVp%2BkSvRX6uMlGXc6G%2BA5UL0Qy3W5iDTpvzs%3D&reserved=0
>>>>>>
>>>>>> Lang was able to reproduce the issue and narrow it down to the pin in 
>>>>>> amdgpu_display_crtc_page_flip_target().
>>>>>>
>>>>>> In other words we somehow have an unbalanced pinning of the scanout 
>>>>>> buffer in DC.
>>>>> DC doesn't use amdgpu_display_crtc_page_flip_target AFAICT. The 
>>>>> corresponding pin with DC would be in dm_plane_helper_prepare_fb, paired 
>>>>> with the unpin in
>>>>> dm_plane_helper_cleanup_fb.
>>>>>
>>>>>
>>>>> With non-DC, the pin in amdgpu_display_crtc_page_flip_target is paired 
>>>>> with the unpin in dm_plane_helper_cleanup_fb
>>>> This should say amdgpu_display_unpin_work_func.
>>>
>>> Ah! So that is the classic (e.g. non atomic) path?
>>
>> Presumably.
>>
>>
>>>>> & dce_v*_crtc_disable. One thing I notice is that the pin is guarded by 
>>>>> if (!adev->enable_virtual_display), but the unpins seem unconditional. So 
>>>>> could this be about virtual display, and the problem is actually trying 
>>>>> to unpin a BO that was never pinned?
>>>
>>> Nope, my educated guess is rather that we free up the BO before 
>>> amdgpu_display_unpin_work_func is called.
>>>
>>> E.g. not pin unbalance, but rather use after free.
>>
>> amdgpu_display_crtc_page_flip_target calls amdgpu_bo_ref(work->old_abo), and 
>> amdgpu_display_unpin_work_func calls amdgpu_bo_unref(&work->old_abo) only 
>> after amdgpu_bo_unpin. So what you describe could only happen if there's an 
>> imbalance elsewhere such that amdgpu_bo_unref is called more often than 
>> amdgpu_bo_ref, or maybe if amdgpu_bo_reserve fails in 
>> amdgpu_display_unpin_work_func (in which case the "failed to reserve buffer 
>> after flip" error message should appear in dmesg).
> 
> 
> Actually, each call to amdgpu_display_crtc_page_flip_target() will
> 
> 1, init a work(amdgpu_display_unpin_work_func) to unpin an old buffer
>(crtc->primary->fb), the work will be queued in dce_vX_0_pageflip_irq().
> 
> 2, pin a new buffer, assign it to crtc->primary->fb. But how to unpin it?
>Next call.
> 
> The problem is the pinned buffer of last call to 
> amdgpu_display_crtc_page_flip_target() is not unpinned.

It's unpinned in dce_v*_0_crtc_disable.


I think I've found the problem though: dce_v*_0_crtc_do_set_base pin the BO 
from target_fb unconditionally, but unpin the BO from the fb parameter only if 
it's different from the former. So if they're the same, the BO's pin count is 
incremented by 1.


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: Questions about KMS flip

2021-11-15 Thread Michel Dänzer

On 2021-11-15 10:04, Lang Yu wrote:
> On Mon, Nov 15, 2021 at 09:38:47AM +0100, Michel DDDnzer wrote:
>> On 2021-11-15 07:41, Lang Yu wrote:
>>> On Fri, Nov 12, 2021 at 05:10:27PM +0100, Michel DDDnzer wrote:
>>>> On 2021-11-12 16:03, Christian König wrote:
>>>>> Am 12.11.21 um 15:30 schrieb Michel Dänzer:
>>>>>> On 2021-11-12 15:29, Michel Dänzer wrote:
>>>>>>> On 2021-11-12 13:47, Christian König wrote:
>>>>>>>> Anyway this unfortunately turned out to be work for Harray and 
>>>>>>>> Nicholas. In detail it's about this bug report here: 
>>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.kernel.org%2Fshow_bug.cgi%3Fid%3D214621&data=04%7C01%7CLang.Yu%40amd.com%7C766e41a1c3052b6b08d9a81358b0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637725623316543180%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&sdata=od%2BNksWOff%2FtsuAYZLX7lIJFQJMY2OScVqhLclPYWAQ%3D&reserved=0
>>>>>>>>
>>>>>>>> Lang was able to reproduce the issue and narrow it down to the pin in 
>>>>>>>> amdgpu_display_crtc_page_flip_target().
>>>>>>>>
>>>>>>>> In other words we somehow have an unbalanced pinning of the scanout 
>>>>>>>> buffer in DC.
>>>>>>> DC doesn't use amdgpu_display_crtc_page_flip_target AFAICT. The 
>>>>>>> corresponding pin with DC would be in dm_plane_helper_prepare_fb, 
>>>>>>> paired with the unpin in
>>>>>>> dm_plane_helper_cleanup_fb.
>>>>>>>
>>>>>>>
>>>>>>> With non-DC, the pin in amdgpu_display_crtc_page_flip_target is paired 
>>>>>>> with the unpin in dm_plane_helper_cleanup_fb
>>>>>> This should say amdgpu_display_unpin_work_func.
>>>>>
>>>>> Ah! So that is the classic (e.g. non atomic) path?
>>>>
>>>> Presumably.
>>>>
>>>>
>>>>>>> & dce_v*_crtc_disable. One thing I notice is that the pin is guarded by 
>>>>>>> if (!adev->enable_virtual_display), but the unpins seem unconditional. 
>>>>>>> So could this be about virtual display, and the problem is actually 
>>>>>>> trying to unpin a BO that was never pinned?
>>>>>
>>>>> Nope, my educated guess is rather that we free up the BO before 
>>>>> amdgpu_display_unpin_work_func is called.
>>>>>
>>>>> E.g. not pin unbalance, but rather use after free.
>>>>
>>>> amdgpu_display_crtc_page_flip_target calls amdgpu_bo_ref(work->old_abo), 
>>>> and amdgpu_display_unpin_work_func calls amdgpu_bo_unref(&work->old_abo) 
>>>> only after amdgpu_bo_unpin. So what you describe could only happen if 
>>>> there's an imbalance elsewhere such that amdgpu_bo_unref is called more 
>>>> often than amdgpu_bo_ref, or maybe if amdgpu_bo_reserve fails in 
>>>> amdgpu_display_unpin_work_func (in which case the "failed to reserve 
>>>> buffer after flip" error message should appear in dmesg).
>>>
>>>
>>> Actually, each call to amdgpu_display_crtc_page_flip_target() will
>>>
>>> 1, init a work(amdgpu_display_unpin_work_func) to unpin an old buffer
>>>(crtc->primary->fb), the work will be queued in dce_vX_0_pageflip_irq().
>>>
>>> 2, pin a new buffer, assign it to crtc->primary->fb. But how to unpin it?
>>>Next call.
>>>
>>> The problem is the pinned buffer of last call to 
>>> amdgpu_display_crtc_page_flip_target() is not unpinned.
>>
>> It's unpinned in dce_v*_0_crtc_disable.
> 
> I just found crtc->primary->fb is NULL when came in dce_v*_0_crtc_disable().
> So it's not unpinned...

__drm_helper_disable_unused_functions sets crtc->primary->fb = NULL only after 
calling crtc_funcs->disable. Maybe this path can get hit for a CRTC which was 
already disabled, in which case crtc->primary->fb == NULL in 
dce_v*_0_crtc_disable is harmless.

Have you checked for the issue I described below? Should be pretty easy to 
catch.


>> I think I've found the problem though: dce_v*_0_crtc_do_set_base pin the BO 
>> from target_fb unconditionally, but unpin the BO from the fb parameter only 
>> if it's different from the former. So if they're the same, the BO's pin 
>> count is incremented by 1.


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: Questions about KMS flip

2021-11-15 Thread Michel Dänzer

On 2021-11-15 12:31, Lang Yu wrote:
> On Mon, Nov 15, 2021 at 10:49:39AM +0100, Michel DDDnzer wrote:
>> On 2021-11-15 10:04, Lang Yu wrote:
>>> On Mon, Nov 15, 2021 at 09:38:47AM +0100, Michel DDDnzer wrote:
>>>> On 2021-11-15 07:41, Lang Yu wrote:
>>>>> On Fri, Nov 12, 2021 at 05:10:27PM +0100, Michel DDDnzer wrote:
>>>>>> On 2021-11-12 16:03, Christian König wrote:
>>>>>>> Am 12.11.21 um 15:30 schrieb Michel Dänzer:
>>>>>>>> On 2021-11-12 15:29, Michel Dänzer wrote:
>>>>>>>>> On 2021-11-12 13:47, Christian König wrote:
>>>>>>>>>> Anyway this unfortunately turned out to be work for Harray and 
>>>>>>>>>> Nicholas. In detail it's about this bug report here: 
>>>>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.kernel.org%2Fshow_bug.cgi%3Fid%3D214621&data=04%7C01%7CLang.Yu%40amd.com%7Cee54c4d055d040ef9f8b08d9a81d3eb9%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637725665833112900%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&sdata=7nwIYd1um420XHVpOzeIvz37%2FLQqHF%2F6aRKfzgxUTnM%3D&reserved=0
>>>>>>>>>>
>>>>>>>>>> Lang was able to reproduce the issue and narrow it down to the pin 
>>>>>>>>>> in amdgpu_display_crtc_page_flip_target().
>>>>>>>>>>
>>>>>>>>>> In other words we somehow have an unbalanced pinning of the scanout 
>>>>>>>>>> buffer in DC.
>>>>>>>>> DC doesn't use amdgpu_display_crtc_page_flip_target AFAICT. The 
>>>>>>>>> corresponding pin with DC would be in dm_plane_helper_prepare_fb, 
>>>>>>>>> paired with the unpin in
>>>>>>>>> dm_plane_helper_cleanup_fb.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> With non-DC, the pin in amdgpu_display_crtc_page_flip_target is 
>>>>>>>>> paired with the unpin in dm_plane_helper_cleanup_fb
>>>>>>>> This should say amdgpu_display_unpin_work_func.
>>>>>>>
>>>>>>> Ah! So that is the classic (e.g. non atomic) path?
>>>>>>
>>>>>> Presumably.
>>>>>>
>>>>>>
>>>>>>>>> & dce_v*_crtc_disable. One thing I notice is that the pin is guarded 
>>>>>>>>> by if (!adev->enable_virtual_display), but the unpins seem 
>>>>>>>>> unconditional. So could this be about virtual display, and the 
>>>>>>>>> problem is actually trying to unpin a BO that was never pinned?
>>>>>>>
>>>>>>> Nope, my educated guess is rather that we free up the BO before 
>>>>>>> amdgpu_display_unpin_work_func is called.
>>>>>>>
>>>>>>> E.g. not pin unbalance, but rather use after free.
>>>>>>
>>>>>> amdgpu_display_crtc_page_flip_target calls amdgpu_bo_ref(work->old_abo), 
>>>>>> and amdgpu_display_unpin_work_func calls amdgpu_bo_unref(&work->old_abo) 
>>>>>> only after amdgpu_bo_unpin. So what you describe could only happen if 
>>>>>> there's an imbalance elsewhere such that amdgpu_bo_unref is called more 
>>>>>> often than amdgpu_bo_ref, or maybe if amdgpu_bo_reserve fails in 
>>>>>> amdgpu_display_unpin_work_func (in which case the "failed to reserve 
>>>>>> buffer after flip" error message should appear in dmesg).
>>>>>
>>>>>
>>>>> Actually, each call to amdgpu_display_crtc_page_flip_target() will
>>>>>
>>>>> 1, init a work(amdgpu_display_unpin_work_func) to unpin an old buffer
>>>>>(crtc->primary->fb), the work will be queued in 
>>>>> dce_vX_0_pageflip_irq().
>>>>>
>>>>> 2, pin a new buffer, assign it to crtc->primary->fb. But how to unpin it?
>>>>>Next call.
>>>>>
>>>>> The problem is the pinned buffer of last call to 
>>>>> amdgpu_display_crtc_page_flip_target() is not unpinned.
>>>>
>>>> It's unpinned in dce_v*_0_crtc_disable.
>>>
>>> I just found crtc->primary->fb is NULL when came in dce_v*_0_crtc_disable().
>&

Re: Questions about KMS flip

2021-11-16 Thread Michel Dänzer

On 2021-11-16 15:10, Alex Deucher wrote:
> On Tue, Nov 16, 2021 at 3:09 AM Christian König
>  wrote:
>>
>> Am 16.11.21 um 09:00 schrieb Lang Yu:
>>> On Tue, Nov 16, 2021 at 08:14:08AM +0100, Christian KKKnig wrote:
>>>> Am 16.11.21 um 04:27 schrieb Lang Yu:
>>>>> On Mon, Nov 15, 2021 at 01:04:15PM +0100, Michel DDDnzer wrote:
>>>>>> [SNIP]
>>>>>>> Though a single call to dce_v*_0_crtc_do_set_base() will
>>>>>>> only pin the BO, I found it will be unpinned in next call to
>>>>>>> dce_v*_0_crtc_do_set_base().
>>>>>> Yeah, that's the normal case when the new BO is different from the old 
>>>>>> one.
>>>>>>
>>>>>> To catch the case I described, try something like
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
>>>>>>
>>>>>> index 18a7b3bd633b..5726bd87a355 100644
>>>>>>
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
>>>>>>
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
>>>>>>
>>>>>> @@ -1926,6 +1926,7 @@ static int dce_v11_0_crtc_do_set_base(struct 
>>>>>> drm_crtc *crtc,
>>>>>>
>>>>>>   return r;
>>>>>>
>>>>>>
>>>>>>
>>>>>>   if (!atomic) {
>>>>>>
>>>>>> +   WARN_ON_ONCE(target_fb == fb);
>>>>>>
>>>>>>   r = amdgpu_bo_pin(abo, AMDGPU_GEM_DOMAIN_VRAM);
>>>>>>
>>>>>>   if (unlikely(r != 0)) {
>>>>>>
>>>>>>   amdgpu_bo_unreserve(abo);
>>>>>>
>>>>> I did some tests, the warning can be triggered.
>>>>>
>>>>> pin/unpin operations in *_crtc_do_set_base() and
>>>>> amdgpu_display_crtc_page_flip_target() are mixed.
>>>> Ok sounds like we narrowed down the root cause pretty well.
>>>>
>>>> Question is now how can we fix this? Just not pin the BO when target_fb ==
>>>> fb?
>>> That worked. I did a few simple tests and didn't observe ttm_bo_release 
>>> warnings
>>> any more.
>>>
>>> The pin/unpin logic,
>>>
>>> 1, fist crtc_mode_set, dce_v*_0_crtc_do_set_base() pins 
>>> crtc->primary->fb(new),
>>> old_fb is NULL.
>>>
>>> 2, second crtc_mode_set, dce_v*_0_crtc_do_set_base() pins 
>>> crtc->primary->fb(new),
>>> unpins old fb.
>>>
>>> 3, amdgpu_display_crtc_page_flip_target() pin/unpin operations.
>>>
>>> 4, third crtc_mode_set, dce_v*_0_crtc_do_set_base() pins 
>>> crtc->primary->fb(new),
>>> unpins old fb (it is pinned in last call to 
>>> amdgpu_display_crtc_page_flip_target)
>>>
>>> 5, amdgpu_display_crtc_page_flip_target() pin/unpin operations.
>>>
>>> .
>>>
>>> x, reboot, amdgpu_display_suspend_helper() is called, the last pinned fb 
>>> was unpinned.
>>>
>>> And I didn't observe amdgpu_bo_unpin() in dce_v*_0_crtc_disable() is called.
>>>
>>> If the logic is wrong, please correct me.
>>
>> I can't fully judge because I'm not that deep with my head in the old
>> display code, but from a ten mile high view it sounds sane to me. Michel
>> what do you think?

Sounds good to me.


Would be nice to address the other issue identified in this thread as well:

The pin in amdgpu_display_crtc_page_flip_target is guarded by if 
(!adev->enable_virtual_display), but the corresponding unpins in 
amdgpu_display_unpin_work_func & dce_v*_0_crtc_disable aren't. This probably 
results in pin count underflows with virtual display.


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

Re: [PATCH v2] drm/fb-helper: Call drm_fb_helper_hotplug_event() when releasing drm master

2021-11-18 Thread Michel Dänzer

On 2021-11-08 17:00, Daniel Vetter wrote:
> On Mon, Nov 08, 2021 at 04:34:53PM +0100, Jocelyn Falempe wrote:
>> When using Xorg/Logind and an external monitor connected with an MST dock.
>> After disconnecting the external monitor, switching to VT may not work,
>> the (internal) monitor sill display Xorg, and you can't see what you are
>> typing in the VT.
>>
>> This is related to commit e2809c7db818 ("drm/fb_helper: move deferred fb
>> checking into restore mode (v2)")
>>
>> When switching to VT, with Xorg and logind, if there
>> are pending hotplug event (like MST unplugged), the hotplug path
>> may not be fulfilled, because logind may drop the master a bit later.
>> It leads to the console not showing up on the monitor.
>>
>> So when dropping the drm master, call the delayed hotplug function if
>> needed.
>>
>> v2: rewrote the patch by calling the hotplug function after dropping master
>>
>> Signed-off-by: Jocelyn Falempe 
> 
> Lastclose console restore is a very gross hack, and generally doesn't work
> well.
> 
> The way this is supposed to work is:
> - userspace drops drm master (because drm master always wins)
> - userspace switches the console back to text mode (which will restore the
>   console)
> 
> I guess we could also do this from dropmaster once more (like from
> lastclose), but that really feels like papering over userspace bugs. And
> given what a massive mess this entire area is already, I'm not eager to
> add more hacks here.
> 
> So ... can't we fix userspace?

Sounds like the good old UMS days, when VT switching relied on user space doing 
the right things in the right order.

With KMS, surely the kernel needs to be able to get to a known good state from 
scratch when it's in control of the VT, no matter what state user space left 
things in.


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer

[PATCH 2/2] drm/amd/display: Reduce stack size for dml31 UseMinimumDCFCLK

2021-12-09 Thread Michel Dänzer

From: Michel Dänzer 

Use the struct display_mode_lib pointer instead of passing lots of large
arrays as parameters by value.

Addresses this warning (resulting in failure to build a RHEL debug kernel
with Werror enabled):

../drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn31/display_mode_vba_31.c: In 
function ‘UseMinimumDCFCLK’:
../drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn31/display_mode_vba_31.c:7478:1:
 warning: the frame size of 2128 bytes is larger than 2048 bytes 
[-Wframe-larger-than=]

NOTE: AFAICT this function previously had no observable effect, since it
only modified parameters passed by value and doesn't return anything.
Now it may modify some values in struct display_mode_lib passed in by
reference.

Signed-off-by: Michel Dänzer 
---
 .../dc/dml/dcn31/display_mode_vba_31.c| 304 --
 1 file changed, 69 insertions(+), 235 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c 
b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
index 8965f9af9d0a..6feb23432f8d 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
@@ -422,62 +422,8 @@ static void CalculateUrgentBurstFactor(
 
 static void UseMinimumDCFCLK(
struct display_mode_lib *mode_lib,
-   int MaxInterDCNTileRepeaters,
int MaxPrefetchMode,
-   double FinalDRAMClockChangeLatency,
-   double SREnterPlusExitTime,
-   int ReturnBusWidth,
-   int RoundTripPingLatencyCycles,
-   int ReorderingBytes,
-   int PixelChunkSizeInKByte,
-   int MetaChunkSize,
-   bool GPUVMEnable,
-   int GPUVMMaxPageTableLevels,
-   bool HostVMEnable,
-   int NumberOfActivePlanes,
-   double HostVMMinPageSize,
-   int HostVMMaxNonCachedPageTableLevels,
-   bool DynamicMetadataVMEnabled,
-   enum immediate_flip_requirement ImmediateFlipRequirement,
-   bool ProgressiveToInterlaceUnitInOPP,
-   double 
MaxAveragePercentOfIdealFabricAndSDPPortBWDisplayCanUseInNormalSystemOperation,
-   double PercentOfIdealFabricAndSDPPortBWReceivedAfterUrgLatency,
-   int VTotal[],
-   int VActive[],
-   int DynamicMetadataTransmittedBytes[],
-   int DynamicMetadataLinesBeforeActiveRequired[],
-   bool Interlace[],
-   double RequiredDPPCLK[][2][DC__NUM_DPP__MAX],
-   double RequiredDISPCLK[][2],
-   double UrgLatency[],
-   unsigned int NoOfDPP[][2][DC__NUM_DPP__MAX],
-   double ProjectedDCFCLKDeepSleep[][2],
-   double MaximumVStartup[][2][DC__NUM_DPP__MAX],
-   double TotalVActivePixelBandwidth[][2],
-   double TotalVActiveCursorBandwidth[][2],
-   double TotalMetaRowBandwidth[][2],
-   double TotalDPTERowBandwidth[][2],
-   unsigned int TotalNumberOfActiveDPP[][2],
-   unsigned int TotalNumberOfDCCActiveDPP[][2],
-   int dpte_group_bytes[],
-   double PrefetchLinesY[][2][DC__NUM_DPP__MAX],
-   double PrefetchLinesC[][2][DC__NUM_DPP__MAX],
-   int swath_width_luma_ub_all_states[][2][DC__NUM_DPP__MAX],
-   int swath_width_chroma_ub_all_states[][2][DC__NUM_DPP__MAX],
-   int BytePerPixelY[],
-   int BytePerPixelC[],
-   int HTotal[],
-   double PixelClock[],
-   double PDEAndMetaPTEBytesPerFrame[][2][DC__NUM_DPP__MAX],
-   double DPTEBytesPerRow[][2][DC__NUM_DPP__MAX],
-   double MetaRowBytes[][2][DC__NUM_DPP__MAX],
-   bool DynamicMetadataEnable[],
-   double VActivePixelBandwidth[][2][DC__NUM_DPP__MAX],
-   double VActiveCursorBandwidth[][2][DC__NUM_DPP__MAX],
-   double ReadBandwidthLuma[],
-   double ReadBandwidthChroma[],
-   double DCFCLKPerState[],
-   double DCFCLKState[][2]);
+   int ReorderingBytes);
 
 static void CalculatePixelDeliveryTimes(
unsigned int NumberOfActivePlanes,
@@ -5175,66 +5121,8 @@ void dml31_ModeSupportAndSystemConfigurationFull(struct 
display_mode_lib *mode_l
}
}
 
-   if (v->UseMinimumRequiredDCFCLK == true) {
-   UseMinimumDCFCLK(
-   mode_lib,
-   v->MaxInterDCNTileRepeaters,
-   MaxPrefetchMode,
-   v->DRAMClockChangeLatency,
-   v->SREnterPlusExitTime,
-   v->ReturnBusWidth,
-   v->Rou

[PATCH 1/2] drm/amd/display: Reduce stack size for dml31_ModeSupportAndSystemConfigurationFull

2021-12-09 Thread Michel Dänzer

From: Michel Dänzer 

Move code using the Pipe struct to a new helper function.

Works around[0] this warning (resulting in failure to build a RHEL debug
kernel with Werror enabled):

../drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn31/display_mode_vba_31.c: In 
function ‘dml31_ModeSupportAndSystemConfigurationFull’:
../drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn31/display_mode_vba_31.c:5740:1:
 warning: the frame size of 2144 bytes is larger than 2048 bytes 
[-Wframe-larger-than=]

The culprit seems to be the Pipe struct, so pull the relevant block out
into its own sub-function. (This is porting
a62427ef9b55 "drm/amd/display: Reduce stack size for 
dml21_ModeSupportAndSystemConfigurationFull"
from dml31 to dml21)

[0] AFAICT this doesn't actually reduce the total amount of stack which
can be used, just moves some of it from
dml31_ModeSupportAndSystemConfigurationFull to the new helper function,
so the former happens to no longer exceed the limit for a single
function.

Signed-off-by: Michel Dänzer 
---
 .../dc/dml/dcn31/display_mode_vba_31.c| 185 ++
 1 file changed, 99 insertions(+), 86 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c 
b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
index 7e937bdcea00..8965f9af9d0a 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
@@ -3949,6 +3949,102 @@ static double TruncToValidBPP(
return BPP_INVALID;
 }
 
+static noinline void CalculatePrefetchSchedulePerPlane(
+   struct display_mode_lib *mode_lib,
+   double HostVMInefficiencyFactor,
+   int i,
+   unsigned j,
+   unsigned k)
+{
+   struct vba_vars_st *v = &mode_lib->vba;
+   Pipe myPipe;
+
+   myPipe.DPPCLK = v->RequiredDPPCLK[i][j][k];
+   myPipe.DISPCLK = v->RequiredDISPCLK[i][j];
+   myPipe.PixelClock = v->PixelClock[k];
+   myPipe.DCFCLKDeepSleep = v->ProjectedDCFCLKDeepSleep[i][j];
+   myPipe.DPPPerPlane = v->NoOfDPP[i][j][k];
+   myPipe.ScalerEnabled = v->ScalerEnabled[k];
+   myPipe.SourceScan = v->SourceScan[k];
+   myPipe.BlockWidth256BytesY = v->Read256BlockWidthY[k];
+   myPipe.BlockHeight256BytesY = v->Read256BlockHeightY[k];
+   myPipe.BlockWidth256BytesC = v->Read256BlockWidthC[k];
+   myPipe.BlockHeight256BytesC = v->Read256BlockHeightC[k];
+   myPipe.InterlaceEnable = v->Interlace[k];
+   myPipe.NumberOfCursors = v->NumberOfCursors[k];
+   myPipe.VBlank = v->VTotal[k] - v->VActive[k];
+   myPipe.HTotal = v->HTotal[k];
+   myPipe.DCCEnable = v->DCCEnable[k];
+   myPipe.ODMCombineIsEnabled = v->ODMCombineEnablePerState[i][k] == 
dm_odm_combine_mode_4to1
+   || v->ODMCombineEnablePerState[i][k] == 
dm_odm_combine_mode_2to1;
+   myPipe.SourcePixelFormat = v->SourcePixelFormat[k];
+   myPipe.BytePerPixelY = v->BytePerPixelY[k];
+   myPipe.BytePerPixelC = v->BytePerPixelC[k];
+   myPipe.ProgressiveToInterlaceUnitInOPP = 
v->ProgressiveToInterlaceUnitInOPP;
+   v->NoTimeForPrefetch[i][j][k] = CalculatePrefetchSchedule(
+   mode_lib,
+   HostVMInefficiencyFactor,
+   &myPipe,
+   v->DSCDelayPerState[i][k],
+   v->DPPCLKDelaySubtotal + v->DPPCLKDelayCNVCFormater,
+   v->DPPCLKDelaySCL,
+   v->DPPCLKDelaySCLLBOnly,
+   v->DPPCLKDelayCNVCCursor,
+   v->DISPCLKDelaySubtotal,
+   v->SwathWidthYThisState[k] / v->HRatio[k],
+   v->OutputFormat[k],
+   v->MaxInterDCNTileRepeaters,
+   dml_min(v->MaxVStartup, v->MaximumVStartup[i][j][k]),
+   v->MaximumVStartup[i][j][k],
+   v->GPUVMMaxPageTableLevels,
+   v->GPUVMEnable,
+   v->HostVMEnable,
+   v->HostVMMaxNonCachedPageTableLevels,
+   v->HostVMMinPageSize,
+   v->DynamicMetadataEnable[k],
+   v->DynamicMetadataVMEnabled,
+   v->DynamicMetadataLinesBeforeActiveRequired[k],
+   v->DynamicMetadataTransmittedBytes[k],
+   v->UrgLatency[i],
+   v->ExtraLatency,
+   v->TimeCalc,
+   v->PDEAndMetaPTEBytesPerFrame[i][j][k],
+   v->MetaRowBytes[i][j][k],
+   v->DPTEBytesPerRow[i][j][k],
+   v->PrefetchLinesY[i][j][k],
+   v->SwathWidthYThisState[k],
+   v->PrefillY[k],
+   v->MaxNumSwY[k],
+   v->PrefetchLinesC[i][j][k],
+   v->SwathWidthCThisState[k],
+   v->PrefillC[k],

Re: [PATCH 1/2] drm/amd/display: Reduce stack size for dml31_ModeSupportAndSystemConfigurationFull

2021-12-13 Thread Michel Dänzer

On 2021-12-11 13:20, Rodrigo Siqueira Jordao wrote:
> 
> 
> On 2021-12-09 11:46 a.m., Michel Dänzer wrote:
>> From: Michel Dänzer 
>>
>> Move code using the Pipe struct to a new helper function.
>>
>> Works around[0] this warning (resulting in failure to build a RHEL debug
>> kernel with Werror enabled):
>>
>> ../drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn31/display_mode_vba_31.c: 
>> In function ‘dml31_ModeSupportAndSystemConfigurationFull’:
>> ../drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn31/display_mode_vba_31.c:5740:1:
>>  warning: the frame size of 2144 bytes is larger than 2048 bytes 
>> [-Wframe-larger-than=]
>>
>> The culprit seems to be the Pipe struct, so pull the relevant block out
>> into its own sub-function. (This is porting
>> a62427ef9b55 "drm/amd/display: Reduce stack size for 
>> dml21_ModeSupportAndSystemConfigurationFull"
>> from dml31 to dml21)
>>
>> [0] AFAICT this doesn't actually reduce the total amount of stack which
>> can be used, just moves some of it from
>> dml31_ModeSupportAndSystemConfigurationFull to the new helper function,
>> so the former happens to no longer exceed the limit for a single
>> function.
>>
>> Signed-off-by: Michel Dänzer 
>> ---
>>   .../dc/dml/dcn31/display_mode_vba_31.c    | 185 ++
>>   1 file changed, 99 insertions(+), 86 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c 
>> b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
>> index 7e937bdcea00..8965f9af9d0a 100644
>> --- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
>> +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
>> @@ -3949,6 +3949,102 @@ static double TruncToValidBPP(
>>   return BPP_INVALID;
>>   }
>>   +static noinline void CalculatePrefetchSchedulePerPlane(
>> +    struct display_mode_lib *mode_lib,
>> +    double HostVMInefficiencyFactor,
>> +    int i,
>> +    unsigned j,
>> +    unsigned k)
>> +{
>> +    struct vba_vars_st *v = &mode_lib->vba;
>> +    Pipe myPipe;
>> +
>> +    myPipe.DPPCLK = v->RequiredDPPCLK[i][j][k];
>> +    myPipe.DISPCLK = v->RequiredDISPCLK[i][j];
>> +    myPipe.PixelClock = v->PixelClock[k];
>> +    myPipe.DCFCLKDeepSleep = v->ProjectedDCFCLKDeepSleep[i][j];
>> +    myPipe.DPPPerPlane = v->NoOfDPP[i][j][k];
>> +    myPipe.ScalerEnabled = v->ScalerEnabled[k];
>> +    myPipe.SourceScan = v->SourceScan[k];
>> +    myPipe.BlockWidth256BytesY = v->Read256BlockWidthY[k];
>> +    myPipe.BlockHeight256BytesY = v->Read256BlockHeightY[k];
>> +    myPipe.BlockWidth256BytesC = v->Read256BlockWidthC[k];
>> +    myPipe.BlockHeight256BytesC = v->Read256BlockHeightC[k];
>> +    myPipe.InterlaceEnable = v->Interlace[k];
>> +    myPipe.NumberOfCursors = v->NumberOfCursors[k];
>> +    myPipe.VBlank = v->VTotal[k] - v->VActive[k];
>> +    myPipe.HTotal = v->HTotal[k];
>> +    myPipe.DCCEnable = v->DCCEnable[k];
>> +    myPipe.ODMCombineIsEnabled = v->ODMCombineEnablePerState[i][k] == 
>> dm_odm_combine_mode_4to1
>> +    || v->ODMCombineEnablePerState[i][k] == dm_odm_combine_mode_2to1;
>> +    myPipe.SourcePixelFormat = v->SourcePixelFormat[k];
>> +    myPipe.BytePerPixelY = v->BytePerPixelY[k];
>> +    myPipe.BytePerPixelC = v->BytePerPixelC[k];
>> +    myPipe.ProgressiveToInterlaceUnitInOPP = 
>> v->ProgressiveToInterlaceUnitInOPP;
>> +    v->NoTimeForPrefetch[i][j][k] = CalculatePrefetchSchedule(
>> +    mode_lib,
>> +    HostVMInefficiencyFactor,
>> +    &myPipe,
>> +    v->DSCDelayPerState[i][k],
>> +    v->DPPCLKDelaySubtotal + v->DPPCLKDelayCNVCFormater,
>> +    v->DPPCLKDelaySCL,
>> +    v->DPPCLKDelaySCLLBOnly,
>> +    v->DPPCLKDelayCNVCCursor,
>> +    v->DISPCLKDelaySubtotal,
>> +    v->SwathWidthYThisState[k] / v->HRatio[k],
>> +    v->OutputFormat[k],
>> +    v->MaxInterDCNTileRepeaters,
>> +    dml_min(v->MaxVStartup, v->MaximumVStartup[i][j][k]),
>> +    v->MaximumVStartup[i][j][k],
>> +    v->GPUVMMaxPageTableLevels,
>> +    v->GPUVMEnable,
>> +    v->HostVMEnable,
>> +    v->HostVMMaxNonCachedPageTableLevels,
>> +    v->HostVMMinPageSize,
>> +    v->DynamicMetadataEnable[k],
>> +    v->Dynamic

Re: [PATCH v2 0/3] Experimental freesync video mode optimization

2020-12-11 Thread Michel Dänzer


On 2020-12-14 9:57 p.m., Christian König wrote:

Am 11.12.20 um 13:20 schrieb Pekka Paalanen:

On Fri, 11 Dec 2020 11:28:36 +0100
Christian König  wrote:


I think the general idea we settled on is that we specify an earliest
display time for each frame and give feedback to the application when a
frame was actually displayed.

That is indeed something completely different, and feels like it would
be several years in the future, while the proposed scheme or the
min/max properties could be done fairly quickly. But I'm not in a hurry.


X11 already supports that with the DRI3 extension. Also see VDPAU 
present and the Vulkan extension for reference application API 
implementations, so that stuff is already present.


I suspect you mean the Present extension, specifically PresentOptionUST. 
There is no working implementation of this yet (the xserver tree has 
never had any code which would even advertise PresentCapabilityUST, let 
alone do anything with a timestamp passed in by the client).



--
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH] amdgpu: Kill off stale dal directories

2016-04-30 Thread Michel Dänzer

On 30.04.2016 12:18, Edward O'Callaghan wrote:
> Signed-off-by: Edward O'Callaghan 
> ---
>  drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore | 4 
>  drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore   | 4 
>  2 files changed, 8 deletions(-)
>  delete mode 100644 
> drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore
>  delete mode 100644 
> drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore
> 
> diff --git a/drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore 
> b/drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore
> deleted file mode 100644
> index 4d12de1..000
> --- a/drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -x64
> -Debug
> -*.user
> -*.sdf
> \ No newline at end of file
> diff --git a/drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore 
> b/drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore
> deleted file mode 100644
> index 7b285df..000
> --- a/drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -bin
> -obj
> -*.user
> -*.sdf
> \ No newline at end of file
> 

You could add a single new .gitignore file, e.g.
drivers/gpu/drm/amd/dal/.gitignore .


-- 
Earthling Michel DÃ¤nzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer

[PATCH] amdgpu: Kill off stale dal directories

2016-04-30 Thread Michel Dänzer

On 30.04.2016 16:44, Michel DÃ¤nzer wrote:
> On 30.04.2016 12:18, Edward O'Callaghan wrote:
>> Signed-off-by: Edward O'Callaghan 
>> ---
>>  drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore | 4 
>>  drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore   | 4 
>>  2 files changed, 8 deletions(-)
>>  delete mode 100644 
>> drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore
>>  delete mode 100644 
>> drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore
>>
>> diff --git a/drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore 
>> b/drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore
>> deleted file mode 100644
>> index 4d12de1..000
>> --- a/drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore
>> +++ /dev/null
>> @@ -1,4 +0,0 @@
>> -x64
>> -Debug
>> -*.user
>> -*.sdf
>> \ No newline at end of file
>> diff --git a/drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore 
>> b/drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore
>> deleted file mode 100644
>> index 7b285df..000
>> --- a/drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore
>> +++ /dev/null
>> @@ -1,4 +0,0 @@
>> -bin
>> -obj
>> -*.user
>> -*.sdf
>> \ No newline at end of file
>>
> 
> You could add a single new .gitignore file, e.g.
> drivers/gpu/drm/amd/dal/.gitignore .

Never mind, I read the patch backwards. In my defence, I think the
shortlog is slightly confusing, how about "amdgpu: Remove stale
.gitignore files" instead?


-- 
Earthling Michel DÃ¤nzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer

[PATCH] amdgpu: Kill off stale dal directories

2016-04-30 Thread Michel Dänzer

On 30.04.2016 17:50, Michel DÃ¤nzer wrote:
> On 30.04.2016 16:44, Michel DÃ¤nzer wrote:
>> On 30.04.2016 12:18, Edward O'Callaghan wrote:
>>> Signed-off-by: Edward O'Callaghan 
>>> ---
>>>  drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore | 4 
>>>  drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore   | 4 
>>>  2 files changed, 8 deletions(-)
>>>  delete mode 100644 
>>> drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore
>>>  delete mode 100644 
>>> drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore
>>>
>>> diff --git a/drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore 
>>> b/drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore
>>> deleted file mode 100644
>>> index 4d12de1..000
>>> --- a/drivers/gpu/drm/amd/dal/utils/bw_calc_test_harness/.gitignore
>>> +++ /dev/null
>>> @@ -1,4 +0,0 @@
>>> -x64
>>> -Debug
>>> -*.user
>>> -*.sdf
>>> \ No newline at end of file
>>> diff --git a/drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore 
>>> b/drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore
>>> deleted file mode 100644
>>> index 7b285df..000
>>> --- a/drivers/gpu/drm/amd/dal/utils/vba_to_c_converter/.gitignore
>>> +++ /dev/null
>>> @@ -1,4 +0,0 @@
>>> -bin
>>> -obj
>>> -*.user
>>> -*.sdf
>>> \ No newline at end of file
>>>
>>
>> You could add a single new .gitignore file, e.g.
>> drivers/gpu/drm/amd/dal/.gitignore .
> 
> Never mind, I read the patch backwards. In my defence, I think the
> shortlog is slightly confusing, how about "amdgpu: Remove stale
> .gitignore files" instead?

No, it's fine as is, since the patch presumably removes the now empty
directories containing those files as well.

Sorry for the noise,

Reviewed-by: Michel DÃ¤nzer 


-- 
Earthling Michel DÃ¤nzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer

[PATCH libdrm] Make sure the second argument to container_of() is initialized

2016-08-02 Thread Michel Dänzer

From: Michel DÃ¤nzer 

Fixes warnings and miscompilation resulting in crashes with clang.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94249
Signed-off-by: Michel DÃ¤nzer 
---
 amdgpu/amdgpu_vamgr.c |  2 +-
 util_double_list.h| 10 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/amdgpu/amdgpu_vamgr.c b/amdgpu/amdgpu_vamgr.c
index 8a707cb..3eaef70 100644
--- a/amdgpu/amdgpu_vamgr.c
+++ b/amdgpu/amdgpu_vamgr.c
@@ -157,7 +157,7 @@ amdgpu_vamgr_find_va(struct amdgpu_bo_va_mgr *mgr, uint64_t 
size,
 drm_private void
 amdgpu_vamgr_free_va(struct amdgpu_bo_va_mgr *mgr, uint64_t va, uint64_t size)
 {
-   struct amdgpu_bo_va_hole *hole;
+   struct amdgpu_bo_va_hole *hole = NULL;

if (va == AMDGPU_INVALID_VA_ADDRESS)
return;
diff --git a/util_double_list.h b/util_double_list.h
index 5d01f52..fc32da5 100644
--- a/util_double_list.h
+++ b/util_double_list.h
@@ -114,29 +114,29 @@ static inline void list_delinit(struct list_head *item)
 #endif

 #define LIST_FOR_EACH_ENTRY(pos, head, member) \
-   for (pos = container_of((head)->next, pos, member); \
+   for (pos = NULL, pos = container_of((head)->next, pos, member); \
&pos->member != (head); \
pos = container_of(pos->member.next, pos, member))

 #define LIST_FOR_EACH_ENTRY_SAFE(pos, storage, head, member)   \
-   for (pos = container_of((head)->next, pos, member), \
+   for (pos = NULL, pos = container_of((head)->next, pos, member), \
storage = container_of(pos->member.next, pos, member);  \
&pos->member != (head); \
pos = storage, storage = container_of(storage->member.next, storage, 
member))

 #define LIST_FOR_EACH_ENTRY_SAFE_REV(pos, storage, head, member)   \
-   for (pos = container_of((head)->prev, pos, member), \
+   for (pos = NULL, pos = container_of((head)->prev, pos, member), \
storage = container_of(pos->member.prev, pos, member);  \
&pos->member != (head); \
pos = storage, storage = container_of(storage->member.prev, storage, 
member))

 #define LIST_FOR_EACH_ENTRY_FROM(pos, start, head, member) \
-   for (pos = container_of((start), pos, member);  \
+   for (pos = NULL, pos = container_of((start), pos, member);  \
&pos->member != (head); \
pos = container_of(pos->member.next, pos, member))

 #define LIST_FOR_EACH_ENTRY_FROM_REV(pos, start, head, member) \
-   for (pos = container_of((start), pos, member);  \
+   for (pos = NULL, pos = container_of((start), pos, member);  \
&pos->member != (head); \
pos = container_of(pos->member.prev, pos, member))

-- 
2.8.1

[PATCH libdrm] Make sure the second argument to container_of() is initialized

2016-08-03 Thread Michel Dänzer

On 03.08.2016 02:49, Nicolai HÃ¤hnle wrote:
> On 02.08.2016 12:08, Michel DÃ¤nzer wrote:
>> From: Michel DÃ¤nzer 
>>
>> Fixes warnings and miscompilation resulting in crashes with clang.
> 
> How annoying. Seems like this would affect most (current and future)
> uses of container_of, and so it would be better to switch to the Linux
> kernel version of container_of, where the second parameter is the type...

Rob proposed http://hastebin.com/vefenavoje.coffee on IRC, using
typeof(). I'm fine with going for that instead of my patch, just FYI: My
patch was based on xserver's list.h, which works with compilers which
don't support typeof(). Not sure that's relevant for libdrm though.

-- 
Earthling Michel DÃ¤nzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer

[PATCH 0/6] Allow DC state to reset the counter on screen enabled.

2016-08-04 Thread Michel Dänzer

On 04.08.2016 06:33, Rodrigo Vivi wrote:
> For now DC is only helping on screen off scenarios since PSR is disabled.
> 
> But if we want to enable PSR first we need to make DC reliable with screen on.
> Biggest challenge is to deal with vblank counters since frame counter register
> is read only and can be reset in DC state.
> 
> This series is one of possible approaches, but brings the down side of not
> being possible to use runtime pm with vblank enabled. Some test cases needs
> to be adapted to represent this new vision.

Note that the frame counter register must not reset before
drm_crtc_vblank_off / after drm_crtc_vblank_on, even while the vblank
interrupt is disabled. Otherwise the DRM vblank counter as seen by
userspace via the DRM_IOCTL_WAIT_VBLANK ioctl will jump around, because
the core DRM code uses the frame counter register to keep track of how
many vertical blank periods occurred since the interrupt was disabled.

> But also this series is not fully tested. Apparently I have an issue yet
> with flip-vs-expired-vblank_* tests and pm_rpm basic tests.

Maybe some of the test failures are due to the above.

-- 
Earthling Michel DÃ¤nzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer

[PATCH 3/6] drm/amdgpu: Set MASTER_UPDATE_MODE to 0 again

2016-08-04 Thread Michel Dänzer

From: Michel DÃ¤nzer 

With the previous change, it's safe to let page flips take effect
anytime during a vertical blank period.

This can avoid delaying a flip by a frame in some cases where we get to
amdgpu_flip_work_func -> adev->mode_info.funcs->page_flip during a
vertical blank period.

Signed-off-by: Michel DÃ¤nzer 
---
 drivers/gpu/drm/amd/amdgpu/dce_v10_0.c | 8 
 drivers/gpu/drm/amd/amdgpu/dce_v11_0.c | 4 ++--
 drivers/gpu/drm/amd/amdgpu/dce_v8_0.c  | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
index a158942..ddf5c7a 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
@@ -646,8 +646,8 @@ static void dce_v10_0_resume_mc_access(struct amdgpu_device 
*adev,

if (save->crtc_enabled[i]) {
tmp = RREG32(mmMASTER_UPDATE_MODE + crtc_offsets[i]);
-   if (REG_GET_FIELD(tmp, MASTER_UPDATE_MODE, 
MASTER_UPDATE_MODE) != 3) {
-   tmp = REG_SET_FIELD(tmp, MASTER_UPDATE_MODE, 
MASTER_UPDATE_MODE, 3);
+   if (REG_GET_FIELD(tmp, MASTER_UPDATE_MODE, 
MASTER_UPDATE_MODE) != 0) {
+   tmp = REG_SET_FIELD(tmp, MASTER_UPDATE_MODE, 
MASTER_UPDATE_MODE, 0);
WREG32(mmMASTER_UPDATE_MODE + crtc_offsets[i], 
tmp);
}
tmp = RREG32(mmGRPH_UPDATE + crtc_offsets[i]);
@@ -2275,8 +2275,8 @@ static int dce_v10_0_crtc_do_set_base(struct drm_crtc 
*crtc,
WREG32(mmVIEWPORT_SIZE + amdgpu_crtc->crtc_offset,
   (viewport_w << 16) | viewport_h);

-   /* set pageflip to happen only at start of vblank interval (front 
porch) */
-   WREG32(mmMASTER_UPDATE_MODE + amdgpu_crtc->crtc_offset, 3);
+   /* set pageflip to happen anywhere in vblank interval */
+   WREG32(mmMASTER_UPDATE_MODE + amdgpu_crtc->crtc_offset, 0);

if (!atomic && fb && fb != crtc->primary->fb) {
amdgpu_fb = to_amdgpu_framebuffer(fb);
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
index 4455d63..97ae822 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
@@ -2250,8 +2250,8 @@ static int dce_v11_0_crtc_do_set_base(struct drm_crtc 
*crtc,
WREG32(mmVIEWPORT_SIZE + amdgpu_crtc->crtc_offset,
   (viewport_w << 16) | viewport_h);

-   /* set pageflip to happen only at start of vblank interval (front 
porch) */
-   WREG32(mmCRTC_MASTER_UPDATE_MODE + amdgpu_crtc->crtc_offset, 3);
+   /* set pageflip to happen anywhere in vblank interval */
+   WREG32(mmCRTC_MASTER_UPDATE_MODE + amdgpu_crtc->crtc_offset, 0);

if (!atomic && fb && fb != crtc->primary->fb) {
amdgpu_fb = to_amdgpu_framebuffer(fb);
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
index ad4884a..9a876db 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
@@ -2137,8 +2137,8 @@ static int dce_v8_0_crtc_do_set_base(struct drm_crtc 
*crtc,
WREG32(mmVIEWPORT_SIZE + amdgpu_crtc->crtc_offset,
   (viewport_w << 16) | viewport_h);

-   /* set pageflip to happen only at start of vblank interval (front 
porch) */
-   WREG32(mmMASTER_UPDATE_MODE + amdgpu_crtc->crtc_offset, 3);
+   /* set pageflip to happen anywhere in vblank interval */
+   WREG32(mmMASTER_UPDATE_MODE + amdgpu_crtc->crtc_offset, 0);

if (!atomic && fb && fb != crtc->primary->fb) {
amdgpu_fb = to_amdgpu_framebuffer(fb);
-- 
2.8.1

[PATCH 1/6] drm: Add page_flip_target CRTC hook

2016-08-04 Thread Michel Dänzer

From: Michel DÃ¤nzer 

Mostly the same as the existing page_flip hook, but takes an additional
parameter specifying the target vertical blank period when the flip
should take effect.

Signed-off-by: Michel DÃ¤nzer 
---
 drivers/gpu/drm/drm_crtc.c | 23 +++
 include/drm/drm_crtc.h | 14 ++
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c
index 9d3f80e..15ad7e2 100644
--- a/drivers/gpu/drm/drm_crtc.c
+++ b/drivers/gpu/drm/drm_crtc.c
@@ -5421,6 +5421,7 @@ int drm_mode_page_flip_ioctl(struct drm_device *dev,
struct drm_crtc *crtc;
struct drm_framebuffer *fb = NULL;
struct drm_pending_vblank_event *e = NULL;
+   u32 target_vblank = 0;
int ret = -EINVAL;

if (page_flip->flags & ~DRM_MODE_PAGE_FLIP_FLAGS ||
@@ -5434,6 +5435,18 @@ int drm_mode_page_flip_ioctl(struct drm_device *dev,
if (!crtc)
return -ENOENT;

+   if (crtc->funcs->page_flip_target) {
+   int r;
+
+   r = drm_crtc_vblank_get(crtc);
+   if (r)
+   return r;
+
+   target_vblank = drm_crtc_vblank_count(crtc) +
+   !(page_flip->flags & DRM_MODE_PAGE_FLIP_ASYNC);
+   } else if (crtc->funcs->page_flip == NULL)
+   return -EINVAL;
+
drm_modeset_lock_crtc(crtc, crtc->primary);
if (crtc->primary->fb == NULL) {
/* The framebuffer is currently unbound, presumably
@@ -5444,9 +5457,6 @@ int drm_mode_page_flip_ioctl(struct drm_device *dev,
goto out;
}

-   if (crtc->funcs->page_flip == NULL)
-   goto out;
-
fb = drm_framebuffer_lookup(dev, page_flip->fb_id);
if (!fb) {
ret = -ENOENT;
@@ -5487,7 +5497,12 @@ int drm_mode_page_flip_ioctl(struct drm_device *dev,
}

crtc->primary->old_fb = crtc->primary->fb;
-   ret = crtc->funcs->page_flip(crtc, fb, e, page_flip->flags);
+   if (crtc->funcs->page_flip_target)
+   ret = crtc->funcs->page_flip_target(crtc, fb, e,
+   page_flip->flags,
+   target_vblank);
+   else
+   ret = crtc->funcs->page_flip(crtc, fb, e, page_flip->flags);
if (ret) {
if (page_flip->flags & DRM_MODE_PAGE_FLIP_EVENT)
drm_event_cancel_free(dev, &e->base);
diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h
index 9e6ab4a..ae1d9b6 100644
--- a/include/drm/drm_crtc.h
+++ b/include/drm/drm_crtc.h
@@ -579,6 +579,20 @@ struct drm_crtc_funcs {
 uint32_t flags);

/**
+* @page_flip_target:
+*
+* Same as @page_flip but with an additional parameter specifying the
+* target vertical blank period when the flip should take effect.
+*
+* Note that the core code calls drm_crtc_vblank_get before this entry
+* point.
+*/
+   int (*page_flip_target)(struct drm_crtc *crtc,
+   struct drm_framebuffer *fb,
+   struct drm_pending_vblank_event *event,
+   uint32_t flags, uint32_t target);
+
+   /**
 * @set_property:
 *
 * This is the legacy entry point to update a property attached to the
-- 
2.8.1

[PATCH 6/6] drm: Add DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags

2016-08-04 Thread Michel Dänzer

From: Michel DÃ¤nzer 

These flags allow userspace to explicitly specify the target vertical
blank period when a flip should take effect.

Signed-off-by: Michel DÃ¤nzer 
---

Note that the previous patches in this series can avoid delaying page
flips in some cases even without this patch or any corresponding
userspace changes. Here are examples of how userspace can take advantage
of this patch to avoid delaying page flips in even more cases:

https://cgit.freedesktop.org/~daenzer/xf86-video-ati/commit/?id=fc884a8af25345c32bd4104c864ecfeb9bb3db9b

https://cgit.freedesktop.org/~daenzer/xf86-video-amdgpu/commit/?id=b8631a9ba49c0d0ebe5dcd1dbfb68fcfe907296f

 drivers/gpu/drm/drm_crtc.c  | 46 +++--
 drivers/gpu/drm/drm_ioctl.c |  8 
 include/uapi/drm/drm.h  |  1 +
 include/uapi/drm/drm_mode.h | 24 +++
 4 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c
index 15ad7e2..b2fd860 100644
--- a/drivers/gpu/drm/drm_crtc.c
+++ b/drivers/gpu/drm/drm_crtc.c
@@ -5421,11 +5421,19 @@ int drm_mode_page_flip_ioctl(struct drm_device *dev,
struct drm_crtc *crtc;
struct drm_framebuffer *fb = NULL;
struct drm_pending_vblank_event *e = NULL;
-   u32 target_vblank = 0;
+   u32 target_vblank = page_flip->sequence;
int ret = -EINVAL;

-   if (page_flip->flags & ~DRM_MODE_PAGE_FLIP_FLAGS ||
-   page_flip->reserved != 0)
+   if (page_flip->flags & ~DRM_MODE_PAGE_FLIP_FLAGS)
+   return -EINVAL;
+
+   if (page_flip->sequence != 0 && !(page_flip->flags & 
DRM_MODE_PAGE_FLIP_TARGET))
+   return -EINVAL;
+
+   /* Only one of the DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags
+* can be specified
+*/
+   if ((page_flip->flags & DRM_MODE_PAGE_FLIP_TARGET) == 
DRM_MODE_PAGE_FLIP_TARGET)
return -EINVAL;

if ((page_flip->flags & DRM_MODE_PAGE_FLIP_ASYNC) && 
!dev->mode_config.async_page_flip)
@@ -5436,15 +5444,41 @@ int drm_mode_page_flip_ioctl(struct drm_device *dev,
return -ENOENT;

if (crtc->funcs->page_flip_target) {
+   u32 current_vblank;
int r;

r = drm_crtc_vblank_get(crtc);
if (r)
return r;

-   target_vblank = drm_crtc_vblank_count(crtc) +
-   !(page_flip->flags & DRM_MODE_PAGE_FLIP_ASYNC);
-   } else if (crtc->funcs->page_flip == NULL)
+   current_vblank = drm_crtc_vblank_count(crtc);
+
+   switch (page_flip->flags & DRM_MODE_PAGE_FLIP_TARGET) {
+   case DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE:
+   if ((int)(target_vblank - current_vblank) > 1) {
+   DRM_DEBUG("Invalid absolute flip target %u, "
+ "must be <= %u\n", target_vblank,
+ current_vblank + 1);
+   drm_crtc_vblank_put(crtc);
+   return -EINVAL;
+   }
+   break;
+   case DRM_MODE_PAGE_FLIP_TARGET_RELATIVE:
+   if (target_vblank != 0 && target_vblank != 1) {
+   DRM_DEBUG("Invalid relative flip target %u, "
+ "must be 0 or 1\n", target_vblank);
+   drm_crtc_vblank_put(crtc);
+   return -EINVAL;
+   }
+   target_vblank += current_vblank;
+   break;
+   default:
+   target_vblank = current_vblank +
+   !(page_flip->flags & DRM_MODE_PAGE_FLIP_ASYNC);
+   break;
+   }
+   } else if (crtc->funcs->page_flip == NULL ||
+  (page_flip->flags & DRM_MODE_PAGE_FLIP_TARGET))
return -EINVAL;

drm_modeset_lock_crtc(crtc, crtc->primary);
diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
index 33af4a5..0099d2a 100644
--- a/drivers/gpu/drm/drm_ioctl.c
+++ b/drivers/gpu/drm/drm_ioctl.c
@@ -228,6 +228,7 @@ static int drm_getstats(struct drm_device *dev, void *data,
 static int drm_getcap(struct drm_device *dev, void *data, struct drm_file 
*file_priv)
 {
struct drm_get_cap *req = data;
+   struct drm_crtc *crtc;

req->value = 0;
switch (req->capability) {
@@ -254,6 +255,13 @@ static int drm_getcap(struct drm_device *dev, void *data, 
struct drm_file *file_
case DRM_CAP_ASYNC_PAGE_FLIP:
req->value = dev->mode_config.async_page_flip;
break;
+   case DRM_CAP_PAGE_FLIP_TARGET:
+   req->value = 1;
+   drm_for_each_crtc(crtc, dev) {
+   if (!crtc->funcs->page_flip

[PATCH 5/6] drm/radeon: Set MASTER_UPDATE_MODE to 0 again

2016-08-04 Thread Michel Dänzer

From: Michel DÃ¤nzer 

With the previous change, it's safe to let page flips take effect
anytime during a vertical blank period.

This can avoid delaying a flip by a frame in some cases where we get to
radeon_flip_work_func -> adev->mode_info.funcs->page_flip during a
vertical blank period.

Signed-off-by: Michel DÃ¤nzer 
---
 drivers/gpu/drm/radeon/atombios_crtc.c | 8 
 drivers/gpu/drm/radeon/evergreen.c | 3 +--
 drivers/gpu/drm/radeon/rv515.c | 3 +--
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/radeon/atombios_crtc.c 
b/drivers/gpu/drm/radeon/atombios_crtc.c
index a97abc8..2029e35 100644
--- a/drivers/gpu/drm/radeon/atombios_crtc.c
+++ b/drivers/gpu/drm/radeon/atombios_crtc.c
@@ -1433,8 +1433,8 @@ static int dce4_crtc_do_set_base(struct drm_crtc *crtc,
WREG32(EVERGREEN_VIEWPORT_SIZE + radeon_crtc->crtc_offset,
   (viewport_w << 16) | viewport_h);

-   /* set pageflip to happen only at start of vblank interval (front 
porch) */
-   WREG32(EVERGREEN_MASTER_UPDATE_MODE + radeon_crtc->crtc_offset, 3);
+   /* set pageflip to happen anywhere in vblank interval */
+   WREG32(EVERGREEN_MASTER_UPDATE_MODE + radeon_crtc->crtc_offset, 0);

if (!atomic && fb && fb != crtc->primary->fb) {
radeon_fb = to_radeon_framebuffer(fb);
@@ -1632,8 +1632,8 @@ static int avivo_crtc_do_set_base(struct drm_crtc *crtc,
WREG32(AVIVO_D1MODE_VIEWPORT_SIZE + radeon_crtc->crtc_offset,
   (viewport_w << 16) | viewport_h);

-   /* set pageflip to happen only at start of vblank interval (front 
porch) */
-   WREG32(AVIVO_D1MODE_MASTER_UPDATE_MODE + radeon_crtc->crtc_offset, 3);
+   /* set pageflip to happen anywhere in vblank interval */
+   WREG32(AVIVO_D1MODE_MASTER_UPDATE_MODE + radeon_crtc->crtc_offset, 0);

if (!atomic && fb && fb != crtc->primary->fb) {
radeon_fb = to_radeon_framebuffer(fb);
diff --git a/drivers/gpu/drm/radeon/evergreen.c 
b/drivers/gpu/drm/radeon/evergreen.c
index db275b7..f95db0c 100644
--- a/drivers/gpu/drm/radeon/evergreen.c
+++ b/drivers/gpu/drm/radeon/evergreen.c
@@ -2878,9 +2878,8 @@ void evergreen_mc_resume(struct radeon_device *rdev, 
struct evergreen_mc_save *s
for (i = 0; i < rdev->num_crtc; i++) {
if (save->crtc_enabled[i]) {
tmp = RREG32(EVERGREEN_MASTER_UPDATE_MODE + 
crtc_offsets[i]);
-   if ((tmp & 0x7) != 3) {
+   if ((tmp & 0x7) != 0) {
tmp &= ~0x7;
-   tmp |= 0x3;
WREG32(EVERGREEN_MASTER_UPDATE_MODE + 
crtc_offsets[i], tmp);
}
tmp = RREG32(EVERGREEN_GRPH_UPDATE + crtc_offsets[i]);
diff --git a/drivers/gpu/drm/radeon/rv515.c b/drivers/gpu/drm/radeon/rv515.c
index c55d653..76c55c5 100644
--- a/drivers/gpu/drm/radeon/rv515.c
+++ b/drivers/gpu/drm/radeon/rv515.c
@@ -406,9 +406,8 @@ void rv515_mc_resume(struct radeon_device *rdev, struct 
rv515_mc_save *save)
for (i = 0; i < rdev->num_crtc; i++) {
if (save->crtc_enabled[i]) {
tmp = RREG32(AVIVO_D1MODE_MASTER_UPDATE_MODE + 
crtc_offsets[i]);
-   if ((tmp & 0x7) != 3) {
+   if ((tmp & 0x7) != 0) {
tmp &= ~0x7;
-   tmp |= 0x3;
WREG32(AVIVO_D1MODE_MASTER_UPDATE_MODE + 
crtc_offsets[i], tmp);
}
tmp = RREG32(AVIVO_D1GRPH_UPDATE + crtc_offsets[i]);
-- 
2.8.1

[PATCH 0/6] drm: Explicit target vblank seqno for page flips

2016-08-04 Thread Michel Dänzer

The purpose of this series is to allow drivers to avoid unnecessarily
delaying page flips, by explicitly telling the driver which vblank seqno
a flip is supposed to take effect in.

Patch 1 sets the target to the vblank seqno after the current one when
the DRM_IOCTL_MODE_PAGE_FLIP ioctl is called, preserving the ioctl
contract with existing userspace.

Patches 2-4 take advantage of this in the amdgpu and radeon drivers, by
allowing a flip to be programmed and executed during the target vertical
blank period (or a later one).

Patch 6 extends the ioctl with new flags, which allow userspace to
explicitly specify the target vblank seqno. This can also avoid delaying
flips in some cases where we are already in the target vertical blank
period when the ioctl is called.

[PATCH 1/6] drm: Add page_flip_target CRTC hook
[PATCH 2/6] drm/amdgpu: Provide page_flip_target hook
[PATCH 3/6] drm/amdgpu: Set MASTER_UPDATE_MODE to 0 again
[PATCH 4/6] drm/radeon: Provide page_flip_target hook
[PATCH 5/6] drm/radeon: Set MASTER_UPDATE_MODE to 0 again
[PATCH 6/6] drm: Add DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE

[PATCH 2/6] drm/amdgpu: Provide page_flip_target hook

2016-08-04 Thread Michel Dänzer

From: Michel DÃ¤nzer 

Now we can program a flip during a vertical blank period, if it's the
one targeted by the flip (or a later one). This allows simplifying
amdgpu_flip_work_func considerably.

Signed-off-by: Michel DÃ¤nzer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 98 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h|  8 +--
 drivers/gpu/drm/amd/amdgpu/dce_v10_0.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/dce_v11_0.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/dce_v8_0.c   |  2 +-
 6 files changed, 39 insertions(+), 76 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index eb09037..71f4a4c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -718,10 +718,11 @@ void amdgpu_doorbell_get_kfd_info(struct amdgpu_device 
*adev,
  */

 struct amdgpu_flip_work {
-   struct work_struct  flip_work;
+   struct delayed_work flip_work;
struct work_struct  unpin_work;
struct amdgpu_device*adev;
int crtc_id;
+   u32 target_vblank;
uint64_tbase;
struct drm_pending_vblank_event *event;
struct amdgpu_bo*old_rbo;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
index 7dbe8d0..ce1f2bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -41,7 +41,7 @@ static void amdgpu_flip_callback(struct fence *f, struct 
fence_cb *cb)
container_of(cb, struct amdgpu_flip_work, cb);

fence_put(f);
-   schedule_work(&work->flip_work);
+   schedule_work(&work->flip_work.work);
 }

 static bool amdgpu_flip_handle_fence(struct amdgpu_flip_work *work,
@@ -63,16 +63,17 @@ static bool amdgpu_flip_handle_fence(struct 
amdgpu_flip_work *work,

 static void amdgpu_flip_work_func(struct work_struct *__work)
 {
+   struct delayed_work *delayed_work =
+   container_of(__work, struct delayed_work, work);
struct amdgpu_flip_work *work =
-   container_of(__work, struct amdgpu_flip_work, flip_work);
+   container_of(delayed_work, struct amdgpu_flip_work, flip_work);
struct amdgpu_device *adev = work->adev;
struct amdgpu_crtc *amdgpuCrtc = adev->mode_info.crtcs[work->crtc_id];

struct drm_crtc *crtc = &amdgpuCrtc->base;
unsigned long flags;
-   unsigned i, repcnt = 4;
-   int vpos, hpos, stat, min_udelay = 0;
-   struct drm_vblank_crtc *vblank = &crtc->dev->vblank[work->crtc_id];
+   unsigned i;
+   int vpos, hpos;

if (amdgpu_flip_handle_fence(work, &work->excl))
return;
@@ -81,55 +82,23 @@ static void amdgpu_flip_work_func(struct work_struct 
*__work)
if (amdgpu_flip_handle_fence(work, &work->shared[i]))
return;

-   /* We borrow the event spin lock for protecting flip_status */
-   spin_lock_irqsave(&crtc->dev->event_lock, flags);
-
-   /* If this happens to execute within the "virtually extended" vblank
-* interval before the start of the real vblank interval then it needs
-* to delay programming the mmio flip until the real vblank is entered.
-* This prevents completing a flip too early due to the way we fudge
-* our vblank counter and vblank timestamps in order to work around the
-* problem that the hw fires vblank interrupts before actual start of
-* vblank (when line buffer refilling is done for a frame). It
-* complements the fudging logic in amdgpu_get_crtc_scanoutpos() for
-* timestamping and amdgpu_get_vblank_counter_kms() for vblank counts.
-*
-* In practice this won't execute very often unless on very fast
-* machines because the time window for this to happen is very small.
+   /* Wait until we're out of the vertical blank period before the one
+* targeted by the flip
 */
-   while (amdgpuCrtc->enabled && --repcnt) {
-   /* GET_DISTANCE_TO_VBLANKSTART returns distance to real vblank
-* start in hpos, and to the "fudged earlier" vblank start in
-* vpos.
-*/
-   stat = amdgpu_get_crtc_scanoutpos(adev->ddev, work->crtc_id,
- GET_DISTANCE_TO_VBLANKSTART,
- &vpos, &hpos, NULL, NULL,
- &crtc->hwmode);
-
-   if ((stat & (DRM_SCANOUTPOS_VALID | DRM_SCANOUTPOS_ACCURATE)) !=
-   (DRM_SCANOUTPOS_VALID | DRM_SCANOUTPOS_ACCURATE) ||
-   !(vpos >= 0 && hpos <= 0))
-   break;
-

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 2081 matches

Mail list logo