By caching some values in local variables we can allow the compiler to
emit more compact code because it does not have to reload those values
constantly.

Before and after size comparisons:

     text          data     bss      dec            hex filename
  10708384       547307  213512 11469203         af0193 amdgpu.ko.0
  10688632       547307  213512 11449451         aeb46b amdgpu.ko.1

 add/remove: 0/0 grow/shrink: 3/340 up/down: 29/-20025 (-19996)
  Function                                     old     new   delta
   amdgpu_ring_write_multiple                   600     612     +12
   amdgpu_umsch_mm_submit_pkt                   196     207     +11
   amdgpu_ring_write_multiple.constprop         453     459      +6
   vcn_v2_0_enc_ring_insert_end                  69      64      -5
 ...
   jpeg_v4_0_3_dec_ring_emit_ib                1281    1045    -236
   jpeg_v2_0_dec_ring_emit_ib                  1402    1147    -255
   jpeg_v1_0_decode_ring_emit_fence            1788    1507    -281
 Total: Before=8949691, After=8929695, chg -0.22%

Signed-off-by: Tvrtko Ursulin <tvrtko.ursu...@igalia.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index afaf951b0b78..d37e822ff46e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -467,8 +467,10 @@ static inline void amdgpu_ring_clear_ring(struct 
amdgpu_ring *ring)
 
 static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)
 {
-       ring->ring[ring->wptr++ & ring->buf_mask] = v;
-       ring->wptr &= ring->ptr_mask;
+       u64 wptr = ring->wptr;
+
+       ring->ring[wptr++ & ring->buf_mask] = v;
+       ring->wptr = wptr & ring->ptr_mask;
        ring->count_dw--;
 }
 
@@ -476,9 +478,11 @@ static inline void amdgpu_ring_write_multiple(struct 
amdgpu_ring *ring,
                                              void *src, int count_dw)
 {
        unsigned occupied, chunk1, chunk2;
+       u32 buf_mask = ring->buf_mask;
+       u64 wptr = ring->wptr;
 
-       occupied = ring->wptr & ring->buf_mask;
-       chunk1 = ring->buf_mask + 1 - occupied;
+       occupied = wptr & buf_mask;
+       chunk1 = buf_mask + 1 - occupied;
        chunk1 = (chunk1 >= count_dw) ? count_dw : chunk1;
        chunk2 = count_dw - chunk1;
        chunk1 <<= 2;
@@ -492,8 +496,8 @@ static inline void amdgpu_ring_write_multiple(struct 
amdgpu_ring *ring,
                memcpy(ring->ring, src, chunk2);
        }
 
-       ring->wptr += count_dw;
-       ring->wptr &= ring->ptr_mask;
+       wptr += count_dw;
+       ring->wptr = wptr & ring->ptr_mask;
        ring->count_dw -= count_dw;
 }
 
-- 
2.48.0

Reply via email to