[FFmpeg-cvslog] [ffmpeg] aarch64: vp9lpf: Fix GCS violations (branch master)

Martin Storsjö via ffmpeg-cvslog Wed, 10 Jun 2026 11:04:44 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


The following commit(s) were added to refs/heads/master by this push:
     new d05786cf23 aarch64: vp9lpf: Fix GCS violations
d05786cf23 is described below

commit d05786cf23ead5a92f722b3ba4516d99d7f108ad
Author:     Martin Storsjö <[email protected]>
AuthorDate: Thu Jun 4 23:41:11 2026 +0300
Commit:     Martin Storsjö <[email protected]>
CommitDate: Wed Jun 10 18:03:01 2026 +0000

    aarch64: vp9lpf: Fix GCS violations
    
    The aarch64 VP9 loopfilters actually violate aarch64 GCS
    (Guarded Control Stack), even though we marked the code as GCS
    compliant in 846746be4b8edd66be8be2c123b0072de2636e9d.
    
    This means that builds with GCS enabled, after that commit,
    will crash when decoding VP9, on future hardware (or current
    QEMU) that supports GCS. This also goes for ffmpeg version 8.1.1
    where the GCS enabling was backported.
    
    This matches the fix that was done for hevcdsp in
    1f7ed8a78de1da743a359913ce05cc258a400b5d.
    
    This issue wasn't observed if running checkasm in QEMU - therefore,
    I thought all GCS issues had been fixed by
    846746be4b8edd66be8be2c123b0072de2636e9d. (If I would have
    tested the full "make fate" with QEMU, the issue would
    have appeared though.)
    
    However with the new checkasm, some of the GCS violations
    do appear even in checkasm.
    
    The reason is that the checkasm vp9 test intentionally craft
    input pixels that attempt to trigger all the individual
    separate cases in each input buffer (in
    randomize_loopfilter_buffers). This means that the checkasm
    tests actually never test or exercise the early exit cases,
    which are the ones that violate GCS.
    
    With the new checkasm, the call to "bench_new" always test
    running the code at least once, even if not benchmarking.
    
    As the input buffers weren't reinitialized between the test
    and "bench_new", the pixel differences now differ from the
    initial setup, so that the code now some times (often) would
    end up hitting the early exit cases.
    
    Ideally, the vp9 checkasm test would be repeated to cover all
    cases of input buffers that allow early exits, in addition to
    covering the case with all different cases in one block.
---
 libavcodec/aarch64/vp9lpf_16bpp_neon.S |  62 +++++++++++--------
 libavcodec/aarch64/vp9lpf_neon.S       | 106 +++++++++++++++++++--------------
 2 files changed, 97 insertions(+), 71 deletions(-)

diff --git a/libavcodec/aarch64/vp9lpf_16bpp_neon.S 
b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
index e3e70491c6..589775b0f1 100644
--- a/libavcodec/aarch64/vp9lpf_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
@@ -56,9 +56,7 @@
         mov             x11, v4.d[0]
         mov             x12, v4.d[1]
         adds            x11, x11, x12
-        b.ne            1f
-        ret             x10
-1:
+        b.eq            9f
 
 .if \wd >= 8
         dup             v0.8h,  w5
@@ -189,13 +187,7 @@
         // If no pixels need flat8in, jump to flat8out
         // (or to a writeout of the inner 4 pixels, for wd=8)
 .if \wd >= 8
-.if \wd == 16
         b.eq            6f
-.else
-        b.ne            1f
-        ret             x13
-1:
-.endif
 
         // flat8in
         add             \tmp1\().8h, v20.8h, v21.8h
@@ -249,20 +241,16 @@
         mov             x11, v2.d[0]
         mov             x12, v2.d[1]
         adds            x11, x11, x12
-        b.ne            1f
         // If no pixels needed flat8in nor flat8out, jump to a
         // writeout of the inner 4 pixels
-        ret             x14
-1:
+        b.eq            7f
 
         mov             x11, v7.d[0]
         mov             x12, v7.d[1]
         adds            x11, x11, x12
-        b.ne            1f
         // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
-        ret             x15
+        b.eq            8f
 
-1:
         // flat8out
         // This writes all outputs into v2-v17 (skipping v6 and v16).
         // If this part is skipped, the output is read from v21-v26 (which is 
the input
@@ -378,35 +366,57 @@
 // while we need those for inputs/outputs in wd=16 and use v8-v15
 // for temp registers there instead.
 function vp9_loop_filter_4
+        mov             x13, #0
         loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
         ret
+9:
+        mov             x13, #(1<<9)
+        ret
 endfunc
 
 function vp9_loop_filter_8
+        mov             x13, #0
         loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
         ret
+6:
+        mov             x13, #(1<<6)
+        ret
+9:
+        mov             x13, #(1<<9)
+        ret
 endfunc
 
 function vp9_loop_filter_16
+        mov             x13, #0
         loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
         ret
+7:
+        mov             x13, #(1<<7)
+        ret
+8:
+        mov             x13, #(1<<8)
+        ret
+9:
+        mov             x13, #(1<<9)
+        ret
 endfunc
 
 .macro loop_filter_4
         bl              vp9_loop_filter_4
+        tbnz            x13, #9, 9f
 .endm
 
 .macro loop_filter_8
-        // calculate alternative 'return' targets
-        adr             x13, 6f
         bl              vp9_loop_filter_8
+        tbnz            x13, #6, 6f
+        tbnz            x13, #9, 9f
 .endm
 
 .macro loop_filter_16
-        // calculate alternative 'return' targets
-        adr             x14, 7f
-        adr             x15, 8f
         bl              vp9_loop_filter_16
+        tbnz            x13, #7, 7f
+        tbnz            x13, #8, 8f
+        tbnz            x13, #9, 9f
 .endm
 
 
@@ -540,7 +550,7 @@ function vp9_loop_filter_v_4_8_16_neon
         st1             {v23.8h}, [x9], x1
         st1             {v25.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #1
-
+9:
         ret             x10
 endfunc
 
@@ -588,7 +598,7 @@ function vp9_loop_filter_h_4_8_16_neon
         st1             {v25.d}[1], [x0], x1
         sub             x0,  x0,  x1, lsl #3
         add             x0,  x0,  #4
-
+9:
         ret             x10
 endfunc
 
@@ -619,7 +629,7 @@ function vp9_loop_filter_v_8_8_16_neon
         st1             {v26.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #1
         sub             x0,  x0,  x1
-
+9:
         ret             x10
 6:
         sub             x9,  x0,  x1, lsl #1
@@ -670,7 +680,7 @@ function vp9_loop_filter_h_8_8_16_neon
         st1             {v27.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #3
         add             x0,  x0,  #8
-
+9:
         ret             x10
 6:
         // If we didn't need to do the flat8in part, we use the same writeback
@@ -742,7 +752,7 @@ function vp9_loop_filter_v_16_8_16_neon
         st1             {v17.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #3
         add             x0,  x0,  x1
-
+9:
         ret             x10
 8:
         add             x9,  x9,  x1, lsl #2
@@ -820,7 +830,7 @@ function vp9_loop_filter_h_16_8_16_neon
         st1             {v9.8h},  [x9], x1
         st1             {v31.8h}, [x0], x1
         sub             x0,  x0,  x1, lsl #3
-
+9:
         ret             x10
 8:
         // The same writeback as in loop_filter_h_8_8
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 9a79f48df3..2ffebcdfc3 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -388,32 +388,28 @@
 .endif
 .if \wd == 16
 6:
+        // If no pixels needed flat8in nor flat8out, jump to a
+        // writeout of the inner 4 pixels
         orr             v2\sz,  v6\sz,  v7\sz
         mov             x5,  v2.d[0]
 .ifc \sz, .16b
         mov             x6,  v2.d[1]
         adds            x5,  x5,  x6
-        b.ne            1f
+        b.eq            7f
 .else
-        cbnz            x5,  1f
+        cbz             x5,  7f
 .endif
-        // If no pixels needed flat8in nor flat8out, jump to a
-        // writeout of the inner 4 pixels
-        ret             x14
-1:
 
+        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
         mov             x5,  v7.d[0]
 .ifc \sz, .16b
         mov             x6,  v7.d[1]
         adds            x5,  x5,  x6
-        b.ne            1f
+        b.eq            8f
 .else
-        cbnz            x5,  1f
+        cbz             x5,  8f
 .endif
-        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
-        ret             x15
 
-1:
         // flat8out
         // This writes all outputs into v2-v17 (skipping v6 and v16).
         // If this part is skipped, the output is read from v21-v26 (which is 
the input
@@ -529,76 +525,94 @@
 // while we need those for inputs/outputs in wd=16 and use v8-v15
 // for temp registers there instead.
 function vp9_loop_filter_4
+        mov             x13, #0
         loop_filter     4,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
         ret
 9:
-        ret             x10
+        mov             x13, #(1<<9)
+        ret
 endfunc
 
 function vp9_loop_filter_4_16b_mix_44
+        mov             x13, #0
         loop_filter     4,  .16b, 44,   v16, v17, v18, v19, v28, v29, v30, v31
         ret
 9:
-        ret             x10
+        mov             x13, #(1<<9)
+        ret
 endfunc
 
 function vp9_loop_filter_8
+        mov             x13, #0
         loop_filter     8,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
         ret
 6:
-        ret             x13
+        mov             x13, #(1<<6)
+        ret
 9:
-        ret             x10
+        mov             x13, #(1<<9)
+        ret
 endfunc
 
 function vp9_loop_filter_8_16b_mix
+        mov             x13, #0
         loop_filter     8,  .16b, 88,   v16, v17, v18, v19, v28, v29, v30, v31
         ret
 6:
-        ret             x13
+        mov             x13, #(1<<6)
+        ret
 9:
-        ret             x10
+        mov             x13, #(1<<9)
+        ret
 endfunc
 
 function vp9_loop_filter_16
+        mov             x13, #0
         loop_filter     16, .8b,  0,    v8,  v9,  v10, v11, v12, v13, v14, v15
         ret
+7:
+        mov             x13, #(1<<7)
+        ret
+8:
+        mov             x13, #(1<<8)
+        ret
 9:
-        ldp             d10, d11, [sp, #0x10]
-        ldp             d12, d13, [sp, #0x20]
-        ldp             d14, d15, [sp, #0x30]
-        ldp             d8,  d9,  [sp], #0x40
-        ret             x10
+        mov             x13, #(1<<9)
+        ret
 endfunc
 
 function vp9_loop_filter_16_16b
+        mov             x13, #0
         loop_filter     16, .16b, 0,    v8,  v9,  v10, v11, v12, v13, v14, v15
         ret
+7:
+        mov             x13, #(1<<7)
+        ret
+8:
+        mov             x13, #(1<<8)
+        ret
 9:
-        ldp             d10, d11, [sp, #0x10]
-        ldp             d12, d13, [sp, #0x20]
-        ldp             d14, d15, [sp, #0x30]
-        ldp             d8,  d9,  [sp], #0x40
-        ret             x10
+        mov             x13, #(1<<9)
+        ret
 endfunc
 
 .macro loop_filter_4
         bl              vp9_loop_filter_4
+        tbnz            x13, #9, 9f
 .endm
 
 .macro loop_filter_4_16b_mix mix
         bl              vp9_loop_filter_4_16b_mix_\mix
+        tbnz            x13, #9, 9f
 .endm
 
 .macro loop_filter_8
-        // calculate alternative 'return' targets
-        adr             x13, 6f
         bl              vp9_loop_filter_8
+        tbnz            x13, #6, 6f
+        tbnz            x13, #9, 9f
 .endm
 
 .macro loop_filter_8_16b_mix mix
-        // calculate alternative 'return' targets
-        adr             x13, 6f
 .if \mix == 48
         mov             x11, #0xffffffff00000000
 .elseif \mix == 84
@@ -607,20 +621,22 @@ endfunc
         mov             x11, #0xffffffffffffffff
 .endif
         bl              vp9_loop_filter_8_16b_mix
+        tbnz            x13, #6, 6f
+        tbnz            x13, #9, 9f
 .endm
 
 .macro loop_filter_16
-        // calculate alternative 'return' targets
-        adr             x14, 7f
-        adr             x15, 8f
         bl              vp9_loop_filter_16
+        tbnz            x13, #7, 7f
+        tbnz            x13, #8, 8f
+        tbnz            x13, #9, 9f
 .endm
 
 .macro loop_filter_16_16b
-        // calculate alternative 'return' targets
-        adr             x14, 7f
-        adr             x15, 8f
         bl              vp9_loop_filter_16_16b
+        tbnz            x13, #7, 7f
+        tbnz            x13, #8, 8f
+        tbnz            x13, #9, 9f
 .endm
 
 
@@ -647,7 +663,7 @@ function ff_vp9_loop_filter_v_4_8_neon, export=1
         st1             {v24.8b}, [x0], x1
         st1             {v23.8b}, [x9], x1
         st1             {v25.8b}, [x0], x1
-
+9:
         ret             x10
 endfunc
 
@@ -671,7 +687,7 @@ function ff_vp9_loop_filter_v_44_16_neon, export=1
         st1             {v24.16b}, [x0], x1
         st1             {v23.16b}, [x9], x1
         st1             {v25.16b}, [x0], x1
-
+9:
         ret             x10
 endfunc
 
@@ -713,7 +729,7 @@ function ff_vp9_loop_filter_h_4_8_neon, export=1
         st1             {v24.s}[1], [x0], x1
         st1             {v25.s}[0], [x9], x1
         st1             {v25.s}[1], [x0], x1
-
+9:
         ret             x10
 endfunc
 
@@ -765,7 +781,7 @@ function ff_vp9_loop_filter_h_44_16_neon, export=1
         st1             {v24.s}[3], [x0], x1
         st1             {v25.s}[1], [x9], x1
         st1             {v25.s}[3], [x0], x1
-
+9:
         ret             x10
 endfunc
 
@@ -792,7 +808,7 @@ function ff_vp9_loop_filter_v_8_8_neon, export=1
         st1             {v25.8b}, [x0], x1
         st1             {v23.8b}, [x9], x1
         st1             {v26.8b}, [x0], x1
-
+9:
         ret             x10
 6:
         sub             x9,  x0,  x1, lsl #1
@@ -827,7 +843,7 @@ function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1
         st1             {v25.16b}, [x0], x1
         st1             {v23.16b}, [x9], x1
         st1             {v26.16b}, [x0], x1
-
+9:
         ret             x10
 6:
         sub             x9,  x0,  x1, lsl #1
@@ -875,7 +891,7 @@ function ff_vp9_loop_filter_h_8_8_neon, export=1
         st1             {v26.8b}, [x0], x1
         st1             {v23.8b}, [x9], x1
         st1             {v27.8b}, [x0], x1
-
+9:
         ret             x10
 6:
         // If we didn't need to do the flat8in part, we use the same writeback
@@ -941,7 +957,7 @@ function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
         st1             {v26.d}[1], [x0], x1
         st1             {v27.8b},   [x9], x1
         st1             {v27.d}[1], [x0], x1
-
+9:
         ret             x10
 6:
         add             x9,  x9,  #2

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] aarch64: vp9lpf: Fix GCS violations (branch master)

Reply via email to