On Thu, 19 Aug 2021, Mikhail Nitenko wrote:

diff --git a/libavcodec/aarch64/h264qpel_neon.S 
b/libavcodec/aarch64/h264qpel_neon.S
index d27cfac494..eb18469b7f 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -932,3 +932,518 @@ endfunc

        h264_qpel16 put
        h264_qpel16 avg
+
+//trashes v0-v5, v7
+.macro  lowpass_8_10    r0,  r1,  r2,  r3,  d0,  d1
+        ext             v2.16B,     \r0\().16B,  \r1\().16B, #4
+        ext             v3.16B,     \r0\().16B,  \r1\().16B, #6
+        add             v2.8H,      v2.8H,       v3.8H
+        ext             v4.16B,     \r0\().16B,  \r1\().16B, #2
+        ext             v5.16B,     \r0\().16B,  \r1\().16B, #8
+        add             v4.8H,      v4.8H,       v5.8H
+        ext             v1.16B,     \r0\().16B,  \r1\().16B, #10
+        uaddl2          \d1\().4S,  \r0\().8H,   v1.8H
+        uaddl           \d0\().4S,  \r0\().4H,   v1.4H
+        ext             v0.16B,      \r2\().16B, \r3\().16B, #4

Nit: Indentation is off for the center column

+        umlal           \d0\().4S,  v2.4H,       v6.H[1]
+        umlal2          \d1\().4S,  v2.8H,       v6.H[1]
+        ext             v1.16B,     \r2\().16B, \r3\().16B, #6
+        add             v0.8H,      v0.8H,       v1.8H
+        ext             v1.16B,     \r2\().16B,  \r3\().16B, #2
+        umlsl           \d0\().4S,  v4.4H,       v6.H[0]
+        umlsl2          \d1\().4S,  v4.8H,       v6.H[0]

I see why you need to go to 32 bit here, but I think this could be kept in 16 bit with this trick:

First add + mla of the two positive coefficients. This is can go outside of the range of a signed 16 bit integer, so this must be considered unsigned 16 bit. Then do mul of the negative coefficient (corresponding to the umlsl here) into a separate register. We see this as a separate unsigned 16 bit value.

Then we so a uqsub of these two 16 bit values; the result is nonnegative, but still possibly larger than signed 16 bit range. So then finally you do urshr instead of sqrshrun (and maybe also umin instead of smin).

Previously you had:
- 2 uaddl (16->32)
- 2 umlal (16->32)
- 2 umlsl (16->32)
- 2 sqrshrun (32->16)

With this, you'd get this down to:
- 1 add
- 1 mla
- 1 mul
- 1 uqsub
- 1 urshr

So 5 instructions instead of 8.

As there's fewer of each operation, it might be good to interleave it more with the second calculation if there's enough registers, to avoid stalling in a long sequential operation on one single register.

+        sqrshrun        \d0\().4H,  \d0\().4S,   #5
+        sqrshrun2       \d0\().8H,  \d1\().4S,   #5
+        ext             v3.16B,     \r2\().16B,  \r3\().16B, #8
+        add             v1.8H,      v1.8H,       v3.8H
+        ext             v2.16B,     \r2\().16B,  \r3\().16B, #10
+        uaddl           v3.4S,      \r2\().4H,   v2.4H
+        uaddl2          v4.4S,      \r2\().8H,   v2.8H
+        umlal           v3.4S,      v0.4H,       v6.H[1]
+        umlal2          v4.4S,      v0.8H,       v6.H[1]
+        umlsl           v3.4S,      v1.4H,       v6.H[0]
+        umlsl2          v4.4S,      v1.8H,       v6.H[0]
+        mvni            v5.8h,      #0xFC,       lsl #8 // 1023 for clipping
+        sqrshrun        \d1\().4H,  v3.4S,       #5
+        sqrshrun2       \d1\().8H,  v4.4S,       #5
+        smin            \d0\().8H,  \d0\().8H,   v5.8h
+        smin            \d1\().8H,  \d1\().8H,   v5.8h
+.endm
+
+function put_h264_qpel16_h_lowpass_neon_packed_10
+        mov             x4,  x30
+        mov             x12, #32
+        mov             x3,  #16
+        bl              put_h264_qpel8_h_lowpass_neon_10
+        sub             x1,  x1,  x2, lsl #4
+        add             x1,  x1,  #16
+        mov             x12, #32
+        mov             x30, x4
+        b               put_h264_qpel8_h_lowpass_neon_10
+endfunc
+
+.macro  h264_qpel_h_lowpass_10 type
+function \type\()_h264_qpel16_h_lowpass_neon_10
+        mov             x13, x30
+        mov             x12, #32
+        bl              \type\()_h264_qpel8_h_lowpass_neon_10
+        sub             x0,  x0,  x3, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        add             x0,  x0,  #16
+        add             x1,  x1,  #16
+        mov             x12, #32
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_neon_10
+1:      ld1             {v28.8H, v29.8H}, [x1], x2
+        ld1             {v16.8H, v17.8H}, [x1], x2
+        subs            x12, x12, #4
+        lowpass_8_10    v28, v29, v16, v17, v28, v20
+  .ifc \type,avg
+        ld1             {v2.8H},    [x0], x3
+        urhadd          v28.8H, v28.8H,  v2.8H
+        ld1             {v3.8H},    [x0]
+        urhadd          v20.8H, v20.8H, v3.8H
+        sub             x0,  x0,  x3
+  .endif
+        st1             {v28.8H},    [x0], x3
+        st1             {v20.8H},    [x0], x3
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass_10 put
+        h264_qpel_h_lowpass_10 avg
+
+.macro h264_qpel_h_lowpass_l2_10 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon_10
+        mov             x13, x30
+        mov             x12, #32
+        bl              \type\()_h264_qpel8_h_lowpass_l2_neon_10
+        sub             x0,  x0,  x2, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        sub             x3,  x3,  x2, lsl #4
+        add             x0,  x0,  #16
+        add             x1,  x1,  #16
+        add             x3,  x3,  #16
+        mov             x12, #32
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_l2_neon_10
+1:      ld1             {v26.8H, v27.8H}, [x1], x2
+        ld1             {v16.8H, v17.8H}, [x1], x2
+        ld1             {v28.8H},     [x3], x2
+        ld1             {v29.8H},     [x3], x2
+        subs            x12, x12, #4
+        lowpass_8_10    v26, v27, v16, v17, v26, v27
+        urhadd          v26.8H, v26.8H, v28.8H
+        urhadd          v27.8H, v27.8H, v29.8H
+  .ifc \type,avg
+        ld1             {v2.8H},      [x0], x2
+        urhadd          v26.8H, v26.8H, v2.8H
+        ld1             {v3.8H},      [x0]
+        urhadd          v27.8H, v27.8H, v3.8H
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v26.8H},     [x0], x2
+        st1             {v27.8H},     [x0], x2
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass_l2_10 put
+        h264_qpel_h_lowpass_l2_10 avg
+
+function put_h264_qpel16_v_lowpass_neon_packed_10
+        mov             x4,  x30
+        mov             x2,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+        b               put_h264_qpel8_v_lowpass_neon
+endfunc
+
+.macro  h264_qpel_v_lowpass_10 type
+function \type\()_h264_qpel16_v_lowpass_neon_10
+        mov             x4,  x30
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x0,  x0,  x2, lsl #4
+        add             x0,  x0,  #16
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #16
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_neon_10
+        ld1             {v16.8H}, [x1], x3
+        ld1             {v18.8H}, [x1], x3
+        ld1             {v20.8H}, [x1], x3
+        ld1             {v22.8H}, [x1], x3
+        ld1             {v24.8H}, [x1], x3
+        ld1             {v26.8H}, [x1], x3
+        ld1             {v28.8H}, [x1], x3
+        ld1             {v30.8H}, [x1], x3
+        ld1             {v17.8H}, [x1], x3
+        ld1             {v19.8H}, [x1], x3
+        ld1             {v21.8H}, [x1], x3
+        ld1             {v23.8H}, [x1], x3
+        ld1             {v25.8H}, [x1]
+
+        transpose_8x8H  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
+        transpose_8x8H  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
+        lowpass_8_10    v16, v17, v18, v19, v16, v17
+        lowpass_8_10    v20, v21, v22, v23, v18, v19
+        lowpass_8_10    v24, v25, v26, v27, v20, v21
+        lowpass_8_10    v28, v29, v30, v31, v22, v23
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1

I'm a bit surprised by doing this kind of vertical filtering by transposing and doing it horizontally - when vertical filtering can be done so efficiently as-is without needing any extra 'ext' instructions and such. But I see that the existing code does it this way. I'll give it a try to make a PoC of rewriting the existing code for some case to see how it behaves without the transposes.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to