Re: [FFmpeg-devel] [PATCH] lavc/aarch64: h264qpel, add lowpass_8 based functions

Martin Storsjö Fri, 03 Sep 2021 02:53:56 -0700

On Thu, 19 Aug 2021, Mikhail Nitenko wrote:

diff --git a/libavcodec/aarch64/h264qpel_neon.S 
b/libavcodec/aarch64/h264qpel_neon.S
index d27cfac494..eb18469b7f 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -932,3 +932,518 @@ endfunc


        h264_qpel16 put
        h264_qpel16 avg
+
+//trashes v0-v5, v7
+.macro  lowpass_8_10    r0,  r1,  r2,  r3,  d0,  d1
+        ext             v2.16B,     \r0\().16B,  \r1\().16B, #4
+        ext             v3.16B,     \r0\().16B,  \r1\().16B, #6
+        add             v2.8H,      v2.8H,       v3.8H
+        ext             v4.16B,     \r0\().16B,  \r1\().16B, #2
+        ext             v5.16B,     \r0\().16B,  \r1\().16B, #8
+        add             v4.8H,      v4.8H,       v5.8H
+        ext             v1.16B,     \r0\().16B,  \r1\().16B, #10
+        uaddl2          \d1\().4S,  \r0\().8H,   v1.8H
+        uaddl           \d0\().4S,  \r0\().4H,   v1.4H
+        ext             v0.16B,      \r2\().16B, \r3\().16B, #4


Nit: Indentation is off for the center column

+        umlal           \d0\().4S,  v2.4H,       v6.H[1]
+        umlal2          \d1\().4S,  v2.8H,       v6.H[1]
+        ext             v1.16B,     \r2\().16B, \r3\().16B, #6
+        add             v0.8H,      v0.8H,       v1.8H
+        ext             v1.16B,     \r2\().16B,  \r3\().16B, #2
+        umlsl           \d0\().4S,  v4.4H,       v6.H[0]
+        umlsl2          \d1\().4S,  v4.8H,       v6.H[0]

I see why you need to go to 32 bit here, but I think this could be kept in16 bit with this trick:

First add + mla of the two positive coefficients. This is can go outsideof the range of a signed 16 bit integer, so this must be consideredunsigned 16 bit. Then do mul of the negative coefficient (corresponding tothe umlsl here) into a separate register. We see this as a separateunsigned 16 bit value.

Then we so a uqsub of these two 16 bit values; the result is nonnegative,but still possibly larger than signed 16 bit range. So then finally you dourshr instead of sqrshrun (and maybe also umin instead of smin).


Previously you had:
- 2 uaddl (16->32)
- 2 umlal (16->32)
- 2 umlsl (16->32)
- 2 sqrshrun (32->16)

With this, you'd get this down to:
- 1 add
- 1 mla
- 1 mul
- 1 uqsub
- 1 urshr

So 5 instructions instead of 8.

As there's fewer of each operation, it might be good to interleave it morewith the second calculation if there's enough registers, to avoid stallingin a long sequential operation on one single register.

+        sqrshrun        \d0\().4H,  \d0\().4S,   #5
+        sqrshrun2       \d0\().8H,  \d1\().4S,   #5
+        ext             v3.16B,     \r2\().16B,  \r3\().16B, #8
+        add             v1.8H,      v1.8H,       v3.8H
+        ext             v2.16B,     \r2\().16B,  \r3\().16B, #10
+        uaddl           v3.4S,      \r2\().4H,   v2.4H
+        uaddl2          v4.4S,      \r2\().8H,   v2.8H
+        umlal           v3.4S,      v0.4H,       v6.H[1]
+        umlal2          v4.4S,      v0.8H,       v6.H[1]
+        umlsl           v3.4S,      v1.4H,       v6.H[0]
+        umlsl2          v4.4S,      v1.8H,       v6.H[0]
+        mvni            v5.8h,      #0xFC,       lsl #8 // 1023 for clipping
+        sqrshrun        \d1\().4H,  v3.4S,       #5
+        sqrshrun2       \d1\().8H,  v4.4S,       #5
+        smin            \d0\().8H,  \d0\().8H,   v5.8h
+        smin            \d1\().8H,  \d1\().8H,   v5.8h
+.endm
+
+function put_h264_qpel16_h_lowpass_neon_packed_10
+        mov             x4,  x30
+        mov             x12, #32
+        mov             x3,  #16
+        bl              put_h264_qpel8_h_lowpass_neon_10
+        sub             x1,  x1,  x2, lsl #4
+        add             x1,  x1,  #16
+        mov             x12, #32
+        mov             x30, x4
+        b               put_h264_qpel8_h_lowpass_neon_10
+endfunc
+
+.macro  h264_qpel_h_lowpass_10 type
+function \type\()_h264_qpel16_h_lowpass_neon_10
+        mov             x13, x30
+        mov             x12, #32
+        bl              \type\()_h264_qpel8_h_lowpass_neon_10
+        sub             x0,  x0,  x3, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        add             x0,  x0,  #16
+        add             x1,  x1,  #16
+        mov             x12, #32
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_neon_10
+1:      ld1             {v28.8H, v29.8H}, [x1], x2
+        ld1             {v16.8H, v17.8H}, [x1], x2
+        subs            x12, x12, #4
+        lowpass_8_10    v28, v29, v16, v17, v28, v20
+  .ifc \type,avg
+        ld1             {v2.8H},    [x0], x3
+        urhadd          v28.8H, v28.8H,  v2.8H
+        ld1             {v3.8H},    [x0]
+        urhadd          v20.8H, v20.8H, v3.8H
+        sub             x0,  x0,  x3
+  .endif
+        st1             {v28.8H},    [x0], x3
+        st1             {v20.8H},    [x0], x3
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass_10 put
+        h264_qpel_h_lowpass_10 avg
+
+.macro h264_qpel_h_lowpass_l2_10 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon_10
+        mov             x13, x30
+        mov             x12, #32
+        bl              \type\()_h264_qpel8_h_lowpass_l2_neon_10
+        sub             x0,  x0,  x2, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        sub             x3,  x3,  x2, lsl #4
+        add             x0,  x0,  #16
+        add             x1,  x1,  #16
+        add             x3,  x3,  #16
+        mov             x12, #32
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_l2_neon_10
+1:      ld1             {v26.8H, v27.8H}, [x1], x2
+        ld1             {v16.8H, v17.8H}, [x1], x2
+        ld1             {v28.8H},     [x3], x2
+        ld1             {v29.8H},     [x3], x2
+        subs            x12, x12, #4
+        lowpass_8_10    v26, v27, v16, v17, v26, v27
+        urhadd          v26.8H, v26.8H, v28.8H
+        urhadd          v27.8H, v27.8H, v29.8H
+  .ifc \type,avg
+        ld1             {v2.8H},      [x0], x2
+        urhadd          v26.8H, v26.8H, v2.8H
+        ld1             {v3.8H},      [x0]
+        urhadd          v27.8H, v27.8H, v3.8H
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v26.8H},     [x0], x2
+        st1             {v27.8H},     [x0], x2
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass_l2_10 put
+        h264_qpel_h_lowpass_l2_10 avg
+
+function put_h264_qpel16_v_lowpass_neon_packed_10
+        mov             x4,  x30
+        mov             x2,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+        b               put_h264_qpel8_v_lowpass_neon
+endfunc
+
+.macro  h264_qpel_v_lowpass_10 type
+function \type\()_h264_qpel16_v_lowpass_neon_10
+        mov             x4,  x30
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x0,  x0,  x2, lsl #4
+        add             x0,  x0,  #16
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #16
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_neon_10
+        ld1             {v16.8H}, [x1], x3
+        ld1             {v18.8H}, [x1], x3
+        ld1             {v20.8H}, [x1], x3
+        ld1             {v22.8H}, [x1], x3
+        ld1             {v24.8H}, [x1], x3
+        ld1             {v26.8H}, [x1], x3
+        ld1             {v28.8H}, [x1], x3
+        ld1             {v30.8H}, [x1], x3
+        ld1             {v17.8H}, [x1], x3
+        ld1             {v19.8H}, [x1], x3
+        ld1             {v21.8H}, [x1], x3
+        ld1             {v23.8H}, [x1], x3
+        ld1             {v25.8H}, [x1]
+
+        transpose_8x8H  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
+        transpose_8x8H  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
+        lowpass_8_10    v16, v17, v18, v19, v16, v17
+        lowpass_8_10    v20, v21, v22, v23, v18, v19
+        lowpass_8_10    v24, v25, v26, v27, v20, v21
+        lowpass_8_10    v28, v29, v30, v31, v22, v23
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1

I'm a bit surprised by doing this kind of vertical filtering bytransposing and doing it horizontally - when vertical filtering can bedone so efficiently as-is without needing any extra 'ext' instructions andsuch. But I see that the existing code does it this way. I'll give it atry to make a PoC of rewriting the existing code for some case to see howit behaves without the transposes.


// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/aarch64: h264qpel, add lowpass_8 based functions

Reply via email to