hevc_idct_32x32_8_c: 40128.5 hevc_idct_32x32_8_neon: 7102.0 Co-developed-by: Rafal Dabrowa <fatwild...@gmail.com> Signed-off-by: J. Dekker <j...@itanimul.li> --- libavcodec/aarch64/hevcdsp_idct_neon.S | 1265 +++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 + 2 files changed, 1267 insertions(+)
Written by the same author as the other 16x16 idct. Again the same concern with it. diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index 784bae33b3..3b6e95153f 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -644,6 +644,40 @@ idct_dc 32, 10 .Lo16transform5: .hword 43, -90, 57, 25, -87, 70, 9, -80 // transform[22][0-7] .Lo16transform6: .hword 25, -70, 90, -80, 43, 9, -57, 87 // transform[26][0-7] .Lo16transform7: .hword 9, -25, 43, -57, 70, -80, 87, -90 // transform[30][0-7] +.Lo32transform: + .hword 90, 90, 88, 85, 82, 78, 73, 67 // transform[1,3,5,7..15][1] + .hword 61, 54, 46, 38, 31, 22, 13, 4 // transform[17,19,21..31][1] + .hword 90, 82, 67, 46, 22, -4, -31, -54 // transform[1,3,5,7..15][3] + .hword -73, -85, -90, -88, -78, -61, -38, -13 // transform[17,19,21..31][3] + .hword 88, 67, 31, -13, -54, -82, -90, -78 // .. + .hword -46, -4, 38, 73, 90, 85, 61, 22 + .hword 85, 46, -13, -67, -90, -73, -22, 38 + .hword 82, 88, 54, -4, -61, -90, -78, -31 +.Lo32transform9_31: + .hword 82, 22, -54, -90, -61, 13, 78, 85 + .hword 31, -46, -90, -67, 4, 73, 88, 38 + .hword 78, -4, -82, -73, 13, 85, 67, -22 + .hword -88, -61, 31, 90, 54, -38, -90, -46 + .hword 73, -31, -90, -22, 78, 67, -38, -90 + .hword -13, 82, 61, -46, -88, -4, 85, 54 + .hword 67, -54, -78, 38, 85, -22, -90, 4 + .hword 90, 13, -88, -31, 82, 46, -73, -61 + .hword 61, -73, -46, 82, 31, -88, -13, 90 + .hword -4, -90, 22, 85, -38, -78, 54, 67 + .hword 54, -85, -4, 88, -46, -61, 82, 13 + .hword -90, 38, 67, -78, -22, 90, -31, -73 + .hword 46, -90, 38, 54, -90, 31, 61, -88 + .hword 22, 67, -85, 13, 73, -82, 4, 78 + .hword 38, -88, 73, -4, -67, 90, -46, -31 + .hword 85, -78, 13, 61, -90, 54, 22, -82 + .hword 31, -78, 90, -61, 4, 54, -88, 82 + .hword -38, -22, 73, -90, 67, -13, -46, 85 + .hword 22, -61, 85, -90, 73, -38, -4, 46 + .hword -78, 90, -82, 54, -13, -31, 67, -88 + .hword 13, -38, 61, -78, 88, -90, 85, -73 + .hword 54, -31, 4, 22, -46, 67, -82, 90 + .hword 4, -13, 22, -31, 38, -46, 54, -61 // transform[1,3,5,7..15][31] + .hword 67, -73, 78, -82, 85, -88, 90, -90 // transform[17,19,21..31][31] // void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit) function ff_hevc_idct_16x16_8_neon_new, export=1 @@ -1284,3 +1318,1234 @@ function ff_hevc_idct_16x16_8_neon_new, export=1 ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64 ret endfunc + +function ff_hevc_idct_32x32_8_neon, export=1 + sub sp, sp, 64 + st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp] + sub sp, sp, 64 + st1 {v12.16b, v13.16b, v14.16b, v15.16b}, [sp] + sub sp, sp, 16 * 32 * 4 // room for o_32: 16 * 32 values + mov x3, 0 // loop counter + mov x2, x0 + mov x7, 83 + add x7, x7, 36 * 65536 // o0, o1 coeff. factors +1: mov x9, 128 + // loading odd lines + add x4, x2, 64 // odd lines + ld1 {v16.8h}, [x4], x9 // line 1 + ld1 {v17.8h}, [x4], x9 // line 3 + ld1 {v18.8h}, [x4], x9 // line 5 + ld1 {v19.8h}, [x4], x9 // line 7 + ld1 {v20.8h}, [x4], x9 // line 9 + ld1 {v21.8h}, [x4], x9 // line 11 + ld1 {v22.8h}, [x4], x9 // line 13 + ld1 {v23.8h}, [x4], x9 // line 15 + ld1 {v24.8h}, [x4], x9 // line 17 + ld1 {v25.8h}, [x4], x9 // line 19 + ld1 {v26.8h}, [x4], x9 // line 21 + ld1 {v27.8h}, [x4], x9 // line 23 + ld1 {v28.8h}, [x4], x9 // line 25 + ld1 {v29.8h}, [x4], x9 // line 27 + ld1 {v30.8h}, [x4], x9 // line 29 + ld1 {v31.8h}, [x4], x9 // line 31 + + cmp x1, 28 + b.hs 5f + // limit2 below 32 + bic x4, x1, 1 + adr x5, .LimitMask + cbnz x3, 3f + // columns 0 .. 7 - cleanup of indexes 5 .. 7 + ld1 {v0.8h}, [x5] + adr x5, 2f + add x5, x5, x4, lsl 2 + add x5, x5, x4, lsl 1 + br x5 +2: and v16.16b, v16.16b, v0.16b // col_limit 0..1 -> limit2 == 4..5 + and v17.16b, v17.16b, v0.16b + b 5f + and v17.16b, v17.16b, v0.16b // col_limit 2..3 -> limit2 == 6..7 + and v18.16b, v18.16b, v0.16b + b 5f + and v18.16b, v18.16b, v0.16b // col_limit 4..5 -> limit2 == 8..9 + and v19.16b, v19.16b, v0.16b + b 5f + and v19.16b, v19.16b, v0.16b // col_limit 6..7 -> limit2 == 10..11 + and v20.16b, v20.16b, v0.16b + b 5f + and v20.16b, v20.16b, v0.16b // col_limit 8..9 -> limit2 == 12..13 + and v21.16b, v21.16b, v0.16b + b 5f + and v21.16b, v21.16b, v0.16b // col_limit 10..11 -> limit2 == 14..15 + and v22.16b, v22.16b, v0.16b + b 5f + and v22.16b, v22.16b, v0.16b // col_limit 12..13 -> limit2 == 16..17 + and v23.16b, v23.16b, v0.16b + b 5f + and v23.16b, v23.16b, v0.16b // col_limit 14..15 -> limit2 == 18..19 + and v24.16b, v24.16b, v0.16b + b 5f + and v24.16b, v24.16b, v0.16b // col_limit 16..17 -> limit2 == 20..21 + and v25.16b, v25.16b, v0.16b + b 5f + and v25.16b, v25.16b, v0.16b // col_limit 18..19 -> limit2 == 22..23 + and v26.16b, v26.16b, v0.16b + b 5f + and v26.16b, v26.16b, v0.16b // col_limit 20..21 -> limit2 == 24..25 + and v27.16b, v27.16b, v0.16b + b 5f + and v27.16b, v27.16b, v0.16b // col_limit 22..23 -> limit2 == 26..27 + and v28.16b, v28.16b, v0.16b + b 5f + and v28.16b, v28.16b, v0.16b // col_limit 24..25 -> limit2 == 28..29 + and v29.16b, v29.16b, v0.16b + b 5f + and v29.16b, v29.16b, v0.16b // col_limit 26..27 -> limit2 == 30..31 + and v30.16b, v30.16b, v0.16b + b 5f + // columns 8 .. 31 +3: add x4, x4, 6 + subs x4, x4, x3, lsl 3 + b.lo 5f + ld1 {v0.8h, v1.8h}, [x5] + adr x5, 4f + add x5, x5, x4, lsl 3 + add x5, x5, x4, lsl 1 + br x5 +4: and v16.16b, v16.16b, v1.16b // limit2 == 2..3 + b 5f + nop + nop + nop + and v16.16b, v16.16b, v1.16b // limit2 == 4..5 + and v17.16b, v17.16b, v1.16b + b 5f + nop + nop + and v16.16b, v16.16b, v0.16b // limit2 == 6..7 + and v17.16b, v17.16b, v1.16b + and v18.16b, v18.16b, v1.16b + b 5f + nop + and v16.16b, v16.16b, v0.16b // limit2 == 8..9 + and v17.16b, v17.16b, v0.16b + and v18.16b, v18.16b, v1.16b + and v19.16b, v19.16b, v1.16b + b 5f + and v17.16b, v17.16b, v0.16b // limit2 == 10..11 + and v18.16b, v18.16b, v0.16b + and v19.16b, v19.16b, v1.16b + and v20.16b, v20.16b, v1.16b + b 5f + and v18.16b, v18.16b, v0.16b // limit2 == 12..13 + and v19.16b, v19.16b, v0.16b + and v20.16b, v20.16b, v1.16b + and v21.16b, v21.16b, v1.16b + b 5f + and v19.16b, v19.16b, v0.16b // limit2 == 14..15 + and v20.16b, v20.16b, v0.16b + and v21.16b, v21.16b, v1.16b + and v22.16b, v22.16b, v1.16b + b 5f + and v20.16b, v20.16b, v0.16b // limit2 == 16..17 + and v21.16b, v21.16b, v0.16b + and v22.16b, v22.16b, v1.16b + and v23.16b, v23.16b, v1.16b + b 5f + and v21.16b, v21.16b, v0.16b // limit2 == 18..19 + and v22.16b, v22.16b, v0.16b + and v23.16b, v23.16b, v1.16b + and v24.16b, v24.16b, v1.16b + b 5f + and v22.16b, v22.16b, v0.16b // limit2 == 20..21 + and v23.16b, v23.16b, v0.16b + and v24.16b, v24.16b, v1.16b + and v25.16b, v25.16b, v1.16b + b 5f + and v23.16b, v23.16b, v0.16b // limit2 == 22..23 + and v24.16b, v24.16b, v0.16b + and v25.16b, v25.16b, v1.16b + and v26.16b, v26.16b, v1.16b + b 5f + and v24.16b, v24.16b, v0.16b // limit2 == 24..25 + and v25.16b, v25.16b, v0.16b + and v26.16b, v26.16b, v1.16b + and v27.16b, v27.16b, v1.16b + b 5f + and v25.16b, v25.16b, v0.16b // limit2 == 26..27 + and v26.16b, v26.16b, v0.16b + and v27.16b, v27.16b, v1.16b + and v28.16b, v28.16b, v1.16b + b 5f + + + // o_32 +5: mov x5, 32 + cmp x1, 28 + b.hs 6f + add x5, x1, 4 + bic x5, x5, 1 + cbz x3, 6f + add x5, x1, 6 + orr x5, x5, 1 + subs x5, x5, x3, lsl 3 + csel x5, x5, xzr, hs +6: mov x4, 128 + sub x4, x4, x5, lsl 2 + adr x5, 8f + add x5, x5, x4 + adr x4, .Lo32transform + mov x8, sp + mov x6, 16 +7: ld1 {v2.8h, v3.8h}, [x4], 32 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +8: smlal2 v9.4s, v31.8h, v3.h[7] + smlal v8.4s, v31.4h, v3.h[7] + smlal2 v9.4s, v30.8h, v3.h[6] + smlal v8.4s, v30.4h, v3.h[6] + smlal2 v9.4s, v29.8h, v3.h[5] + smlal v8.4s, v29.4h, v3.h[5] + smlal2 v9.4s, v28.8h, v3.h[4] + smlal v8.4s, v28.4h, v3.h[4] + smlal2 v9.4s, v27.8h, v3.h[3] + smlal v8.4s, v27.4h, v3.h[3] + smlal2 v9.4s, v26.8h, v3.h[2] + smlal v8.4s, v26.4h, v3.h[2] + smlal2 v9.4s, v25.8h, v3.h[1] + smlal v8.4s, v25.4h, v3.h[1] + smlal2 v9.4s, v24.8h, v3.h[0] + smlal v8.4s, v24.4h, v3.h[0] + smlal2 v9.4s, v23.8h, v2.h[7] + smlal v8.4s, v23.4h, v2.h[7] + smlal2 v9.4s, v22.8h, v2.h[6] + smlal v8.4s, v22.4h, v2.h[6] + smlal2 v9.4s, v21.8h, v2.h[5] + smlal v8.4s, v21.4h, v2.h[5] + smlal2 v9.4s, v20.8h, v2.h[4] + smlal v8.4s, v20.4h, v2.h[4] + smlal2 v9.4s, v19.8h, v2.h[3] + smlal v8.4s, v19.4h, v2.h[3] + smlal2 v9.4s, v18.8h, v2.h[2] + smlal v8.4s, v18.4h, v2.h[2] + smlal2 v9.4s, v17.8h, v2.h[1] + smlal v8.4s, v17.4h, v2.h[1] + smlal2 v9.4s, v16.8h, v2.h[0] + smlal v8.4s, v16.4h, v2.h[0] + st1 {v8.4s, v9.4s}, [x8], 32 + subs x6, x6, 1 + b.ne 7b + + mov x4, x2 + ld1 {v16.8h}, [x4], x9 // line 0 + ld1 {v17.8h}, [x4], x9 // line 2 + ld1 {v18.8h}, [x4], x9 // line 4 + ld1 {v19.8h}, [x4], x9 // line 6 + ld1 {v20.8h}, [x4], x9 // line 8 + ld1 {v21.8h}, [x4], x9 // line 10 + ld1 {v22.8h}, [x4], x9 // line 12 + ld1 {v23.8h}, [x4], x9 // line 14 + ld1 {v24.8h}, [x4], x9 // line 16 + ld1 {v25.8h}, [x4], x9 // line 18 + ld1 {v26.8h}, [x4], x9 // line 20 + ld1 {v27.8h}, [x4], x9 // line 22 + ld1 {v28.8h}, [x4], x9 // line 24 + ld1 {v29.8h}, [x4], x9 // line 26 + ld1 {v30.8h}, [x4], x9 // line 28 + ld1 {v31.8h}, [x4], x9 // line 30 + cmp x1, 28 + b.hs 12f + // limit2 below 32 + bic x4, x1, 3 + cbnz x3, 10f + // columns 0 .. 7 - cleanup of indexes 5 .. 7 + adr x5, 9f + add x5, x5, x4, lsl 1 + br x5 +9: and v17.16b, v17.16b, v0.16b // col_limit 0..3 -> limit2/2 == 2..3 + b 12f + and v19.16b, v19.16b, v0.16b // col_limit 4..7 -> limit2/2 == 4..5 + b 12f + and v21.16b, v21.16b, v0.16b // col_limit 8..11 -> limit2/2 == 6..7 + b 12f + and v23.16b, v23.16b, v0.16b // col_limit 12..15 -> limit2/2 == 8..9 + b 12f + and v25.16b, v25.16b, v0.16b // col_limit 16..19 -> limit2/2 == 10..11 + b 12f + and v27.16b, v27.16b, v0.16b // col_limit 20..23 -> limit2/2 == 12..13 + b 12f + and v29.16b, v29.16b, v0.16b // col_limit 24..27 -> limit2/2 == 14..15 + b 12f + // columns 8 .. 31 +10: add x4, x4, 4 + subs x4, x4, x3, lsl 3 // x4 = (limit2 & ~3)-4 for column 8 * x3 + b.lo 12f + adr x5, 11f + add x5, x5, x4, lsl 1 + add x5, x5, x4 + br x5 +11: and v17.16b, v17.16b, v1.16b // limit2 == 4..7 + b 12f + nop + and v17.16b, v17.16b, v0.16b // limit2 == 8..11 + and v19.16b, v19.16b, v1.16b + b 12f + and v19.16b, v19.16b, v0.16b // limit2 == 12..15 + and v21.16b, v21.16b, v1.16b + b 12f + and v21.16b, v21.16b, v0.16b // limit2 == 16..19 + and v23.16b, v23.16b, v1.16b + b 12f + and v23.16b, v23.16b, v0.16b // limit2 == 20..23 + and v25.16b, v25.16b, v1.16b + b 12f + and v25.16b, v25.16b, v0.16b // limit2 == 24..27 + and v27.16b, v27.16b, v1.16b + b 12f + + // v0,v1 = e0 +12: sshll v0.4s, v16.4h, 6 + sshll v1.4s, v24.4h, 6 + add v0.4s, v0.4s, v1.4s + sshll2 v1.4s, v16.8h, 6 + sshll2 v2.4s, v24.8h, 6 + add v1.4s, v1.4s, v2.4s + + // v2,v3 = o0 + mov v14.s[0], w7 + smull v2.4s, v20.4h, v14.h[0] + smlal v2.4s, v28.4h, v14.h[1] + smull2 v3.4s, v20.8h, v14.h[0] + smlal2 v3.4s, v28.8h, v14.h[1] + + // v4,v5 = e_8[0] + add v4.4s, v0.4s, v2.4s + add v5.4s, v1.4s, v3.4s + + // v6,v7 = e_8[3] + sub v6.4s, v0.4s, v2.4s + sub v7.4s, v1.4s, v3.4s + + + // v0,v1 = o_8[0] + adr x4, .Lo8transform0 + ld1 {v15.4h}, [x4] + smull v0.4s, v18.4h, v15.h[0] + smlal v0.4s, v22.4h, v15.h[1] + smlal v0.4s, v26.4h, v15.h[2] + smlal v0.4s, v30.4h, v15.h[3] + smull2 v1.4s, v18.8h, v15.h[0] + smlal2 v1.4s, v22.8h, v15.h[1] + smlal2 v1.4s, v26.8h, v15.h[2] + smlal2 v1.4s, v30.8h, v15.h[3] + + // v2,v3 = e_16[0] + add v2.4s, v4.4s, v0.4s + add v3.4s, v5.4s, v1.4s + + // v8,v9 = o_16[0] + adr x4, .Lo16transform0 + ld1 {v15.8h}, [x4] + mov x5, 32 + cmp x1, 28 + b.hs 13f + add x5, x1, 4 + bic x5, x5, 3 + cbz x3, 13f + orr x5, x5, 2 + subs x5, x5, x3, lsl 3 + csel x5, x5, xzr, hs +13: mov x4, 64 + sub x6, x4, x5, lsl 1 + adr x5, 14f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +14: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // v12,v13 = e_32[0] + add v12.4s, v2.4s, v8.4s + add v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[0] + ld1 {v14.4s, v15.4s}, [sp] + + // tmp[0 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + st1 {v10.8h}, [x2] + + // tmp[31 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 31 * 64 + st1 {v10.8h}, [x4] + + // v12,v13 = e_32[15] + sub v12.4s, v2.4s, v8.4s + sub v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[15] + add x4, sp, 15 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[15 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 15 * 64 + st1 {v10.8h}, [x4] + + // tmp[16 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 16 * 64 + st1 {v10.8h}, [x4] + + // v2,v3 = e_16[7] + sub v2.4s, v4.4s, v0.4s + sub v3.4s, v5.4s, v1.4s + + // v8,v9 = o_16[7] + adr x4, .Lo16transform7 + ld1 {v15.8h}, [x4] + adr x5, 15f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +15: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // v12,v13 = e_32[7] + add v12.4s, v2.4s, v8.4s + add v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[7] + add x4, sp, 7 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[7 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 7 * 64 + st1 {v10.8h}, [x4] + + // tmp[24 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 24 * 64 + st1 {v10.8h}, [x4] + + // v12,v13 = e_32[8] + sub v12.4s, v2.4s, v8.4s + sub v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[8] + add x4, sp, 8 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[8 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 8 * 64 + st1 {v10.8h}, [x4] + + // tmp[23 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 23 * 64 + st1 {v10.8h}, [x4] + + // v0,v1 = o_8[3] + adr x4, .Lo8transform3 + ld1 {v15.4h}, [x4] + smull v0.4s, v18.4h, v15.h[0] + smlal v0.4s, v22.4h, v15.h[1] + smlal v0.4s, v26.4h, v15.h[2] + smlal v0.4s, v30.4h, v15.h[3] + smull2 v1.4s, v18.8h, v15.h[0] + smlal2 v1.4s, v22.8h, v15.h[1] + smlal2 v1.4s, v26.8h, v15.h[2] + smlal2 v1.4s, v30.8h, v15.h[3] + + // v2,v3 = e_16[3] + add v2.4s, v6.4s, v0.4s + add v3.4s, v7.4s, v1.4s + + // v8,v9 = o_16[3] + adr x4, .Lo16transform3 + ld1 {v15.8h}, [x4] + adr x5, 16f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +16: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // v12,v13 = e_32[3] + add v12.4s, v2.4s, v8.4s + add v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[3] + add x4, sp, 3 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[3 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 3 * 64 + st1 {v10.8h}, [x4] + + // tmp[28 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 28 * 64 + st1 {v10.8h}, [x4] + + // v12,v13 = e_32[12] + sub v12.4s, v2.4s, v8.4s + sub v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[12] + add x4, sp, 12 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[12 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 12 * 64 + st1 {v10.8h}, [x4] + + // tmp[19 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 19 * 64 + st1 {v10.8h}, [x4] + + // v2,v3 = e_16[4] + sub v2.4s, v6.4s, v0.4s + sub v3.4s, v7.4s, v1.4s + + // v8,v9 = o_16[4] + adr x4, .Lo16transform4 + ld1 {v15.8h}, [x4] + adr x5, 17f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +17: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // v12,v13 = e_32[4] + add v12.4s, v2.4s, v8.4s + add v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[4] + add x4, sp, 4 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[4 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 4 * 64 + st1 {v10.8h}, [x4] + + // tmp[27 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 27 * 64 + st1 {v10.8h}, [x4] + + // v12,v13 = e_32[11] + sub v12.4s, v2.4s, v8.4s + sub v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[11] + add x4, sp, 11 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[11 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 11 * 64 + st1 {v10.8h}, [x4] + + // tmp[20 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 20 * 64 + st1 {v10.8h}, [x4] + + // v0,v1 = e1 + sshll v0.4s, v16.4h, 6 + sshll v1.4s, v24.4h, 6 + sub v0.4s, v0.4s, v1.4s + sshll2 v1.4s, v16.8h, 6 + sshll2 v2.4s, v24.8h, 6 + sub v1.4s, v1.4s, v2.4s + + // v2,v3 = o1 + mov v14.s[0], w7 + smull v2.4s, v20.4h, v14.h[1] + smlsl v2.4s, v28.4h, v14.h[0] + smull2 v3.4s, v20.8h, v14.h[1] + smlsl2 v3.4s, v28.8h, v14.h[0] + + // v4,v5 = e_8[1] + add v4.4s, v0.4s, v2.4s + add v5.4s, v1.4s, v3.4s + + // v6,v7 = e_8[2] + sub v6.4s, v0.4s, v2.4s + sub v7.4s, v1.4s, v3.4s + + // v0,v1 = o_8[1] + adr x4, .Lo8transform1 + ld1 {v15.4h}, [x4] + smull v0.4s, v18.4h, v15.h[0] + smlal v0.4s, v22.4h, v15.h[1] + smlal v0.4s, v26.4h, v15.h[2] + smlal v0.4s, v30.4h, v15.h[3] + smull2 v1.4s, v18.8h, v15.h[0] + smlal2 v1.4s, v22.8h, v15.h[1] + smlal2 v1.4s, v26.8h, v15.h[2] + smlal2 v1.4s, v30.8h, v15.h[3] + + // v2,v3 = e_16[1] + add v2.4s, v4.4s, v0.4s + add v3.4s, v5.4s, v1.4s + + // v8,v9 = o_16[1] + adr x4, .Lo16transform1 + ld1 {v15.8h}, [x4] + adr x5, 18f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +18: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // v12,v13 = e_32[1] + add v12.4s, v2.4s, v8.4s + add v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[1] + add x4, sp, 1 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[1 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 1 * 64 + st1 {v10.8h}, [x4] + + // tmp[30 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 30 * 64 + st1 {v10.8h}, [x4] + + // v12,v13 = e_32[14] + sub v12.4s, v2.4s, v8.4s + sub v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[14] + add x4, sp, 14 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[14 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 14 * 64 + st1 {v10.8h}, [x4] + + // tmp[17 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 17 * 64 + st1 {v10.8h}, [x4] + + // v2,v3 = e_16[6] + sub v2.4s, v4.4s, v0.4s + sub v3.4s, v5.4s, v1.4s + + // v8,v9 = o_16[6] + adr x4, .Lo16transform6 + ld1 {v15.8h}, [x4] + adr x5, 19f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +19: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // v12,v13 = e_32[6] + add v12.4s, v2.4s, v8.4s + add v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[6] + add x4, sp, 6 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[6 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 6 * 64 + st1 {v10.8h}, [x4] + + // tmp[25 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 25 * 64 + st1 {v10.8h}, [x4] + + // v12,v13 = e_32[9] + sub v12.4s, v2.4s, v8.4s + sub v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[9] + add x4, sp, 9 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[9 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 9 * 64 + st1 {v10.8h}, [x4] + + // tmp[22 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 22 * 64 + st1 {v10.8h}, [x4] + + // v0,v1 = o_8[2] + adr x4, .Lo8transform2 + ld1 {v15.4h}, [x4] + smull v0.4s, v18.4h, v15.h[0] + smlal v0.4s, v22.4h, v15.h[1] + smlal v0.4s, v26.4h, v15.h[2] + smlal v0.4s, v30.4h, v15.h[3] + smull2 v1.4s, v18.8h, v15.h[0] + smlal2 v1.4s, v22.8h, v15.h[1] + smlal2 v1.4s, v26.8h, v15.h[2] + smlal2 v1.4s, v30.8h, v15.h[3] + + // v2,v3 = e_16[2] + add v2.4s, v6.4s, v0.4s + add v3.4s, v7.4s, v1.4s + + // v8,v9 = o_16[2] + adr x4, .Lo16transform2 + ld1 {v15.8h}, [x4] + adr x5, 20f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +20: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // v12,v13 = e_32[2] + add v12.4s, v2.4s, v8.4s + add v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[2] + add x4, sp, 2 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[2 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 2 * 64 + st1 {v10.8h}, [x4] + + // tmp[29 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 29 * 64 + st1 {v10.8h}, [x4] + + // v12,v13 = e_32[13] + sub v12.4s, v2.4s, v8.4s + sub v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[13] + add x4, sp, 13 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[13 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 13 * 64 + st1 {v10.8h}, [x4] + + // tmp[18 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 18 * 64 + st1 {v10.8h}, [x4] + + // v2,v3 = e_16[5] + sub v2.4s, v6.4s, v0.4s + sub v3.4s, v7.4s, v1.4s + + // v8,v9 = o_16[5] + adr x4, .Lo16transform5 + ld1 {v15.8h}, [x4] + adr x5, 21f + add x5, x5, x6 + movi v8.4s, 0 + movi v9.4s, 0 + br x5 +21: smlal2 v9.4s, v31.8h, v15.h[7] + smlal v8.4s, v31.4h, v15.h[7] + smlal2 v9.4s, v29.8h, v15.h[6] + smlal v8.4s, v29.4h, v15.h[6] + smlal2 v9.4s, v27.8h, v15.h[5] + smlal v8.4s, v27.4h, v15.h[5] + smlal2 v9.4s, v25.8h, v15.h[4] + smlal v8.4s, v25.4h, v15.h[4] + smlal2 v9.4s, v23.8h, v15.h[3] + smlal v8.4s, v23.4h, v15.h[3] + smlal2 v9.4s, v21.8h, v15.h[2] + smlal v8.4s, v21.4h, v15.h[2] + smlal2 v9.4s, v19.8h, v15.h[1] + smlal v8.4s, v19.4h, v15.h[1] + smlal2 v9.4s, v17.8h, v15.h[0] + smlal v8.4s, v17.4h, v15.h[0] + + // v12,v13 = e_32[5] + add v12.4s, v2.4s, v8.4s + add v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[5] + add x4, sp, 5 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[5 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 5 * 64 + st1 {v10.8h}, [x4] + + // tmp[26 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 26 * 64 + st1 {v10.8h}, [x4] + + // v12,v13 = e_32[10] + sub v12.4s, v2.4s, v8.4s + sub v13.4s, v3.4s, v9.4s + + // v14,v15 = o_32[10] + add x4, sp, 10 * 32 + ld1 {v14.4s, v15.4s}, [x4] + + // tmp[10 * 32] + add v10.4s, v12.4s, v14.4s + add v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 10 * 64 + st1 {v10.8h}, [x4] + + // tmp[21 * 32] + sub v10.4s, v12.4s, v14.4s + sub v11.4s, v13.4s, v15.4s + sqrshrn v10.4h, v10.4s, 7 + sqrshrn2 v10.8h, v11.4s, 7 + add x4, x2, 21 * 64 + st1 {v10.8h}, [x4] + + + add x2, x2, 16 + add x3, x3, 1 + cmp x3, 4 + b.ne 1b + + // horizontal transform + cmp x1, 9 + b.ls 24f + // o_32 partially (last 12 sum components) + adr x4, .Lo32transform9_31 + ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x4], 64 + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x4], 64 + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], 64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], 64 + bic x5, x1, 1 + subs x5, x5, 8 + csel x5, x5, xzr, hs + mov x4, 24 + subs x4, x4, x5 + csel x5, x4, xzr, hs + adr x4, 23f + add x5, x4, x5, lsl 3 + add x2, x0, 16 + mov x8, sp + mov x3, 64 + mov x6, 32 +22: ld1 {v0.8h, v1.8h, v2.8h}, [x2], x3 + movi v4.4s, 0 + movi v5.4s, 0 + movi v6.4s, 0 + movi v7.4s, 0 + br x5 +23: smlal v4.4s, v30.4h, v2.h[7] + smlal2 v5.4s, v30.8h, v2.h[7] + smlal v6.4s, v31.4h, v2.h[7] + smlal2 v7.4s, v31.8h, v2.h[7] + smlal v4.4s, v28.4h, v2.h[5] + smlal2 v5.4s, v28.8h, v2.h[5] + smlal v6.4s, v29.4h, v2.h[5] + smlal2 v7.4s, v29.8h, v2.h[5] + smlal v4.4s, v26.4h, v2.h[3] + smlal2 v5.4s, v26.8h, v2.h[3] + smlal v6.4s, v27.4h, v2.h[3] + smlal2 v7.4s, v27.8h, v2.h[3] + smlal v4.4s, v24.4h, v2.h[1] + smlal2 v5.4s, v24.8h, v2.h[1] + smlal v6.4s, v25.4h, v2.h[1] + smlal2 v7.4s, v25.8h, v2.h[1] + smlal v4.4s, v22.4h, v1.h[7] + smlal2 v5.4s, v22.8h, v1.h[7] + smlal v6.4s, v23.4h, v1.h[7] + smlal2 v7.4s, v23.8h, v1.h[7] + smlal v4.4s, v20.4h, v1.h[5] + smlal2 v5.4s, v20.8h, v1.h[5] + smlal v6.4s, v21.4h, v1.h[5] + smlal2 v7.4s, v21.8h, v1.h[5] + smlal v4.4s, v18.4h, v1.h[3] + smlal2 v5.4s, v18.8h, v1.h[3] + smlal v6.4s, v19.4h, v1.h[3] + smlal2 v7.4s, v19.8h, v1.h[3] + smlal v4.4s, v16.4h, v1.h[1] + smlal2 v5.4s, v16.8h, v1.h[1] + smlal v6.4s, v17.4h, v1.h[1] + smlal2 v7.4s, v17.8h, v1.h[1] + smlal v4.4s, v14.4h, v0.h[7] + smlal2 v5.4s, v14.8h, v0.h[7] + smlal v6.4s, v15.4h, v0.h[7] + smlal2 v7.4s, v15.8h, v0.h[7] + smlal v4.4s, v12.4h, v0.h[5] + smlal2 v5.4s, v12.8h, v0.h[5] + smlal v6.4s, v13.4h, v0.h[5] + smlal2 v7.4s, v13.8h, v0.h[5] + smlal v4.4s, v10.4h, v0.h[3] + smlal2 v5.4s, v10.8h, v0.h[3] + smlal v6.4s, v11.4h, v0.h[3] + smlal2 v7.4s, v11.8h, v0.h[3] + smlal v4.4s, v8.4h, v0.h[1] + smlal2 v5.4s, v8.8h, v0.h[1] + smlal v6.4s, v9.4h, v0.h[1] + smlal2 v7.4s, v9.8h, v0.h[1] + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x8], 64 + subs x6, x6, 1 + b.ne 22b + + +24: adr x4, .Leo_coeff + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x4], 64 + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x4], 64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], 64 + adr x4, .Lo32transform + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], 64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], 64 + // o_16 jump address + mov x4, 64 + bic x5, x1, 3 + subs x4, x4, x5, lsl 1 + csel x4, x4, xzr, hs + adr x5, 26f + add x5, x5, x4 + // o_32 jump address + bic x6, x1, 1 + mov x4, 8 + subs x4, x4, x6 + csel x6, x4, xzr, hs + adr x4, 29f + add x6, x4, x6, lsl 3 + + mov x8, sp + mov x3, 32 +25: ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0] + + // v2 = e_8 + smull v2.4s, v12.4h, v8.h[0] + smlal2 v2.4s, v12.8h, v9.h[0] + smlal v2.4s, v13.4h, v10.h[0] + smlal2 v2.4s, v13.8h, v11.h[0] + + // v3 = o_8 + smull v3.4s, v14.4h, v8.h[4] + smlal2 v3.4s, v14.8h, v9.h[4] + smlal v3.4s, v15.4h, v10.h[4] + smlal2 v3.4s, v15.8h, v11.h[4] + + // v0,v1 = e_16 + add v0.4s, v2.4s, v3.4s + sub v2.4s, v2.4s, v3.4s + mov v1.d[0], v2.d[1] + mov v1.d[1], v2.d[0] + rev64 v1.4s, v1.4s + + // v2,v3 = o_16 + movi v2.4s, 0 + movi v3.4s, 0 + br x5 +26: smlal v2.4s, v23.4h, v11.h[6] + smlal2 v3.4s, v23.8h, v11.h[6] + smlal v2.4s, v22.4h, v11.h[2] + smlal2 v3.4s, v22.8h, v11.h[2] + smlal v2.4s, v21.4h, v10.h[6] + smlal2 v3.4s, v21.8h, v10.h[6] + smlal v2.4s, v20.4h, v10.h[2] + smlal2 v3.4s, v20.8h, v10.h[2] + smlal v2.4s, v19.4h, v9.h[6] + smlal2 v3.4s, v19.8h, v9.h[6] + smlal v2.4s, v18.4h, v9.h[2] + smlal2 v3.4s, v18.8h, v9.h[2] + smlal v2.4s, v17.4h, v8.h[6] + smlal2 v3.4s, v17.8h, v8.h[6] + smlal v2.4s, v16.4h, v8.h[2] + smlal2 v3.4s, v16.8h, v8.h[2] + + // v4,v5,v6,v7 = e_32 + add v4.4s, v0.4s, v2.4s + add v5.4s, v1.4s, v3.4s + sub v11.4s, v0.4s, v2.4s + mov v7.d[0], v11.d[1] + mov v7.d[1], v11.d[0] + rev64 v7.4s, v7.4s + sub v11.4s, v1.4s, v3.4s + mov v6.d[0], v11.d[1] + mov v6.d[1], v11.d[0] + rev64 v6.4s, v6.4s + + // v0,v1,v2,v3 = o_32 + cmp x1, 9 + b.hi 28f + movi v0.4s, 0 + movi v1.4s, 0 + movi v2.4s, 0 + movi v3.4s, 0 + br x6 +28: ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], 64 + br x6 +29: smlal v0.4s, v30.4h, v8.h[7] + smlal2 v1.4s, v30.8h, v8.h[7] + smlal v2.4s, v31.4h, v8.h[7] + smlal2 v3.4s, v31.8h, v8.h[7] + smlal v0.4s, v28.4h, v8.h[5] + smlal2 v1.4s, v28.8h, v8.h[5] + smlal v2.4s, v29.4h, v8.h[5] + smlal2 v3.4s, v29.8h, v8.h[5] + smlal v0.4s, v26.4h, v8.h[3] + smlal2 v1.4s, v26.8h, v8.h[3] + smlal v2.4s, v27.4h, v8.h[3] + smlal2 v3.4s, v27.8h, v8.h[3] + smlal v0.4s, v24.4h, v8.h[1] + smlal2 v1.4s, v24.8h, v8.h[1] + smlal v2.4s, v25.4h, v8.h[1] + smlal2 v3.4s, v25.8h, v8.h[1] + + // coeff + add v8.4s, v4.4s, v0.4s + add v9.4s, v5.4s, v1.4s + add v10.4s, v6.4s, v2.4s + add v11.4s, v7.4s, v3.4s + sub v4.4s, v4.4s, v0.4s + sub v5.4s, v5.4s, v1.4s + sub v6.4s, v6.4s, v2.4s + sub v7.4s, v7.4s, v3.4s + sqrshrn v8.4h, v8.4s, 12 + sqrshrn2 v8.8h, v9.4s, 12 + sqrshrn v9.4h, v10.4s, 12 + sqrshrn2 v9.8h, v11.4s, 12 + sqrshrn v4.4h, v4.4s, 12 + sqrshrn2 v4.8h, v5.4s, 12 + sqrshrn v5.4h, v6.4s, 12 + sqrshrn2 v5.8h, v7.4s, 12 + mov v10.d[0], v5.d[1] + mov v10.d[1], v5.d[0] + rev64 v10.8h, v10.8h + mov v11.d[0], v4.d[1] + mov v11.d[1], v4.d[0] + rev64 v11.8h, v11.8h + st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], 64 + subs x3, x3, 1 + b.ne 25b + + add sp, sp, 16 * 32 * 4 + ld1 {v12.16b, v13.16b, v14.16b, v15.16b}, [sp], 64 + ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], 64 + ret +endfunc diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 612ebb9541..bb2a6b2502 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -46,6 +46,7 @@ void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_16x16_8_neon_new(int16_t *coeffs, int col_limit); +void ff_hevc_idct_32x32_8_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs); void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs); void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs); @@ -74,6 +75,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon; c->idct[1] = ff_hevc_idct_8x8_8_neon; c->idct[2] = ff_hevc_idct_16x16_8_neon_new; + c->idct[3] = ff_hevc_idct_32x32_8_neon; c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_neon; c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon; c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; -- 2.32.0 (Apple Git-132) _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".