ffmpeg | branch: master | Henrik Gramner <gram...@twoorioles.com> | Fri May 16 15:18:14 2025 +0200| [fd18ae88ae736b5aabff34e17394fcd103f9e5ad] | committer: Henrik Gramner
avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 8bpc inverse transforms > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=fd18ae88ae736b5aabff34e17394fcd103f9e5ad --- libavcodec/x86/Makefile | 1 + libavcodec/x86/vp9dsp_init.c | 15 + libavcodec/x86/vp9itxfm_avx512.asm | 1629 ++++++++++++++++++++++++++++++++++++ libavutil/mem_internal.h | 2 + tests/checkasm/vp9dsp.c | 14 +- 5 files changed, 1654 insertions(+), 7 deletions(-) diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 821c410a0f..bf752f5da2 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -184,6 +184,7 @@ X86ASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \ x86/vp9intrapred_16bpp.o \ x86/vp9itxfm.o \ + x86/vp9itxfm_avx512.o \ x86/vp9itxfm_16bpp.o \ x86/vp9lpf.o \ x86/vp9lpf_16bpp.o \ diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index 8d11dbc348..4373fa3f04 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -114,7 +114,9 @@ itxfm_func(idct, idct, 32, ssse3); itxfm_func(idct, idct, 32, avx); itxfm_func(iwht, iwht, 4, mmx); itxfm_funcs(16, avx2); +itxfm_funcs(16, avx512icl); itxfm_func(idct, idct, 32, avx2); +itxfm_func(idct, idct, 32, avx512icl); #undef itxfm_func #undef itxfm_funcs @@ -406,6 +408,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) init_ipred(32, avx2, tm, TM_VP8); } +#if ARCH_X86_64 + if (EXTERNAL_AVX512ICL(cpu_flags)) { + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx512icl; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx512icl; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx512icl; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx512icl; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx512icl; + } +#endif + #undef init_fpel #undef init_subpel1 #undef init_subpel2 diff --git a/libavcodec/x86/vp9itxfm_avx512.asm b/libavcodec/x86/vp9itxfm_avx512.asm new file mode 100644 index 0000000000..d51c50756d --- /dev/null +++ b/libavcodec/x86/vp9itxfm_avx512.asm @@ -0,0 +1,1629 @@ +;****************************************************************************** +;* VP9 IDCT SIMD optimizations +;* +;* Copyright (C) 2025 Two Orioles, LLC +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 && HAVE_AVX512ICL_EXTERNAL + +SECTION_RODATA 64 + +dup16_perm: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 + db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15 + db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23 + db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31 +itx_perm: dq 0x0000000820150440, 0x0000000231372604 + dq 0x0000000ca8041551, 0x00000006b9263715 + dq 0x00000001ec9d8c62, 0x0000000bfdbfae26 + dq 0x00000005648c9d73, 0x0000000f75aebf37 +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 +pw_512: times 4 dw 512 +pw_m512: times 4 dw -512 +pw_15137_6270x2x4: times 4 dw 15137*2 + times 4 dw 6270*2 +pw_11585_m11585x2x4: times 4 dw 11585*2 +pw_m11585_11585x2x4: times 4 dw -11585*2 +pw_11585_11585x2: times 4 dw 11585*2 +int_mshift: db 142, 150, 0, 0, 174, 182, 0, 0 +pd_8192: dd 8192 +pw_804x2: times 2 dw 804*2 +pw_1606x2: times 2 dw 1606*2 +pw_3196x2: times 2 dw 3196*2 +pw_3981x2: times 2 dw 3981*2 +pw_6270x2: times 2 dw 6270*2 +pw_7005x2: times 2 dw 7005*2 +pw_7723x2: times 2 dw 7723*2 +pw_9760x2: times 2 dw 9760*2 +pw_12140x2: times 2 dw 12140*2 +pw_12665x2: times 2 dw 12665*2 +pw_13160x2: times 2 dw 13160*2 +pw_13623x2: times 2 dw 13623*2 +pw_14053x2: times 2 dw 14053*2 +pw_14449x2: times 2 dw 14449*2 +pw_14811x2: times 2 dw 14811*2 +pw_15137x2: times 2 dw 15137*2 +pw_15426x2: times 2 dw 15426*2 +pw_15679x2: times 2 dw 15679*2 +pw_15893x2: times 2 dw 15893*2 +pw_16069x2: times 2 dw 16069*2 +pw_16207x2: times 2 dw 16207*2 +pw_16305x2: times 2 dw 16305*2 +pw_16364x2: times 2 dw 16364*2 +pw_m2404x2: times 2 dw -2404*2 +pw_m4756x2: times 2 dw -4756*2 +pw_m5520x2: times 2 dw -5520*2 +pw_m8423x2: times 2 dw -8423*2 +pw_m9102x2: times 2 dw -9102*2 +pw_m10394x2: times 2 dw -10394*2 +pw_m11003x2: times 2 dw -11003*2 +pw_804_16364x2: dw 804*2, 16364*2 +pw_1606_16305x2: dw 1606*2, 16305*2 +pw_3196_16069x2: dw 3196*2, 16069*2 +pw_3981_15893x2: dw 3981*2, 15893*2 +pw_7005_14811x2: dw 7005*2, 14811*2 +pw_7723_14449x2: dw 7723*2, 14449*2 +pw_9760_13160x2: dw 9760*2, 13160*2 +pw_m2404_16207x2: dw -2404*2, 16207*2 +pw_m4756_15679x2: dw -4756*2, 15679*2 +pw_m5520_15426x2: dw -5520*2, 15426*2 +pw_m8423_14053x2: dw -8423*2, 14053*2 +pw_m9102_13623x2: dw -9102*2, 13623*2 +pw_m10394_12665x2: dw -10394*2, 12665*2 +pw_m11003_12140x2: dw -11003*2, 12140*2 + +%macro COEF_PAIR 2-3 0 +%if %3 & 4 +pw_%1_m%2: dw %1, -%2 +%else +pw_%1_%2: dw %1, %2 +%if %3 & 2 +pw_m%1_%2: dw -%1, %2 +%else +pw_m%2_%1: dw -%2, %1 +%endif +%endif +%if %3 & 1 +pw_m%1_m%2: dw -%1, -%2 +%endif +%endmacro + +COEF_PAIR 804, 16364 +COEF_PAIR 1606, 16305 +COEF_PAIR 3196, 16069, 1 +COEF_PAIR 3981, 15893 +COEF_PAIR 6270, 15137, 1 +COEF_PAIR 7005, 14811 +COEF_PAIR 7723, 14449 +COEF_PAIR 9102, 13623 +COEF_PAIR 9760, 13160 +COEF_PAIR 11585, 11585, 1 +COEF_PAIR 12140, 11003 +COEF_PAIR 12665, 10394 +COEF_PAIR 13623, 9102, 1 +COEF_PAIR 14053, 8423 +COEF_PAIR 15137, 6270 +COEF_PAIR 15426, 5520 +COEF_PAIR 15679, 4756 +COEF_PAIR 16069, 3196 +COEF_PAIR 16207, 2404 + +; ADST16-only: +COEF_PAIR 2404, 9760, 2 +COEF_PAIR 5520, 7005, 2 +COEF_PAIR 8423, 3981, 2 +COEF_PAIR 11003, 804, 2 +COEF_PAIR 12140, 16364, 5 +COEF_PAIR 14053, 15893, 5 +COEF_PAIR 15426, 14811, 5 +COEF_PAIR 16207, 13160, 5 +pw_11585_m11585: dw 11585, -11585 +pw_16069_m3196: dw 16069, -3196 +pw_9102_m13623: dw 9102, -13623 +pw_15137_m6270: dw 15137, -6270 +pw_6270_m15137: dw 6270, -15137 + +%define pw_11585x2 pw_11585_11585x2 +%define pw_m11585x2 pw_m11585_11585x2x4 + +SECTION .text + +%define o_base pw_512 + 128 +%define o(x) (r6 - (o_base) + (x)) +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack, +; 16 = special_mul1, 32 = special_mul2, 64 = dst_in_tmp1 +%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags + mova m%2, m%4 +%if %7 & 16 + vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd} + mova m%3, m%4 +%if %7 & 32 + vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} +%else + vpdpwssd m%3, m%1, m%6 +%endif +%elif %7 & 32 + vpdpwssd m%2, m%1, m%5 + mova m%3, m%4 + vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} +%elif %6 < 32 + vpdpwssd m%2, m%1, m%5 + mova m%3, m%4 + vpdpwssd m%3, m%1, m%6 +%elif %7 & 1 + vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd} + mova m%3, m%4 + vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd} +%else + vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd} + mova m%3, m%4 + vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd} +%endif +%if %7 & 2 + psrld m%2, 14 + pslld m%3, 2 + vpshrdd m%1, m%3, m%2, 16 +%elif %7 & 4 + ; compared to using shifts (as above) this has better throughput, + ; but worse latency and requires setting up the opmask/index + ; registers, so only use this method for the larger transforms +%if %7 & 64 + pslld m%2, 2 + vpmultishiftqb m%2{k7}, m13, m%3 +%else + pslld m%1, m%2, 2 + vpmultishiftqb m%1{k7}, m13, m%3 +%endif +%else + psrad m%2, 14 + psrad m%3, 14 +%if %7 & 8 == 0 + packssdw m%1, m%3, m%2 +%endif +%endif +%endmacro + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2] + punpcklwd m%3, m%2, m%1 + punpckhwd m%2, m%1 +%if %7 < 32 + mova m%1, m%5 + vpdpwssd m%1, m%3, m%7 + mova m%4, m%5 + vpdpwssd m%4, m%2, m%7 +%else + mova m%1, m%5 + vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd} + mova m%4, m%5 + vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd} +%endif + psrad m%1, 14 + psrad m%4, 14 + packssdw m%1, m%4 + mova m%4, m%5 +%if %7 < 32 + vpdpwssd m%4, m%2, m%6 + mova m%2, m%5 + vpdpwssd m%2, m%3, m%6 +%else + vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd} + mova m%2, m%5 + vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd} +%endif + psrad m%4, 14 + psrad m%2, 14 + packssdw m%2, m%4 +%endmacro + +; flags: 1 = swap, 2 = invert2, 4 = invert1 +%macro ADST_MULSUB_4W 10-11 0 ; dst1/src1, src2, dst2, tmp[1-2], rnd, coef[1-4], flags + mova m%3, m%6 +%if %11 & 1 + vpdpwssd m%3, m%1, [o(pw_m%8_%7)] {bcstd} +%else + vpdpwssd m%3, m%1, [o(pw_%7_%8)] {bcstd} +%endif +%if %11 & 4 + vpbroadcastd m%4, [o(pw_m%9_%10)] +%elif %11 & 2 + vpbroadcastd m%4, [o(pw_%9_m%10)] +%elif %11 & 1 + vpbroadcastd m%4, [o(pw_%10_%9)] +%else + vpbroadcastd m%4, [o(pw_%9_%10)] +%endif + pmaddwd m%4, m%2 + mova m%5, m%6 +%if %11 & 4 + vpdpwssd m%5, m%1, [o(pw_%8_m%7)] {bcstd} +%elif %11 & 1 + vpdpwssd m%5, m%1, [o(pw_%7_%8)] {bcstd} +%else + vpdpwssd m%5, m%1, [o(pw_m%8_%7)] {bcstd} +%endif +%if %11 & 2 + vpbroadcastd m%1, [o(pw_%10_%9)] +%elif %11 & 1 + vpbroadcastd m%1, [o(pw_%9_m%10)] +%else + vpbroadcastd m%1, [o(pw_m%10_%9)] +%endif + pmaddwd m%2, m%1 + paddd m%1, m%3, m%4 + psubd m%3, m%4 + paddd m%4, m%5, m%2 + psubd m%5, m%2 + pslld m%1, 2 + pslld m%3, 2 + vpmultishiftqb m%1{k7}, m13, m%4 + vpmultishiftqb m%3{k7}, m13, m%5 +%endmacro + +%macro WRAP_YMM 1+ + INIT_YMM cpuname + %1 + INIT_ZMM cpuname +%endmacro + +%macro INV_TXFM_FN 3-4 0 ; type1, type2, size, eob_offset +cglobal vp9_i%1_i%2_%3_add, 4, 5, 0, dst, stride, c, eob, tx2 + %undef cmp + %define %%p1 m(vp9_i%1_%3_internal) + lea r6, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(vp9_i%2_%3_internal).pass2] +%ifidn %1_%2, dct_dct + cmp eobd, 1 + jne %%p1 +%else +%if %4 + add eobd, %4 +%endif + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, 16x16, %3 +%ifidn %1_%2, dct_dct + movd xmm0, [o(pw_11585x2)] + pmulhrsw xmm3, xmm0, [cq] + pxor ym2, ym2 + pmulhrsw xmm3, xmm0 + pmulhrsw xmm3, [o(pw_512)] + mova [cq], xm2 + add r3d, 7 + vpbroadcastw ym3, xmm3 +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti32x4 ym1, [dstq+strideq*1], 1 + punpcklbw ym0, ym1, ym2 + punpckhbw ym1, ym2 + paddw ym0, ym3 + paddw ym1, ym3 + packuswb ym0, ym1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + lea dstq, [dstq+strideq*2] + dec r3d + jg .dconly_loop + RET +%endif +%endmacro + +%macro IDCT16_MAIN 0-1 0 ; idct32 +%if mmsize == 64 && %1 == 0 +.main_fast: +%endif + vpbroadcastd m2, [o(pw_1606_16305x2)] + vpbroadcastd m4, [o(pw_m10394_12665x2)] + vpbroadcastd m11, [o(pw_7723_14449x2)] + vpbroadcastd m12, [o(pw_m4756_15679x2)] + pmulhrsw m8, m2 ; t8a t15a + vpbroadcastd m2, [o(pw_3196_16069x2)] + pmulhrsw m0, m4 ; t9a t14a + vpbroadcastd m4, [o(pw_m9102_13623x2)] + pmulhrsw m5, m11 ; t10a t13a + vpbroadcastd m11, [o(pw_11585_11585x2)] + pmulhrsw m1, m12 ; t11a t12a + vbroadcasti32x4 m12, [o(pw_15137_6270x2x4)] + pmulhrsw m7, m2 ; t4a t7a + pmulhrsw m3, m4 ; t5a t6a + pmulhrsw m9, m11 ; t0 t1 + pmulhrsw m6, m12 ; t3 t2 +%if mmsize == 64 && %1 == 0 + jmp %%main2 +ALIGN function_align +.main: + punpckhwd m8, m7, m0 ; dct16 in15 in1 + punpcklwd m9, m4, m0 ; dct4 in2 in0 + punpckhwd m0, m3, m4 ; dct16 in7 in9 + punpcklwd m7, m1 ; dct8 in7 in1 + punpckhwd m1, m6 ; dct16 in3 in13 + punpcklwd m3, m5 ; dct8 in3 in5 + punpckhwd m5, m2 ; dct16 in11 in5 + punpcklwd m6, m2 ; dct4 in3 in1 + ITX_MUL2X_PACK 8, 2, 4, 10, 1606, 16305, 5 ; t8a t15a + ITX_MUL2X_PACK 0, 2, 4, 10, 12665, 10394, 5 ; t9a t14a + ITX_MUL2X_PACK 5, 2, 4, 10, 7723, 14449, 5 ; t10a t13a + ITX_MUL2X_PACK 1, 2, 4, 10, 15679, 4756, 5 ; t11a t12a + ITX_MUL2X_PACK 7, 2, 4, 10, 3196, 16069, 5 ; t4a t7a + ITX_MUL2X_PACK 3, 2, 4, 10, 13623, 9102, 5 ; t5a t6a + ITX_MUL2X_PACK 9, 2, 4, 10, 11585, 11585 ; t0 t1 + ITX_MUL2X_PACK 6, 2, 4, 10, 6270, 15137 ; t3 t2 +%%main2: +%endif + psubw m2, m8, m0 ; t9 t14 + paddw m8, m0 ; t8 t15 + psubw m4, m1, m5 ; t10 t13 + paddw m1, m5 ; t11 t12 + ITX_MUL2X_PACK 2, 0, 5, 10, 6270, 15137, (1|%1*4) ; t9a t14a + ITX_MUL2X_PACK 4, 0, 5, 10, m15137, 6270, (1|%1*4) ; t10a t13a + vbroadcasti32x4 m5, [o(deint_shuf)] + psubw m0, m8, m1 ; t11a t12a + paddw m8, m1 ; t8a t15a + psubw m1, m7, m3 ; t5a t6a + paddw m7, m3 ; t4 t7 + pshufb m8, m5 + pshufb m7, m5 + paddw m3, m2, m4 ; t9 t14 + psubw m2, m4 ; t10 t13 +%if %1 + vpbroadcastd m12, [o(pw_11585_11585)] + vpbroadcastd m11, [o(pw_m11585_11585)] + pshufb m3, m5 + ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6 + ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12 + ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a + packssdw m5, m11 ; t12 t13a + packssdw m4, m0 ; t11 t10a +%else + pshufb m0, m5 + ITX_MUL2X_PACK 1, 4, 5, 10, 11585_11585, m11585_11585, 48 ; t5 t6 + vpbroadcastd m11, [o(pw_11585x2)] + punpckhqdq m5, m0, m2 ; t12a t13 + punpcklqdq m0, m2 ; t11a t10 + psubw m4, m5, m0 + paddw m5, m0 + pmulhrsw m4, m11 ; t11 t10a + pmulhrsw m5, m11 ; t12 t13a +%endif + punpckhqdq m2, m7, m1 ; t7 t6 + punpcklqdq m7, m1 ; t4 t5 + psubw m1, m9, m6 ; t3 t2 + paddw m9, m6 ; t0 t1 + punpckhqdq m0, m8, m3 ; t15a t14 + punpcklqdq m8, m3 ; t8a t9 + psubw m3, m9, m2 ; t7 t6 + paddw m9, m2 ; t0 t1 + psubw m2, m1, m7 ; t4 t5 + paddw m1, m7 ; t3 t2 + psubw m7, m9, m0 ; out15 out14 + paddw m0, m9 ; out0 out1 + psubw m6, m1, m5 ; out12 out13 + paddw m1, m5 ; out3 out2 + psubw m5, m2, m4 ; out11 out10 + paddw m2, m4 ; out4 out5 + psubw m4, m3, m8 ; out8 out9 + paddw m3, m8 ; out7 out6 +%endmacro + +INIT_ZMM avx512icl +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, adst, 39-23 + +cglobal vp9_idct_16x16_internal, 0, 5, 16, dst, stride, c, eob, tx2 + mova m15, [o(itx_perm)] + vpbroadcastd m10, [o(pd_8192)] + vpbroadcastq m13, [o(int_mshift)] + vpcmpub k7, m13, m10, 6 + sub eobd, 39 + jl .pass1_fast + vpermq m0, m15, [cq+64*0] + vpermq m1, m15, [cq+64*1] + vpermq m2, m15, [cq+64*2] + vpermq m3, m15, [cq+64*3] + vpermq m4, m15, [cq+64*4] + vpermq m5, m15, [cq+64*5] + vpermq m6, m15, [cq+64*6] + vpermq m7, m15, [cq+64*7] + call .main + vbroadcasti32x4 m12, [o(int_shuf1)] + vbroadcasti32x4 m11, [o(int_shuf2)] + pshufb m0, m12 + pshufb m8, m1, m11 + pshufb m2, m12 + pshufb m9, m3, m11 + pshufb m4, m12 + pshufb m14, m5, m11 + pshufb m6, m12 + pshufb m11, m7, m11 + punpckhdq m1, m0, m8 + punpckldq m0, m8 + punpckhdq m3, m2, m9 + punpckldq m2, m9 + punpckhdq m5, m4, m14 + punpckldq m4, m14 + punpckhdq m7, m6, m11 + punpckldq m6, m11 +.pass1_end: + vshufi32x4 m8, m4, m6, q3232 + vinserti32x8 m4, ym6, 1 + vshufi32x4 m6, m0, m2, q3232 + vinserti32x8 m0, ym2, 1 + vshufi32x4 m9, m5, m7, q3232 + vinserti32x8 m5, ym7, 1 + vshufi32x4 m7, m1, m3, q3232 + vinserti32x8 m1, ym3, 1 + vshufi32x4 m2, m0, m4, q3131 ; 4 5 + vshufi32x4 m0, m4, q2020 ; 0 1 + vshufi32x4 m4, m6, m8, q2020 ; 8 9 + vshufi32x4 m6, m8, q3131 ; 12 13 + vshufi32x4 m3, m1, m5, q3131 ; 6 7 + vshufi32x4 m1, m5, q2020 ; 2 3 + vshufi32x4 m5, m7, m9, q2020 ; 10 11 + vshufi32x4 m7, m9, q3131 ; 14 1 + jmp tx2q +.pass1_fast: + mova ym3, [o(dup16_perm)] + vbroadcasti32x4 ym9, [cq+32*0] + vbroadcasti32x4 ym6, [cq+32*4] + vpermb ym8, ym3, [cq+32*1] + vpermb ym0, ym3, [cq+32*7] + vpermb ym5, ym3, [cq+32*5] + vpermb ym1, ym3, [cq+32*3] + vpermb ym7, ym3, [cq+32*2] + vpermb ym3, ym3, [cq+32*6] + shufpd ym9, ym9, 0x0c + shufpd ym6, ym6, 0x0c + WRAP_YMM IDCT16_MAIN + vbroadcasti32x4 m8, [o(int_shuf1)] + vbroadcasti32x4 m9, [o(int_shuf2)] + vinserti32x8 m0, ym2, 1 ; 0 1 | 4 5 + vinserti32x8 m4, ym6, 1 ; 8 9 | 12 13 + vinserti32x8 m1, ym3, 1 ; 3 2 | 7 6 + vinserti32x8 m5, ym7, 1 ; 11 10 | 15 14 + vshufi32x4 m2, m0, m4, q3131 + vshufi32x4 m0, m4, q2020 + vshufi32x4 m4, m1, m5, q2020 + vshufi32x4 m1, m5, q3131 + pshufb m2, m8 + pshufb m0, m8 + pshufb m4, m9 + pshufb m1, m9 + punpckhdq m3, m2, m1 ; 6-7 + punpckldq m2, m1 ; 4-5 + punpckhdq m1, m0, m4 ; 2-3 + punpckldq m0, m4 ; 0-1 + jmp tx2q +.pass2: + test eobd, eobd + jl .pass2_fast + call .main + jmp .pass2_end +.pass2_fast: + punpcklqdq m9, m0, m0 + punpckhwd m8, m0, m0 + punpcklwd m7, m1, m1 + punpckhwd m1, m1 + punpcklqdq m6, m2, m2 + punpckhwd m5, m2, m2 + punpckhwd m0, m3, m3 + punpcklwd m3, m3 + call .main_fast +.pass2_end: + psrldq m8, m15, 1 + psrlq m12, m15, 12 + psrldq m9, m15, 2 + psrlq m13, m15, 20 + mova m10, m8 + vpermi2q m8, m0, m2 ; 0 1 4 5 + vpermt2q m0, m12, m2 + mova m11, m9 + vpermi2q m9, m1, m3 ; 2 3 6 7 + vpermt2q m1, m13, m3 + vpbroadcastd m2, [o(pw_512)] + vpermi2q m10, m4, m6 ; 8 9 12 13 + vpermt2q m4, m12, m6 + vpermi2q m11, m5, m7 ; 10 11 14 15 + vpermt2q m5, m13, m7 + REPX {pmulhrsw x, m2}, m0, m1, m4, m5, m8, m9, m10, m11 +.pass2_end2: + lea r3, [strideq*3] + lea r4, [dstq+strideq*4] + lea r5, [dstq+strideq*8] + lea r6, [r4 +strideq*8] + mova xm3, [dstq+strideq*0] + mova xm6, [dstq+strideq*2] + vinserti32x4 ym3, [dstq+strideq*1], 1 + vinserti32x4 ym6, [dstq+r3 ], 1 + vinserti32x4 m3, [r4+strideq*0], 2 + vinserti32x4 m6, [r4+strideq*2], 2 + vinserti32x4 m3, [r4+strideq*1], 3 + vinserti32x4 m6, [r4+r3 ], 3 + mova xm12, [r5+strideq*0] + mova xm13, [r5+strideq*2] + vinserti32x4 ym12, [r5+strideq*1], 1 + vinserti32x4 ym13, [r5+r3 ], 1 + vinserti32x4 m12, [r6+strideq*0], 2 + vinserti32x4 m13, [r6+strideq*2], 2 + vinserti32x4 m12, [r6+strideq*1], 3 + vinserti32x4 m13, [r6+r3 ], 3 + pxor m7, m7 + REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m8, m3 + packuswb m0, m8 + punpcklbw m2, m6, m7 + punpckhbw m6, m7 + paddw m1, m2 + paddw m9, m6 + packuswb m1, m9 + punpcklbw m2, m12, m7 + punpckhbw m12, m7 + paddw m2, m4 + paddw m10, m12 + packuswb m2, m10 + punpcklbw m3, m13, m7 + punpckhbw m13, m7 + paddw m3, m5 + paddw m11, m13 + packuswb m3, m11 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + mova [dstq+strideq*2], xm1 + vextracti32x4 [dstq+r3 ], ym1, 1 + vextracti32x4 [r4+strideq*0], m0, 2 + vextracti32x4 [r4+strideq*1], m0, 3 + vextracti32x4 [r4+strideq*2], m1, 2 + vextracti32x4 [r4+r3 ], m1, 3 + mova [r5+strideq*0], xm2 + vextracti32x4 [r5+strideq*1], ym2, 1 + mova [r5+strideq*2], xm3 + vextracti32x4 [r5+r3 ], ym3, 1 + vextracti32x4 [r6+strideq*0], m2, 2 + vextracti32x4 [r6+strideq*1], m2, 3 + vextracti32x4 [r6+strideq*2], m3, 2 + vextracti32x4 [r6+r3 ], m3, 3 + RET +ALIGN function_align + IDCT16_MAIN + ret + +%macro IADST16_MAIN 0 +%if mmsize == 64 +.main_fast: +%endif + punpcklwd m4, m3, m0 ; in7 in0 + punpcklwd m11, m1, m2 ; in3 in4 + punpckhwd m9, m2, m1 ; in5 in2 + punpckhwd m7, m0, m3 ; in1 in6 + ITX_MUL2X_PACK 4, 0, 6, 10, 11003_804, 12140_m16364, 116 ; t1a t0a + ITX_MUL2X_PACK 4, 5, 6, 10, m11003_804, m12140_m16364, 52 ; t9a t8a + ITX_MUL2X_PACK 11, 2, 6, 10, 5520_7005, 15426_m14811, 116 ; t5a t4a + ITX_MUL2X_PACK 11, 5, 6, 10, m5520_7005, m15426_m14811, 52 ; t13a t12a + ITX_MUL2X_PACK 9, 1, 6, 10, 8423_3981, 14053_m15893, 116 ; t3a t2a + ITX_MUL2X_PACK 9, 5, 6, 10, m8423_3981, m14053_m15893, 52 ; t11a t10a + ITX_MUL2X_PACK 7, 3, 6, 10, 2404_9760, 16207_m13160, 116 ; t7a t6a + ITX_MUL2X_PACK 7, 5, 6, 10, m2404_9760, m16207_m13160, 52 ; t15a t14a +%if mmsize == 64 ; for the ymm variant we only ever use the fast path + jmp %%main2 +ALIGN function_align +.main: + punpckhwd m8, m7, m0 ; in14 in1 + punpcklwd m0, m7 ; in0 in15 + punpcklwd m7, m6, m1 ; in12 in3 + punpckhwd m1, m6 ; in2 in13 + punpckhwd m6, m5, m2 ; in10 in5 + punpcklwd m2, m5 ; in4 in11 + punpcklwd m5, m4, m3 ; in8 in7 + punpckhwd m3, m4 ; in6 in9 + ADST_MULSUB_4W 0, 5, 4, 9, 11, 10, 804, 16364, 12140, 11003 ; t1a t0a, t9a t8a + ADST_MULSUB_4W 2, 7, 11, 5, 9, 10, 7005, 14811, 15426, 5520 ; t5a t4a, t13a t12a + ADST_MULSUB_4W 1, 6, 9, 5, 7, 10, 3981, 15893, 14053, 8423 ; t3a t2a, t11a t10a + ADST_MULSUB_4W 3, 8, 7, 5, 6, 10, 9760, 13160, 16207, 2404 ; t7a t6a, t15a t14a +%%main2: +%endif + psubw m5, m1, m3 ; t7 t6 + paddw m6, m1, m3 ; t3 t2 + psubw m1, m0, m2 ; t5 t4 + paddw m2, m0 ; t1 t0 + ADST_MULSUB_4W 4, 11, 8, 3, 0, 10, 3196, 16069, 16069, 3196, 1 ; t8a t9a, t12a t13a + ADST_MULSUB_4W 9, 7, 0, 3, 11, 10, 13623, 9102, 9102, 13623, 1 ; t10a t11a, t14a t15a + ADST_MULSUB_4W 1, 5, 11, 3, 7, 10, 6270, 15137, 15137, 6270, 2 ; out12 -out3, t7 t6 + psubw m3, m2, m6 ; t3a t2a + paddw m2, m6 ; -out15 out0 + ADST_MULSUB_4W 8, 0, 5, 6, 7, 10, 15137, 6270, 6270, 15137, 6 ; -out13 out2, t15a t14 + vbroadcasti32x4 m12, [o(deint_shuf)] + paddw m0, m4, m9 ; -out1 out14 + psubw m4, m9 ; t10 t11 + pshufb m2, m12 + pshufb m1, m12 + pshufb m8, m12 + pshufb m0, m12 + punpcklqdq m6, m1, m8 ; out12 -out13 + shufps m7, m0, m2, q1032 ; out14 -out15 +%endmacro + +%macro IADST16_PASS1_END 0 + shufps m0, m2, m0, q1032 ; out0 -out1 + punpckhqdq m1, m8, m1 ; out2 -out3 + mova m2, m10 + vpdpwssd m2, m5, [o(pw_m11585_m11585)] {bcstd} ; out5 + mova m8, m10 + vpdpwssd m8, m11, [o(pw_11585_11585)] {bcstd} ; out4 + mova m9, m10 + vpdpwssd m9, m5, [o(pw_m11585_11585)] {bcstd} ; out10 + mova m5, m10 + vpdpwssd m5, m11, [o(pw_11585_m11585)] {bcstd} ; out11 + mova m11, m10 + vpdpwssd m11, m3, [o(pw_m11585_m11585)] {bcstd} ; out7 + mova m14, m10 + vpdpwssd m14, m4, [o(pw_11585_11585)] {bcstd} ; out6 + mova m12, m10 + vpdpwssd m12, m3, [o(pw_m11585_11585)] {bcstd} ; out8 + mova m3, m10 + vpdpwssd m3, m4, [o(pw_m11585_11585)] {bcstd} ; out9 +%endmacro + +INV_TXFM_16X16_FN adst, dct, 39-18 +INV_TXFM_16X16_FN adst, adst + +cglobal vp9_iadst_16x16_internal, 0, 5, 16, dst, stride, c, eob, tx2 + mova m15, [o(itx_perm)] + psrlq m7, m15, 4 + vpermq m0, m15, [cq+64*0] ; 0 1 + vpermq m1, m7, [cq+64*1] ; 3 2 + vpermq m2, m15, [cq+64*2] ; 4 5 + vpermq m3, m7, [cq+64*3] ; 7 6 + vpbroadcastd m10, [o(pd_8192)] + vpbroadcastq m13, [o(int_mshift)] + vpcmpub k7, m13, m10, 6 + sub eobd, 39 + jl .pass1_fast + vpermq m4, m15, [cq+64*4] ; 8 9 + vpermq m5, m7, [cq+64*5] ; 11 10 + vpermq m6, m15, [cq+64*6] ; 12 13 + vpermq m7, m7, [cq+64*7] ; 15 14 + call .main + IADST16_PASS1_END + REPX {psrad x, 14}, m2, m8, m9, m5, m11, m14, m12, m3 + packssdw m2, m8, m2 ; out4 out5 + packssdw m5, m9, m5 ; out10 out11 + packssdw m4, m12, m3 ; out8 out9 + packssdw m3, m14, m11 ; out6 out7 + pxor m9, m9 + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + psubw m8, m9, m8 + punpckhwd m1, m0, m8 + punpcklwd m0, m8 + punpckhwd m8, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m2, m8 + punpcklwd m2, m8 + punpckhwd m8, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m4, m8 + punpcklwd m4, m8 + punpckhwd m8, m6, m7 + punpcklwd m6, m7 + psubw m8, m9, m8 + punpckhwd m7, m6, m8 + punpcklwd m6, m8 + jmp m(vp9_idct_16x16_internal).pass1_end +.pass1_fast: + WRAP_YMM IADST16_MAIN + WRAP_YMM IADST16_PASS1_END + vinserti32x8 m0, ym6, 1 + vinserti32x8 m1, ym7, 1 + vinserti32x8 m8, ym12, 1 + vinserti32x8 m2, ym3, 1 + vinserti32x8 m14, ym9, 1 + vinserti32x8 m11, ym5, 1 + pslld m14, 2 + pslld m11, 2 + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + vpmultishiftqb m14{k7}, m13, m8 + vpmultishiftqb m11{k7}, m13, m2 + psrlq m1, m15, 24 + pxor m2, m2 + psubw m2, m4 + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + psrlq m2, m15, 28 + punpckhwd m4, m14, m11 + punpcklwd m14, m11 + mova m5, m2 + vpermi2q m2, m0, m14 + vpermt2q m0, m1, m14 + vpermi2q m1, m3, m4 + vpermt2q m3, m5, m4 + jmp tx2q +.pass2: + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + test eobd, eobd + jl .pass2_fast + pshufd m5, m5, q1032 + pshufd m7, m7, q1032 + call .main + jmp .pass2_end +.pass2_fast: + call .main_fast +.pass2_end: + vbroadcasti32x4 m9, [o(pw_11585_m11585x2x4)] + vbroadcasti32x4 m10, [o(pw_m11585_11585x2x4)] + punpckhqdq m1, m8 ; -out3 out2 + shufps m0, m2, q3210 ; -out1 out0 + pshufb m2, m11, m12 + pshufb m5, m12 + pshufb m3, m12 + pshufb m4, m12 + vbroadcasti32x4 m11, [o(pw_512)] + vpbroadcastd m12, [o(pw_512)] + punpcklqdq m8, m5, m2 ; t15a t7 + punpckhqdq m5, m2 ; t14a t6 + shufps m2, m3, m4, q1032 ; t2a t10 + shufps m3, m4, q3210 ; t3a t11 + psubsw m4, m2, m3 + paddsw m3, m2 + paddsw m2, m5, m8 + psubsw m5, m8 + pmulhrsw m4, m9 ; out8 out9 + pmulhrsw m3, m10 ; out7 out6 + pmulhrsw m2, m10 ; out5 out4 + pmulhrsw m5, m9 ; out10 out11 + pmulhrsw m6, m11 + pmulhrsw m7, m11 + pshufd m11, m11, q1032 + pmulhrsw m0, m11 + pmulhrsw m1, m11 + REPX {pmulhrsw x, m12}, m2, m3, m4, m5 + psrldq m8, m15, 2 + psrlq m12, m15, 20 + psrldq m10, m15, 1 + psrlq m13, m15, 12 + mova m9, m8 + vpermi2q m8, m0, m2 ; 0 1 4 5 + vpermt2q m0, m12, m2 + vpermi2q m9, m1, m3 ; 2 3 6 7 + vpermt2q m1, m12, m3 + mova m11, m10 + vpermi2q m10, m4, m6 ; 8 9 12 13 + vpermt2q m4, m13, m6 + vpermi2q m11, m5, m7 ; 10 11 14 15 + vpermt2q m5, m13, m7 + jmp m(vp9_idct_16x16_internal).pass2_end2 +ALIGN function_align + IADST16_MAIN + ret + +%macro IDCT_32x32_END 4 ; src, mem, stride[1-2] + pmovzxbw m10, [dstq+%3] + pmovzxbw m11, [r3 +%4] +%if %2 < 8 + paddw m8, m%2, m%1 + psubw m9, m%2, m%1 +%else + mova m9, [rsp+64*(%2-8)] + paddw m8, m9, m%1 + psubw m9, m%1 +%endif + pmulhrsw m8, m12 + pmulhrsw m9, m12 + paddw m8, m10 + paddw m9, m11 + packuswb m8, m9 + vpermq m8, m13, m8 + mova [dstq+%3], ym8 + vextracti32x8 [r3 +%4], m8, 1 +%if %2 == 3 || %2 == 7 || %2 == 11 + add dstq, r5 + sub r3, r5 +%endif +%endmacro + +cglobal vp9_idct_idct_32x32_add, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r6, [o_base] + cmp eobd, 1 + jne .pass1 + movd xmm0, [o(pw_11585x2)] + pmulhrsw xmm3, xmm0, [cq] + pxor m2, m2 + pmulhrsw xmm3, xmm0 + pmulhrsw xmm3, [o(pw_512)] + movd [cq], xm2 + add r3d, 15 + vpbroadcastw m3, xmm3 +.dconly_loop: + mova ym1, [dstq+strideq*0] + vinserti32x8 m1, [dstq+strideq*1], 1 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + paddw m0, m3 + paddw m1, m3 + packuswb m0, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + dec r3d + jg .dconly_loop + RET +.pass1: + PROLOGUE 0, 7, 30, 64*16, dst, stride, c, eob + sub eobd, 135 + jl .fast + mova m0, [cq+64* 0] + mova m14, [cq+64* 2] + mova m1, [cq+64* 4] + mova m15, [cq+64* 6] + mova m2, [cq+64* 8] + mova m16, [cq+64*10] + mova m3, [cq+64*12] + mova m17, [cq+64*14] + mova m4, [cq+64*16] + mova m18, [cq+64*18] + mova m5, [cq+64*20] + mova m19, [cq+64*22] + mova m6, [cq+64*24] + mova m20, [cq+64*26] + mova m7, [cq+64*28] + mova m21, [cq+64*30] + call .idct16 + mova [rsp+64*0], m14 + mova [rsp+64*1], m15 + mova [rsp+64*2], m16 + mova [rsp+64*3], m17 + mova [rsp+64*4], m18 + mova [rsp+64*5], m19 + mova [rsp+64*6], m20 + mova [rsp+64*7], m21 + mova m22, [cq+64* 1] + mova m23, [cq+64* 3] + mova m24, [cq+64* 5] + mova m25, [cq+64* 7] + mova m26, [cq+64* 9] + mova m27, [cq+64*11] + mova m28, [cq+64*13] + mova m29, [cq+64*15] + mova m14, [cq+64*17] + mova m15, [cq+64*19] + mova m16, [cq+64*21] + mova m17, [cq+64*23] + mova m18, [cq+64*25] + mova m19, [cq+64*27] + mova m20, [cq+64*29] + mova m21, [cq+64*31] + call .main + psubw m13, m0, m29 ; 31 + paddw m0, m29 ; 0 + psubw m29, m1, m28 ; 30 + paddw m1, m28 ; 1 + psubw m28, m2, m27 ; 29 + paddw m2, m27 ; 2 + psubw m27, m3, m26 ; 28 + paddw m3, m26 ; 3 + psubw m26, m4, m25 ; 27 + paddw m4, m25 ; 4 + psubw m25, m5, m24 ; 26 + paddw m5, m24 ; 5 + psubw m24, m6, m23 ; 25 + paddw m6, m23 ; 6 + psubw m23, m7, m22 ; 24 + paddw m7, m22 ; 7 + punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 + punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 + punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 + punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 + punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 + punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 + punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 + punpckhwd m3, m23, m24 + punpcklwd m23, m24 + punpckhwd m24, m25, m26 + punpcklwd m25, m26 + punpckhwd m26, m27, m28 + punpcklwd m27, m28 + punpckhwd m28, m29, m13 + punpcklwd m29, m13 + punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 + punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 + punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 + punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 + punpckhdq m13, m23, m25 + punpckldq m23, m25 + punpckhdq m25, m27, m29 + punpckldq m27, m29 + punpckhdq m9, m3, m24 + punpckldq m3, m24 + punpckhdq m24, m26, m28 + punpckldq m26, m28 + punpcklqdq m5, m23, m27 ; d00 d08 d16 d24 + punpckhqdq m23, m27 ; d01 d09 d17 d25 + punpckhqdq m27, m13, m25 ; d03 d11 d19 d27 + punpcklqdq m13, m25 ; d02 d10 d18 d26 + punpckhqdq m25, m3, m26 ; d05 d13 d21 d29 + punpcklqdq m3, m26 ; d04 d12 d20 d28 + punpckhqdq m26, m9, m24 ; d07 d15 d23 d31 + punpcklqdq m9, m24 ; d06 d14 d22 d30 + mova [rsp+64*12], m23 + mova [rsp+64*13], m27 + mova [rsp+64*14], m25 + mova [rsp+64*15], m26 + punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 + punpcklqdq m8, m22 ; a04 a12 a20 a28 + punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 + punpcklqdq m0, m4 ; a00 a08 a16 a24 + punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 + punpcklqdq m7, m2 ; a02 a10 a18 a26 + punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 + punpcklqdq m6, m1 ; a06 a14 a22 a30 + mova m2, [rsp+64*0] + mova m11, [rsp+64*1] + mova m12, [rsp+64*2] + mova m29, [rsp+64*3] + mova m27, [rsp+64*4] + mova m26, [rsp+64*5] + mova m4, [rsp+64*6] + mova m28, [rsp+64*7] + psubw m1, m2, m21 ; 23 + paddw m2, m21 ; 8 + psubw m21, m11, m20 ; 22 + paddw m11, m20 ; 9 + psubw m20, m12, m19 ; 21 + paddw m12, m19 ; 10 + psubw m19, m29, m18 ; 20 + paddw m29, m18 ; 11 + psubw m18, m27, m17 ; 19 + paddw m27, m17 ; 12 + psubw m17, m26, m16 ; 18 + paddw m26, m16 ; 13 + paddw m16, m4, m15 ; 14 + psubw m4, m15 ; 17 + mova m15, m6 + psubw m6, m28, m14 ; 16 + paddw m28, m14 ; 15 + mova m14, m7 + punpcklwd m7, m6, m4 + punpckhwd m6, m4 + punpckhwd m4, m17, m18 + punpcklwd m17, m18 + punpckhwd m18, m19, m20 + punpcklwd m19, m20 + punpckhwd m20, m21, m1 + punpcklwd m21, m1 + punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 + punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 + punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 + punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 + punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 + punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 + punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 + punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 + punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3 + punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3 + punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1 + punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 + punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 + punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 + punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 + punpckhdq m26, m19, m21 + punpckldq m19, m21 + punpckhdq m21, m6, m4 + punpckldq m6, m4 + punpckhdq m4, m18, m20 + punpckldq m18, m20 + punpckhdq m20, m7, m17 + punpckldq m7, m17 + punpcklqdq m17, m28, m12 ; b02 b10 b18 b26 + punpckhqdq m28, m12 ; b03 b11 b19 b27 + punpckhqdq m12, m2, m27 ; b01 b09 b17 b25 + punpcklqdq m2, m27 ; b00 b08 b16 b24 + punpckhqdq m27, m1, m29 ; b05 b13 b21 b29 + punpcklqdq m1, m29 ; b04 b12 b20 b28 + punpckhqdq m29, m16, m11 ; b07 b15 b23 b31 + punpcklqdq m16, m11 ; b06 b14 b22 b30 + mova [rsp+64* 8], m12 + mova [rsp+64* 9], m28 + mova [rsp+64*10], m27 + mova [rsp+64*11], m29 + punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 + punpcklqdq m20, m26 ; c02 c10 c18 c26 + punpckhqdq m26, m7, m19 ; c01 c09 c17 c25 + punpcklqdq m7, m19 ; c00 c08 c16 c24 + punpckhqdq m28, m6, m18 ; c05 c13 c21 c29 + punpcklqdq m6, m18 ; c04 c12 c20 c28 + punpckhqdq m29, m21, m4 ; c07 c15 c23 c31 + punpcklqdq m21, m4 ; c06 c14 c22 c30 + mov r3d, 64*28 + pxor m4, m4 +.zero_loop: + mova [cq+r3+64*0], m4 + mova [cq+r3+64*1], m4 + mova [cq+r3+64*2], m4 + mova [cq+r3+64*3], m4 + sub r3d, 64*4 + jge .zero_loop + vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24 + vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08 + vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24 + vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08 + vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28 + vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12 + vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28 + vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12 + vshufi32x4 m3, m1, m6, q3131 ; 12 + vshufi32x4 m1, m6, q2020 ; 4 + vshufi32x4 m6, m4, m2, q3131 ; 24 + vshufi32x4 m4, m2, q2020 ; 16 + vshufi32x4 m2, m0, m7, q3131 ; 8 + vshufi32x4 m0, m7, q2020 ; 0 + vshufi32x4 m7, m5, m8, q3131 ; 28 + vshufi32x4 m5, m8, q2020 ; 20 + vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26 + vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10 + vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26 + vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10 + vshufi32x4 m13, m21, m9, q3232 ; c22 c30 d22 d30 + vinserti32x8 m21, ym9, 1 ; c06 c14 d06 d14 + vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30 + vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14 + vshufi32x4 m16, m14, m20, q3131 ; 10 + vshufi32x4 m14, m20, q2020 ; 2 + vshufi32x4 m20, m18, m17, q3131 ; 26 + vshufi32x4 m18, m17, q2020 ; 18 + vshufi32x4 m17, m15, m21, q3131 ; 14 + vshufi32x4 m15, m21, q2020 ; 6 + vshufi32x4 m21, m19, m13, q3131 ; 30 + vshufi32x4 m19, m13, q2020 ; 22 + call .idct16 + mova [rsp+64*0], m14 + mova [rsp+64*1], m15 + mova [rsp+64*2], m16 + mova [rsp+64*3], m17 + mova [rsp+64*4], m18 + mova [rsp+64*5], m19 + mova [rsp+64*6], m20 + mova [rsp+64*7], m21 + mova m15, [rsp+64* 8] + mova m16, [rsp+64* 9] + mova m17, [rsp+64*10] + mova m19, [rsp+64*11] + mova m20, [rsp+64*12] + mova m21, [rsp+64*13] + mova m13, [rsp+64*14] + mova m18, [rsp+64*15] + vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25 + vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09 + vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27 + vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11 + vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29 + vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13 + vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31 + vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15 + vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09 + vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25 + vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11 + vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27 + vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13 + vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29 + vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15 + vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31 + vshufi32x4 m18, m14, m26, q3131 ; 25 + vshufi32x4 m14, m26, q2020 ; 17 + vshufi32x4 m19, m15, m27, q3131 ; 27 + vshufi32x4 m15, m27, q2020 ; 19 + vshufi32x4 m20, m16, m28, q3131 ; 29 + vshufi32x4 m16, m28, q2020 ; 21 + vshufi32x4 m21, m17, m29, q3131 ; 31 + vshufi32x4 m17, m29, q2020 ; 23 + vshufi32x4 m26, m22, m8, q3131 ; 9 + vshufi32x4 m22, m8, q2020 ; 1 + vshufi32x4 m27, m23, m9, q3131 ; 11 + vshufi32x4 m23, m9, q2020 ; 3 + vshufi32x4 m28, m24, m11, q3131 ; 13 + vshufi32x4 m24, m11, q2020 ; 5 + vshufi32x4 m29, m25, m12, q3131 ; 15 + vshufi32x4 m25, m12, q2020 ; 7 + call .main + jmp .end +.fast: + mova m14, [o(dup16_perm)] + pmovzxbw m9, [cq+64*0] + pmovzxbw m6, [cq+64*8] + vpermb m8, m14, [cq+64* 2] + vpermb m0, m14, [cq+64*14] + vpermb m5, m14, [cq+64*10] + vpermb m1, m14, [cq+64* 6] + vpermb m7, m14, [cq+64* 4] + vpermb m3, m14, [cq+64*12] + vpbroadcastd m10, [o(pd_8192)] + vpbroadcastq m13, [o(int_mshift)] + packuswb m9, m9 + packuswb m6, m6 + vpcmpub k7, m13, m10, 6 + IDCT16_MAIN 1 + vpermb m21, m14, [cq+64* 1] + vpermb m17, m14, [cq+64*15] + vpermb m20, m14, [cq+64* 9] + vpermb m15, m14, [cq+64* 7] + vpermb m18, m14, [cq+64* 5] + vpermb m16, m14, [cq+64*11] + vpermb m19, m14, [cq+64*13] + vpermb m14, m14, [cq+64* 3] + call .main_packed_fast + punpcklwd m8, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m1, m3 + punpckhwd m1, m3 + punpcklwd m3, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + punpcklwd m7, m14, m16 + punpckhwd m14, m16 + punpcklwd m16, m15, m17 + punpckhwd m15, m17 + punpcklwd m17, m19, m21 + punpckhwd m19, m21 + punpckhwd m21, m18, m20 + punpcklwd m18, m20 + punpcklwd m20, m8, m1 + punpckhwd m8, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m7, m15 + punpckhwd m7, m15 + punpcklwd m15, m14, m16 + punpckhwd m14, m16 + punpckhwd m16, m18, m19 + punpcklwd m18, m19 + punpcklwd m19, m21, m17 + punpckhwd m21, m17 + punpcklwd m17, m8, m0 ; a2 a6 aa ae + punpckhwd m8, m0 ; a3 a7 ab af + punpcklwd m0, m20, m1 ; a0 a4 a8 ac + punpckhwd m20, m1 ; a1 a5 a9 ad + punpcklwd m1, m2, m5 ; b0 b4 b8 bc + punpckhwd m2, m5 ; b1 b5 b9 bd + punpcklwd m5, m3, m4 ; b2 b6 ba be + punpckhwd m3, m4 ; b3 b7 bb bf + punpcklwd m4, m6, m15 ; c0 c4 c8 cc + punpckhwd m6, m15 ; c1 c5 c9 cd + punpcklwd m15, m7, m14 ; c2 c6 ca ce + punpckhwd m7, m14 ; c3 c7 cb cf + punpcklwd m14, m18, m19 ; d0 d4 d8 dc + punpckhwd m18, m19 ; d1 d5 d9 dd + punpcklwd m9, m16, m21 ; d2 d6 da de + punpckhwd m16, m21 ; d3 d7 db df + mov r3d, 64*12 + pxor ym21, ym21 +.fast_zero_loop: + mova [cq+r3+64*0], ym21 + mova [cq+r3+64*1], ym21 + mova [cq+r3+64*2], ym21 + mova [cq+r3+64*3], ym21 + sub r3d, 64*4 + jge .fast_zero_loop + vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc + vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4 + vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6 + vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be + vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7 + vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf + vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4 + vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc + vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5 + vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd + vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5 + vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd + vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6 + vshufi32x4 m15, m9, q3232 ; ca ce da de + vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7 + vshufi32x4 m7, m16, q3232 ; cb cf db df + vshufi32x4 m22, m14, m2, q2020 ; 1 + vshufi32x4 m24, m14, m2, q3131 ; 5 + vshufi32x4 m23, m17, m9, q2020 ; 3 + vshufi32x4 m25, m17, m9, q3131 ; 7 + vshufi32x4 m16, m5, m15, q2020 ; 10 + vshufi32x4 m17, m5, m15, q3131 ; 14 + vshufi32x4 m14, m1, m18, q2020 ; 2 + vshufi32x4 m15, m1, m18, q3131 ; 6 + vshufi32x4 m1, m0, m3, q3131 ; 4 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m21, m4, q3131 ; 12 + vshufi32x4 m2, m21, m4, q2020 ; 8 + vshufi32x4 m26, m20, m6, q2020 ; 9 + vshufi32x4 m28, m20, m6, q3131 ; 13 + vshufi32x4 m27, m19, m7, q2020 ; 11 + vshufi32x4 m29, m19, m7, q3131 ; 15 + call .idct16_fast + mova [rsp+64*0], m14 + mova [rsp+64*1], m15 + mova [rsp+64*2], m16 + mova [rsp+64*3], m17 + mova [rsp+64*4], m18 + mova [rsp+64*5], m19 + mova [rsp+64*6], m20 + mova [rsp+64*7], m21 + call .main_fast +.end: + lea r4, [strideq*3] + vpbroadcastd m12, [o(pw_512)] + movshdup m13, [o(itx_perm)] + lea r3, [dstq+r4*8] + lea r5, [strideq+r4] ; stride*4 + add r3, r5 ; dst+stride*28 + IDCT_32x32_END 29, 0, strideq*0, r4 + IDCT_32x32_END 28, 1, strideq*1, strideq*2 + IDCT_32x32_END 27, 2, strideq*2, strideq*1 + IDCT_32x32_END 26, 3, r4 , strideq*0 + IDCT_32x32_END 25, 4, strideq*0, r4 + IDCT_32x32_END 24, 5, strideq*1, strideq*2 + IDCT_32x32_END 23, 6, strideq*2, strideq*1 + IDCT_32x32_END 22, 7, r4 , strideq*0 + IDCT_32x32_END 21, 8, strideq*0, r4 + IDCT_32x32_END 20, 9, strideq*1, strideq*2 + IDCT_32x32_END 19, 10, strideq*2, strideq*1 + IDCT_32x32_END 18, 11, r4 , strideq*0 + IDCT_32x32_END 17, 12, strideq*0, r4 + IDCT_32x32_END 16, 13, strideq*1, strideq*2 + IDCT_32x32_END 15, 14, strideq*2, strideq*1 + IDCT_32x32_END 14, 15, r4 , strideq*0 + RET +ALIGN function_align +.idct16_fast: + vpbroadcastd m21, [o(pw_16305x2)] + vpbroadcastd m8, [o(pw_1606x2)] + vpbroadcastd m18, [o(pw_m10394x2)] + vpbroadcastd m9, [o(pw_12665x2)] + pmulhrsw m21, m14 ; t15a + vpbroadcastd m19, [o(pw_14449x2)] + pmulhrsw m14, m8 ; t8a + vpbroadcastd m8, [o(pw_7723x2)] + pmulhrsw m18, m17 ; t9a + vpbroadcastd m20, [o(pw_m4756x2)] + pmulhrsw m17, m9 ; t14a + vpbroadcastd m9, [o(pw_15679x2)] + pmulhrsw m19, m16 ; t13a + vpbroadcastd m5, [o(pw_m9102x2)] + pmulhrsw m16, m8 ; t10a + vpbroadcastd m8, [o(pw_13623x2)] + pmulhrsw m20, m15 ; t11a + vpbroadcastd m7, [o(pw_16069x2)] + pmulhrsw m15, m9 ; t12a + vpbroadcastd m9, [o(pw_3196x2)] + pmulhrsw m5, m3 ; t5a + vpbroadcastd m6, [o(pw_15137x2)] + pmulhrsw m3, m8 ; t6a + vpbroadcastd m8, [o(pw_6270x2)] + pmulhrsw m7, m1 ; t7a + vpbroadcastd m4, [o(pw_11585x2)] + pmulhrsw m1, m9 ; t4 + vpbroadcastd m10, [o(pd_8192)] + pmulhrsw m6, m2 ; t3 + pmulhrsw m2, m8 ; t2 + pmulhrsw m4, m0 ; t0 + mova m0, m4 ; t1 + jmp .idct16b +ALIGN function_align +.idct16: + vpbroadcastd m10, [o(pd_8192)] + ITX_MULSUB_2W 14, 21, 8, 9, 10, 1606, 16305 ; t8a, t15a + ITX_MULSUB_2W 18, 17, 8, 9, 10, 12665, 10394 ; t9a, t14a + ITX_MULSUB_2W 16, 19, 8, 9, 10, 7723, 14449 ; t10a, t13a + ITX_MULSUB_2W 20, 15, 8, 9, 10, 15679, 4756 ; t11a, t12 + ITX_MULSUB_2W 5, 3, 8, 9, 10, 13623, 9102 ; t5a, t6a + ITX_MULSUB_2W 1, 7, 8, 9, 10, 3196, 16069 ; t4a, t7a + ITX_MULSUB_2W 2, 6, 8, 9, 10, 6270, 15137 ; t2, t3 + ITX_MULSUB_2W 0, 4, 8, 9, 10, 11585, 11585 ; t1, t0 +.idct16b: + paddw m8, m20, m16 ; t11 + psubw m20, m16 ; t10 + paddw m16, m15, m19 ; t12 + psubw m15, m19 ; t13 + psubw m19, m14, m18 ; t9 + paddw m14, m18 ; t8 + psubw m18, m21, m17 ; t14 + paddw m21, m17 ; t15 + vpbroadcastd m11, [o(pw_6270_15137)] + vpbroadcastd m12, [o(pw_m15137_6270)] + ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a + vpbroadcastd m11, [o(pw_m6270_m15137)] + ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a + vpbroadcastd m11, [o(pw_11585_11585)] + vpbroadcastd m12, [o(pw_m11585_11585)] + paddw m9, m7, m3 ; t7 + psubw m3, m7, m3 ; t6a + paddw m7, m1, m5 ; t4 + psubw m1, m5 ; t5a + psubw m17, m14, m8 ; t11a + paddw m8, m14 ; t8a + paddw m14, m18, m15 ; t9 + psubw m18, m15 ; t10 + psubw m15, m19, m20 ; t13 + paddw m19, m20 ; t14 + paddw m20, m21, m16 ; t15a + psubw m16, m21, m16 ; t12a + ITX_MULSUB_2W 3, 1, 5, 21, 10, 11, 12 ; t5, t6 + ITX_MULSUB_2W 15, 18, 5, 21, 10, 11, 12 ; t10a, t13a + ITX_MULSUB_2W 16, 17, 5, 21, 10, 11, 12 ; t11, t12 + psubw m5, m0, m2 ; t2 + paddw m2, m0 ; t1 + paddw m0, m4, m6 ; t0 + psubw m4, m6 ; t3 + psubw m6, m2, m1 ; t6 + paddw m1, m2 ; t1 + paddw m2, m5, m3 ; t2 + psubw m5, m3 ; t5 + paddw m3, m4, m7 ; t3 + psubw m4, m7 ; t4 + psubw m7, m0, m9 ; t7 + paddw m0, m9 ; t0 + psubw m21, m0, m20 ; out15 + paddw m0, m20 ; out0 + psubw m20, m1, m19 ; out14 + paddw m1, m19 ; out1 + psubw m19, m2, m18 ; out13 + paddw m2, m18 ; out2 + psubw m18, m3, m17 ; out12 + paddw m3, m17 ; out3 + psubw m17, m4, m16 ; out11 + paddw m4, m16 ; out4 + psubw m16, m5, m15 ; out10 + paddw m5, m15 ; out5 + psubw m15, m6, m14 ; out9 + paddw m6, m14 ; out6 + psubw m14, m7, m8 ; out8 + paddw m7, m8 ; out7 + ret +ALIGN function_align +.main_fast: + vpbroadcastd m21, [o(pw_16364x2)] + vpbroadcastd m8, [o(pw_804x2)] + vpbroadcastd m14, [o(pw_m11003x2)] + vpbroadcastd m9, [o(pw_12140x2)] + pmulhrsw m21, m22 ; t31a + vpbroadcastd m17, [o(pw_14811x2)] + pmulhrsw m22, m8 ; t16a + vpbroadcastd m8, [o(pw_7005x2)] + pmulhrsw m14, m29 ; t30a + vpbroadcastd m18, [o(pw_m5520x2)] + pmulhrsw m29, m9 ; t17a + vpbroadcastd m9, [o(pw_15426x2)] + pmulhrsw m17, m26 ; t29a + vpbroadcastd m19, [o(pw_15893x2)] + pmulhrsw m26, m8 ; t18a + vpbroadcastd m8, [o(pw_3981x2)] + pmulhrsw m18, m25 ; t19a + vpbroadcastd m16, [o(pw_m8423x2)] + pmulhrsw m25, m9 ; t28a + vpbroadcastd m9, [o(pw_14053x2)] + pmulhrsw m19, m24 ; t27a + vpbroadcastd m15, [o(pw_13160x2)] + pmulhrsw m24, m8 ; t20a + vpbroadcastd m8, [o(pw_9760x2)] + pmulhrsw m16, m27 ; t21a + vpbroadcastd m20, [o(pw_m2404x2)] + pmulhrsw m27, m9 ; t26a + vpbroadcastd m9, [o(pw_16207x2)] + pmulhrsw m15, m28 ; t25a + pmulhrsw m28, m8 ; t22a + pmulhrsw m20, m23 ; t23a + pmulhrsw m23, m9 ; t24a + jmp .main2 +ALIGN function_align +.main: + ITX_MULSUB_2W 22, 21, 8, 9, 10, 804, 16364 ; t16a, t31a + ITX_MULSUB_2W 14, 29, 8, 9, 10, 12140, 11003 ; t17a, t30a + ITX_MULSUB_2W 26, 17, 8, 9, 10, 7005, 14811 ; t18a, t29a + ITX_MULSUB_2W 18, 25, 8, 9, 10, 15426, 5520 ; t19a, t28a + ITX_MULSUB_2W 24, 19, 8, 9, 10, 3981, 15893 ; t20a, t27a + ITX_MULSUB_2W 16, 27, 8, 9, 10, 14053, 8423 ; t21a, t26a + ITX_MULSUB_2W 28, 15, 8, 9, 10, 9760, 13160 ; t22a, t25a + ITX_MULSUB_2W 20, 23, 8, 9, 10, 16207, 2404 ; t23a, t24a +.main2: + psubw m8, m22, m14 ; t17 + paddw m22, m14 ; t16 + paddw m14, m18, m26 ; t19 + psubw m18, m26 ; t18 + psubw m26, m24, m16 ; t21 + paddw m24, m16 ; t20 + psubw m16, m20, m28 ; t22 + paddw m28, m20 ; t23 + psubw m20, m23, m15 ; t25 + paddw m23, m15 ; t24 + psubw m15, m21, m29 ; t30 + paddw m21, m29 ; t31 + psubw m29, m19, m27 ; t26 + paddw m19, m27 ; t27 + paddw m27, m25, m17 ; t28 + psubw m25, m17 ; t29 + ITX_MULSUB_2W 15, 8, 9, 17, 10, 3196, 16069 ; t17a, t30a + ITX_MULSUB_2W 25, 18, 9, 17, 10, m16069, 3196 ; t18a, t29a + ITX_MULSUB_2W 29, 26, 9, 17, 10, 13623, 9102 ; t21a, t26a + ITX_MULSUB_2W 20, 16, 9, 17, 10, m9102, 13623 ; t22a, t25a + psubw m17, m21, m27 ; t28a + paddw m21, m27 ; t31a + psubw m27, m15, m25 ; t18 + paddw m15, m25 ; t17 + psubw m25, m20, m29 ; t21 + paddw m20, m29 ; t22 + psubw m29, m8, m18 ; t29 + paddw m8, m18 ; t30 + psubw m18, m22, m14 ; t19a + paddw m22, m14 ; t16a + psubw m14, m28, m24 ; t20a + paddw m24, m28 ; t23a + paddw m28, m16, m26 ; t25 + psubw m16, m26 ; t26 + psubw m26, m23, m19 ; t27a + paddw m23, m19 ; t24a + vpbroadcastd m12, [o(pw_m15137_6270)] + vpbroadcastd m11, [o(pw_6270_15137)] + ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a + ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28 + vpbroadcastd m11, [o(pw_m6270_m15137)] + ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a + ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27 + vpbroadcastd m12, [o(pw_m11585_11585)] + vpbroadcastd m11, [o(pw_11585_11585)] + psubw m19, m27, m25 ; t26 + paddw m27, m25 ; t29 + psubw m25, m17, m26 ; t20a + paddw m17, m26 ; t19a + paddw m26, m18, m14 ; t28a + psubw m18, m14 ; t27a + paddw m14, m22, m24 ; t16 + psubw m22, m24 ; t23 + psubw m24, m29, m16 ; t21 + paddw m16, m29 ; t18 + paddw m29, m21, m23 ; t31 + psubw m21, m23 ; t24 + psubw m23, m15, m20 ; t22a + paddw m15, m20 ; t17a + psubw m20, m8, m28 ; t25a + paddw m28, m8 ; t30a + ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27 + ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a + ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a + ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25 + ret +ALIGN function_align +.main_packed_fast: + vpbroadcastd m8, [o(pw_804_16364x2)] + vpbroadcastd m9, [o(pw_m11003_12140x2)] + vpbroadcastd m11, [o(pw_7005_14811x2)] + vpbroadcastd m12, [o(pw_m5520_15426x2)] + pmulhrsw m21, m8 ; t16a, t31a + vpbroadcastd m8, [o(pw_3981_15893x2)] + pmulhrsw m17, m9 ; t17a, t30a + vpbroadcastd m9, [o(pw_m8423_14053x2)] + pmulhrsw m20, m11 ; t18a, t29a + vpbroadcastd m11, [o(pw_9760_13160x2)] + pmulhrsw m15, m12 ; t19a, t28a + vpbroadcastd m12, [o(pw_m2404_16207x2)] + pmulhrsw m18, m8 ; t20a, t27a + pmulhrsw m16, m9 ; t21a, t26a + pmulhrsw m19, m11 ; t22a, t25a + pmulhrsw m14, m12 ; t23a, t24a + psubw m8, m21, m17 ; t17 t30 + paddw m21, m17 ; t16 t31 + psubw m17, m15, m20 ; t18 t29 + paddw m20, m15 ; t19 t28 + psubw m15, m18, m16 ; t21 t26 + paddw m18, m16 ; t20 t27 + psubw m16, m14, m19 ; t22 t25 + paddw m14, m19 ; t23 t24 + ITX_MUL2X_PACK 8, 9, 19, 10, 3196, 16069, 5 ; t17a t30a + ITX_MUL2X_PACK 17, 9, 19, 10, m16069, 3196, 5 ; t18a t29a + ITX_MUL2X_PACK 15, 9, 19, 10, 13623, 9102, 5 ; t21a t26a + ITX_MUL2X_PACK 16, 9, 19, 10, m9102, 13623, 5 ; t22a t25a + vpbroadcastd m11, [o(pw_m15137_6270)] + psubw m19, m21, m20 ; t19a t28a + paddw m21, m20 ; t16a t31a + psubw m20, m14, m18 ; t20a t27a + paddw m14, m18 ; t23a t24a + psubw m18, m8, m17 ; t18 t29 + paddw m8, m17 ; t17 t30 + psubw m17, m16, m15 ; t21 t26 + paddw m15, m16 ; t22 t25 + ITX_MUL2X_PACK 18, 9, 16, 10, 6270_15137, 11, 20 ; t18a t29a + ITX_MUL2X_PACK 19, 9, 16, 10, 6270_15137, 11, 20 ; t19 t28 + ITX_MUL2X_PACK 20, 9, 16, 10, 11, m6270_m15137, 36 ; t20 t27 + ITX_MUL2X_PACK 17, 9, 16, 10, 11, m6270_m15137, 36 ; t21a t26a + vbroadcasti32x4 m9, [o(deint_shuf)] + psubw m16, m21, m14 ; t23 t24 + paddw m14, m21 ; t16 t31 + psubw m21, m8, m15 ; t22a t25a + paddw m15, m8 ; t17a t30a + psubw m8, m18, m17 ; t21 t26 + paddw m18, m17 ; t18 t29 + paddw m17, m19, m20 ; t19a t28a + psubw m19, m20 ; t20a t27a + vpbroadcastd m11, [o(pw_m11585_11585)] + vpbroadcastd m12, [o(pw_11585_11585)] + REPX {pshufb x, m9}, m14, m15, m18, m17 + mova m9, m10 + vpdpwssd m9, m16, m11 + mova m20, m10 + vpdpwssd m20, m21, m11 + psrad m9, 14 + psrad m20, 14 + packssdw m9, m20 ; t23a t22 + mova m20, m10 + vpdpwssd m20, m16, m12 + mova m16, m10 + vpdpwssd m16, m21, m12 + psrad m20, 14 + psrad m16, 14 + packssdw m16, m20, m16 ; t24a t25 + ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a + ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27 + packssdw m11, m20 ; t27 t26a + packssdw m8, m21 ; t20 t21a + punpcklqdq m20, m14, m15 ; t16 t17a + punpckhqdq m14, m15 ; t31 t30a + punpckhqdq m15, m17, m18 ; t28a t29 + punpcklqdq m17, m18 ; t19a t18 + psubw m21, m0, m14 ; out31 out30 + paddw m0, m14 ; out0 out1 + psubw m14, m7, m20 ; out16 out17 + paddw m7, m20 ; out15 out14 + psubw m20, m1, m15 ; out28 out29 + paddw m1, m15 ; out3 out2 + psubw m15, m6, m17 ; out19 out18 + paddw m6, m17 ; out12 out13 + psubw m17, m4, m9 ; out23 out22 + paddw m4, m9 ; out8 out9 + psubw m18, m3, m16 ; out24 out25 + paddw m3, m16 ; out7 out6 + psubw m16, m5, m8 ; out20 out21 + paddw m5, m8 ; out11 out10 + psubw m19, m2, m11 ; out27 out26 + paddw m2, m11 ; out4 out5 + ret + +%endif diff --git a/libavutil/mem_internal.h b/libavutil/mem_internal.h index c027fa51c3..d58881d09c 100644 --- a/libavutil/mem_internal.h +++ b/libavutil/mem_internal.h @@ -131,4 +131,6 @@ #define LOCAL_ALIGNED_32(t, v, ...) E1(LOCAL_ALIGNED_D(32, t, v, __VA_ARGS__,,)) +#define LOCAL_ALIGNED_64(t, v, ...) E1(LOCAL_ALIGNED_D(64, t, v, __VA_ARGS__,,)) + #endif /* AVUTIL_MEM_INTERNAL_H */ diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c index cecd0dee0f..bddc9a79fc 100644 --- a/tests/checkasm/vp9dsp.c +++ b/tests/checkasm/vp9dsp.c @@ -310,13 +310,13 @@ static int is_zero(const int16_t *c, int sz) static void check_itxfm(void) { - LOCAL_ALIGNED_32(uint8_t, src, [32 * 32 * 2]); - LOCAL_ALIGNED_32(uint8_t, dst, [32 * 32 * 2]); - LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]); - LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]); - LOCAL_ALIGNED_32(int16_t, coef, [32 * 32 * 2]); - LOCAL_ALIGNED_32(int16_t, subcoef0, [32 * 32 * 2]); - LOCAL_ALIGNED_32(int16_t, subcoef1, [32 * 32 * 2]); + LOCAL_ALIGNED_64(uint8_t, src, [32 * 32 * 2]); + LOCAL_ALIGNED_64(uint8_t, dst, [32 * 32 * 2]); + LOCAL_ALIGNED_64(uint8_t, dst0, [32 * 32 * 2]); + LOCAL_ALIGNED_64(uint8_t, dst1, [32 * 32 * 2]); + LOCAL_ALIGNED_64(int16_t, coef, [32 * 32 * 2]); + LOCAL_ALIGNED_64(int16_t, subcoef0, [32 * 32 * 2]); + LOCAL_ALIGNED_64(int16_t, subcoef1, [32 * 32 * 2]); declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); VP9DSPContext dsp; int y, x, tx, txtp, bit_depth, sub; _______________________________________________ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".