ffmpeg | branch: master | Henrik Gramner <gram...@twoorioles.com> | Wed May 21 14:49:17 2025 +0200| [eda0ac7e5f09e179bcdc946ddef543e8c44a4792] | committer: Henrik Gramner
avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 10bpc inverse transforms > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=eda0ac7e5f09e179bcdc946ddef543e8c44a4792 --- libavcodec/vp9dec.h | 4 +- libavcodec/x86/Makefile | 1 + libavcodec/x86/vp9dsp_init_16bpp_template.c | 8 + libavcodec/x86/vp9itxfm_16bpp_avx512.asm | 1165 +++++++++++++++++++++++++++ 4 files changed, 1176 insertions(+), 2 deletions(-) diff --git a/libavcodec/vp9dec.h b/libavcodec/vp9dec.h index 851ee9f6dd..e41f47a82a 100644 --- a/libavcodec/vp9dec.h +++ b/libavcodec/vp9dec.h @@ -220,8 +220,8 @@ struct VP9TileData { DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8]; DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8]; // block reconstruction intermediates - DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2]; - DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2]; + DECLARE_ALIGNED(64, uint8_t, tmp_y)[64 * 64 * 2]; + DECLARE_ALIGNED(64, uint8_t, tmp_uv)[2][64 * 64 * 2]; struct { int x, y; } min_mv, max_mv; int16_t *block_base, *block, *uvblock_base[2], *uvblock[2]; uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2]; diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index bf752f5da2..89ee8dc726 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -186,6 +186,7 @@ X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \ x86/vp9itxfm.o \ x86/vp9itxfm_avx512.o \ x86/vp9itxfm_16bpp.o \ + x86/vp9itxfm_16bpp_avx512.o \ x86/vp9lpf.o \ x86/vp9lpf_16bpp.o \ x86/vp9mc.o \ diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c index f93ea2468e..db775f7c1a 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp_template.c +++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c @@ -127,6 +127,8 @@ decl_itxfm_func(iwht, iwht, 4, BPC, mmxext); #if BPC == 10 decl_itxfm_func(idct, idct, 4, BPC, mmxext); decl_itxfm_funcs(4, BPC, ssse3); +decl_itxfm_funcs(16, BPC, avx512icl); +decl_itxfm_func(idct, idct, 32, BPC, avx512icl); #else decl_itxfm_func(idct, idct, 4, BPC, sse2); #endif @@ -233,6 +235,12 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) #endif } +#if ARCH_X86_64 && BPC == 10 + if (EXTERNAL_AVX512ICL(cpu_flags)) { + init_itx_funcs(TX_16X16, 16, BPC, avx512icl); + init_itx_func_one(TX_32X32, idct, idct, 32, BPC, avx512icl); + } +#endif #endif /* HAVE_X86ASM */ ff_vp9dsp_init_16bpp_x86(dsp); diff --git a/libavcodec/x86/vp9itxfm_16bpp_avx512.asm b/libavcodec/x86/vp9itxfm_16bpp_avx512.asm new file mode 100644 index 0000000000..11d1e453a7 --- /dev/null +++ b/libavcodec/x86/vp9itxfm_16bpp_avx512.asm @@ -0,0 +1,1165 @@ +;****************************************************************************** +;* VP9 IDCT SIMD optimizations +;* +;* Copyright (C) 2025 Two Orioles, LLC +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 && HAVE_AVX512ICL_EXTERNAL + +SECTION_RODATA 64 + +; Thw following set of constants are ordered to form the +; qword shuffle mask { 0, 2, 4, 6, 1, 3, 5, 7 } +%define deintq_perm pd_5520 +pd_5520: dd 5520 +pd_9760: dd 9760 +pd_10394: dd 10394 +pd_15426: dd 15426 +pd_804: dd 804 +pd_2404: dd 2404 +pd_6270: dd 6270 +pd_9102: dd 9102 +pd_11585: dd 11585 +pd_12665: dd 12665 +pd_7723: dd 7723 +pd_14811: dd 14811 +pd_7005: dd 7005 +pd_14053: dd 14053 +pd_8423: dd 8423 +pd_13623: dd 13623 + +pixel_clip: times 2 dw 0x7c00 +pixel_clip6: dd 2031648 ; 32 + (pixel_clip << 6) +pd_532480: dd 532480 ; 8192 + (32 << 14) +pd_8192: dd 8192 + +pd_1606: dd 1606 +pd_3196: dd 3196 +pd_3981: dd 3981 +pd_4756: dd 4756 +pd_11003: dd 11003 +pd_12140: dd 12140 +pd_13160: dd 13160 +pd_14449: dd 14449 +pd_15137: dd 15137 +pd_15679: dd 15679 +pd_15893: dd 15893 +pd_16069: dd 16069 +pd_16207: dd 16207 +pd_16305: dd 16305 +pd_16364: dd 16364 + +SECTION .text + +%define o_base (deintq_perm+128) +%define o(x) (r5 - o_base + (x)) +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], inv_dst2 +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else + vpbroadcastd m%3, [o(pd_%8)] + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else + vpbroadcastd m%5, [o(pd_%7)] + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 + psubd m%4, m%6, m%4 + psubd m%2, m%4, m%2 +%else +%ifnum %6 + paddd m%4, m%6 +%endif + paddd m%2, m%4 +%endif +%ifnum %6 + paddd m%1, m%6 +%endif + psubd m%1, m%3 +%ifnum %6 + psrad m%2, 14 + psrad m%1, 14 +%endif +%endmacro + +%macro WRAP_YMM 1+ + INIT_YMM cpuname + %1 + INIT_ZMM cpuname +%endmacro + +%macro TRANSPOSE_4D 5 ; in[1-4], tmp + punpckhdq m%5, m%3, m%4 ; c2 d2 c3 d3 + punpckldq m%3, m%4 ; c0 d0 c1 d1 + punpckhdq m%4, m%1, m%2 ; a2 b2 a3 b3 + punpckldq m%1, m%2 ; a0 b0 a1 b1 + punpckhqdq m%2, m%1, m%3 ; a1 b1 c1 d1 + punpcklqdq m%1, m%3 ; a0 b0 c0 d0 + punpcklqdq m%3, m%4, m%5 ; a2 b2 c2 d2 + punpckhqdq m%4, m%5 ; a3 b3 c3 d3 +%endmacro + +%macro TRANSPOSE_4DQ 5 ; in[1-4], tmp + vshufi32x4 m%5, m%3, m%4, q3232 ; c2 c3 d2 d3 + vinserti32x8 m%3, ym%4, 1 ; c0 c1 d0 d1 + vshufi32x4 m%4, m%1, m%2, q3232 ; a2 a3 b2 b3 + vinserti32x8 m%1, ym%2, 1 ; a0 a1 b0 b1 + vshufi32x4 m%2, m%1, m%3, q3131 ; a1 b1 c1 d1 + vshufi32x4 m%1, m%3, q2020 ; a0 b0 c0 d0 + vshufi32x4 m%3, m%4, m%5, q2020 ; a2 b2 c2 d2 + vshufi32x4 m%4, m%5, q3131 ; a3 b3 c3 d3 +%endmacro + +%macro INV_TXFM_FN 3-4 0 ; type1, type2, size, eob_offset +cglobal vp9_i%1_i%2_%3_add_10, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(vp9_i%1_%3_internal_10) + lea r5, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(vp9_i%2_%3_internal_10).pass2] +%ifidn %1_%2, dct_dct + dec eobd + jnz %%p1 +%else +%if %4 + add eobd, %4 +%endif + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, 16x16, %3 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 11585 + vpbroadcastd ym3, [o(pixel_clip)] + mov [cq], r3d + add r6d, 8192 + sar r6d, 14 + imul r6d, 11585 + or r3d, 8 + add r6d, 532480 + sar r6d, 20 + vpbroadcastw ym2, r6d + paddsw ym2, ym3 +.dconly_loop: + paddsw ym0, ym2, [dstq+strideq*0] + paddsw ym1, ym2, [dstq+strideq*1] + psubusw ym0, ym3 + psubusw ym1, ym3 + mova [dstq+strideq*0], ym0 + mova [dstq+strideq*1], ym1 + lea dstq, [dstq+strideq*2] + dec r3d + jg .dconly_loop + RET +%endif +%endmacro + +%macro IDCT16_PART1 0 +%if mmsize == 64 +.main_part1_fast: +%endif + pmulld m15, m1, [o(pd_16305)] {bcstd} ; t15a + pmulld m1, [o(pd_1606)] {bcstd} ; t8a + pmulld m9, m7, [o(pd_10394)] {bcstd} ; t9a + pmulld m7, [o(pd_12665)] {bcstd} ; t14a + pmulld m11, m5, [o(pd_14449)] {bcstd} ; t13a + pmulld m5, [o(pd_7723)] {bcstd} ; t10a + pmulld m13, m3, [o(pd_4756)] {bcstd} ; t11a + pmulld m3, [o(pd_15679)] {bcstd} ; t12a + pmulld m10, m6, [o(pd_9102)] {bcstd} ; t5a + pmulld m6, [o(pd_13623)] {bcstd} ; t6a + pmulld m14, m2, [o(pd_16069)] {bcstd} ; t7a + pmulld m2, [o(pd_3196)] {bcstd} ; t4a + pmulld m12, m4, [o(pd_15137)] {bcstd} ; t3 + pmulld m4, [o(pd_6270)] {bcstd} ; t2 + pmulld m0, m21 + REPX {psubd x, m20, x}, m9, m13, m10 + paddd m0, m20 + mova m18, m0 +%if mmsize == 64 ; for the ymm variant we only ever use the fast path + jmp %%main_part1b +.main_part1: + ITX_MULSUB_2D 1, 15, 16, 17, 18, _, 1606, 16305 ; t8a, t15a + ITX_MULSUB_2D 9, 7, 16, 17, 18, _, 12665, 10394 ; t9a, t14a + ITX_MULSUB_2D 5, 11, 16, 17, 18, _, 7723, 14449 ; t10a, t13a + ITX_MULSUB_2D 13, 3, 16, 17, 18, _, 15679, 4756 ; t11a, t12a + ITX_MULSUB_2D 10, 6, 16, 17, 18, _, 13623, 9102 ; t5a, t6a + ITX_MULSUB_2D 2, 14, 16, 17, 18, _, 3196, 16069 ; t4a, t7a + ITX_MULSUB_2D 4, 12, 16, 17, 18, _, 6270, 15137 ; t2, t3 + pmulld m0, m21 + pmulld m8, m21 + REPX {paddd x, m20}, m0, m9, m13, m10 + psubd m18, m0, m8 ; t1 + paddd m0, m8 ; t0 +%%main_part1b: +%endif + vpbroadcastd m19, [o(pd_15137)] + vpbroadcastd m16, [o(pd_6270)] + REPX {paddd x, m20}, m15, m7, m1, m11, m3, m5 + REPX {psrad x, 14 }, m15, m7, m1, m9, m11, m3, m5, m13 + paddd m17, m15, m7 ; t15 + psubd m15, m7 ; t14 + psubd m7, m3, m11 ; t13 + paddd m3, m11 ; t12 + psubd m11, m13, m5 ; t10 + paddd m5, m13 ; t11 + psubd m13, m1, m9 ; t9 + paddd m1, m9 ; t8 + ITX_MULSUB_2D 15, 13, 8, 9, _, 20, 16, 19 ; t9a, t14a + ITX_MULSUB_2D 7, 11, 8, 9, _, 20, 16, 19, 2 ; t13a, t10a + paddd m16, m1, m5 ; t8a + psubd m1, m5 ; t11a + paddd m8, m15, m11 ; t9 + psubd m15, m11 ; t10 + psubd m11, m17, m3 ; t12a + paddd m17, m3 ; t15a + psubd m9, m13, m7 ; t13 + paddd m13, m7 ; t14 + REPX {pmulld x, m21}, m11, m9, m1, m15 + REPX {paddd x, m20}, m2, m6, m14 + REPX {psrad x, 14 }, m10, m2, m6, m14 + psubd m3, m2, m10 ; t5a + paddd m10, m2 ; t4 + paddd m11, m20 + psubd m5, m11, m1 ; t11 + paddd m11, m1 ; t12 + psubd m1, m14, m6 ; t6a + paddd m14, m6 ; t7 + pmulld m1, m21 + pmulld m3, m21 + paddd m4, m20 + paddd m12, m20 + REPX {psrad x, 14 }, m4, m12, m0, m18 + paddd m9, m20 + paddd m2, m9, m15 ; t13a + psubd m9, m15 ; t10a + paddd m1, m20 + psubd m6, m1, m3 ; t5 + paddd m1, m3 ; t6 + REPX {psrad x, 14}, m6, m1, m11, m5, m2, m9 +%endmacro + +%macro IDCT16_PART2 0 + psubd m3, m0, m12 ; t3 + paddd m0, m12 ; t0 + psubd m12, m18, m4 ; t2 + paddd m18, m4 ; t1 + psubd m4, m3, m10 ; t4 + paddd m3, m10 ; t3 + psubd m10, m12, m6 ; t5 + paddd m12, m6 ; t2 + psubd m6, m18, m1 ; t6 + paddd m1, m18 ; t1 + psubd m7, m0, m14 ; t7 + paddd m0, m14 ; t0 + psubd m15, m0, m17 ; out15 + paddd m0, m17 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m12, m2 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m5 ; out11 + paddd m4, m5 ; out4 + paddd m5, m10, m9 ; out5 + psubd m10, m9 ; out10 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, m16 ; out8 + paddd m7, m16 ; out7 +%endmacro + +INIT_ZMM avx512icl +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, adst, 39-23-1 + +cglobal vp9_idct_16x16_internal_10, 0, 7, 22, dst, stride, c, eob, tx2 + mova m0, [cq+64* 0] + mova m1, [cq+64* 1] + mova m2, [cq+64* 2] + mova m3, [cq+64* 3] + mova m4, [cq+64* 4] + mova m5, [cq+64* 5] + mova m6, [cq+64* 6] + mova m7, [cq+64* 7] + vpbroadcastd m20, [o(pd_8192)] + vpbroadcastd m21, [o(pd_11585)] + sub eobd, 38 + jl .pass1_fast + mova m8, [cq+64* 8] + mova m9, [cq+64* 9] + mova m10, [cq+64*10] + mova m11, [cq+64*11] + mova m12, [cq+64*12] + mova m13, [cq+64*13] + mova m14, [cq+64*14] + mova m15, [cq+64*15] + call .main_part1 + call .main_part2 +.pass1_end: + TRANSPOSE_4DQ 0, 4, 8, 12, 16 + TRANSPOSE_4DQ 1, 5, 9, 13, 16 + TRANSPOSE_4DQ 2, 6, 10, 14, 16 + TRANSPOSE_4DQ 3, 7, 11, 15, 16 + TRANSPOSE_4D 8, 9, 10, 11, 16 + TRANSPOSE_4D 12, 13, 14, 15, 16 + mov r6d, 64*12 + jmp .pass1_transpose_end +.pass1_fast: + WRAP_YMM IDCT16_PART1 + WRAP_YMM IDCT16_PART2 +.pass1_fast_end: + vinserti32x8 m0, ym4, 1 + vinserti32x8 m8, ym12, 1 + vinserti32x8 m1, ym5, 1 + vinserti32x8 m9, ym13, 1 + vinserti32x8 m2, ym6, 1 + vinserti32x8 m10, ym14, 1 + vinserti32x8 m3, ym7, 1 + vinserti32x8 m11, ym15, 1 + vshufi32x4 m4, m0, m8, q3131 + vshufi32x4 m0, m8, q2020 + vshufi32x4 m5, m1, m9, q3131 + vshufi32x4 m1, m9, q2020 + vshufi32x4 m6, m2, m10, q3131 + vshufi32x4 m2, m10, q2020 + vshufi32x4 m7, m3, m11, q3131 + vshufi32x4 m3, m11, q2020 + mov r6d, 64*4 +.pass1_transpose_end: + pxor m16, m16 +.zero_loop: + mova [cq+r6+64*0], m16 + mova [cq+r6+64*1], m16 + mova [cq+r6+64*2], m16 + mova [cq+r6+64*3], m16 + sub r6d, 64*4 + jge .zero_loop + TRANSPOSE_4D 0, 1, 2, 3, 16 + TRANSPOSE_4D 4, 5, 6, 7, 16 + jmp tx2q +.pass2: + test eobd, eobd + jl .pass2_fast + call .main_part1 + jmp .pass2_end +.pass2_fast: + call .main_part1_fast +.pass2_end: + vpbroadcastd m3, [o(pixel_clip6)] + paddd m0, m3 + paddd m18, m3 + call .main_part2 + REPX {psrad x, 6}, m0, m1, m2, m3 + packssdw m0, m1 + lea r6, [strideq*3] + packssdw m1, m2, m3 + mova m2, [o(deintq_perm)] + vpbroadcastd m3, [o(pixel_clip)] + REPX {psrad x, 6}, m4, m5, m6, m7 + call .write_16x4 + packssdw m0, m4, m5 + packssdw m1, m6, m7 + REPX {psrad x, 6}, m8, m9, m10, m11 + call .write_16x4 + packssdw m0, m8, m9 + packssdw m1, m10, m11 +.pass2_end2: + REPX {psrad x, 6}, m12, m13, m14, m15 + call .write_16x4 + packssdw m0, m12, m13 + packssdw m1, m14, m15 + call .write_16x4 + RET +ALIGN function_align +.write_16x4: + mova ym16, [dstq+strideq*0] + vinserti32x8 m16, [dstq+strideq*1], 1 + mova ym17, [dstq+strideq*2] + vinserti32x8 m17, [dstq+r6 ], 1 + vpermq m0, m2, m0 + vpermq m1, m2, m1 + paddsw m16, m0 + paddsw m17, m1 + psubusw m16, m3 + psubusw m17, m3 + mova [dstq+strideq*0], ym16 + vextracti32x8 [dstq+strideq*1], m16, 1 + mova [dstq+strideq*2], ym17 + vextracti32x8 [dstq+r6 ], m17, 1 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align + IDCT16_PART1 + ret +ALIGN function_align +.main_part2: + IDCT16_PART2 + ret + +%macro IADST16_PART1 0 +%if mmsize == 64 +.main_part1_fast: +%endif + pmulld m15, m0, [o(pd_16364)] {bcstd} ; t1 + pmulld m0, [o(pd_804)] {bcstd} ; t0 + pmulld m13, m2, [o(pd_15893)] {bcstd} ; t3 + pmulld m2, [o(pd_3981)] {bcstd} ; t2 + pmulld m11, m4, [o(pd_14811)] {bcstd} ; t5 + pmulld m4, [o(pd_7005)] {bcstd} ; t4 + pmulld m9, m6, [o(pd_13160)] {bcstd} ; t7 + pmulld m6, [o(pd_9760)] {bcstd} ; t6 + pmulld m8, m7, [o(pd_11003)] {bcstd} ; t8 + pmulld m7, [o(pd_12140)] {bcstd} ; t9 + pmulld m10, m5, [o(pd_8423)] {bcstd} ; t10 + pmulld m5, [o(pd_14053)] {bcstd} ; t11 + pmulld m12, m3, [o(pd_5520)] {bcstd} ; t12 + pmulld m3, [o(pd_15426)] {bcstd} ; t13 + pmulld m14, m1, [o(pd_2404)] {bcstd} ; t14 + pmulld m1, [o(pd_16207)] {bcstd} ; t15 + REPX {psubd x, m20, x}, m15, m13, m11, m9 +%if mmsize == 64 ; for the ymm variant we only ever use the fast path + jmp %%main_part1b +ALIGN function_align +.main_part1: + ITX_MULSUB_2D 15, 0, 16, 17, 18, _, 804, 16364 ; t1, t0 + ITX_MULSUB_2D 13, 2, 16, 17, 18, _, 3981, 15893 ; t3, t2 + ITX_MULSUB_2D 11, 4, 16, 17, 18, _, 7005, 14811 ; t5, t4 + ITX_MULSUB_2D 9, 6, 16, 17, 18, _, 9760, 13160 ; t7, t6 + ITX_MULSUB_2D 7, 8, 16, 17, 18, _, 12140, 11003 ; t9, t8 + ITX_MULSUB_2D 5, 10, 16, 17, 18, _, 14053, 8423 ; t11, t10 + ITX_MULSUB_2D 3, 12, 16, 17, 18, _, 15426, 5520 ; t13, t12 + ITX_MULSUB_2D 1, 14, 16, 17, 18, _, 16207, 2404 ; t15, t14 + REPX {paddd x, m20}, m15, m13, m11, m9 +%%main_part1b: +%endif + REPX {paddd x, m20}, m0, m2, m4, m6 + psubd m16, m2, m10 ; t10a + paddd m2, m10 ; t2a + psubd m10, m9, m1 ; t15a + paddd m9, m1 ; t7a + psubd m1, m13, m5 ; t11a + paddd m13, m5 ; t3a + psubd m5, m6, m14 ; t14a + paddd m6, m14 ; t6a + REPX {psrad x, 14}, m16, m10, m1, m5 + psubd m14, m0, m8 ; t8a + paddd m0, m8 ; t0a + psubd m8, m15, m7 ; t9a + paddd m15, m7 ; t1a + psubd m7, m4, m12 ; t12a + paddd m4, m12 ; t4a + paddd m12, m11, m3 ; t5a + psubd m11, m3 ; t13a + REPX {psrad x, 14}, m14, m8, m7, m11 + vpbroadcastd m19, [o(pd_9102)] + vpbroadcastd m18, [o(pd_13623)] + ITX_MULSUB_2D 16, 1, 3, 17, _, _, 18, 19 ; t11, t10 + ITX_MULSUB_2D 10, 5, 3, 17, _, _, 19, 18 ; t14, t15 + vpbroadcastd m19, [o(pd_16069)] + vpbroadcastd m18, [o(pd_3196)] + ITX_MULSUB_2D 14, 8, 3, 17, _, _, 18, 19 ; t9, t8 + ITX_MULSUB_2D 11, 7, 3, 17, _, _, 19, 18 ; t12, t13 + vpbroadcastd m19, [o(pd_6270)] + vpbroadcastd m18, [o(pd_15137)] + REPX {psrad x, 14}, m15, m12, m0, m4 + psubd m3, m15, m12 ; t5 + paddd m15, m12 ; t1 + psubd m12, m0, m4 ; t4 + paddd m0, m4 ; t0 + REPX {psrad x, 14}, m2, m6, m13, m9 + psubd m4, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m13, m9 ; t7 + paddd m9, m13 ; t3 + REPX {paddd x, m20}, m8, m14, m1, m16 + psubd m13, m8, m11 ; t12a + paddd m8, m11 ; t8a + psubd m11, m14, m7 ; t13a + paddd m14, m7 ; t9a + psubd m7, m1, m10 ; t14a + paddd m1, m10 ; t10a + psubd m10, m16, m5 ; t15a + paddd m16, m5 ; t11a + REPX {psrad x, 14}, m13, m11, m7, m10 + ITX_MULSUB_2D 12, 3, 5, 17, _, _, 19, 18 ; t5a, t4a + ITX_MULSUB_2D 6, 4, 5, 17, _, _, 18, 19 ; t6a, t7a + ITX_MULSUB_2D 13, 11, 5, 17, _, _, 19, 18 ; t13, t12 + ITX_MULSUB_2D 10, 7, 5, 17, _, _, 18, 19 ; t14, t15 + REPX {psrad x, 14}, m8, m1, m14, m16 + psubd m5, m8, m1 ; t10 + paddd m1, m8 ; -out1 + psubd m8, m15, m9 ; t3a + paddd m15, m9 ; -out15 + psubd m9, m14, m16 ; t11 + paddd m14, m16 ; out14 + psubd m16, m0, m2 ; t2a + paddd m0, m2 ; out0 + REPX {paddd x, m20}, m11, m13, m12, m3 + paddd m2, m11, m10 ; out2 + psubd m11, m10 ; t14a + psubd m10, m13, m7 ; t15a + paddd m13, m7 ; -out13 + psubd m7, m12, m4 ; t7 + paddd m12, m4 ; out12 + psubd m4, m3, m6 ; t6 + paddd m3, m6 ; -out3 + REPX {psrad x, 14}, m10, m7, m11, m4 + REPX {pmulld x, m21}, m9, m10, m7, m8, m5, m11, m4, m16 + REPX {psrad x, 14}, m2, m13, m12, m3 +%endmacro + +%macro IADST16_PART2 0 + paddd m9, m20 + psubd m10, m20, m10 + paddd m7, m20 + psubd m8, m20, m8 + paddd m6, m9, m5 ; out6 + psubd m9, m5 ; out9 + psubd m5, m10, m11 ; out5 + paddd m10, m11 ; out10 + psubd m11, m7, m4 ; out11 + paddd m4, m7 ; out4 + psubd m7, m8, m16 ; out7 + paddd m8, m16 ; out8 +%endmacro + +%macro IADST16_PASS1_END 0 + pxor m16, m16 + psubd m1, m16, m1 + psubd m3, m16, m3 + psubd m13, m16, m13 + psubd m15, m16, m15 + REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11 +%endmacro + +INV_TXFM_16X16_FN adst, dct, 39-18 +INV_TXFM_16X16_FN adst, adst + +cglobal vp9_iadst_16x16_internal_10, 0, 7, 22, dst, stride, c, eob, tx2 + mova m0, [cq+64* 0] + mova m1, [cq+64* 1] + mova m2, [cq+64* 2] + mova m3, [cq+64* 3] + mova m4, [cq+64* 4] + mova m5, [cq+64* 5] + mova m6, [cq+64* 6] + mova m7, [cq+64* 7] + vpbroadcastd m20, [o(pd_8192)] + vpbroadcastd m21, [o(pd_11585)] + sub eobd, 39 + jl .pass1_fast + mova m8, [cq+64* 8] + mova m9, [cq+64* 9] + mova m10, [cq+64*10] + mova m11, [cq+64*11] + mova m12, [cq+64*12] + mova m13, [cq+64*13] + mova m14, [cq+64*14] + mova m15, [cq+64*15] + call .main_part1 + call .main_part2 + IADST16_PASS1_END + jmp m(vp9_idct_16x16_internal_10).pass1_end +.pass1_fast: + WRAP_YMM IADST16_PART1 + WRAP_YMM IADST16_PART2 + WRAP_YMM IADST16_PASS1_END + jmp m(vp9_idct_16x16_internal_10).pass1_fast_end +.pass2: + test eobd, eobd + jl .pass2_fast + call .main_part1 + jmp .pass2_end +.pass2_fast: + call .main_part1_fast +.pass2_end: + vpbroadcastd m20, [o(pd_532480)] + call .main_part2 + vpbroadcastd m16, [o(pixel_clip6)] + REPX {paddd x, m16}, m0, m2, m12, m14 + REPX {psubd x, m16, x}, m1, m3, m13, m15 + REPX {psrad x, 6}, m0, m1, m2, m3 + packssdw m0, m1 + lea r6, [strideq*3] + packssdw m1, m2, m3 + mova m2, [o(deintq_perm)] + vpbroadcastd m3, [o(pixel_clip)] + REPX {psrad x, 20}, m4, m5, m6, m7 + call m(vp9_idct_16x16_internal_10).write_16x4 + packssdw m0, m4, m5 + packssdw m1, m6, m7 + paddsw m0, m3 + paddsw m1, m3 + REPX {psrad x, 20}, m8, m9, m10, m11 + call m(vp9_idct_16x16_internal_10).write_16x4 + packssdw m0, m8, m9 + packssdw m1, m10, m11 + paddsw m0, m3 + paddsw m1, m3 + jmp m(vp9_idct_16x16_internal_10).pass2_end2 +ALIGN function_align + IADST16_PART1 + ret +ALIGN function_align +.main_part2: + IADST16_PART2 + ret + +cglobal vp9_idct_idct_32x32_add_10, 4, 7, 23, 64*64, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + dec eobd + jnz .pass1 + imul r6d, [cq], 11585 + vpbroadcastd m3, [o(pixel_clip)] + mov [cq], r3d + add r6d, 8192 + sar r6d, 14 + imul r6d, 11585 + or r3d, 16 + add r6d, 532480 + sar r6d, 20 + vpbroadcastw m2, r6d + paddsw m2, m3 +.dconly_loop: + paddsw m0, m2, [dstq+strideq*0] + paddsw m1, m2, [dstq+strideq*1] + psubusw m0, m3 + psubusw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + dec r3d + jg .dconly_loop + RET +.pass1: + vpbroadcastd m20, [o(pd_8192)] + vpbroadcastd m21, [o(pd_11585)] + cmp eobd, 135 + jl .pass1_fast + add cq, 64 + lea r4, [rsp+64*8] + cmp eobd, 579 + jl .pass1_right_fast + mov r6d, 128*28 + call .pass1_main + jmp .pass1_right_end +.pass1_right_fast: ; bottomright quadrant is zero + mova m0, [cq+128* 1] + mova m1, [cq+128* 3] + mova m2, [cq+128* 5] + mova m3, [cq+128* 7] + mova m4, [cq+128* 9] + mova m5, [cq+128*11] + mova m6, [cq+128*13] + mova m7, [cq+128*15] + call .main_fast + mova m0, [cq+128* 0] + mova m1, [cq+128* 2] + mova m2, [cq+128* 4] + mova m3, [cq+128* 6] + mova m4, [cq+128* 8] + mova m5, [cq+128*10] + mova m6, [cq+128*12] + mova m7, [cq+128*14] + call m(vp9_idct_16x16_internal_10).main_part1_fast + mov r6d, 128*12 + call .pass1_main_end +.pass1_right_end: + mova [r4+64* 8], m0 + mova [r4+64* 9], m1 + mova [r4+64*10], m2 + mova [r4+64*11], m3 + mova [r4+64*12], m4 + mova [r4+64*13], m5 + mova [r4+64*14], m6 + mova [r4+64*15], m7 + mova [r4+64*16], m16 + mova [r4+64*17], m17 + mova [r4+64*18], m18 + mova [r4+64*19], m19 + mova [r4+64*20], m8 + mova [r4+64*21], m9 + mova [r4+64*22], m10 + mova [r4+64*23], m11 + sub cq, 64 + sub r4, 64*8 + mov r6d, 128*28 + call .pass1_main + mova m12, [r4+64*20] + mova m13, [r4+64*21] + mova m14, [r4+64*22] + mova m15, [r4+64*23] + mova [r4+64*20], m8 + mova [r4+64*21], m9 + mova [r4+64*22], m10 + mova [r4+64*23], m11 + mova m8, [r4+64*16] + mova m9, [r4+64*17] + mova m10, [r4+64*18] + mova m11, [r4+64*19] + mova [r4+64*16], m16 + mova [r4+64*17], m17 + mova [r4+64*18], m18 + mova [r4+64*19], m19 + call .main + mova m0, [r4+64*16] + mova m1, [r4+64*17] + mova m2, [r4+64*18] + mova m3, [r4+64*19] + mova m4, [r4+64*20] + mova m5, [r4+64*21] + mova m6, [r4+64*22] + mova m7, [r4+64*23] + mova m8, [r4+64*24] + mova m9, [r4+64*25] + mova m10, [r4+64*26] + mova m11, [r4+64*27] + mova m12, [r4+64*28] + mova m13, [r4+64*29] + mova m14, [r4+64*30] + mova m15, [r4+64*31] + call m(vp9_idct_16x16_internal_10).main_part1 + call .pass2_main_left + mova m8, [r4+64* 8] + mova m9, [r4+64* 9] + mova m10, [r4+64*10] + mova m11, [r4+64*11] + mova m12, [r4+64*12] + mova m13, [r4+64*13] + mova m14, [r4+64*14] + mova m15, [r4+64*15] + TRANSPOSE_4DQ 8, 10, 12, 14, 16 + TRANSPOSE_4DQ 9, 11, 13, 15, 16 + call .main + call .pass2_main_right + mova m8, [r4+64*24] + mova m9, [r4+64*25] + mova m10, [r4+64*26] + mova m11, [r4+64*27] + mova m12, [r4+64*28] + mova m13, [r4+64*29] + mova m14, [r4+64*30] + mova m15, [r4+64*31] + TRANSPOSE_4DQ 8, 10, 12, 14, 16 + TRANSPOSE_4DQ 9, 11, 13, 15, 16 + call m(vp9_idct_16x16_internal_10).main_part1 + jmp .pass2_end +.pass1_fast: + mova m0, [cq+128* 1] + mova m1, [cq+128* 3] + mova m2, [cq+128* 5] + mova m3, [cq+128* 7] + mova m4, [cq+128* 9] + mova m5, [cq+128*11] + mova m6, [cq+128*13] + mova m7, [cq+128*15] + mov r4, rsp + call .main_fast + mova m0, [cq+128* 0] + mova m1, [cq+128* 2] + mova m2, [cq+128* 4] + mova m3, [cq+128* 6] + mova m4, [cq+128* 8] + mova m5, [cq+128*10] + mova m6, [cq+128*12] + mova m7, [cq+128*14] + call m(vp9_idct_16x16_internal_10).main_part1_fast + call m(vp9_idct_16x16_internal_10).main_part2 + mov r6d, 128*12 + call .pass1_main_end2 + mova [r4+64*16], m16 + mova [r4+64*17], m17 + mova [r4+64*18], m18 + mova [r4+64*19], m19 + mova [r4+64*20], m8 + mova [r4+64*21], m9 + mova [r4+64*22], m10 + mova [r4+64*23], m11 + call .main_fast + mova m0, [r4+64*16] + mova m1, [r4+64*17] + mova m2, [r4+64*18] + mova m3, [r4+64*19] + mova m4, [r4+64*20] + mova m5, [r4+64*21] + mova m6, [r4+64*22] + mova m7, [r4+64*23] + call m(vp9_idct_16x16_internal_10).main_part1_fast + call .pass2_main_left + call .main_fast + call .pass2_main_right + call m(vp9_idct_16x16_internal_10).main_part1_fast +.pass2_end: + paddd m0, m22 + paddd m18, m22 + call m(vp9_idct_16x16_internal_10).main_part2 + mova m20, [o(deintq_perm)] + rorx r2, strideq, 59 ; strideq*32 + vpbroadcastd m21, [o(pixel_clip)] + add r2, dstq +%assign i 0 +%rep 16 + mova m16, [r4+64*(15-i)] + mova m17, [r4+64*(i-16)] + mova m18, [r4-64*(17+i)] + paddd m19, m %+ i, m16 + psubd m0, m %+ i, m16 + call .write_32x2 + %assign i i+1 +%endrep + RET +ALIGN function_align +.write_32x2: + paddd m16, m17, m18 + psubd m17, m18 + REPX {psrad x, 6}, m19, m16, m0, m17 + packssdw m16, m19 + packssdw m17, m0 + sub r2, strideq + vpermq m16, m20, m16 + vpermq m17, m20, m17 + paddsw m16, [dstq] + paddsw m17, [r2 ] + psubusw m16, m21 + psubusw m17, m21 + mova [dstq], m16 + mova [r2 ], m17 + add dstq, strideq + ret +ALIGN function_align +.pass1_main: + mova m0, [cq+128* 1] + mova m1, [cq+128* 3] + mova m2, [cq+128* 5] + mova m3, [cq+128* 7] + mova m4, [cq+128* 9] + mova m5, [cq+128*11] + mova m6, [cq+128*13] + mova m7, [cq+128*15] + mova m8, [cq+128*17] + mova m9, [cq+128*19] + mova m10, [cq+128*21] + mova m11, [cq+128*23] + mova m12, [cq+128*25] + mova m13, [cq+128*27] + mova m14, [cq+128*29] + mova m15, [cq+128*31] + call .main + mova m0, [cq+128* 0] + mova m1, [cq+128* 2] + mova m2, [cq+128* 4] + mova m3, [cq+128* 6] + mova m4, [cq+128* 8] + mova m5, [cq+128*10] + mova m6, [cq+128*12] + mova m7, [cq+128*14] + mova m8, [cq+128*16] + mova m9, [cq+128*18] + mova m10, [cq+128*20] + mova m11, [cq+128*22] + mova m12, [cq+128*24] + mova m13, [cq+128*26] + mova m14, [cq+128*28] + mova m15, [cq+128*30] + call m(vp9_idct_16x16_internal_10).main_part1 +.pass1_main_end: + call m(vp9_idct_16x16_internal_10).main_part2 +.pass1_main_end2: + pxor m16, m16 +.pass1_zero_loop: + mova [cq+r6+128*0], m16 + mova [cq+r6+128*1], m16 + mova [cq+r6+128*2], m16 + mova [cq+r6+128*3], m16 + sub r6d, 128*4 + jge .pass1_zero_loop + mova m16, [r4+64*15] + mova m19, [r4+64*14] + mova m22, [r4+64*13] + mova m17, [r4+64*12] + psubd m18, m0, m16 + paddd m16, m0 + paddd m0, m19, m1 + psubd m19, m1, m19 + paddd m1, m17, m3 + psubd m3, m17 + paddd m17, m2, m22 + psubd m2, m22 + TRANSPOSE_4D 3, 2, 19, 18, 22 ; 28 29 30 31 + TRANSPOSE_4D 16, 0, 17, 1, 22 ; 0 1 2 3 + mova [r4+64*54], m3 + mova [r4+64*55], m19 + mova [r4+64*38], m2 + mova [r4+64*39], m18 + mova m2, [r4+64*11] + mova m19, [r4+64*10] + mova m3, [r4+64* 9] + mova m22, [r4+64* 8] + paddd m18, m4, m2 + psubd m4, m2 + paddd m2, m5, m19 + psubd m5, m19 + paddd m19, m6, m3 + psubd m6, m3 + paddd m3, m7, m22 + psubd m7, m22 + TRANSPOSE_4D 7, 6, 5, 4, 22 ; 24 25 26 27 + TRANSPOSE_4D 18, 2, 19, 3, 22 ; 4 5 6 7 + mova [r4+64*52], m7 + mova [r4+64*53], m5 + mova [r4+64*36], m6 + mova [r4+64*37], m4 + mova m7, [r4+64* 7] + mova m4, [r4+64* 6] + mova m5, [r4+64* 5] + mova m22, [r4+64* 4] + psubd m6, m8, m7 + paddd m8, m7 + psubd m7, m9, m4 + paddd m4, m9 + paddd m9, m10, m5 + psubd m10, m5 + paddd m5, m11, m22 + psubd m11, m22 + TRANSPOSE_4D 11, 10, 7, 6, 22 ; 20 21 22 23 + TRANSPOSE_4D 8, 4, 9, 5, 22 ; 8 9 10 11 + mova [r4+64*50], m11 + mova [r4+64*51], m7 + mova [r4+64*34], m10 + mova [r4+64*35], m6 + mova m6, [r4+64* 3] + mova m11, [r4+64* 2] + mova m7, [r4+64* 1] + mova m22, [r4+64* 0] + paddd m10, m12, m6 + psubd m12, m6 + paddd m6, m13, m11 + psubd m13, m11 + paddd m11, m14, m7 + psubd m14, m7 + paddd m7, m15, m22 + psubd m15, m22 + TRANSPOSE_4D 15, 14, 13, 12, 22 ; 16 17 18 19 + TRANSPOSE_4D 10, 6, 11, 7, 22 ; 12 13 14 15 + mova [r4+64*48], m15 + mova [r4+64*49], m13 + mova [r4+64*32], m14 + mova [r4+64*33], m12 + TRANSPOSE_4DQ 0, 2, 4, 6, 22 + TRANSPOSE_4DQ 1, 3, 5, 7, 22 + TRANSPOSE_4DQ 16, 18, 8, 10, 22 + TRANSPOSE_4DQ 17, 19, 9, 11, 22 + ret +ALIGN function_align +.pass2_main_left: + vpbroadcastd m22, [o(pixel_clip6)] + paddd m0, m22 + paddd m18, m22 + call m(vp9_idct_16x16_internal_10).main_part2 + mova [r4+64*16], m0 + mova [r4+64*17], m1 + mova [r4+64*18], m2 + mova [r4+64*19], m3 + mova [r4+64*20], m4 + mova [r4+64*21], m5 + mova [r4+64*22], m6 + mova [r4+64*23], m7 + mova [r4+64*24], m8 + mova [r4+64*25], m9 + mova [r4+64*26], m10 + mova [r4+64*27], m11 + mova [r4+64*28], m12 + mova [r4+64*29], m13 + mova [r4+64*30], m14 + mova [r4+64*31], m15 + add r4, 64*32 + mova m0, [r4+64* 0] + mova m1, [r4+64* 1] + mova m2, [r4+64* 2] + mova m3, [r4+64* 3] + mova m4, [r4+64* 4] + mova m5, [r4+64* 5] + mova m6, [r4+64* 6] + mova m7, [r4+64* 7] + jmp .pass2_main_transpose +ALIGN function_align +.pass2_main_right: + mova m0, [r4+64*16] + mova m1, [r4+64*17] + mova m2, [r4+64*18] + mova m3, [r4+64*19] + mova m4, [r4+64*20] + mova m5, [r4+64*21] + mova m6, [r4+64*22] + mova m7, [r4+64*23] +.pass2_main_transpose: + TRANSPOSE_4DQ 0, 2, 4, 6, 8 + TRANSPOSE_4DQ 1, 3, 5, 7, 8 + ret +ALIGN function_align +.main_fast: + pmulld m15, m0, [o(pd_16364)] {1to16} ; t31a + pmulld m0, [o(pd_804)] {1to16} ; t16a + pmulld m8, m7, [o(pd_11003)] {1to16} ; t17a + pmulld m7, [o(pd_12140)] {1to16} ; t30a + pmulld m11, m4, [o(pd_14811)] {1to16} ; t29a + pmulld m4, [o(pd_7005)] {1to16} ; t18a + pmulld m12, m3, [o(pd_5520)] {1to16} ; t19a + pmulld m3, [o(pd_15426)] {1to16} ; t28a + pmulld m13, m2, [o(pd_15893)] {1to16} ; t27a + pmulld m2, [o(pd_3981)] {1to16} ; t20a + pmulld m10, m5, [o(pd_8423)] {1to16} ; t21a + pmulld m5, [o(pd_14053)] {1to16} ; t26a + pmulld m9, m6, [o(pd_13160)] {1to16} ; t25a + pmulld m6, [o(pd_9760)] {1to16} ; t22a + pmulld m14, m1, [o(pd_2404)] {1to16} ; t23a + pmulld m1, [o(pd_16207)] {1to16} ; t24a + REPX {psubd x, m20, x}, m8, m12, m10, m14 + jmp .main2 +ALIGN function_align +.main: + ITX_MULSUB_2D 0, 15, 16, 17, 18, _, 804, 16364 ; t16a, t31a + ITX_MULSUB_2D 8, 7, 16, 17, 18, _, 12140, 11003 ; t17a, t30a + ITX_MULSUB_2D 4, 11, 16, 17, 18, _, 7005, 14811 ; t18a, t29a + ITX_MULSUB_2D 12, 3, 16, 17, 18, _, 15426, 5520 ; t19a, t28a + ITX_MULSUB_2D 2, 13, 16, 17, 18, _, 3981, 15893 ; t20a, t27a + ITX_MULSUB_2D 10, 5, 16, 17, 18, _, 14053, 8423 ; t21a, t26a + ITX_MULSUB_2D 6, 9, 16, 17, 18, _, 9760, 13160 ; t22a, t25a + ITX_MULSUB_2D 14, 1, 16, 17, 18, _, 16207, 2404 ; t23a, t24a + REPX {paddd x, m20}, m8, m12, m10, m14 +.main2: + REPX {paddd x, m20}, m0, m15, m7, m4, m3, m11 + REPX {psrad x, 14 }, m8, m0, m15, m7, m12, m4, m3, m11 + psubd m16, m0, m8 ; t17 + paddd m0, m8 ; t16 + psubd m8, m15, m7 ; t30 + paddd m15, m7 ; t31 + paddd m7, m12, m4 ; t19 + psubd m12, m4 ; t18 + paddd m4, m3, m11 ; t28 + psubd m3, m11 ; t29 + REPX {paddd x, m20}, m2, m13, m5, m6, m1, m9 + REPX {psrad x, 14 }, m10, m2, m13, m5, m14, m6, m1, m9 + psubd m11, m2, m10 ; t21 + paddd m2, m10 ; t20 + psubd m10, m13, m5 ; t26 + paddd m13, m5 ; t27 + psubd m5, m14, m6 ; t22 + paddd m6, m14 ; t23 + psubd m14, m1, m9 ; t25 + paddd m9, m1 ; t24 + vpbroadcastd m19, [o(pd_16069)] + vpbroadcastd m18, [o(pd_3196)] + ITX_MULSUB_2D 8, 16, 1, 17, _, 20, 18, 19 ; t17a, t30a + ITX_MULSUB_2D 3, 12, 1, 17, _, 20, 18, 19, 1 ; t29a, t18a + vpbroadcastd m19, [o(pd_9102)] + vpbroadcastd m18, [o(pd_13623)] + ITX_MULSUB_2D 10, 11, 1, 17, _, 20, 18, 19 ; t21a, t26a + ITX_MULSUB_2D 14, 5, 1, 17, _, 20, 18, 19, 1 ; t25a, t22a + paddd m1, m6, m2 ; t23a + psubd m6, m2 ; t20a + psubd m2, m9, m13 ; t27a + paddd m9, m13 ; t24a + psubd m13, m15, m4 ; t28a + paddd m15, m4 ; t31a + psubd m4, m8, m12 ; t18 + paddd m8, m12 ; t17 + psubd m12, m0, m7 ; t19a + paddd m0, m7 ; t16a + psubd m7, m16, m3 ; t29 + paddd m3, m16 ; t30 + paddd m16, m5, m10 ; t22 + psubd m5, m10 ; t21 + psubd m10, m14, m11 ; t26 + paddd m14, m11 ; t25 + vpbroadcastd m19, [o(pd_15137)] + vpbroadcastd m18, [o(pd_6270)] + ITX_MULSUB_2D 13, 12, 11, 17, _, 20, 18, 19 ; t19, t28 + ITX_MULSUB_2D 2, 6, 11, 17, _, 20, 18, 19, 1 ; t27, t20 + ITX_MULSUB_2D 7, 4, 11, 17, _, 20, 18, 19 ; t18a, t29a + ITX_MULSUB_2D 10, 5, 11, 17, _, 20, 18, 19, 1 ; t26a, t21a + psubd m11, m0, m1 ; t23 + paddd m0, m1 ; t16 + paddd m1, m16, m8 ; t17a + psubd m16, m8, m16 ; t22a + psubd m8, m15, m9 ; t24 + paddd m15, m9 ; t31 + psubd m9, m3, m14 ; t25a + paddd m14, m3 ; t30a + paddd m3, m6, m13 ; t19a + psubd m6, m13, m6 ; t20a + paddd m13, m10, m4 ; t29 + psubd m10, m4, m10 ; t26 + psubd m4, m12, m2 ; t27a + paddd m12, m2 ; t28a + paddd m2, m7, m5 ; t18 + psubd m7, m5 ; t21 + REPX {pmulld x, m21}, m10, m8, m4, m9, m7, m11, m6, m16 + mova [r4+64* 0], m0 + mova [r4+64* 1], m1 + mova [r4+64* 2], m2 + mova [r4+64* 3], m3 + mova [r4+64*12], m12 + mova [r4+64*13], m13 + mova [r4+64*14], m14 + mova [r4+64*15], m15 + REPX {paddd x, m20}, m10, m8, m4, m9 + psubd m5, m10, m7 ; t21a + paddd m10, m7 ; t26a + psubd m7, m8, m11 ; t23a + paddd m8, m11 ; t24a + REPX {psrad x, 14 }, m5, m10, m7, m8 + paddd m11, m4, m6 ; t27 + psubd m4, m6 ; t20 + psubd m6, m9, m16 ; t22 + paddd m9, m16 ; t25 + REPX {psrad x, 14 }, m11, m4, m6, m9 + mova [r4+64* 4], m4 + mova [r4+64* 5], m5 + mova [r4+64* 6], m6 + mova [r4+64* 7], m7 + mova [r4+64* 8], m8 + mova [r4+64* 9], m9 + mova [r4+64*10], m10 + mova [r4+64*11], m11 + ret + +%endif _______________________________________________ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".