Signed-off-by: James Almer <jamr...@gmail.com> --- libavcodec/ac3dsp.h | 4 ++-- libavcodec/ac3enc_template.c | 2 +- libavcodec/x86/ac3dsp.asm | 28 ++++++++++++++++++++++++++-- libavcodec/x86/ac3dsp_init.c | 4 ++++ 4 files changed, 33 insertions(+), 5 deletions(-)
diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h index a01bff3d11..25341f3396 100644 --- a/libavcodec/ac3dsp.h +++ b/libavcodec/ac3dsp.h @@ -47,9 +47,9 @@ typedef struct AC3DSPContext { * [-(1<<24),(1<<24)] * * @param dst destination array of int32_t. - * constraints: 16-byte aligned + * constraints: 32-byte aligned * @param src source array of float. - * constraints: 16-byte aligned + * constraints: 32-byte aligned * @param len number of elements to convert. * constraints: multiple of 32 greater than zero */ diff --git a/libavcodec/ac3enc_template.c b/libavcodec/ac3enc_template.c index be4ecebc9c..a16faea681 100644 --- a/libavcodec/ac3enc_template.c +++ b/libavcodec/ac3enc_template.c @@ -112,7 +112,7 @@ static void apply_channel_coupling(AC3EncodeContext *s) { LOCAL_ALIGNED_16(CoefType, cpl_coords, [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]); #if AC3ENC_FLOAT - LOCAL_ALIGNED_16(int32_t, fixed_cpl_coords, [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]); + LOCAL_ALIGNED_32(int32_t, fixed_cpl_coords, [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]); #else int32_t (*fixed_cpl_coords)[AC3_MAX_CHANNELS][16] = cpl_coords; #endif diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm index 42c8310462..e31c58e1c1 100644 --- a/libavcodec/x86/ac3dsp.asm +++ b/libavcodec/x86/ac3dsp.asm @@ -21,10 +21,10 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 ; 16777216.0f - used in ff_float_to_fixed24() -pf_1_24: times 4 dd 0x4B800000 +pf_1_24: times 8 dd 0x4B800000 ; used in ff_ac3_compute_mantissa_size() cextern ac3_bap_bits @@ -128,6 +128,30 @@ cglobal float_to_fixed24, 3, 3, 9, dst, src, len jl .loop RET +INIT_YMM avx2 +cglobal float_to_fixed24, 3, 3, 5, dst, src, len + movaps m0, [pf_1_24] + shl lenq, 2 + add srcq, lenq + add dstq, lenq + neg lenq +.loop: + mulps m1, m0, [srcq+lenq+mmsize*0] + mulps m2, m0, [srcq+lenq+mmsize*1] + mulps m3, m0, [srcq+lenq+mmsize*2] + mulps m4, m0, [srcq+lenq+mmsize*3] + cvtps2dq m1, m1 + cvtps2dq m2, m2 + cvtps2dq m3, m3 + cvtps2dq m4, m4 + movdqa [dstq+lenq+mmsize*0], m1 + movdqa [dstq+lenq+mmsize*1], m2 + movdqa [dstq+lenq+mmsize*2], m3 + movdqa [dstq+lenq+mmsize*3], m4 + add lenq, mmsize*4 + jl .loop + RET + ;------------------------------------------------------------------------------ ; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16]) ;------------------------------------------------------------------------------ diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c index 43b3b4ac85..106121b5b9 100644 --- a/libavcodec/x86/ac3dsp_init.c +++ b/libavcodec/x86/ac3dsp_init.c @@ -27,6 +27,7 @@ void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); +void ff_float_to_fixed24_avx2 (int32_t *dst, const float *src, unsigned int len); int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); @@ -48,6 +49,9 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) if (!(cpu_flags & AV_CPU_FLAG_ATOM)) c->extract_exponents = ff_ac3_extract_exponents_ssse3; } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + c->float_to_fixed24 = ff_float_to_fixed24_avx2; + } } #define DOWNMIX_FUNC_OPT(ch, opt) \ -- 2.42.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".