The add_dirac_obmc8_mmx function was the only MMX function left. This patch migrates it to SSE2.
Here is checkasm benchmark results: diracdsp.add_dirac_obmc_8_c: 2299.1 ( 1.00x) diracdsp.add_dirac_obmc_8_mmx: 237.6 ( 9.68x) diracdsp.add_dirac_obmc_8_sse2: 109.1 (21.07x) Signed-off-by: Kyosuke Kawakami <kawakami150...@gmail.com> --- libavcodec/x86/diracdsp.asm | 23 +++++++++++++++++++---- libavcodec/x86/diracdsp_init.c | 10 +++------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm index e5e2b11846..e708400b66 100644 --- a/libavcodec/x86/diracdsp.asm +++ b/libavcodec/x86/diracdsp.asm @@ -227,7 +227,7 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen punpckhbw m1, m4 mova m2, [obmcq+i] mova m3, m2 - punpcklbw m2, m4 + punpcklbw m2, m4 punpckhbw m3, m4 pmullw m0, m2 pmullw m1, m3 @@ -247,9 +247,6 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen RET %endm -INIT_MMX -ADD_OBMC 8, mmx - INIT_XMM PUT_RECT sse2 ADD_RECT sse2 @@ -258,6 +255,24 @@ HPEL_FILTER sse2 ADD_OBMC 32, sse2 ADD_OBMC 16, sse2 +cglobal add_dirac_obmc8_sse2, 6,6,5, dst, src, stride, obmc, yblen + pxor m4, m4 +.loop: + movh m0, [srcq] + punpcklbw m0, m4 + movh m1, [obmcq] + punpcklbw m1, m4 + pmullw m0, m1 + movu m1, [dstq] + paddw m0, m1 + movu [dstq], m0 + lea srcq, [srcq+strideq] + lea dstq, [dstq+2*strideq] + add obmcq, 32 + sub yblend, 1 + jg .loop + RET + INIT_XMM sse4 ; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h) diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c index f678759dc0..08247133e1 100644 --- a/libavcodec/x86/diracdsp_init.c +++ b/libavcodec/x86/diracdsp_init.c @@ -24,8 +24,7 @@ void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int); -void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); - +void ff_add_dirac_obmc8_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); @@ -94,15 +93,12 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) #if HAVE_X86ASM int mm_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(mm_flags)) { - c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx; - } - if (EXTERNAL_SSE2(mm_flags)) { c->dirac_hpel_filter = dirac_hpel_filter_sse2; c->add_rect_clamped = ff_add_rect_clamped_sse2; c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2; + c->add_dirac_obmc[0] = ff_add_dirac_obmc8_sse2; c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2; c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2; @@ -116,5 +112,5 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) c->dequant_subband[1] = ff_dequant_subband_32_sse4; c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4; } -#endif +#endif // HAVE_X86ASM } -- 2.47.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".