This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 154bcd10540f15a1c62cfefff1364a18a7ec4272 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Thu Feb 26 02:37:48 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Sun Mar 1 12:04:14 2026 +0100 avcodec/x86/huffyuvencdsp: Add AVX2 sub_hfyu_median_pred_int16 This version can also process 16bpp. Benchmarks: sub_hfyu_median_pred_int16_9bpp_c: 12667.7 ( 1.00x) sub_hfyu_median_pred_int16_9bpp_mmxext: 1966.5 ( 6.44x) sub_hfyu_median_pred_int16_9bpp_sse2: 997.6 (12.70x) sub_hfyu_median_pred_int16_9bpp_avx2: 474.8 (26.68x) sub_hfyu_median_pred_int16_9bpp_aligned_c: 12604.6 ( 1.00x) sub_hfyu_median_pred_int16_9bpp_aligned_mmxext: 1964.6 ( 6.42x) sub_hfyu_median_pred_int16_9bpp_aligned_sse2: 981.9 (12.84x) sub_hfyu_median_pred_int16_9bpp_aligned_avx2: 462.6 (27.25x) sub_hfyu_median_pred_int16_16bpp_c: 12592.5 ( 1.00x) sub_hfyu_median_pred_int16_16bpp_avx2: 465.6 (27.04x) sub_hfyu_median_pred_int16_16bpp_aligned_c: 12587.5 ( 1.00x) sub_hfyu_median_pred_int16_16bpp_aligned_avx2: 462.5 (27.22x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/huffyuvencdsp.asm | 50 +++++++++++++++++++++++++------------ libavcodec/x86/huffyuvencdsp_init.c | 4 +++ 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm index 3d38931893..11f4b8c01f 100644 --- a/libavcodec/x86/huffyuvencdsp.asm +++ b/libavcodec/x86/huffyuvencdsp.asm @@ -95,23 +95,32 @@ cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_ mov [leftq], maskd RET -INIT_XMM sse2 +%macro SUB_HFYU_MEDIAN_PRED_INT16 1 ; u,s for pmaxuw vs pmaxsw cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_top - movd m5, maskd + movd xm5, maskd lea wd, [wd+wd-(mmsize-1)] - movu m0, [src1q] - movu m2, [src2q] - SPLATW m5, m5 + movu xm0, [src1q] + movu xm2, [src2q] + SPLATW m5, xm5 add dstq, wq - movd m1, [left_topq] + movd xm1, [left_topq] neg wq - movd m3, [leftq] + movd xm3, [leftq] +%if mmsize >= 32 + movu xm4, [src1q+14] +%endif sub src1q, wq + pslldq xm0, 2 + pslldq xm2, 2 + por xm0, xm1 +%if mmsize >= 32 + vinserti128 m0, xm4, 1 +%endif + por xm2, xm3 +%if mmsize >= 32 + vinserti128 m2, [src2q+14], 1 +%endif sub src2q, wq - pslldq m0, 2 - pslldq m2, 2 - por m0, m1 - por m2, m3 jmp .init .loop: @@ -121,16 +130,16 @@ cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_ movu m1, [src1q + wq] ; t movu m3, [src2q + wq] psubw m4, m2, m0 ; l - lt - pmaxsw m0, m1, m2 + pmax%1w m0, m1, m2 paddw m4, m1 ; l - lt + t - pminsw m2, m1 + pmin%1w m2, m1 pand m4, m5 ; (l - lt + t)&mask - pminsw m4, m0 - pmaxsw m4, m2 ; pred + pmin%1w m4, m0 + pmax%1w m4, m2 ; pred psubw m3, m4 ; l - pred pand m3, m5 movu [dstq + wq], m3 - add wq, 16 + add wq, mmsize js .loop cmp wd, mmsize-1 @@ -144,3 +153,12 @@ cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_ .tail: mov wq, -1 jmp .loop +%endmacro + +INIT_XMM sse2 +SUB_HFYU_MEDIAN_PRED_INT16 s + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +SUB_HFYU_MEDIAN_PRED_INT16 u +%endif diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c index e32b7ea19d..7289e94bc7 100644 --- a/libavcodec/x86/huffyuvencdsp_init.c +++ b/libavcodec/x86/huffyuvencdsp_init.c @@ -35,6 +35,8 @@ void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, c unsigned mask, int w, int *left, int *left_top); void ff_sub_hfyu_median_pred_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); +void ff_sub_hfyu_median_pred_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, + unsigned mask, int w, int *left, int *left_top); av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width) { @@ -52,5 +54,7 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int wid if (EXTERNAL_AVX2_FAST(cpu_flags)) { c->diff_int16 = ff_diff_int16_avx2; + if (width >= 16) + c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_avx2; } } _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
