This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit e575c2d496efde0b552f3be37f8bd6f84fac030f Author: Andreas Rheinhardt <[email protected]> AuthorDate: Thu Feb 26 00:43:09 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Sun Mar 1 12:03:55 2026 +0100 avcodec/x86/huffyuvencdsp: Add SSE2 sub_hfyu_median_pred_int16 Contrary to the MMXEXT version this version does not overread at all (the MMXEXT version processes the input of 2*w bytes in eight byte chunks and overreads by a further six bytes, because it loads the next left and left top values at the end of the loop, i.e. it reads FFALIGN(2*w,8)+6 bytes instead of 2*w). Benchmarks: sub_hfyu_median_pred_int16_9bpp_c: 12673.6 ( 1.00x) sub_hfyu_median_pred_int16_9bpp_mmxext: 1947.7 ( 6.51x) sub_hfyu_median_pred_int16_9bpp_sse2: 993.9 (12.75x) sub_hfyu_median_pred_int16_9bpp_aligned_c: 12596.1 ( 1.00x) sub_hfyu_median_pred_int16_9bpp_aligned_mmxext: 1956.1 ( 6.44x) sub_hfyu_median_pred_int16_9bpp_aligned_sse2: 989.4 (12.73x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/huffyuvencdsp.asm | 50 +++++++++++++++++++++++++++++++++++++ libavcodec/x86/huffyuvencdsp_init.c | 4 +++ 2 files changed, 54 insertions(+) diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm index 8bfd0face0..3d38931893 100644 --- a/libavcodec/x86/huffyuvencdsp.asm +++ b/libavcodec/x86/huffyuvencdsp.asm @@ -94,3 +94,53 @@ cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_ movzx maskd, word [src2q + wq - 2] mov [leftq], maskd RET + +INIT_XMM sse2 +cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_top + movd m5, maskd + lea wd, [wd+wd-(mmsize-1)] + movu m0, [src1q] + movu m2, [src2q] + SPLATW m5, m5 + add dstq, wq + movd m1, [left_topq] + neg wq + movd m3, [leftq] + sub src1q, wq + sub src2q, wq + pslldq m0, 2 + pslldq m2, 2 + por m0, m1 + por m2, m3 + jmp .init + +.loop: + movu m0, [src1q + wq - 2] ; lt + movu m2, [src2q + wq - 2] ; l +.init: + movu m1, [src1q + wq] ; t + movu m3, [src2q + wq] + psubw m4, m2, m0 ; l - lt + pmaxsw m0, m1, m2 + paddw m4, m1 ; l - lt + t + pminsw m2, m1 + pand m4, m5 ; (l - lt + t)&mask + pminsw m4, m0 + pmaxsw m4, m2 ; pred + psubw m3, m4 ; l - pred + pand m3, m5 + movu [dstq + wq], m3 + add wq, 16 + js .loop + + cmp wd, mmsize-1 + jne .tail + + movzx src1d, word [src1q + (mmsize-1) - 2] + movzx src2d, word [src2q + (mmsize-1) - 2] + mov [left_topq], src1d + mov [leftq], src2d + RET +.tail: + mov wq, -1 + jmp .loop diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c index 153edabf02..e32b7ea19d 100644 --- a/libavcodec/x86/huffyuvencdsp_init.c +++ b/libavcodec/x86/huffyuvencdsp_init.c @@ -33,6 +33,8 @@ void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src unsigned mask, int w); void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); +void ff_sub_hfyu_median_pred_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, + unsigned mask, int w, int *left, int *left_top); av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width) { @@ -44,6 +46,8 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int wid if (EXTERNAL_SSE2(cpu_flags)) { c->diff_int16 = ff_diff_int16_sse2; + if (bpp < 16 && width >= 8) + c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_sse2; } if (EXTERNAL_AVX2_FAST(cpu_flags)) { _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
