huffyuvencdsp: Add SSE2 sub_hfyu_median_pred_int16

Andreas Rheinhardt via ffmpeg-cvslog Sun, 01 Mar 2026 03:47:25 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit e575c2d496efde0b552f3be37f8bd6f84fac030f
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Feb 26 00:43:09 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Mar 1 12:03:55 2026 +0100

    avcodec/x86/huffyuvencdsp: Add SSE2 sub_hfyu_median_pred_int16
    
    Contrary to the MMXEXT version this version does not overread at all
    (the MMXEXT version processes the input of 2*w bytes in eight byte
    chunks and overreads by a further six bytes, because it loads
    the next left and left top values at the end of the loop,
    i.e. it reads FFALIGN(2*w,8)+6 bytes instead of 2*w).
    
    Benchmarks:
    sub_hfyu_median_pred_int16_9bpp_c:                   12673.6 ( 1.00x)
    sub_hfyu_median_pred_int16_9bpp_mmxext:               1947.7 ( 6.51x)
    sub_hfyu_median_pred_int16_9bpp_sse2:                  993.9 (12.75x)
    sub_hfyu_median_pred_int16_9bpp_aligned_c:           12596.1 ( 1.00x)
    sub_hfyu_median_pred_int16_9bpp_aligned_mmxext:       1956.1 ( 6.44x)
    sub_hfyu_median_pred_int16_9bpp_aligned_sse2:          989.4 (12.73x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/huffyuvencdsp.asm    | 50 +++++++++++++++++++++++++++++++++++++
 libavcodec/x86/huffyuvencdsp_init.c |  4 +++
 2 files changed, 54 insertions(+)

diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
index 8bfd0face0..3d38931893 100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -94,3 +94,53 @@ cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, 
mask, w, left, left_
     movzx maskd, word [src2q + wq - 2]
     mov [leftq], maskd
     RET
+
+INIT_XMM sse2
+cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, 
left_top
+    movd         m5, maskd
+    lea          wd, [wd+wd-(mmsize-1)]
+    movu         m0, [src1q]
+    movu         m2, [src2q]
+    SPLATW       m5, m5
+    add        dstq, wq
+    movd         m1, [left_topq]
+    neg          wq
+    movd         m3, [leftq]
+    sub       src1q, wq
+    sub       src2q, wq
+    pslldq       m0, 2
+    pslldq       m2, 2
+    por          m0, m1
+    por          m2, m3
+    jmp       .init
+
+.loop:
+    movu         m0, [src1q + wq - 2]   ; lt
+    movu         m2, [src2q + wq - 2]   ; l
+.init:
+    movu         m1, [src1q + wq]       ; t
+    movu         m3, [src2q + wq]
+    psubw        m4, m2, m0             ; l - lt
+    pmaxsw       m0, m1, m2
+    paddw        m4, m1                 ; l - lt + t
+    pminsw       m2, m1
+    pand         m4, m5                 ; (l - lt + t)&mask
+    pminsw       m4, m0
+    pmaxsw       m4, m2                 ; pred
+    psubw        m3, m4                 ; l - pred
+    pand         m3, m5
+    movu [dstq + wq], m3
+    add          wq, 16
+    js        .loop
+
+    cmp          wd, mmsize-1
+    jne       .tail
+
+    movzx     src1d, word [src1q + (mmsize-1) - 2]
+    movzx     src2d, word [src2q + (mmsize-1) - 2]
+    mov [left_topq], src1d
+    mov     [leftq], src2d
+    RET
+.tail:
+    mov          wq, -1
+    jmp       .loop
diff --git a/libavcodec/x86/huffyuvencdsp_init.c 
b/libavcodec/x86/huffyuvencdsp_init.c
index 153edabf02..e32b7ea19d 100644
--- a/libavcodec/x86/huffyuvencdsp_init.c
+++ b/libavcodec/x86/huffyuvencdsp_init.c
@@ -33,6 +33,8 @@ void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, 
const uint16_t *src
                         unsigned mask, int w);
 void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, 
const uint16_t *src2,
                                           unsigned mask, int w, int *left, int 
*left_top);
+void ff_sub_hfyu_median_pred_int16_sse2(uint16_t *dst, const uint16_t *src1, 
const uint16_t *src2,
+                                        unsigned mask, int w, int *left, int 
*left_top);
 
 av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int 
width)
 {
@@ -44,6 +46,8 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext 
*c, int bpp, int wid
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->diff_int16 = ff_diff_int16_sse2;
+        if (bpp < 16 && width >= 8)
+            c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_sse2;
     }
 
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 07/10: avcodec/x86/huffyuvencdsp: Add SSE2 sub_hfyu_median_pred_int16

Reply via email to