This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit 154bcd10540f15a1c62cfefff1364a18a7ec4272
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Feb 26 02:37:48 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Mar 1 12:04:14 2026 +0100

    avcodec/x86/huffyuvencdsp: Add AVX2 sub_hfyu_median_pred_int16
    
    This version can also process 16bpp.
    
    Benchmarks:
    sub_hfyu_median_pred_int16_9bpp_c:                   12667.7 ( 1.00x)
    sub_hfyu_median_pred_int16_9bpp_mmxext:               1966.5 ( 6.44x)
    sub_hfyu_median_pred_int16_9bpp_sse2:                  997.6 (12.70x)
    sub_hfyu_median_pred_int16_9bpp_avx2:                  474.8 (26.68x)
    sub_hfyu_median_pred_int16_9bpp_aligned_c:           12604.6 ( 1.00x)
    sub_hfyu_median_pred_int16_9bpp_aligned_mmxext:       1964.6 ( 6.42x)
    sub_hfyu_median_pred_int16_9bpp_aligned_sse2:          981.9 (12.84x)
    sub_hfyu_median_pred_int16_9bpp_aligned_avx2:          462.6 (27.25x)
    sub_hfyu_median_pred_int16_16bpp_c:                  12592.5 ( 1.00x)
    sub_hfyu_median_pred_int16_16bpp_avx2:                 465.6 (27.04x)
    sub_hfyu_median_pred_int16_16bpp_aligned_c:          12587.5 ( 1.00x)
    sub_hfyu_median_pred_int16_16bpp_aligned_avx2:         462.5 (27.22x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/huffyuvencdsp.asm    | 50 +++++++++++++++++++++++++------------
 libavcodec/x86/huffyuvencdsp_init.c |  4 +++
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
index 3d38931893..11f4b8c01f 100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -95,23 +95,32 @@ cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, 
mask, w, left, left_
     mov [leftq], maskd
     RET
 
-INIT_XMM sse2
+%macro SUB_HFYU_MEDIAN_PRED_INT16 1 ; u,s for pmaxuw vs pmaxsw
 cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, 
left_top
-    movd         m5, maskd
+    movd        xm5, maskd
     lea          wd, [wd+wd-(mmsize-1)]
-    movu         m0, [src1q]
-    movu         m2, [src2q]
-    SPLATW       m5, m5
+    movu        xm0, [src1q]
+    movu        xm2, [src2q]
+    SPLATW       m5, xm5
     add        dstq, wq
-    movd         m1, [left_topq]
+    movd        xm1, [left_topq]
     neg          wq
-    movd         m3, [leftq]
+    movd        xm3, [leftq]
+%if mmsize >= 32
+    movu        xm4, [src1q+14]
+%endif
     sub       src1q, wq
+    pslldq      xm0, 2
+    pslldq      xm2, 2
+    por         xm0, xm1
+%if mmsize >= 32
+    vinserti128  m0, xm4, 1
+%endif
+    por         xm2, xm3
+%if mmsize >= 32
+    vinserti128  m2, [src2q+14], 1
+%endif
     sub       src2q, wq
-    pslldq       m0, 2
-    pslldq       m2, 2
-    por          m0, m1
-    por          m2, m3
     jmp       .init
 
 .loop:
@@ -121,16 +130,16 @@ cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, 
src2, mask, w, left, left_
     movu         m1, [src1q + wq]       ; t
     movu         m3, [src2q + wq]
     psubw        m4, m2, m0             ; l - lt
-    pmaxsw       m0, m1, m2
+    pmax%1w      m0, m1, m2
     paddw        m4, m1                 ; l - lt + t
-    pminsw       m2, m1
+    pmin%1w      m2, m1
     pand         m4, m5                 ; (l - lt + t)&mask
-    pminsw       m4, m0
-    pmaxsw       m4, m2                 ; pred
+    pmin%1w      m4, m0
+    pmax%1w      m4, m2                 ; pred
     psubw        m3, m4                 ; l - pred
     pand         m3, m5
     movu [dstq + wq], m3
-    add          wq, 16
+    add          wq, mmsize
     js        .loop
 
     cmp          wd, mmsize-1
@@ -144,3 +153,12 @@ cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, 
src2, mask, w, left, left_
 .tail:
     mov          wq, -1
     jmp       .loop
+%endmacro
+
+INIT_XMM sse2
+SUB_HFYU_MEDIAN_PRED_INT16 s
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+SUB_HFYU_MEDIAN_PRED_INT16 u
+%endif
diff --git a/libavcodec/x86/huffyuvencdsp_init.c 
b/libavcodec/x86/huffyuvencdsp_init.c
index e32b7ea19d..7289e94bc7 100644
--- a/libavcodec/x86/huffyuvencdsp_init.c
+++ b/libavcodec/x86/huffyuvencdsp_init.c
@@ -35,6 +35,8 @@ void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, 
const uint16_t *src1, c
                                           unsigned mask, int w, int *left, int 
*left_top);
 void ff_sub_hfyu_median_pred_int16_sse2(uint16_t *dst, const uint16_t *src1, 
const uint16_t *src2,
                                         unsigned mask, int w, int *left, int 
*left_top);
+void ff_sub_hfyu_median_pred_int16_avx2(uint16_t *dst, const uint16_t *src1, 
const uint16_t *src2,
+                                        unsigned mask, int w, int *left, int 
*left_top);
 
 av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int 
width)
 {
@@ -52,5 +54,7 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext 
*c, int bpp, int wid
 
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
         c->diff_int16 = ff_diff_int16_avx2;
+        if (width >= 16)
+            c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_avx2;
     }
 }

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to