hpeldsp: Improve ff_{avg,put}_pixels8_xy2_ssse3()

ffmpeg-git--- via ffmpeg-cvslog Sat, 18 Oct 2025 13:52:26 -0700

The branch, master has been updated
       via  36f92206bb90d6f0268749bd6fe6aa57974442db (commit)
       via  4c55724da86ddc5ef10966f287a3d50fe1a1cbbe (commit)
       via  f84e06026ac4546fcc5242813de506f997d2b6fa (commit)
      from  ce9d1814449d6ff6323dd1030fb4c8d1093c6744 (commit)



- Log -----------------------------------------------------------------
commit 36f92206bb90d6f0268749bd6fe6aa57974442db
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Fri Oct 3 09:29:24 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Oct 12 02:45:37 2025 +0200

    avcodec/x86/hpeldsp: Improve ff_{avg,put}_pixels8_xy2_ssse3()
    
    This SSSE3 function uses MMX registers (of course without emms
    at the end) and processes eight bytes of input by unpacking
    it into two MMX registers. This is very suboptimal given
    that one can just use XMM registers to process eight words.
    This commit switches them to using XMM registers.
    
    Old benchmarks:
    avg_pixels_tab[1][3]_c:                                114.5 ( 1.00x)
    avg_pixels_tab[1][3]_ssse3:                             43.6 ( 2.62x)
    put_pixels_tab[1][3]_c:                                 83.6 ( 1.00x)
    put_pixels_tab[1][3]_ssse3:                             34.0 ( 2.46x)
    
    New benchmarks:
    avg_pixels_tab[1][3]_c:                                115.3 ( 1.00x)
    avg_pixels_tab[1][3]_ssse3:                             24.6 ( 4.69x)
    put_pixels_tab[1][3]_c:                                 83.8 ( 1.00x)
    put_pixels_tab[1][3]_ssse3:                             19.7 ( 4.24x)
    
    Reviewed-by: Kieran Kunhya <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index fc51d72d17..2587e3c315 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -32,7 +32,6 @@ cextern pb_1
 cextern pw_1
 cextern pw_2
 pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
-pb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
 
 cextern pw_8192
 
@@ -423,9 +422,14 @@ cglobal %1%3_pixels8_xy2, 4,5,5
     movh        m1, [r1+r4+1]
     punpcklbw   m0, m1
     pmaddubsw   m0, m4
+%ifidn %3, _no_rnd
     paddusw     m2, m3
     paddusw     m2, m0
     psrlw       m2, 2
+%else
+    paddusw     m2, m0
+    pmulhrsw    m2, [pw_8192]
+%endif
 %ifidn %1, avg
     movh        m1, [r0+r4]
     packuswb    m2, m2
@@ -440,9 +444,14 @@ cglobal %1%3_pixels8_xy2, 4,5,5
     movh        m2, [r1+r4+1]
     punpcklbw   m2, m1
     pmaddubsw   m2, m4
+%ifidn %3, _no_rnd
     paddusw     m0, m3
     paddusw     m0, m2
     psrlw       m0, 2
+%else
+    paddusw     m0, m2
+    pmulhrsw    m0, [pw_8192]
+%endif
 %ifidn %1, avg
     movh        m1, [r0+r4]
     packuswb    m0, m0
@@ -459,6 +468,8 @@ cglobal %1%3_pixels8_xy2, 4,5,5
 
 INIT_XMM ssse3
 SET_PIXELS8_XY2 put, pw_1, _no_rnd
+SET_PIXELS8_XY2 avg, pw_8192
+SET_PIXELS8_XY2 put, pw_8192
 
 
 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h)
@@ -542,13 +553,8 @@ SET_PIXELS_XY2 put, pw_1, _no_rnd
 SET_PIXELS_XY2 avg, pw_1, _no_rnd
 
 %macro SSSE3_PIXELS_XY2 1-2
-%if %0 == 2 ; sse2
 cglobal %1_pixels16_xy2, 4,5,%2
     mova        m4, [pb_interleave16]
-%else
-cglobal %1_pixels8_xy2, 4,5
-    mova        m4, [pb_interleave8]
-%endif
     mova        m5, [pb_1]
     movu        m0, [r1]
     movu        m1, [r1+1]
@@ -601,9 +607,6 @@ cglobal %1_pixels8_xy2, 4,5
     RET
 %endmacro
 
-INIT_MMX ssse3
-SSSE3_PIXELS_XY2 put
-SSSE3_PIXELS_XY2 avg
 INIT_XMM ssse3
 SSSE3_PIXELS_XY2 put, 6
 SSSE3_PIXELS_XY2 avg, 7

commit 4c55724da86ddc5ef10966f287a3d50fe1a1cbbe
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Fri Oct 3 05:20:32 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Oct 12 02:45:25 2025 +0200

    avcodec/x86/hpeldsp: Add ff_put_no_rnd_pixels8_xy2_ssse3()
    
    Given that one has to deal with 16 byte intermediates it is
    unsurprising that SSE2 wins against MMX; the MMX version has
    therefore been removed (as well as the now unused inline_asm.h).
    The new function is even 32B smaller than the old MMX one.
    
    Old benchmarks:
    put_no_rnd_pixels_tab[1][3]_c:                          84.1 ( 1.00x)
    put_no_rnd_pixels_tab[1][3]_mmx:                        41.1 ( 2.05x)
    
    New benchmarks:
    put_no_rnd_pixels_tab[1][3]_c:                          84.0 ( 1.00x)
    put_no_rnd_pixels_tab[1][3]_ssse3:                      22.1 ( 3.80x)
    
    Reviewed-by: Kieran Kunhya <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 4c19da3e2a..fc51d72d17 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -407,6 +407,60 @@ INIT_XMM sse2
 AVG_PIXELS8_Y2
 
 
+; void ff_put_no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
+%macro SET_PIXELS8_XY2 2-3
+cglobal %1%3_pixels8_xy2, 4,5,5
+    mova        m4, [pb_1]
+    mova        m3, [%2]
+    movh        m0, [r1]
+    movh        m2, [r1+1]
+    punpcklbw   m2, m0
+    pmaddubsw   m2, m4
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movh        m0, [r1+r4]
+    movh        m1, [r1+r4+1]
+    punpcklbw   m0, m1
+    pmaddubsw   m0, m4
+    paddusw     m2, m3
+    paddusw     m2, m0
+    psrlw       m2, 2
+%ifidn %1, avg
+    movh        m1, [r0+r4]
+    packuswb    m2, m2
+    pavgb       m2, m1
+%else
+    packuswb    m2, m2
+%endif
+    movh   [r0+r4], m2
+    add         r4, r2
+
+    movh        m1, [r1+r4]
+    movh        m2, [r1+r4+1]
+    punpcklbw   m2, m1
+    pmaddubsw   m2, m4
+    paddusw     m0, m3
+    paddusw     m0, m2
+    psrlw       m0, 2
+%ifidn %1, avg
+    movh        m1, [r0+r4]
+    packuswb    m0, m0
+    pavgb       m0, m1
+%else
+    packuswb    m0, m0
+%endif
+    movh   [r0+r4], m0
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+SET_PIXELS8_XY2 put, pw_1, _no_rnd
+
+
 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h)
 %macro SET_PIXELS_XY2 2-3
 cglobal %1%3_pixels16_xy2, 4,5,8
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 44e44f0975..f4b123ce03 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -32,7 +32,6 @@
 #include "libavcodec/hpeldsp.h"
 #include "fpel.h"
 #include "hpeldsp.h"
-#include "inline_asm.h"
 
 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
@@ -64,6 +63,8 @@ void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const 
uint8_t *pixels,
                                     ptrdiff_t line_size, int h);
 void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
                                     ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
                                      ptrdiff_t line_size, int h);
 void ff_avg_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
@@ -73,86 +74,8 @@ void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t 
*pixels,
 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 
-#if HAVE_INLINE_ASM
-
-/***********************************/
-/* MMX no rounding */
-
-// put_pixels
-static void put_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
-                                       ptrdiff_t line_size, int h)
-{
-    MOVQ_ZERO(mm7);
-    MOVQ_WONE(mm6); // =1 for no_rnd version
-    __asm__ volatile(
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm4            \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
-        "add    %3, %1                  \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
-        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "paddusw %%mm2, %%mm0           \n\t"
-        "paddusw %%mm3, %%mm1           \n\t"
-        "paddusw %%mm6, %%mm4           \n\t"
-        "paddusw %%mm6, %%mm5           \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "psrlw  $2, %%mm4               \n\t"
-        "psrlw  $2, %%mm5               \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-        "movq   %%mm4, (%2, %%"FF_REG_a")  \n\t"
-        "add    %3, %%"FF_REG_a"           \n\t"
-
-        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
-        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm2, %%mm4           \n\t"
-        "paddusw %%mm3, %%mm5           \n\t"
-        "paddusw %%mm6, %%mm0           \n\t"
-        "paddusw %%mm6, %%mm1           \n\t"
-        "paddusw %%mm4, %%mm0           \n\t"
-        "paddusw %%mm5, %%mm1           \n\t"
-        "psrlw  $2, %%mm0               \n\t"
-        "psrlw  $2, %%mm1               \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-        "movq   %%mm0, (%2, %%"FF_REG_a")  \n\t"
-        "add    %3, %%"FF_REG_a"        \n\t"
-
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels)
-        :"D"(block), "r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}
-
-#endif /* HAVE_INLINE_ASM */
-
 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
 {
-#if HAVE_MMX_INLINE
-    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
-#endif
 #if HAVE_MMX_EXTERNAL
     c->put_no_rnd_pixels_tab[1][0] =
     c->put_pixels_tab[1][0] = ff_put_pixels8_mmx;
@@ -211,6 +134,8 @@ static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags)
     c->avg_pixels_tab[0][3]            = ff_avg_pixels16_xy2_ssse3;
     c->put_pixels_tab[1][3]            = ff_put_pixels8_xy2_ssse3;
     c->avg_pixels_tab[1][3]            = ff_avg_pixels8_xy2_ssse3;
+
+    c->put_no_rnd_pixels_tab[1][3]     = ff_put_no_rnd_pixels8_xy2_ssse3;
 #endif
 }
 
diff --git a/libavcodec/x86/inline_asm.h b/libavcodec/x86/inline_asm.h
deleted file mode 100644
index 0198746719..0000000000
--- a/libavcodec/x86/inline_asm.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * inline assembly helper macros
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_INLINE_ASM_H
-#define AVCODEC_X86_INLINE_ASM_H
-
-#include "constants.h"
-
-#define MOVQ_WONE(regd) \
-    __asm__ volatile ( \
-    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
-    "psrlw $15, %%" #regd ::)
-
-#define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
-#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
-
-#define MOVQ_BFE(regd)                                  \
-    __asm__ volatile (                                  \
-        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
-        "paddb   %%"#regd", %%"#regd"   \n\t" ::)
-
-#ifndef PIC
-#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: 
"m"(ff_pw_2))
-#else
-// for shared library it's better to use this way for accessing constants
-// pcmpeqd -> -1
-#define MOVQ_WTWO(regd)                                 \
-    __asm__ volatile (                                  \
-        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
-        "psrlw         $15, %%"#regd"   \n\t"           \
-        "psllw          $1, %%"#regd"   \n\t"::)
-
-#endif
-
-// using regr as temporary and for the output result
-// first argument is unmodified and second is trashed
-// regfe is supposed to contain 0xfefefefefefefefe
-#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
-    "movq   "#rega", "#regr"            \n\t"                    \
-    "pand   "#regb", "#regr"            \n\t"                    \
-    "pxor   "#rega", "#regb"            \n\t"                    \
-    "pand  "#regfe", "#regb"            \n\t"                    \
-    "psrlq       $1, "#regb"            \n\t"                    \
-    "paddb  "#regb", "#regr"            \n\t"
-
-#define PAVGB_MMX(rega, regb, regr, regfe)                       \
-    "movq   "#rega", "#regr"            \n\t"                    \
-    "por    "#regb", "#regr"            \n\t"                    \
-    "pxor   "#rega", "#regb"            \n\t"                    \
-    "pand  "#regfe", "#regb"            \n\t"                    \
-    "psrlq       $1, "#regb"            \n\t"                    \
-    "psubb  "#regb", "#regr"            \n\t"
-
-// mm6 is supposed to contain 0xfefefefefefefefe
-#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
-    "movq  "#rega", "#regr"             \n\t"                    \
-    "movq  "#regc", "#regp"             \n\t"                    \
-    "pand  "#regb", "#regr"             \n\t"                    \
-    "pand  "#regd", "#regp"             \n\t"                    \
-    "pxor  "#rega", "#regb"             \n\t"                    \
-    "pxor  "#regc", "#regd"             \n\t"                    \
-    "pand    %%mm6, "#regb"             \n\t"                    \
-    "pand    %%mm6, "#regd"             \n\t"                    \
-    "psrlq      $1, "#regb"             \n\t"                    \
-    "psrlq      $1, "#regd"             \n\t"                    \
-    "paddb "#regb", "#regr"             \n\t"                    \
-    "paddb "#regd", "#regp"             \n\t"
-
-#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
-    "movq  "#rega", "#regr"             \n\t"                    \
-    "movq  "#regc", "#regp"             \n\t"                    \
-    "por   "#regb", "#regr"             \n\t"                    \
-    "por   "#regd", "#regp"             \n\t"                    \
-    "pxor  "#rega", "#regb"             \n\t"                    \
-    "pxor  "#regc", "#regd"             \n\t"                    \
-    "pand    %%mm6, "#regb"             \n\t"                    \
-    "pand    %%mm6, "#regd"             \n\t"                    \
-    "psrlq      $1, "#regd"             \n\t"                    \
-    "psrlq      $1, "#regb"             \n\t"                    \
-    "psubb "#regb", "#regr"             \n\t"                    \
-    "psubb "#regd", "#regp"             \n\t"
-
-#endif /* AVCODEC_X86_INLINE_ASM_H */

commit f84e06026ac4546fcc5242813de506f997d2b6fa
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Fri Oct 3 04:14:59 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Oct 12 02:43:29 2025 +0200

    avcodec/x86/hpeldsp: Add SSE2 of {avg,put} no_rnd xy2 with blocksize 16
    
    Also remove the now superseded MMX versions (the new functions have the
    exact same codesize as the removed ones).
    
    Old benchmarks:
    avg_no_rnd_pixels_tab[0][3]_c:                         233.7 ( 1.00x)
    avg_no_rnd_pixels_tab[0][3]_mmx:                       121.5 ( 1.92x)
    put_no_rnd_pixels_tab[0][3]_c:                         171.4 ( 1.00x)
    put_no_rnd_pixels_tab[0][3]_mmx:                        82.6 ( 2.08x)
    
    New benchmarks:
    avg_no_rnd_pixels_tab[0][3]_c:                         233.3 ( 1.00x)
    avg_no_rnd_pixels_tab[0][3]_sse2:                       45.0 ( 5.18x)
    put_no_rnd_pixels_tab[0][3]_c:                         172.1 ( 1.00x)
    put_no_rnd_pixels_tab[0][3]_sse2:                       40.9 ( 4.21x)
    
    Reviewed-by: Kieran Kunhya <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index e9f988f7b5..4c19da3e2a 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -29,6 +29,7 @@
 
 SECTION_RODATA
 cextern pb_1
+cextern pw_1
 cextern pw_2
 pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
 pb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
@@ -407,10 +408,10 @@ AVG_PIXELS8_Y2
 
 
 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h)
-%macro SET_PIXELS_XY2 1
-cglobal %1_pixels16_xy2, 4,5,8
+%macro SET_PIXELS_XY2 2-3
+cglobal %1%3_pixels16_xy2, 4,5,8
     pxor        m7, m7
-    mova        m6, [pw_2]
+    mova        m6, [%2]
     movu        m0, [r1]
     movu        m4, [r1+1]
     mova        m1, m0
@@ -481,8 +482,10 @@ cglobal %1_pixels16_xy2, 4,5,8
 %endmacro
 
 INIT_XMM sse2
-SET_PIXELS_XY2 put
-SET_PIXELS_XY2 avg
+SET_PIXELS_XY2 put, pw_2
+SET_PIXELS_XY2 avg, pw_2
+SET_PIXELS_XY2 put, pw_1, _no_rnd
+SET_PIXELS_XY2 avg, pw_1, _no_rnd
 
 %macro SSSE3_PIXELS_XY2 1-2
 %if %0 == 2 ; sse2
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index cb47cb7752..44e44f0975 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -30,7 +30,6 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/hpeldsp.h"
-#include "libavcodec/pixels.h"
 #include "fpel.h"
 #include "hpeldsp.h"
 #include "inline_asm.h"
@@ -65,6 +64,10 @@ void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const 
uint8_t *pixels,
                                     ptrdiff_t line_size, int h);
 void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
                                     ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void ff_avg_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
@@ -143,94 +146,12 @@ static void put_no_rnd_pixels8_xy2_mmx(uint8_t *block, 
const uint8_t *pixels,
         :FF_REG_a, "memory");
 }
 
-// this routine is 'slightly' suboptimal but mostly unused
-static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
-                                       ptrdiff_t line_size, int h)
-{
-    MOVQ_ZERO(mm7);
-    MOVQ_WONE(mm6); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm4            \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
-        "add    %3, %1                  \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
-        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "paddusw %%mm2, %%mm0           \n\t"
-        "paddusw %%mm3, %%mm1           \n\t"
-        "paddusw %%mm6, %%mm4           \n\t"
-        "paddusw %%mm6, %%mm5           \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "psrlw  $2, %%mm4               \n\t"
-        "psrlw  $2, %%mm5               \n\t"
-                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-                "pcmpeqd %%mm2, %%mm2   \n\t"
-                "paddb %%mm2, %%mm2     \n\t"
-                PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
-                "movq   %%mm5, (%2, %%"FF_REG_a")  \n\t"
-        "add    %3, %%"FF_REG_a"        \n\t"
-
-        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
-        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm2, %%mm4           \n\t"
-        "paddusw %%mm3, %%mm5           \n\t"
-        "paddusw %%mm6, %%mm0           \n\t"
-        "paddusw %%mm6, %%mm1           \n\t"
-        "paddusw %%mm4, %%mm0           \n\t"
-        "paddusw %%mm5, %%mm1           \n\t"
-        "psrlw  $2, %%mm0               \n\t"
-        "psrlw  $2, %%mm1               \n\t"
-                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-                "pcmpeqd %%mm2, %%mm2   \n\t"
-                "paddb %%mm2, %%mm2     \n\t"
-                PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
-                "movq   %%mm1, (%2, %%"FF_REG_a")  \n\t"
-        "add    %3, %%"FF_REG_a"           \n\t"
-
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels)
-        :"D"(block), "r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}
-
-#if HAVE_MMX
-CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
-CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
-#endif
 #endif /* HAVE_INLINE_ASM */
 
 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
 {
 #if HAVE_MMX_INLINE
-    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
     c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
-    c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
 #endif
 #if HAVE_MMX_EXTERNAL
     c->put_no_rnd_pixels_tab[1][0] =
@@ -269,6 +190,7 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags)
     c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
     c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_sse2;
     c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_sse2;
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_sse2;
 
     c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
     c->avg_pixels_tab[0][1]        = ff_avg_pixels16_x2_sse2;
@@ -278,6 +200,7 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags)
     c->avg_no_rnd_pixels_tab[0]    = ff_avg_pixels16_sse2;
     c->avg_no_rnd_pixels_tab[1]    = ff_avg_no_rnd_pixels16_x2_sse2;
     c->avg_no_rnd_pixels_tab[2]    = ff_avg_no_rnd_pixels16_y2_sse2;
+    c->avg_no_rnd_pixels_tab[3]    = ff_avg_no_rnd_pixels16_xy2_sse2;
 #endif /* HAVE_SSE2_EXTERNAL */
 }
 

-----------------------------------------------------------------------

Summary of changes:
 libavcodec/x86/hpeldsp.asm    |  88 +++++++++++++++++----
 libavcodec/x86/hpeldsp_init.c | 172 +++---------------------------------------
 libavcodec/x86/inline_asm.h   | 100 ------------------------
 3 files changed, 84 insertions(+), 276 deletions(-)
 delete mode 100644 libavcodec/x86/inline_asm.h


hooks/post-receive
-- 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] branch master updated. 36f92206bb avcodec/x86/hpeldsp: Improve ff_{avg,put}_pixels8_xy2_ssse3()

Reply via email to