The branch, master has been updated
via 5bf57a925ca57ba94538f64a22c7d14234794c7d (commit)
via 99209c287687705fe1eee775cb4f7f1d0aa94a1e (commit)
via b890cd0f73750f0ca526a0b848f3daa48ae6eca5 (commit)
via aeb138679a8f97f6c4716ccd91fac3adbe7bb4d1 (commit)
via 0d3a88e55fc443640ed3c57c9fc906b1ed8a33b8 (commit)
via 1c00e094274b8571ea326311ff0425ba2dac0fd0 (commit)
via d633fa0433de093c9a1257aed519b806b1054f21 (commit)
via 2cfef7031ca4620e4744534527fe1674963bfdda (commit)
via 503afa40f7d6227ec25d42d40275f810940b0959 (commit)
from 00ef656a85f245a400b0cd83a0732c892703a7ae (commit)
- Log -----------------------------------------------------------------
commit 5bf57a925ca57ba94538f64a22c7d14234794c7d
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 16 12:10:22 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:13 2025 +0100
avutil/x86/asm: Remove wrong comment, rename FF_REG_sp
Before FFmpeg commit 531b0a316b24f00965cd8a88efdbea2c6d63147f,
FFmpeg used REG_SP as macro for the stack pointer, yet this
clashed with a REG_SP define in Solaris system headers, so it
was changed to REG_sp and a comment was added for this.
Libav fixed it by adding an FF_ prefix to the macros in
1e9c5bf4c136fe9e010cc8a7e7270bba0d1bf45e. FFmpeg switched
to using these prefixes in 9eb3da2f9942cf1b1148d242bccfc383f666feb6,
using FF_REG_sp instead of Libav's FF_REG_SP. In said commit
the comment was changed to claim that Solaris system headers
define FF_REG_SP, but this is (most likely) wrong.
This commit removes the wrong comment and renames the (actually unused)
macro to FF_REG_SP to make it consistent with FF_REG_BP.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavutil/x86/asm.h b/libavutil/x86/asm.h
index 9bff42d628..f06ea25035 100644
--- a/libavutil/x86/asm.h
+++ b/libavutil/x86/asm.h
@@ -38,8 +38,7 @@ typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg;
# define FF_PTR_SIZE "8"
typedef int64_t x86_reg;
-/* FF_REG_SP is defined in Solaris sys headers, so use FF_REG_sp */
-# define FF_REG_sp "rsp"
+# define FF_REG_SP "rsp"
# define FF_REG_BP "rbp"
# define FF_REGBP rbp
# define FF_REGa rax
@@ -60,7 +59,7 @@ typedef int64_t x86_reg;
# define FF_PTR_SIZE "4"
typedef int32_t x86_reg;
-# define FF_REG_sp "esp"
+# define FF_REG_SP "esp"
# define FF_REG_BP "ebp"
# define FF_REGBP ebp
# define FF_REGa eax
commit 99209c287687705fe1eee775cb4f7f1d0aa94a1e
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 16 11:10:07 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:13 2025 +0100
avcodec/x86/mpegvideoenc_template: Reduce number of registers used
qmat and bias always have a constant offset, so one can use one register
to address both of them. This allows to remove the check for HAVE_6REGS
(untested on a system where HAVE_6REGS is false).
Also avoid FF_REG_a while at it.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index c667dcd2a2..24dd049200 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -39,8 +39,6 @@ DECLARE_ALIGNED(16, static const uint16_t,
inv_zigzag_direct16)[64] = {
36, 37, 49, 50, 58, 59, 63, 64,
};
-#if HAVE_6REGS
-
#if HAVE_SSE2_INLINE
#define COMPILE_TEMPLATE_SSSE3 0
#define RENAME(a) a ## _sse2
@@ -55,8 +53,6 @@ DECLARE_ALIGNED(16, static const uint16_t,
inv_zigzag_direct16)[64] = {
#include "mpegvideoenc_template.c"
#endif /* HAVE_SSSE3_INLINE */
-#endif /* HAVE_6REGS */
-
av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
{
const int dct_algo = s->c.avctx->dct_algo;
@@ -65,11 +61,9 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
#if HAVE_SSE2_INLINE
int cpu_flags = av_get_cpu_flags();
if (INLINE_SSE2(cpu_flags)) {
-#if HAVE_6REGS
s->dct_quantize = dct_quantize_sse2;
-#endif
}
-#if HAVE_6REGS && HAVE_SSSE3_INLINE
+#if HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags))
s->dct_quantize = dct_quantize_ssse3;
#endif
diff --git a/libavcodec/x86/mpegvideoenc_template.c
b/libavcodec/x86/mpegvideoenc_template.c
index b5417f6d32..e6ce791347 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -70,7 +70,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
{
x86_reg last_non_zero_p1;
int level=0, q; //=0 is because gcc says uninitialized ...
- const uint16_t *qmat, *bias;
+ const uint16_t *qmat;
LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
//s->fdct (block);
@@ -86,11 +86,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
int dummy;
if (n < 4){
q = s->c.y_dc_scale;
- bias = s->q_intra_matrix16[qscale][1];
qmat = s->q_intra_matrix16[qscale][0];
}else{
q = s->c.c_dc_scale;
- bias = s->q_chroma_intra_matrix16[qscale][1];
qmat = s->q_chroma_intra_matrix16[qscale][0];
}
/* note: block[0] is assumed to be positive */
@@ -109,7 +107,6 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
last_non_zero_p1 = 1;
} else {
last_non_zero_p1 = 0;
- bias = s->q_inter_matrix16[qscale][1];
qmat = s->q_inter_matrix16[qscale][0];
}
@@ -121,7 +118,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"pxor %%xmm4, %%xmm4 \n\t" // 0
"movdqa (%2), %%xmm5 \n\t" // qmat[0]
"pxor %%xmm6, %%xmm6 \n\t"
- "psubw (%3), %%xmm6 \n\t" // -bias[0]
+ "psubw 128(%2), %%xmm6 \n\t" // -bias[0]
"mov $-128, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
@@ -131,9 +128,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"pmulhw %%xmm5, %%xmm0 \n\t" //
(ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
"por %%xmm0, %%xmm4 \n\t"
RESTORE_SIGN("%%xmm1", "%%xmm0") //
out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
- "movdqa %%xmm0, (%5, %%"FF_REG_a") \n\t"
+ "movdqa %%xmm0, (%4, %0) \n\t"
"pcmpeqw %%xmm2, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00
- "movdqa (%4, %%"FF_REG_a"), %%xmm1 \n\t"
+ "movdqa (%3, %0), %%xmm1 \n\t"
"movdqa %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
"pandn %%xmm1, %%xmm0 \n\t"
"pmaxsw %%xmm0, %%xmm3 \n\t"
@@ -143,7 +140,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"movd %%xmm3, %%"FF_REG_a" \n\t"
"movzbl %%al, %%eax \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
- : "r" (block+64), "r" (qmat), "r" (bias),
+ : "r" (block+64), "r" (qmat),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6")
@@ -159,15 +156,15 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"1: \n\t"
"movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t" // block[i]
SAVE_SIGN("%%xmm1", "%%xmm0") // ABS(block[i])
- "movdqa (%3, %%"FF_REG_a"), %%xmm6 \n\t" // bias[0]
+ "movdqa 128(%2, %0), %%xmm6 \n\t" // bias[i]
"paddusw %%xmm6, %%xmm0 \n\t" // ABS(block[i]) +
bias[0]
"movdqa (%2, %%"FF_REG_a"), %%xmm5 \n\t" // qmat[i]
"pmulhw %%xmm5, %%xmm0 \n\t" //
(ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
"por %%xmm0, %%xmm4 \n\t"
RESTORE_SIGN("%%xmm1", "%%xmm0") //
out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
- "movdqa %%xmm0, (%5, %%"FF_REG_a") \n\t"
+ "movdqa %%xmm0, (%4, %0) \n\t"
"pcmpeqw %%xmm2, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00
- "movdqa (%4, %%"FF_REG_a"), %%xmm1 \n\t"
+ "movdqa (%3, %0), %%xmm1 \n\t"
"movdqa %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
"pandn %%xmm1, %%xmm0 \n\t"
"pmaxsw %%xmm0, %%xmm3 \n\t"
@@ -177,7 +174,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"movd %%xmm3, %%"FF_REG_a" \n\t"
"movzbl %%al, %%eax \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
- : "r" (block+64), "r" (qmat+64), "r" (bias+64),
+ : "r" (block+64), "r" (qmat+64),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6")
commit b890cd0f73750f0ca526a0b848f3daa48ae6eca5
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 19:56:23 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:13 2025 +0100
avcodec/x86/mpegvideoenc_template: Avoid touching nonvolatile register
xmm7 is nonvolatile on x64 Windows.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/mpegvideoenc_template.c
b/libavcodec/x86/mpegvideoenc_template.c
index 14e993de2b..b5417f6d32 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -117,7 +117,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
__asm__ volatile(
"movd %%"FF_REG_a", %%xmm3 \n\t" // last_non_zero_p1
SPREADW("%%xmm3")
- "pxor %%xmm7, %%xmm7 \n\t" // 0
+ "pxor %%xmm2, %%xmm2 \n\t" // 0
"pxor %%xmm4, %%xmm4 \n\t" // 0
"movdqa (%2), %%xmm5 \n\t" // qmat[0]
"pxor %%xmm6, %%xmm6 \n\t"
@@ -132,9 +132,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"por %%xmm0, %%xmm4 \n\t"
RESTORE_SIGN("%%xmm1", "%%xmm0") //
out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
"movdqa %%xmm0, (%5, %%"FF_REG_a") \n\t"
- "pcmpeqw %%xmm7, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00
+ "pcmpeqw %%xmm2, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00
"movdqa (%4, %%"FF_REG_a"), %%xmm1 \n\t"
- "movdqa %%xmm7, (%1, %%"FF_REG_a") \n\t" // 0
+ "movdqa %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
"pandn %%xmm1, %%xmm0 \n\t"
"pmaxsw %%xmm0, %%xmm3 \n\t"
"add $16, %%"FF_REG_a" \n\t"
@@ -146,13 +146,13 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
: "r" (block+64), "r" (qmat), "r" (bias),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+ "%xmm4", "%xmm5", "%xmm6")
);
}else{ // FMT_H263
__asm__ volatile(
"movd %%"FF_REG_a", %%xmm3 \n\t" // last_non_zero_p1
SPREADW("%%xmm3")
- "pxor %%xmm7, %%xmm7 \n\t" // 0
+ "pxor %%xmm2, %%xmm2 \n\t" // 0
"pxor %%xmm4, %%xmm4 \n\t" // 0
"mov $-128, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
@@ -166,9 +166,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"por %%xmm0, %%xmm4 \n\t"
RESTORE_SIGN("%%xmm1", "%%xmm0") //
out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
"movdqa %%xmm0, (%5, %%"FF_REG_a") \n\t"
- "pcmpeqw %%xmm7, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00
+ "pcmpeqw %%xmm2, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00
"movdqa (%4, %%"FF_REG_a"), %%xmm1 \n\t"
- "movdqa %%xmm7, (%1, %%"FF_REG_a") \n\t" // 0
+ "movdqa %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
"pandn %%xmm1, %%xmm0 \n\t"
"pmaxsw %%xmm0, %%xmm3 \n\t"
"add $16, %%"FF_REG_a" \n\t"
@@ -180,7 +180,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
: "r" (block+64), "r" (qmat+64), "r" (bias+64),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+ "%xmm4", "%xmm5", "%xmm6")
);
}
__asm__ volatile(
commit aeb138679a8f97f6c4716ccd91fac3adbe7bb4d1
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 19:44:02 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:12 2025 +0100
avcodec/x86/mpegvideoencdsp: Port add_8x8basis_ssse3() to ASM
Both GCC and Clang completely unroll the unlikely loop at -O3,
leading to codesize bloat; their code is also suboptimal, as they
don't make use of pmulhrsw (even with -mssse3). This commit
therefore ports the whole function to external assembly. The new
function occupies 176B here vs 1406B for GCC.
Benchmarks for a testcase with huge qscale (notice that the C version
is unrolled just like the unlikely loop in the SSSE3 version):
add_8x8basis_c: 43.4 ( 1.00x)
add_8x8basis_ssse3 (old): 43.6 ( 1.00x)
add_8x8basis_ssse3 (new): 11.9 ( 3.63x)
Reviewed-by: Kieran Kunhya <[email protected]>
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/mpegvideoencdsp.asm
b/libavcodec/x86/mpegvideoencdsp.asm
index 0e86a5304c..300f98b438 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -25,6 +25,58 @@
SECTION .text
+; void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int
scale)
+INIT_XMM ssse3
+cglobal add_8x8basis, 3, 3+ARCH_X86_64, 4, rem, basis, scale
+ movd m0, scaled
+ add scaled, 1024
+ add basisq, 128
+ add remq, 128
+%if ARCH_X86_64
+%define OFF r3q
+ mov r3q, -128
+ cmp scaled, 2047
+%else
+%define OFF r2q
+ cmp scaled, 2047
+ mov r2q, -128
+%endif
+ ja .huge_scale
+
+ punpcklwd m0, m0
+ pshufd m0, m0, 0x0
+ psllw m0, 5
+.loop1:
+ mova m1, [basisq+OFF]
+ mova m2, [basisq+OFF+16]
+ pmulhrsw m1, m0
+ pmulhrsw m2, m0
+ paddw m1, [remq+OFF]
+ paddw m2, [remq+OFF+16]
+ mova [remq+OFF], m1
+ mova [remq+OFF+16], m2
+ add OFF, 32
+ js .loop1
+ RET
+
+.huge_scale:
+ pslld m0, 6
+ punpcklwd m0, m0
+ pshufd m1, m0, 0x55
+ psrlw m0, 1
+ pshufd m0, m0, 0x0
+.loop2:
+ mova m2, [basisq+OFF]
+ pmulhrsw m3, m2, m0
+ pmullw m2, m1
+ paddw m2, m3
+ paddw m2, [remq+OFF]
+ mova [remq+OFF], m2
+ add OFF, 16
+ js .loop2
+ RET
+
+
INIT_XMM sse2
cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset
pxor m6, m6
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c
b/libavcodec/x86/mpegvideoencdsp_init.c
index f6169b5399..220c75785a 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -32,6 +32,7 @@ void ff_mpv_denoise_dct_sse2(int16_t block[64], int
dct_error_sum[64],
int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size);
int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size);
int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
+void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int
scale);
#if HAVE_INLINE_ASM
#if HAVE_SSSE3_INLINE
@@ -83,41 +84,6 @@ static int try_8x8basis_ssse3(const int16_t rem[64], const
int16_t weight[64], c
);
return i;
}
-
-static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int
scale)
-{
- x86_reg i=0;
-
- if (FFABS(scale) < 1024) {
- scale *= 1 << (16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT);
- __asm__ volatile(
- "movd %3, %%xmm2 \n\t"
- "punpcklwd %%xmm2, %%xmm2 \n\t"
- "pshufd $0, %%xmm2, %%xmm2 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movdqa (%1, %0), %%xmm0 \n\t"
- "movdqa 16(%1, %0), %%xmm1 \n\t"
- "pmulhrsw %%xmm2, %%xmm0 \n\t"
- "pmulhrsw %%xmm2, %%xmm1 \n\t"
- "paddw (%2, %0), %%xmm0 \n\t"
- "paddw 16(%2, %0), %%xmm1 \n\t"
- "movdqa %%xmm0, (%2, %0) \n\t"
- "movdqa %%xmm1, 16(%2, %0) \n\t"
- "add $32, %0 \n\t"
- "cmp $128, %0 \n\t" // FIXME optimize &
bench
- " jb 1b \n\t"
- : "+r" (i)
- : "r"(basis), "r"(rem), "g"(scale)
- XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2")
- );
- } else {
- for (i=0; i<8*8; i++) {
- rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT -
RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
- }
- }
-}
-
#endif /* HAVE_SSSE3_INLINE */
/* Draw the edges of width 'w' of an image of size width, height */
@@ -227,15 +193,17 @@ av_cold void
ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
c->draw_edges = draw_edges_mmx;
}
}
+#endif /* HAVE_INLINE_ASM */
+ if (X86_SSSE3(cpu_flags)) {
#if HAVE_SSSE3_INLINE
- if (INLINE_SSSE3(cpu_flags)) {
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->try_8x8basis = try_8x8basis_ssse3;
}
- c->add_8x8basis = add_8x8basis_ssse3;
- }
#endif /* HAVE_SSSE3_INLINE */
+#if HAVE_SSSE3_EXTERNAL
+ c->add_8x8basis = ff_add_8x8basis_ssse3;
+#endif
+ }
-#endif /* HAVE_INLINE_ASM */
}
commit 0d3a88e55fc443640ed3c57c9fc906b1ed8a33b8
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 19:06:14 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:12 2025 +0100
tests/checkasm/mpegvideoencdsp: Test denoise_dct
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/tests/checkasm/mpegvideoencdsp.c b/tests/checkasm/mpegvideoencdsp.c
index a4a4fa6f5c..955cd9f5b7 100644
--- a/tests/checkasm/mpegvideoencdsp.c
+++ b/tests/checkasm/mpegvideoencdsp.c
@@ -37,6 +37,37 @@
buf[j] = rnd() % (max - min + 1) + min; \
} while (0)
+static void check_denoise_dct(MpegvideoEncDSPContext *c)
+{
+ declare_func(void, int16_t block[64], int dct_error_sum[64],
+ const uint16_t dct_offset[64]);
+
+ if (check_func(c->denoise_dct, "denoise_dct")) {
+ DECLARE_ALIGNED(16, int16_t, block_ref)[64];
+ DECLARE_ALIGNED(16, int16_t, block_new)[64];
+ DECLARE_ALIGNED(16, int, dct_error_sum_ref)[64];
+ DECLARE_ALIGNED(16, int, dct_error_sum_new)[64];
+ DECLARE_ALIGNED(16, uint16_t, dct_offset)[64];
+
+ for (size_t i = 0; i < FF_ARRAY_ELEMS(block_ref); ++i) {
+ unsigned random = rnd();
+ block_ref[i] = random & (1 << 16) ? random : 0;
+ }
+ randomize_buffers(dct_offset, sizeof(dct_offset));
+ randomize_buffer_clipped(dct_error_sum_ref, 0, (1 << 24) - 1);
+ memcpy(block_new, block_ref, sizeof(block_new));
+ memcpy(dct_error_sum_new, dct_error_sum_ref,
sizeof(dct_error_sum_ref));
+
+ call_ref(block_ref, dct_error_sum_ref, dct_offset);
+ call_new(block_new, dct_error_sum_new, dct_offset);
+ if (memcmp(block_ref, block_new, sizeof(block_ref)) ||
+ memcmp(dct_error_sum_new, dct_error_sum_ref,
sizeof(dct_error_sum_new)))
+ fail();
+
+ bench_new(block_new, dct_error_sum_new, dct_offset);
+ }
+}
+
static void check_add_8x8basis(MpegvideoEncDSPContext *c)
{
declare_func(void, int16_t rem[64], const int16_t basis[64], int scale);
@@ -166,6 +197,8 @@ void checkasm_check_mpegvideoencdsp(void)
ff_mpegvideoencdsp_init(&c, &avctx);
+ check_denoise_dct(&c);
+ report("denoise_dct");
check_pix_sum(&c);
report("pix_sum");
check_pix_norm1(&c);
commit 1c00e094274b8571ea326311ff0425ba2dac0fd0
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 18:24:18 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:12 2025 +0100
avcodec/mpegvideo_enc: Port denoise_dct to MpegvideoEncDSPContext
It is very simple to remove the MPVEncContext from it.
Notice that this also fixes a bug in x86/mpegvideoenc.c: It only
used the SSE2 version of denoise_dct when dct_algo was auto or mmx
(and it was therefore unused during FATE).
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 4bbc2f00ea..1d777293d0 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -54,7 +54,6 @@ OBJS-$(CONFIG_BLOCKDSP) +=
mips/blockdsp_init_mips.o
OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_init_mips.o
OBJS-$(CONFIG_IDCTDSP) += mips/idctdsp_init_mips.o
OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_init_mips.o
-OBJS-$(CONFIG_MPEGVIDEOENC) += mips/mpegvideoenc_init_mips.o
OBJS-$(CONFIG_MPEGVIDEOENCDSP) += mips/mpegvideoencdsp_init_mips.o
OBJS-$(CONFIG_ME_CMP) += mips/me_cmp_init_mips.o
OBJS-$(CONFIG_MPEG4_DECODER) += mips/xvididct_init_mips.o
@@ -100,7 +99,7 @@ MMI-OBJS-$(CONFIG_H264DSP) +=
mips/h264dsp_mmi.o
MMI-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_mmi.o
MMI-OBJS-$(CONFIG_H264PRED) += mips/h264pred_mmi.o
MMI-OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_mmi.o
-MMI-OBJS-$(CONFIG_MPEGVIDEOENC) += mips/mpegvideoenc_mmi.o
+MMI-OBJS-$(CONFIG_MPEGVIDEOENCDSP) += mips/mpegvideoenc_mmi.o
MMI-OBJS-$(CONFIG_IDCTDSP) += mips/idctdsp_mmi.o \
mips/simple_idct_mmi.o
MMI-OBJS-$(CONFIG_MPEG4_DECODER) += mips/xvid_idct_mmi.o
diff --git a/libavcodec/mips/mpegvideo_mips.h b/libavcodec/mips/mpegvideo_mips.h
index 72ffed6985..2a9ea4006e 100644
--- a/libavcodec/mips/mpegvideo_mips.h
+++ b/libavcodec/mips/mpegvideo_mips.h
@@ -22,7 +22,6 @@
#define AVCODEC_MIPS_MPEGVIDEO_MIPS_H
#include "libavcodec/mpegvideo.h"
-#include "libavcodec/mpegvideoenc.h"
void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
int n, int qscale);
@@ -34,6 +33,6 @@ void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s,
int16_t *block,
int n, int qscale);
void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
int n, int qscale);
-void ff_denoise_dct_mmi(MPVEncContext *s, int16_t *block);
+void ff_denoise_dct_mmi(int16_t block[64], int sum[64], const uint16_t
offset[64]);
#endif /* AVCODEC_MIPS_MPEGVIDEO_MIPS_H */
diff --git a/libavcodec/mips/mpegvideoenc_init_mips.c
b/libavcodec/mips/mpegvideoenc_init_mips.c
deleted file mode 100644
index 7831973eb8..0000000000
--- a/libavcodec/mips/mpegvideoenc_init_mips.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2015 Manojkumar Bhosale ([email protected])
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/mips/cpu.h"
-#include "libavcodec/mpegvideoenc.h"
-#include "mpegvideo_mips.h"
-
-av_cold void ff_mpvenc_dct_init_mips(MPVEncContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_mmi(cpu_flags)) {
- s->denoise_dct = ff_denoise_dct_mmi;
- }
-}
diff --git a/libavcodec/mips/mpegvideoencdsp_init_mips.c
b/libavcodec/mips/mpegvideoencdsp_init_mips.c
index 24a17b91db..df916282a2 100644
--- a/libavcodec/mips/mpegvideoencdsp_init_mips.c
+++ b/libavcodec/mips/mpegvideoencdsp_init_mips.c
@@ -23,12 +23,17 @@
#include "libavcodec/bit_depth_template.c"
#include "libavcodec/mpegvideoencdsp.h"
#include "h263dsp_mips.h"
+#include "mpegvideo_mips.h"
av_cold void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c,
AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
+ if (have_mmi(cpu_flags)) {
+ c->denoise_dct = ff_denoise_dct_mmi;
+ }
+
if (have_msa(cpu_flags)) {
#if BIT_DEPTH == 8
c->pix_sum = ff_pix_sum_msa;
diff --git a/libavcodec/mips/mpegvideoenc_mmi.c
b/libavcodec/mips/mpegvideoencdsp_mmi.c
similarity index 95%
rename from libavcodec/mips/mpegvideoenc_mmi.c
rename to libavcodec/mips/mpegvideoencdsp_mmi.c
index 085be3b0ec..2239a05978 100644
--- a/libavcodec/mips/mpegvideoenc_mmi.c
+++ b/libavcodec/mips/mpegvideoencdsp_mmi.c
@@ -25,17 +25,12 @@
#include "mpegvideo_mips.h"
#include "libavutil/mips/mmiutils.h"
-void ff_denoise_dct_mmi(MPVEncContext *s, int16_t *block)
+void ff_denoise_dct_mmi(int16_t block[64], int sum[64], const uint16_t
offset[64])
{
- const int intra = s->c.mb_intra;
- int *sum = s->dct_error_sum[intra];
- uint16_t *offset = s->dct_offset[intra];
double ftmp[8];
mips_reg addr[1];
DECLARE_VAR_ALL64;
- s->dct_count[intra]++;
-
__asm__ volatile(
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"1: \n\t"
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index ce0ee4bb68..9e83026b51 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -86,7 +86,6 @@
static int encode_picture(MPVMainEncContext *const s, const AVPacket *pkt);
static int dct_quantize_refine(MPVEncContext *const s, int16_t *block, int16_t
*weight, int16_t *orig, int n, int qscale);
static int sse_mb(MPVEncContext *const s);
-static void denoise_dct_c(MPVEncContext *const s, int16_t *block);
static int dct_quantize_c(MPVEncContext *const s,
int16_t *block, int n,
int qscale, int *overflow);
@@ -300,11 +299,8 @@ static av_cold void mpv_encode_defaults(MPVMainEncContext
*const m)
av_cold void ff_dct_encode_init(MPVEncContext *const s)
{
s->dct_quantize = dct_quantize_c;
- s->denoise_dct = denoise_dct_c;
-#if ARCH_MIPS
- ff_mpvenc_dct_init_mips(s);
-#elif ARCH_X86
+#if ARCH_X86
ff_dct_encode_init_x86(s);
#endif
@@ -3955,29 +3951,14 @@ static int encode_picture(MPVMainEncContext *const m,
const AVPacket *pkt)
return 0;
}
-static void denoise_dct_c(MPVEncContext *const s, int16_t *block)
+static inline void denoise_dct(MPVEncContext *const s, int16_t block[])
{
- const int intra = s->c.mb_intra;
- int i;
+ if (!s->dct_error_sum)
+ return;
+ const int intra = s->c.mb_intra;
s->dct_count[intra]++;
-
- for(i=0; i<64; i++){
- int level= block[i];
-
- if(level){
- if(level>0){
- s->dct_error_sum[intra][i] += level;
- level -= s->dct_offset[intra][i];
- if(level<0) level=0;
- }else{
- s->dct_error_sum[intra][i] -= level;
- level += s->dct_offset[intra][i];
- if(level>0) level=0;
- }
- block[i]= level;
- }
- }
+ s->mpvencdsp.denoise_dct(block, s->dct_error_sum[intra],
s->dct_offset[intra]);
}
static int dct_quantize_trellis_c(MPVEncContext *const s,
@@ -4009,8 +3990,8 @@ static int dct_quantize_trellis_c(MPVEncContext *const s,
s->fdsp.fdct(block);
- if(s->dct_error_sum)
- s->denoise_dct(s, block);
+ denoise_dct(s, block);
+
qmul= qscale*16;
qadd= ((qscale-1)|1)*8;
@@ -4678,8 +4659,7 @@ static int dct_quantize_c(MPVEncContext *const s,
s->fdsp.fdct(block);
- if(s->dct_error_sum)
- s->denoise_dct(s, block);
+ denoise_dct(s, block);
if (s->c.mb_intra) {
scantable = s->c.intra_scantable.scantable;
diff --git a/libavcodec/mpegvideoenc.h b/libavcodec/mpegvideoenc.h
index ee115c3611..131908c10a 100644
--- a/libavcodec/mpegvideoenc.h
+++ b/libavcodec/mpegvideoenc.h
@@ -123,7 +123,6 @@ typedef struct MPVEncContext {
uint16_t (*q_inter_matrix16)[2][64];
/* noise reduction */
- void (*denoise_dct)(struct MPVEncContext *s, int16_t *block);
int (*dct_error_sum)[64];
int dct_count[2];
uint16_t (*dct_offset)[64];
@@ -397,7 +396,6 @@ int ff_mpv_reallocate_putbitbuffer(MPVEncContext *s, size_t
threshold, size_t si
void ff_write_quant_matrix(PutBitContext *pb, uint16_t *matrix);
void ff_dct_encode_init(MPVEncContext *s);
-void ff_mpvenc_dct_init_mips(MPVEncContext *s);
void ff_dct_encode_init_x86(MPVEncContext *s);
void ff_convert_matrix(MPVEncContext *s, int (*qmat)[64], uint16_t
(*qmat16)[2][64],
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
index b4fd2af915..3b4a57d58a 100644
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -28,6 +28,29 @@
#include "mathops.h"
#include "mpegvideoencdsp.h"
+static void denoise_dct_c(int16_t block[64], int dct_error_sum[64],
+ const uint16_t dct_offset[64])
+{
+ for (int i = 0; i < 64; ++i) {
+ int level = block[i];
+
+ if (level) {
+ if (level > 0) {
+ dct_error_sum[i] += level;
+ level -= dct_offset[i];
+ if (level < 0)
+ level = 0;
+ } else {
+ dct_error_sum[i] -= level;
+ level += dct_offset[i];
+ if (level > 0)
+ level = 0;
+ }
+ block[i] = level;
+ }
+ }
+}
+
static int try_8x8basis_c(const int16_t rem[64], const int16_t weight[64],
const int16_t basis[64], int scale)
{
@@ -253,6 +276,8 @@ static void shrink88(uint8_t *dst, ptrdiff_t dst_wrap,
av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
AVCodecContext *avctx)
{
+ c->denoise_dct = denoise_dct_c;
+
c->try_8x8basis = try_8x8basis_c;
c->add_8x8basis = add_8x8basis_c;
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
index 6ec665677b..989503f25f 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -30,6 +30,9 @@
#define EDGE_BOTTOM 2
typedef struct MpegvideoEncDSPContext {
+ void (*denoise_dct)(int16_t block[64], int dct_error_sum[64],
+ const uint16_t dct_offset[64]);
+
int (*try_8x8basis)(const int16_t rem[64], const int16_t weight[64],
const int16_t basis[64], int scale);
void (*add_8x8basis)(int16_t rem[64], const int16_t basis[64], int scale);
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index e5665ac781..c667dcd2a2 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -57,22 +57,6 @@ DECLARE_ALIGNED(16, static const uint16_t,
inv_zigzag_direct16)[64] = {
#endif /* HAVE_6REGS */
-#if HAVE_SSE2_EXTERNAL
-void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
- const uint16_t dct_offset[64]);
-
-static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
-{
- const int intra = s->c.mb_intra;
- int *sum= s->dct_error_sum[intra];
- uint16_t *offset= s->dct_offset[intra];
-
- s->dct_count[intra]++;
-
- ff_mpv_denoise_dct_sse2(block, sum, offset);
-}
-#endif /* HAVE_SSE2_EXTERNAL */
-
av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
{
const int dct_algo = s->c.avctx->dct_algo;
@@ -83,9 +67,6 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
if (INLINE_SSE2(cpu_flags)) {
#if HAVE_6REGS
s->dct_quantize = dct_quantize_sse2;
-#endif
-#if HAVE_SSE2_EXTERNAL
- s->denoise_dct = denoise_dct_sse2;
#endif
}
#if HAVE_6REGS && HAVE_SSSE3_INLINE
diff --git a/libavcodec/x86/mpegvideoenc_template.c
b/libavcodec/x86/mpegvideoenc_template.c
index f0b95c1621..14e993de2b 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -76,8 +76,11 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
//s->fdct (block);
ff_fdct_sse2(block); // cannot be anything else ...
- if(s->dct_error_sum)
- s->denoise_dct(s, block);
+ if (s->dct_error_sum) {
+ const int intra = s->c.mb_intra;
+ s->dct_count[intra]++;
+ s->mpvencdsp.denoise_dct(block, s->dct_error_sum[intra],
s->dct_offset[intra]);
+ }
if (s->c.mb_intra) {
int dummy;
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c
b/libavcodec/x86/mpegvideoencdsp_init.c
index bf5b722016..f6169b5399 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -27,6 +27,8 @@
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideoencdsp.h"
+void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
+ const uint16_t dct_offset[64]);
int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size);
int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size);
int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
@@ -209,6 +211,7 @@ av_cold void
ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
+ c->denoise_dct = ff_mpv_denoise_dct_sse2;
c->pix_sum = ff_pix_sum16_sse2;
c->pix_norm1 = ff_pix_norm1_sse2;
}
commit d633fa0433de093c9a1257aed519b806b1054f21
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 17:32:29 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:12 2025 +0100
avcodec/x86/mpegvideoenc: Port denoise_dct_sse2 to external assembly
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index 2ca05f69ea..e5665ac781 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -57,8 +57,10 @@ DECLARE_ALIGNED(16, static const uint16_t,
inv_zigzag_direct16)[64] = {
#endif /* HAVE_6REGS */
-#if HAVE_INLINE_ASM
-#if HAVE_SSE2_INLINE
+#if HAVE_SSE2_EXTERNAL
+void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
+ const uint16_t dct_offset[64]);
+
static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
{
const int intra = s->c.mb_intra;
@@ -67,56 +69,9 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t
block[])
s->dct_count[intra]++;
- __asm__ volatile(
- "pxor %%xmm6, %%xmm6 \n\t"
- "1: \n\t"
- "pxor %%xmm0, %%xmm0 \n\t"
- "pxor %%xmm1, %%xmm1 \n\t"
- "movdqa (%0), %%xmm2 \n\t"
- "movdqa 16(%0), %%xmm3 \n\t"
- "pcmpgtw %%xmm2, %%xmm0 \n\t"
- "pcmpgtw %%xmm3, %%xmm1 \n\t"
- "pxor %%xmm0, %%xmm2 \n\t"
- "pxor %%xmm1, %%xmm3 \n\t"
- "psubw %%xmm0, %%xmm2 \n\t"
- "psubw %%xmm1, %%xmm3 \n\t"
- "movdqa %%xmm2, %%xmm4 \n\t"
- "movdqa %%xmm3, %%xmm5 \n\t"
- "psubusw (%2), %%xmm2 \n\t"
- "psubusw 16(%2), %%xmm3 \n\t"
- "pxor %%xmm0, %%xmm2 \n\t"
- "pxor %%xmm1, %%xmm3 \n\t"
- "psubw %%xmm0, %%xmm2 \n\t"
- "psubw %%xmm1, %%xmm3 \n\t"
- "movdqa %%xmm2, (%0) \n\t"
- "movdqa %%xmm3, 16(%0) \n\t"
- "movdqa %%xmm4, %%xmm2 \n\t"
- "movdqa %%xmm5, %%xmm0 \n\t"
- "punpcklwd %%xmm6, %%xmm4 \n\t"
- "punpckhwd %%xmm6, %%xmm2 \n\t"
- "punpcklwd %%xmm6, %%xmm5 \n\t"
- "punpckhwd %%xmm6, %%xmm0 \n\t"
- "paddd (%1), %%xmm4 \n\t"
- "paddd 16(%1), %%xmm2 \n\t"
- "paddd 32(%1), %%xmm5 \n\t"
- "paddd 48(%1), %%xmm0 \n\t"
- "movdqa %%xmm4, (%1) \n\t"
- "movdqa %%xmm2, 16(%1) \n\t"
- "movdqa %%xmm5, 32(%1) \n\t"
- "movdqa %%xmm0, 48(%1) \n\t"
- "add $32, %0 \n\t"
- "add $64, %1 \n\t"
- "add $32, %2 \n\t"
- "cmp %3, %0 \n\t"
- " jb 1b \n\t"
- : "+r" (block), "+r" (sum), "+r" (offset)
- : "r"(block+64)
- XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6")
- );
+ ff_mpv_denoise_dct_sse2(block, sum, offset);
}
-#endif /* HAVE_SSE2_INLINE */
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_SSE2_EXTERNAL */
av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
{
@@ -129,7 +84,9 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
#if HAVE_6REGS
s->dct_quantize = dct_quantize_sse2;
#endif
+#if HAVE_SSE2_EXTERNAL
s->denoise_dct = denoise_dct_sse2;
+#endif
}
#if HAVE_6REGS && HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags))
diff --git a/libavcodec/x86/mpegvideoencdsp.asm
b/libavcodec/x86/mpegvideoencdsp.asm
index d12646ae54..0e86a5304c 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -24,6 +24,52 @@
%include "libavutil/x86/x86util.asm"
SECTION .text
+
+INIT_XMM sse2
+cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset
+ pxor m6, m6
+ lea r3, [sumq+256]
+.loop:
+ mova m2, [blockq]
+ mova m3, [blockq+16]
+ mova m0, m6
+ mova m1, m6
+ pcmpgtw m0, m2
+ pcmpgtw m1, m3
+ pxor m2, m0
+ pxor m3, m1
+ psubw m2, m0
+ psubw m3, m1
+ psubusw m4, m2, [offsetq]
+ psubusw m5, m3, [offsetq+16]
+ pxor m4, m0
+ pxor m5, m1
+ add offsetq, 32
+ psubw m4, m0
+ psubw m5, m1
+ mova [blockq], m4
+ mova [blockq+16], m5
+ mova m0, m2
+ mova m1, m3
+ add blockq, 32
+ punpcklwd m0, m6
+ punpckhwd m2, m6
+ punpcklwd m1, m6
+ punpckhwd m3, m6
+ paddd m0, [sumq]
+ paddd m2, [sumq+16]
+ paddd m1, [sumq+32]
+ paddd m3, [sumq+48]
+ mova [sumq], m0
+ mova [sumq+16], m2
+ mova [sumq+32], m1
+ mova [sumq+48], m3
+ add sumq, 64
+ cmp sumq, r3
+ jb .loop
+ RET
+
+
; int ff_pix_sum16(const uint8_t *pix, ptrdiff_t line_size)
; %1 = number of loops
; %2 = number of GPRs used
commit 2cfef7031ca4620e4744534527fe1674963bfdda
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 16:46:18 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:12 2025 +0100
avcodec/x86/mpegvideoenc: Reduce number of registers used
Avoids a push+pop on x64 Windows.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index bb1d2cc319..2ca05f69ea 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -68,7 +68,7 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t
block[])
s->dct_count[intra]++;
__asm__ volatile(
- "pxor %%xmm7, %%xmm7 \n\t"
+ "pxor %%xmm6, %%xmm6 \n\t"
"1: \n\t"
"pxor %%xmm0, %%xmm0 \n\t"
"pxor %%xmm1, %%xmm1 \n\t"
@@ -90,18 +90,18 @@ static void denoise_dct_sse2(MPVEncContext *const s,
int16_t block[])
"psubw %%xmm1, %%xmm3 \n\t"
"movdqa %%xmm2, (%0) \n\t"
"movdqa %%xmm3, 16(%0) \n\t"
- "movdqa %%xmm4, %%xmm6 \n\t"
+ "movdqa %%xmm4, %%xmm2 \n\t"
"movdqa %%xmm5, %%xmm0 \n\t"
- "punpcklwd %%xmm7, %%xmm4 \n\t"
- "punpckhwd %%xmm7, %%xmm6 \n\t"
- "punpcklwd %%xmm7, %%xmm5 \n\t"
- "punpckhwd %%xmm7, %%xmm0 \n\t"
+ "punpcklwd %%xmm6, %%xmm4 \n\t"
+ "punpckhwd %%xmm6, %%xmm2 \n\t"
+ "punpcklwd %%xmm6, %%xmm5 \n\t"
+ "punpckhwd %%xmm6, %%xmm0 \n\t"
"paddd (%1), %%xmm4 \n\t"
- "paddd 16(%1), %%xmm6 \n\t"
+ "paddd 16(%1), %%xmm2 \n\t"
"paddd 32(%1), %%xmm5 \n\t"
"paddd 48(%1), %%xmm0 \n\t"
"movdqa %%xmm4, (%1) \n\t"
- "movdqa %%xmm6, 16(%1) \n\t"
+ "movdqa %%xmm2, 16(%1) \n\t"
"movdqa %%xmm5, 32(%1) \n\t"
"movdqa %%xmm0, 48(%1) \n\t"
"add $32, %0 \n\t"
@@ -112,7 +112,7 @@ static void denoise_dct_sse2(MPVEncContext *const s,
int16_t block[])
: "+r" (block), "+r" (sum), "+r" (offset)
: "r"(block+64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+ "%xmm4", "%xmm5", "%xmm6")
);
}
#endif /* HAVE_SSE2_INLINE */
commit 503afa40f7d6227ec25d42d40275f810940b0959
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 16:18:16 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:12 2025 +0100
avcodec/x86/mpegvideoenc: Remove check for MMX
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index eac9947590..bb1d2cc319 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -123,16 +123,14 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const
s)
const int dct_algo = s->c.avctx->dct_algo;
if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
-#if HAVE_MMX_INLINE
- int cpu_flags = av_get_cpu_flags();
#if HAVE_SSE2_INLINE
+ int cpu_flags = av_get_cpu_flags();
if (INLINE_SSE2(cpu_flags)) {
#if HAVE_6REGS
s->dct_quantize = dct_quantize_sse2;
#endif
s->denoise_dct = denoise_dct_sse2;
}
-#endif
#if HAVE_6REGS && HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags))
s->dct_quantize = dct_quantize_ssse3;
-----------------------------------------------------------------------
Summary of changes:
libavcodec/mips/Makefile | 3 +-
libavcodec/mips/mpegvideo_mips.h | 3 +-
libavcodec/mips/mpegvideoenc_init_mips.c | 33 --------
libavcodec/mips/mpegvideoencdsp_init_mips.c | 5 ++
.../{mpegvideoenc_mmi.c => mpegvideoencdsp_mmi.c} | 7 +-
libavcodec/mpegvideo_enc.c | 38 ++-------
libavcodec/mpegvideoenc.h | 2 -
libavcodec/mpegvideoencdsp.c | 25 ++++++
libavcodec/mpegvideoencdsp.h | 3 +
libavcodec/x86/mpegvideoenc.c | 74 +---------------
libavcodec/x86/mpegvideoenc_template.c | 44 +++++-----
libavcodec/x86/mpegvideoencdsp.asm | 98 ++++++++++++++++++++++
libavcodec/x86/mpegvideoencdsp_init.c | 49 +++--------
libavutil/x86/asm.h | 5 +-
tests/checkasm/mpegvideoencdsp.c | 33 ++++++++
15 files changed, 212 insertions(+), 210 deletions(-)
delete mode 100644 libavcodec/mips/mpegvideoenc_init_mips.c
rename libavcodec/mips/{mpegvideoenc_mmi.c => mpegvideoencdsp_mmi.c} (95%)
hooks/post-receive
--
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]