The branch, master has been updated
via c22c2c5e033e3f73df47d88e49df86206f298e46 (commit)
via 6e2153111d5ff3b21a5303b7c23dd29de8a3bda6 (commit)
via 60084b136916a4dcace41e75a3b873e77eebd648 (commit)
via 1cb987d25bf4c8214461e12b01864b23c9bae67c (commit)
via a9a23925dfcf781dedc9cb910dd3097dd6224104 (commit)
via 1fa8ffc1db2b62e475545bc6b117215704f9e1d8 (commit)
via 6d56807a06ce06712c65f8fcbf2a9a444bf59353 (commit)
via 0f7cc6aeeacba070d6d4b76a9f3a4d4036c3bb0b (commit)
via 357fc5243c32300bba91c096488e86558beed4c8 (commit)
via 581050a1755b335cb106ad1b6c8e5f6fa9c19bd0 (commit)
via e7a629049f7e9be397b0acabe75beb207ad9dc21 (commit)
via 5d41d3e21dff14058b283491480a7382daeb5da9 (commit)
via 011ef7fc65fcbf2141adcec9ca805874bb0a6a16 (commit)
via 358c569b05bc6f9a107a5caebcc8da56e8bf9799 (commit)
from f7551e7505d389fcc14a8e16bcd13ab770658990 (commit)
- Log -----------------------------------------------------------------
commit c22c2c5e033e3f73df47d88e49df86206f298e46
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 4 06:48:19 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100
avcodec/mpegvideo: Port dct_unquantize_mpeg2_intra_mmx to SSE2
Benefits from wider registers.
Benchmarks:
dct_unquantize_mpeg2_intra_c: 228.2 ( 1.00x)
dct_unquantize_mpeg2_intra_mmx: 28.2 ( 8.10x)
dct_unquantize_mpeg2_intra_sse2: 18.4 (12.37x)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 576f8f320f..7c137cf75e 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -28,7 +28,7 @@
#include "libavcodec/mpegvideodata.h"
#include "libavcodec/mpegvideo_unquantize.h"
-#if HAVE_MMX_INLINE
+#if HAVE_SSE2_INLINE
#define SPLATW(reg) "punpcklwd %%" #reg ", %%" #reg "\n\t" \
"pshufd $0, %%" #reg ", %%" #reg "\n\t"
@@ -250,8 +250,8 @@ __asm__ volatile(
#endif /* HAVE_SSSE3_INLINE */
-static void dct_unquantize_mpeg2_intra_mmx(const MPVContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_intra_sse2(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
@@ -271,35 +271,35 @@ static void dct_unquantize_mpeg2_intra_mmx(const
MPVContext *s,
quant_matrix = s->intra_matrix;
x86_reg offset = -2 * nCoeffs;
__asm__ volatile(
- "movd %3, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- "movq (%2, %0), %%mm4 \n\t"
- "movq 8(%2, %0), %%mm5 \n\t"
- "pmullw %%mm6, %%mm4 \n\t" //
q=qscale*quant_matrix[i]
- "pmullw %%mm6, %%mm5 \n\t" //
q=qscale*quant_matrix[i]
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm1, %%mm3 \n\t"
- "psrlw $12, %%mm2 \n\t" // block[i] < 0 ? 0xf : 0
- "psrlw $12, %%mm3 \n\t" // (block[i] is in the
-2048..2047 range)
- "pmullw %%mm4, %%mm0 \n\t" // block[i]*q
- "pmullw %%mm5, %%mm1 \n\t" // block[i]*q
- "paddw %%mm2, %%mm0 \n\t" // bias negative block[i]
- "paddw %%mm3, %%mm1 \n\t" // so that a right-shift
- "psraw $4, %%mm0 \n\t" // is equivalent to
divide
- "psraw $4, %%mm1 \n\t" // with rounding towards
zero
- "movq %%mm0, (%1, %0) \n\t"
- "movq %%mm1, 8(%1, %0) \n\t"
-
- "add $16, %0 \n\t"
- "jng 1b \n\t"
+ "movd %3, %%xmm6 \n\t"
+ SPLATW(xmm6)
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movdqa (%1, %0), %%xmm0 \n\t"
+ "movdqa 16(%1, %0), %%xmm1 \n\t"
+ "movdqa (%2, %0), %%xmm4 \n\t"
+ "movdqa 16(%2, %0), %%xmm5 \n\t"
+ "pmullw %%xmm6, %%xmm4 \n\t" //
q=qscale*quant_matrix[i]
+ "pmullw %%xmm6, %%xmm5 \n\t" //
q=qscale*quant_matrix[i]
+ "movdqa %%xmm0, %%xmm2 \n\t"
+ "movdqa %%xmm1, %%xmm3 \n\t"
+ "psrlw $12, %%xmm2 \n\t" // block[i] < 0 ? 0xf : 0
+ "psrlw $12, %%xmm3 \n\t" // (block[i] is in the
-2048..2047 range)
+ "pmullw %%xmm4, %%xmm0 \n\t" // block[i]*q
+ "pmullw %%xmm5, %%xmm1 \n\t" // block[i]*q
+ "paddw %%xmm2, %%xmm0 \n\t" // bias negative block[i]
+ "paddw %%xmm3, %%xmm1 \n\t" // so that a right-shift
+ "psraw $4, %%xmm0 \n\t" // is equivalent to divide
+ "psraw $4, %%xmm1 \n\t" // with rounding towards
zero
+ "movdqa %%xmm0, (%1, %0) \n\t"
+ "movdqa %%xmm1, 16(%1, %0) \n\t"
+
+ "add $32, %0 \n\t"
+ "jng 1b \n\t"
: "+r" (offset)
: "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
- : "memory"
+ : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4",
"%xmm5", "%xmm6",)
+ "memory"
);
block[0]= block0;
//Note, we do not do mismatch control for intra as errors cannot
accumulate
@@ -371,16 +371,16 @@ __asm__ volatile(
}
#endif /* HAVE_SSSE3_INLINE */
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_SSE2_INLINE */
av_cold void ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
{
-#if HAVE_MMX_INLINE
+#if HAVE_SSE2_INLINE
int cpu_flags = av_get_cpu_flags();
- if (INLINE_MMX(cpu_flags)) {
+ if (INLINE_SSE2(cpu_flags)) {
if (!bitexact)
- s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
+ s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_sse2;
}
#if HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags)) {
@@ -391,5 +391,5 @@ av_cold void
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_ssse3;
}
#endif /* HAVE_SSSE3_INLINE */
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_SSE2_INLINE */
}
diff --git a/tests/checkasm/mpegvideo_unquantize.c
b/tests/checkasm/mpegvideo_unquantize.c
index 837606e60e..220a743a96 100644
--- a/tests/checkasm/mpegvideo_unquantize.c
+++ b/tests/checkasm/mpegvideo_unquantize.c
@@ -215,7 +215,7 @@ void checkasm_check_mpegvideo_unquantize(void)
int q_scale_type = rnd() & 1;
ff_mpv_unquantize_init(&unquant_dsp_ctx, 1 /* bitexact */, q_scale_type);
- declare_func_emms(AV_CPU_FLAG_MMX, void, MPVContext *s, int16_t *block,
int n, int qscale);
+ declare_func(void, MPVContext *s, int16_t *block, int n, int qscale);
for (size_t i = 0; i < FF_ARRAY_ELEMS(tests); ++i) {
void (*func)(MPVContext *s, int16_t *block, int n, int qscale) =
commit 6e2153111d5ff3b21a5303b7c23dd29de8a3bda6
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 4 06:45:12 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100
avcodec/x86/mpegvideo: Port dct_unquantize_mpeg2_inter_mmx to SSSE3
Benefits from wider registers, pabsw and psignw.
Benchmarks:
dct_unquantize_mpeg2_inter_c: 131.2 ( 1.00x)
dct_unquantize_mpeg2_inter_mmx: 50.2 ( 2.62x)
dct_unquantize_mpeg2_inter_ssse3: 20.5 ( 6.38x)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 01048df47d..576f8f320f 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -305,8 +305,10 @@ __asm__ volatile(
//Note, we do not do mismatch control for intra as errors cannot
accumulate
}
-static void dct_unquantize_mpeg2_inter_mmx(const MPVContext *s,
- int16_t *block, int n, int qscale)
+#if HAVE_SSSE3_INLINE
+
+static void dct_unquantize_mpeg2_inter_ssse3(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
av_assert2(s->block_last_index[n]>=0);
@@ -316,72 +318,59 @@ static void dct_unquantize_mpeg2_inter_mmx(const
MPVContext *s,
__asm__ volatile(
- "movd %k1, %%mm6 \n\t"
+ "movd %k1, %%xmm6 \n\t"
"lea (%2, %0), %1 \n\t"
"neg %0 \n\t"
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlq $48, %%mm7 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- "movq (%3, %0), %%mm4 \n\t"
- "movq 8(%3, %0), %%mm5 \n\t"
- "pmullw %%mm6, %%mm4 \n\t" //
q=qscale*quant_matrix[i]
- "pmullw %%mm6, %%mm5 \n\t" //
q=qscale*quant_matrix[i]
- "pxor %%mm2, %%mm2 \n\t"
- "pxor %%mm3, %%mm3 \n\t"
- "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
- "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
- "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
- "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
- "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
- "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
- "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
- "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 +
1)*q
- "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 +
1)*q
- "pxor %%mm4, %%mm4 \n\t"
- "pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%1, %0), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
- "pcmpeqw 8(%1, %0), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
- "psrlw $5, %%mm0 \n\t"
- "psrlw $5, %%mm1 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t"
- "psubw %%mm3, %%mm1 \n\t"
- "pandn %%mm0, %%mm4 \n\t"
- "pandn %%mm1, %%mm5 \n\t"
- "pxor %%mm4, %%mm7 \n\t"
- "pxor %%mm5, %%mm7 \n\t"
- "movq %%mm4, (%1, %0) \n\t"
- "movq %%mm5, 8(%1, %0) \n\t"
-
- "add $16, %0 \n\t"
- "jng 1b \n\t"
- "movd 124(%2), %%mm0 \n\t"
- "movq %%mm7, %%mm6 \n\t"
- "psrlq $32, %%mm7 \n\t"
- "pxor %%mm6, %%mm7 \n\t"
- "movq %%mm7, %%mm6 \n\t"
- "psrlq $16, %%mm7 \n\t"
- "pxor %%mm6, %%mm7 \n\t"
- "pslld $31, %%mm7 \n\t"
- "psrlq $15, %%mm7 \n\t"
- "pxor %%mm7, %%mm0 \n\t"
- "movd %%mm0, 124(%2) \n\t"
+ SPLATW(xmm6)
+ "pcmpeqw %%xmm7, %%xmm7 \n\t"
+ "psrldq $14, %%xmm7 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movdqa (%3, %0), %%xmm4 \n\t"
+ "movdqa 16(%3, %0), %%xmm5 \n\t"
+ "movdqa (%1, %0), %%xmm0 \n\t"
+ "movdqa 16(%1, %0), %%xmm1 \n\t"
+ "pmullw %%xmm6, %%xmm4 \n\t" //
q=qscale*quant_matrix[i]
+ "pmullw %%xmm6, %%xmm5 \n\t" //
q=qscale*quant_matrix[i]
+ "pabsw %%xmm0, %%xmm2 \n\t" // abs(block[i])
+ "pabsw %%xmm1, %%xmm3 \n\t" // abs(block[i])
+ "paddw %%xmm2, %%xmm2 \n\t" // abs(block[i])*2
+ "paddw %%xmm3, %%xmm3 \n\t" // abs(block[i])*2
+ "pmullw %%xmm4, %%xmm2 \n\t" // abs(block[i])*2*q
+ "pmullw %%xmm5, %%xmm3 \n\t" // abs(block[i])*2*q
+ "paddw %%xmm4, %%xmm2 \n\t" // (abs(block[i])*2 + 1)*q
+ "paddw %%xmm5, %%xmm3 \n\t" // (abs(block[i])*2 + 1)*q
+ "psrlw $5, %%xmm2 \n\t"
+ "psrlw $5, %%xmm3 \n\t"
+ "psignw %%xmm0, %%xmm2 \n\t"
+ "psignw %%xmm1, %%xmm3 \n\t"
+ "movdqa %%xmm2, (%1, %0) \n\t"
+ "movdqa %%xmm3, 16(%1, %0) \n\t"
+ "pxor %%xmm2, %%xmm7 \n\t"
+ "pxor %%xmm3, %%xmm7 \n\t"
+
+ "add $32, %0 \n\t"
+ "jng 1b \n\t"
+ "movd 124(%2), %%xmm0 \n\t"
+ "movhlps %%xmm7, %%xmm6 \n\t"
+ "pxor %%xmm6, %%xmm7 \n\t"
+ "pshufd $1, %%xmm7, %%xmm6 \n\t"
+ "pxor %%xmm6, %%xmm7 \n\t"
+ "pshuflw $1, %%xmm7, %%xmm6 \n\t"
+ "pxor %%xmm6, %%xmm7 \n\t"
+ "pslld $31, %%xmm7 \n\t"
+ "psrld $15, %%xmm7 \n\t"
+ "pxor %%xmm7, %%xmm0 \n\t"
+ "movd %%xmm0, 124(%2) \n\t"
: "+r"(offset), "+r" (qscale2)
: "r" (block), "r"(quant_matrix)
- : "memory"
+ : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4",
"%xmm5", "%xmm6", "%xmm7",)
+ "memory"
);
}
+#endif /* HAVE_SSSE3_INLINE */
#endif /* HAVE_MMX_INLINE */
av_cold void ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
@@ -392,7 +381,6 @@ av_cold void
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
if (INLINE_MMX(cpu_flags)) {
if (!bitexact)
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
- s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
}
#if HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags)) {
@@ -400,6 +388,7 @@ av_cold void
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_ssse3;
s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_ssse3;
s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_ssse3;
+ s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_ssse3;
}
#endif /* HAVE_SSSE3_INLINE */
#endif /* HAVE_MMX_INLINE */
commit 60084b136916a4dcace41e75a3b873e77eebd648
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 3 19:45:49 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100
avcodec/x86/mpegvideo: Port MPEG-1 unquantize functions to SSSE3
Benefits from wider registers and pabsw, psignw.
Benchmarks:
dct_unquantize_mpeg1_inter_c: 343.0 ( 1.00x)
dct_unquantize_mpeg1_inter_mmx: 50.6 ( 6.78x)
dct_unquantize_mpeg1_inter_ssse3: 17.2 (19.94x)
dct_unquantize_mpeg1_intra_c: 352.1 ( 1.00x)
dct_unquantize_mpeg1_intra_mmx: 48.8 ( 7.22x)
dct_unquantize_mpeg1_intra_ssse3: 19.5 (18.03x)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index 758bf57ab9..6aff5fbcd0 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -38,6 +38,8 @@
#include "qpeldsp.h"
#include "videodsp.h"
+#include "libavutil/mem_internal.h"
+
#define MAX_THREADS 32
/**
@@ -202,10 +204,10 @@ typedef struct MpegEncContext {
int *mb_index2xy; ///< mb_index -> mb_x + mb_y*mb_stride
/** matrix transmitted in the bitstream */
- uint16_t intra_matrix[64];
- uint16_t chroma_intra_matrix[64];
- uint16_t inter_matrix[64];
- uint16_t chroma_inter_matrix[64];
+ DECLARE_ALIGNED(16, uint16_t, intra_matrix)[64];
+ DECLARE_ALIGNED(16, uint16_t, chroma_intra_matrix)[64];
+ DECLARE_ALIGNED(16, uint16_t, inter_matrix)[64];
+ DECLARE_ALIGNED(16, uint16_t, chroma_inter_matrix)[64];
/* error concealment / resync */
int resync_mb_x; ///< x position of last resync marker
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 82a29d1bcf..01048df47d 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -138,10 +138,9 @@ __asm__ volatile(
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4",
"%xmm5",) "memory"
);
}
-#endif
-static void dct_unquantize_mpeg1_intra_mmx(const MPVContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_intra_ssse3(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
@@ -159,59 +158,45 @@ static void dct_unquantize_mpeg1_intra_mmx(const
MPVContext *s,
quant_matrix = s->intra_matrix;
x86_reg offset = -2 * nCoeffs;
__asm__ volatile(
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $15, %%mm7 \n\t"
- "movd %3, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- "movq (%2, %0), %%mm4 \n\t"
- "movq 8(%2, %0), %%mm5 \n\t"
- "pmullw %%mm6, %%mm4 \n\t" //
q=qscale*quant_matrix[i]
- "pmullw %%mm6, %%mm5 \n\t" //
q=qscale*quant_matrix[i]
- "pxor %%mm2, %%mm2 \n\t"
- "pxor %%mm3, %%mm3 \n\t"
- "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
- "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
- "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
- "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
- "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
- "pxor %%mm4, %%mm4 \n\t"
- "pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%1, %0), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
- "pcmpeqw 8(%1, %0), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
- "psraw $3, %%mm0 \n\t"
- "psraw $3, %%mm1 \n\t"
- "psubw %%mm7, %%mm0 \n\t"
- "psubw %%mm7, %%mm1 \n\t"
- "por %%mm7, %%mm0 \n\t"
- "por %%mm7, %%mm1 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t"
- "psubw %%mm3, %%mm1 \n\t"
- "pandn %%mm0, %%mm4 \n\t"
- "pandn %%mm1, %%mm5 \n\t"
- "movq %%mm4, (%1, %0) \n\t"
- "movq %%mm5, 8(%1, %0) \n\t"
+ "movd %3, %%xmm6 \n\t"
+ "pcmpeqw %%xmm7, %%xmm7 \n\t"
+ "psrlw $15, %%xmm7 \n\t"
+ SPLATW(xmm6)
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movdqa (%2, %0), %%xmm4 \n\t"
+ "movdqa 16(%2, %0), %%xmm5 \n\t"
+ "movdqa (%1, %0), %%xmm0 \n\t"
+ "movdqa 16(%1, %0), %%xmm1 \n\t"
+ "pmullw %%xmm6, %%xmm4 \n\t" //
q=qscale*quant_matrix[i]
+ "pmullw %%xmm6, %%xmm5 \n\t" //
q=qscale*quant_matrix[i]
+ "pabsw %%xmm0, %%xmm2 \n\t" // abs(block[i])
+ "pabsw %%xmm1, %%xmm3 \n\t" // abs(block[i])
+ "pmullw %%xmm4, %%xmm2 \n\t" // abs(block[i])*q
+ "pmullw %%xmm5, %%xmm3 \n\t" // abs(block[i])*q
+ "psraw $3, %%xmm2 \n\t"
+ "psraw $3, %%xmm3 \n\t"
+ "psubw %%xmm7, %%xmm2 \n\t"
+ "psubw %%xmm7, %%xmm3 \n\t"
+ "por %%xmm7, %%xmm2 \n\t"
+ "por %%xmm7, %%xmm3 \n\t"
+ "psignw %%xmm0, %%xmm2 \n\t"
+ "psignw %%xmm1, %%xmm3 \n\t"
+ "movdqa %%xmm2, (%1, %0) \n\t"
+ "movdqa %%xmm3, 16(%1, %0) \n\t"
- "add $16, %0 \n\t"
- "js 1b \n\t"
+ "add $32, %0 \n\t"
+ "js 1b \n\t"
: "+r" (offset)
: "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
- : "memory"
+ : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4",
"%xmm5", "%xmm6", "%xmm7",)
+ "memory"
);
block[0]= block0;
}
-static void dct_unquantize_mpeg1_inter_mmx(const MPVContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_inter_ssse3(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
@@ -223,60 +208,48 @@ static void dct_unquantize_mpeg1_inter_mmx(const
MPVContext *s,
quant_matrix = s->inter_matrix;
x86_reg offset = -2 * nCoeffs;
__asm__ volatile(
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $15, %%mm7 \n\t"
- "movd %3, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- "movq (%2, %0), %%mm4 \n\t"
- "movq 8(%2, %0), %%mm5 \n\t"
- "pmullw %%mm6, %%mm4 \n\t" //
q=qscale*quant_matrix[i]
- "pmullw %%mm6, %%mm5 \n\t" //
q=qscale*quant_matrix[i]
- "pxor %%mm2, %%mm2 \n\t"
- "pxor %%mm3, %%mm3 \n\t"
- "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
- "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
- "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
- "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
- "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
- "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
- "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
- "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 +
1)*q
- "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 +
1)*q
- "pxor %%mm4, %%mm4 \n\t"
- "pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%1, %0), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
- "pcmpeqw 8(%1, %0), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
- "psraw $4, %%mm0 \n\t"
- "psraw $4, %%mm1 \n\t"
- "psubw %%mm7, %%mm0 \n\t"
- "psubw %%mm7, %%mm1 \n\t"
- "por %%mm7, %%mm0 \n\t"
- "por %%mm7, %%mm1 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t"
- "psubw %%mm3, %%mm1 \n\t"
- "pandn %%mm0, %%mm4 \n\t"
- "pandn %%mm1, %%mm5 \n\t"
- "movq %%mm4, (%1, %0) \n\t"
- "movq %%mm5, 8(%1, %0) \n\t"
+ "movd %3, %%xmm6 \n\t"
+ "pcmpeqw %%xmm7, %%xmm7 \n\t"
+ "psrlw $15, %%xmm7 \n\t"
+ SPLATW(xmm6)
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movdqa (%2, %0), %%xmm4 \n\t"
+ "movdqa 16(%2, %0), %%xmm5 \n\t"
+ "movdqa (%1, %0), %%xmm0 \n\t"
+ "movdqa 16(%1, %0), %%xmm1 \n\t"
+ "pmullw %%xmm6, %%xmm4 \n\t" //
q=qscale*quant_matrix[i]
+ "pmullw %%xmm6, %%xmm5 \n\t" //
q=qscale*quant_matrix[i]
+ "pabsw %%xmm0, %%xmm2 \n\t" // abs(block[i])
+ "pabsw %%xmm1, %%xmm3 \n\t" // abs(block[i])
+ "paddw %%xmm2, %%xmm2 \n\t" // abs(block[i])*2
+ "paddw %%xmm3, %%xmm3 \n\t" // abs(block[i])*2
+ "paddw %%xmm7, %%xmm2 \n\t" // abs(block[i])*2 + 1
+ "paddw %%xmm7, %%xmm3 \n\t" // abs(block[i])*2 + 1
+ "pmullw %%xmm4, %%xmm2 \n\t" // (abs(block[i])*2 + 1)*q
+ "pmullw %%xmm5, %%xmm3 \n\t" // (abs(block[i])*2 + 1)*q
+ "psraw $4, %%xmm2 \n\t"
+ "psraw $4, %%xmm3 \n\t"
+ "psubw %%xmm7, %%xmm2 \n\t"
+ "psubw %%xmm7, %%xmm3 \n\t"
+ "por %%xmm7, %%xmm2 \n\t"
+ "por %%xmm7, %%xmm3 \n\t"
+ "psignw %%xmm0, %%xmm2 \n\t"
+ "psignw %%xmm1, %%xmm3 \n\t"
+ "movdqa %%xmm2, (%1, %0) \n\t"
+ "movdqa %%xmm3, 16(%1, %0) \n\t"
- "add $16, %0 \n\t"
- "js 1b \n\t"
+ "add $32, %0 \n\t"
+ "js 1b \n\t"
: "+r" (offset)
: "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
- : "memory"
+ : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4",
"%xmm5", "%xmm6", "%xmm7",)
+ "memory"
);
}
+#endif /* HAVE_SSSE3_INLINE */
+
static void dct_unquantize_mpeg2_intra_mmx(const MPVContext *s,
int16_t *block, int n, int qscale)
{
@@ -417,8 +390,6 @@ av_cold void
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags)) {
- s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
- s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
if (!bitexact)
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
@@ -427,6 +398,8 @@ av_cold void
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
if (INLINE_SSSE3(cpu_flags)) {
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_ssse3;
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_ssse3;
+ s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_ssse3;
+ s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_ssse3;
}
#endif /* HAVE_SSSE3_INLINE */
#endif /* HAVE_MMX_INLINE */
commit 1cb987d25bf4c8214461e12b01864b23c9bae67c
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 4 07:53:09 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100
avcodec/x86/mpegvideo: Port dct_unquantize_h263_{intra,inter}_mmx to SSSE3
It benefits from wider registers and psignw.
Benchmarks:
dct_unquantize_h263_inter_c: 88.3 ( 1.00x)
dct_unquantize_h263_inter_mmx: 24.7 ( 3.58x)
dct_unquantize_h263_inter_ssse3: 9.3 ( 9.47x)
dct_unquantize_h263_intra_c: 93.7 ( 1.00x)
dct_unquantize_h263_intra_mmx: 30.6 ( 3.06x)
dct_unquantize_h263_intra_ssse3: 16.5 ( 5.69x)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index aa15e2b32a..82a29d1bcf 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -30,8 +30,13 @@
#if HAVE_MMX_INLINE
-static void dct_unquantize_h263_intra_mmx(const MPVContext *s,
- int16_t *block, int n, int qscale)
+#define SPLATW(reg) "punpcklwd %%" #reg ", %%" #reg "\n\t" \
+ "pshufd $0, %%" #reg ", %%" #reg "\n\t"
+
+#if HAVE_SSSE3_INLINE
+
+static void dct_unquantize_h263_intra_ssse3(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
x86_reg qmul = (unsigned)qscale << 1;
int level, qadd;
@@ -51,61 +56,45 @@ static void dct_unquantize_h263_intra_mmx(const MPVContext
*s,
x86_reg offset = s->ac_pred ? 63 << 1 :
s->intra_scantable.raster_end[s->block_last_index[n]] << 1;
__asm__ volatile(
- "movd %k1, %%mm6 \n\t" //qmul
- "lea (%2, %0), %1 \n\t"
- "neg %0 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "movd %3, %%mm5 \n\t" //qadd
- "pxor %%mm7, %%mm7 \n\t"
- "packssdw %%mm5, %%mm5 \n\t"
- "packssdw %%mm5, %%mm5 \n\t"
- "psubw %%mm5, %%mm7 \n\t"
- "pxor %%mm4, %%mm4 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
-
- "pmullw %%mm6, %%mm0 \n\t"
- "pmullw %%mm6, %%mm1 \n\t"
+ "movd %k1, %%xmm0 \n\t" //qmul
+ "lea (%2, %0), %1 \n\t"
+ "neg %0 \n\t"
+ "movd %3, %%xmm1 \n\t" //qadd
+ SPLATW(xmm0)
+ SPLATW(xmm1)
- "movq (%1, %0), %%mm2 \n\t"
- "movq 8(%1, %0), %%mm3 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movdqa (%1, %0), %%xmm2 \n\t"
+ "movdqa 16(%1, %0), %%xmm3 \n\t"
- "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
- "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+ "movdqa %%xmm1, %%xmm4 \n\t"
+ "movdqa %%xmm1, %%xmm5 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
+ "psignw %%xmm2, %%xmm4 \n\t" // sgn(block[i])*qadd
+ "psignw %%xmm3, %%xmm5 \n\t" // sgn(block[i])*qadd
- "paddw %%mm7, %%mm0 \n\t"
- "paddw %%mm7, %%mm1 \n\t"
+ "pmullw %%xmm0, %%xmm2 \n\t"
+ "pmullw %%xmm0, %%xmm3 \n\t"
- "pxor %%mm0, %%mm2 \n\t"
- "pxor %%mm1, %%mm3 \n\t"
+ "paddw %%xmm4, %%xmm2 \n\t"
+ "paddw %%xmm5, %%xmm3 \n\t"
- "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
- "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
+ "movdqa %%xmm2, (%1, %0) \n\t"
+ "movdqa %%xmm3, 16(%1, %0) \n\t"
- "pandn %%mm2, %%mm0 \n\t"
- "pandn %%mm3, %%mm1 \n\t"
-
- "movq %%mm0, (%1, %0) \n\t"
- "movq %%mm1, 8(%1, %0) \n\t"
-
- "add $16, %0 \n\t"
- "jng 1b \n\t"
+ "add $32, %0 \n\t"
+ "jng 1b \n\t"
: "+r"(offset), "+r"(qmul)
: "r" (block), "rm" (qadd)
- : "memory"
+ : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4",
"%xmm5",) "memory"
);
block[0]= level;
}
-static void dct_unquantize_h263_inter_mmx(const MPVContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_inter_ssse3(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
int qmul = qscale << 1;
int qadd = (qscale - 1) | 1;
@@ -115,56 +104,41 @@ static void dct_unquantize_h263_inter_mmx(const
MPVContext *s,
x86_reg offset = s->inter_scantable.raster_end[s->block_last_index[n]] <<
1;
__asm__ volatile(
- "movd %2, %%mm6 \n\t" //qmul
- "packssdw %%mm6, %%mm6 \n\t"
- "packssdw %%mm6, %%mm6 \n\t"
- "movd %3, %%mm5 \n\t" //qadd
- "add %1, %0 \n\t"
- "neg %1 \n\t"
- "pxor %%mm7, %%mm7 \n\t"
- "packssdw %%mm5, %%mm5 \n\t"
- "packssdw %%mm5, %%mm5 \n\t"
- "psubw %%mm5, %%mm7 \n\t"
- "pxor %%mm4, %%mm4 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%0, %1), %%mm0 \n\t"
- "movq 8(%0, %1), %%mm1 \n\t"
+ "movd %2, %%xmm0 \n\t" //qmul
+ "movd %3, %%xmm1 \n\t" //qadd
+ "add %1, %0 \n\t"
+ "neg %1 \n\t"
+ SPLATW(xmm0)
+ SPLATW(xmm1)
- "pmullw %%mm6, %%mm0 \n\t"
- "pmullw %%mm6, %%mm1 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movdqa (%0, %1), %%xmm2 \n\t"
+ "movdqa 16(%0, %1), %%xmm3 \n\t"
- "movq (%0, %1), %%mm2 \n\t"
- "movq 8(%0, %1), %%mm3 \n\t"
+ "movdqa %%xmm1, %%xmm4 \n\t"
+ "movdqa %%xmm1, %%xmm5 \n\t"
- "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
- "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+ "psignw %%xmm2, %%xmm4 \n\t" // sgn(block[i])*qadd
+ "psignw %%xmm3, %%xmm5 \n\t" // sgn(block[i])*qadd
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
-
- "paddw %%mm7, %%mm0 \n\t"
- "paddw %%mm7, %%mm1 \n\t"
-
- "pxor %%mm0, %%mm2 \n\t"
- "pxor %%mm1, %%mm3 \n\t"
-
- "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
- "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
+ "pmullw %%xmm0, %%xmm2 \n\t"
+ "pmullw %%xmm0, %%xmm3 \n\t"
- "pandn %%mm2, %%mm0 \n\t"
- "pandn %%mm3, %%mm1 \n\t"
+ "paddw %%xmm4, %%xmm2 \n\t"
+ "paddw %%xmm5, %%xmm3 \n\t"
- "movq %%mm0, (%0, %1) \n\t"
- "movq %%mm1, 8(%0, %1) \n\t"
+ "movdqa %%xmm2, (%0, %1) \n\t"
+ "movdqa %%xmm3, 16(%0, %1) \n\t"
- "add $16, %1 \n\t"
- "jng 1b \n\t"
+ "add $32, %1 \n\t"
+ "jng 1b \n\t"
: "+r" (block), "+r" (offset)
: "rm"(qmul), "rm" (qadd)
- : "memory"
+ : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4",
"%xmm5",) "memory"
);
}
+#endif
static void dct_unquantize_mpeg1_intra_mmx(const MPVContext *s,
int16_t *block, int n, int qscale)
@@ -443,13 +417,17 @@ av_cold void
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags)) {
- s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
- s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
if (!bitexact)
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
}
+#if HAVE_SSSE3_INLINE
+ if (INLINE_SSSE3(cpu_flags)) {
+ s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_ssse3;
+ s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_ssse3;
+ }
+#endif /* HAVE_SSSE3_INLINE */
#endif /* HAVE_MMX_INLINE */
}
commit a9a23925dfcf781dedc9cb910dd3097dd6224104
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 3 19:17:16 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100
avcodec/x86/mpegvideo: Don't duplicate register
Currently several inline ASM blocks used a value as
an input and rax as clobber register. The input value
was just moved into the register which then served as loop
counter. This is wasteful, as one can just use the value's
register directly as loop counter.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index d1614eb1eb..aa15e2b32a 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -183,19 +183,19 @@ static void dct_unquantize_mpeg1_intra_mmx(const
MPVContext *s,
block0 = block[0] * s->c_dc_scale;
/* XXX: only MPEG-1 */
quant_matrix = s->intra_matrix;
+ x86_reg offset = -2 * nCoeffs;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
- "movd %2, %%mm6 \n\t"
+ "movd %3, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
- "mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
- "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
- "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
- "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
- "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ "movq (%2, %0), %%mm4 \n\t"
+ "movq 8(%2, %0), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" //
q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" //
q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
@@ -210,8 +210,8 @@ __asm__ volatile(
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1
: 0
- "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1
: 0
+ "pcmpeqw (%1, %0), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw 8(%1, %0), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
"psraw $3, %%mm0 \n\t"
"psraw $3, %%mm1 \n\t"
"psubw %%mm7, %%mm0 \n\t"
@@ -224,13 +224,14 @@ __asm__ volatile(
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
- "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
- "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+ "movq %%mm4, (%1, %0) \n\t"
+ "movq %%mm5, 8(%1, %0) \n\t"
- "add $16, %%"FF_REG_a" \n\t"
+ "add $16, %0 \n\t"
"js 1b \n\t"
- ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm"
(qscale), "g" (-2*nCoeffs)
- : "%"FF_REG_a, "memory"
+ : "+r" (offset)
+ : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
+ : "memory"
);
block[0]= block0;
}
@@ -246,19 +247,19 @@ static void dct_unquantize_mpeg1_inter_mmx(const
MPVContext *s,
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
quant_matrix = s->inter_matrix;
+ x86_reg offset = -2 * nCoeffs;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
- "movd %2, %%mm6 \n\t"
+ "movd %3, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
- "mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
- "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
- "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
- "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
- "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ "movq (%2, %0), %%mm4 \n\t"
+ "movq 8(%2, %0), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" //
q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" //
q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
@@ -277,8 +278,8 @@ __asm__ volatile(
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 +
1)*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1
: 0
- "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1
: 0
+ "pcmpeqw (%1, %0), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw 8(%1, %0), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
"psraw $4, %%mm0 \n\t"
"psraw $4, %%mm1 \n\t"
"psubw %%mm7, %%mm0 \n\t"
@@ -291,13 +292,14 @@ __asm__ volatile(
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
- "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
- "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+ "movq %%mm4, (%1, %0) \n\t"
+ "movq %%mm5, 8(%1, %0) \n\t"
- "add $16, %%"FF_REG_a" \n\t"
+ "add $16, %0 \n\t"
"js 1b \n\t"
- ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm"
(qscale), "g" (-2*nCoeffs)
- : "%"FF_REG_a, "memory"
+ : "+r" (offset)
+ : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
+ : "memory"
);
}
@@ -320,17 +322,17 @@ static void dct_unquantize_mpeg2_intra_mmx(const
MPVContext *s,
else
block0 = block[0] * s->c_dc_scale;
quant_matrix = s->intra_matrix;
+ x86_reg offset = -2 * nCoeffs;
__asm__ volatile(
- "movd %2, %%mm6 \n\t"
+ "movd %3, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
- "mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
- "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
- "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
- "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
- "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ "movq (%2, %0), %%mm4 \n\t"
+ "movq 8(%2, %0), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" //
q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" //
q=qscale*quant_matrix[i]
"movq %%mm0, %%mm2 \n\t"
@@ -343,13 +345,14 @@ __asm__ volatile(
"paddw %%mm3, %%mm1 \n\t" // so that a right-shift
"psraw $4, %%mm0 \n\t" // is equivalent to
divide
"psraw $4, %%mm1 \n\t" // with rounding towards
zero
- "movq %%mm0, (%0, %%"FF_REG_a") \n\t"
- "movq %%mm1, 8(%0, %%"FF_REG_a")\n\t"
+ "movq %%mm0, (%1, %0) \n\t"
+ "movq %%mm1, 8(%1, %0) \n\t"
- "add $16, %%"FF_REG_a" \n\t"
+ "add $16, %0 \n\t"
"jng 1b \n\t"
- ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm"
(qscale), "g" (-2*nCoeffs)
- : "%"FF_REG_a, "memory"
+ : "+r" (offset)
+ : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
+ : "memory"
);
block[0]= block0;
//Note, we do not do mismatch control for intra as errors cannot
accumulate
@@ -358,30 +361,27 @@ __asm__ volatile(
static void dct_unquantize_mpeg2_inter_mmx(const MPVContext *s,
int16_t *block, int n, int qscale)
{
- x86_reg nCoeffs;
- const uint16_t *quant_matrix;
-
av_assert2(s->block_last_index[n]>=0);
- if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
- else qscale <<= 1;
+ x86_reg qscale2 = s->q_scale_type ? ff_mpeg2_non_linear_qscale[qscale] :
(unsigned)qscale << 1;
+ x86_reg offset = s->intra_scantable.raster_end[s->block_last_index[n]] <<
1;
+ const void *quant_matrix = (const char*)s->inter_matrix + offset;
- nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
- quant_matrix = s->inter_matrix;
__asm__ volatile(
+ "movd %k1, %%mm6 \n\t"
+ "lea (%2, %0), %1 \n\t"
+ "neg %0 \n\t"
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlq $48, %%mm7 \n\t"
- "movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
- "mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
- "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
- "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
- "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
- "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ "movq (%3, %0), %%mm4 \n\t"
+ "movq 8(%3, %0), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" //
q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" //
q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
@@ -400,8 +400,8 @@ __asm__ volatile(
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 +
1)*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1
: 0
- "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1
: 0
+ "pcmpeqw (%1, %0), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw 8(%1, %0), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
"psrlw $5, %%mm0 \n\t"
"psrlw $5, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
@@ -412,12 +412,12 @@ __asm__ volatile(
"pandn %%mm1, %%mm5 \n\t"
"pxor %%mm4, %%mm7 \n\t"
"pxor %%mm5, %%mm7 \n\t"
- "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
- "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+ "movq %%mm4, (%1, %0) \n\t"
+ "movq %%mm5, 8(%1, %0) \n\t"
- "add $16, %%"FF_REG_a" \n\t"
+ "add $16, %0 \n\t"
"jng 1b \n\t"
- "movd 124(%0, %3), %%mm0 \n\t"
+ "movd 124(%2), %%mm0 \n\t"
"movq %%mm7, %%mm6 \n\t"
"psrlq $32, %%mm7 \n\t"
"pxor %%mm6, %%mm7 \n\t"
@@ -427,10 +427,11 @@ __asm__ volatile(
"pslld $31, %%mm7 \n\t"
"psrlq $15, %%mm7 \n\t"
"pxor %%mm7, %%mm0 \n\t"
- "movd %%mm0, 124(%0, %3) \n\t"
+ "movd %%mm0, 124(%2) \n\t"
- ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm"
(qscale), "r" (-2*nCoeffs)
- : "%"FF_REG_a, "memory"
+ : "+r"(offset), "+r" (qscale2)
+ : "r" (block), "r"(quant_matrix)
+ : "memory"
);
}
commit 1fa8ffc1db2b62e475545bc6b117215704f9e1d8
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Oct 7 10:35:08 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100
avcodec/x86/mpegvideo: Improve unquantizing MPEG-2 intra blocks
Unquantizing involves calculating
(block[j] * qscale * quant_matrix[j]) / 16
where / rounds towards zero. Arithmetic right shifts
naturally round towards -inf, so the earlier code
calculated the absolute value first, then used a right-shift
and then negated the result if necessary.
This commit uses a different procedure: It biases the product
for negative values of block[j] by 0xf. The combination of
this and the arithmetic right shift is the same as rounding
towards zero.
Furthermore, a write-only store to mm7 has been removed.
Benchmarks:
dct_unquantize_mpeg2_intra_c: 214.3 ( 1.00x)
dct_unquantize_mpeg2_intra_mmx (old): 43.0 ( 4.98x)
dct_unquantize_mpeg2_intra_mmx (new): 28.4 ( 7.56x)
(The bitexact flag and the test for correctness have beem removed
from checkasm for the benchmarks.)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 38dcd8fc6e..d1614eb1eb 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -321,8 +321,6 @@ static void dct_unquantize_mpeg2_intra_mmx(const MPVContext
*s,
block0 = block[0] * s->c_dc_scale;
quant_matrix = s->intra_matrix;
__asm__ volatile(
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $15, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
@@ -335,30 +333,18 @@ __asm__ volatile(
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
"pmullw %%mm6, %%mm4 \n\t" //
q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" //
q=qscale*quant_matrix[i]
- "pxor %%mm2, %%mm2 \n\t"
- "pxor %%mm3, %%mm3 \n\t"
- "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
- "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
- "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
- "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
- "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
- "pxor %%mm4, %%mm4 \n\t"
- "pxor %%mm5, %%mm5 \n\t" // FIXME slow
- "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1
: 0
- "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1
: 0
- "psraw $4, %%mm0 \n\t"
- "psraw $4, %%mm1 \n\t"
- "pxor %%mm2, %%mm0 \n\t"
- "pxor %%mm3, %%mm1 \n\t"
- "psubw %%mm2, %%mm0 \n\t"
- "psubw %%mm3, %%mm1 \n\t"
- "pandn %%mm0, %%mm4 \n\t"
- "pandn %%mm1, %%mm5 \n\t"
- "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
- "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm1, %%mm3 \n\t"
+ "psrlw $12, %%mm2 \n\t" // block[i] < 0 ? 0xf : 0
+ "psrlw $12, %%mm3 \n\t" // (block[i] is in the
-2048..2047 range)
+ "pmullw %%mm4, %%mm0 \n\t" // block[i]*q
+ "pmullw %%mm5, %%mm1 \n\t" // block[i]*q
+ "paddw %%mm2, %%mm0 \n\t" // bias negative block[i]
+ "paddw %%mm3, %%mm1 \n\t" // so that a right-shift
+ "psraw $4, %%mm0 \n\t" // is equivalent to
divide
+ "psraw $4, %%mm1 \n\t" // with rounding towards
zero
+ "movq %%mm0, (%0, %%"FF_REG_a") \n\t"
+ "movq %%mm1, 8(%0, %%"FF_REG_a")\n\t"
"add $16, %%"FF_REG_a" \n\t"
"jng 1b \n\t"
commit 6d56807a06ce06712c65f8fcbf2a9a444bf59353
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 29 22:23:50 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100
avcodec/x86/mpegvideo: Use correct inline assembly constraints
The H.263 unquantize functions modified an input parameter.
(And they did so since this code was added in
7f3f5ec87bcbf244fce49ffdb476d4ae6e523af6. I am surprised
that this didn't cause issues, particularly with the intra function.)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 4c3299362e..38dcd8fc6e 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -33,9 +33,8 @@
static void dct_unquantize_h263_intra_mmx(const MPVContext *s,
int16_t *block, int n, int qscale)
{
- x86_reg level, qmul, qadd, nCoeffs;
-
- qmul = qscale << 1;
+ x86_reg qmul = (unsigned)qscale << 1;
+ int level, qadd;
av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
@@ -49,16 +48,15 @@ static void dct_unquantize_h263_intra_mmx(const MPVContext
*s,
qadd = 0;
level= block[0];
}
- if(s->ac_pred)
- nCoeffs=63;
- else
- nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
+ x86_reg offset = s->ac_pred ? 63 << 1 :
s->intra_scantable.raster_end[s->block_last_index[n]] << 1;
__asm__ volatile(
- "movd %1, %%mm6 \n\t" //qmul
+ "movd %k1, %%mm6 \n\t" //qmul
+ "lea (%2, %0), %1 \n\t"
+ "neg %0 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
- "movd %2, %%mm5 \n\t" //qadd
+ "movd %3, %%mm5 \n\t" //qadd
"pxor %%mm7, %%mm7 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
@@ -66,14 +64,14 @@ __asm__ volatile(
"pxor %%mm4, %%mm4 \n\t"
".p2align 4 \n\t"
"1: \n\t"
- "movq (%0, %3), %%mm0 \n\t"
- "movq 8(%0, %3), %%mm1 \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
"pmullw %%mm6, %%mm0 \n\t"
"pmullw %%mm6, %%mm1 \n\t"
- "movq (%0, %3), %%mm2 \n\t"
- "movq 8(%0, %3), %%mm3 \n\t"
+ "movq (%1, %0), %%mm2 \n\t"
+ "movq 8(%1, %0), %%mm3 \n\t"
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
@@ -93,12 +91,13 @@ __asm__ volatile(
"pandn %%mm2, %%mm0 \n\t"
"pandn %%mm3, %%mm1 \n\t"
- "movq %%mm0, (%0, %3) \n\t"
- "movq %%mm1, 8(%0, %3) \n\t"
+ "movq %%mm0, (%1, %0) \n\t"
+ "movq %%mm1, 8(%1, %0) \n\t"
- "add $16, %3 \n\t"
+ "add $16, %0 \n\t"
"jng 1b \n\t"
- ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r"
(2*(-nCoeffs))
+ : "+r"(offset), "+r"(qmul)
+ : "r" (block), "rm" (qadd)
: "memory"
);
block[0]= level;
@@ -108,20 +107,20 @@ __asm__ volatile(
static void dct_unquantize_h263_inter_mmx(const MPVContext *s,
int16_t *block, int n, int qscale)
{
- x86_reg qmul, qadd, nCoeffs;
-
- qmul = qscale << 1;
- qadd = (qscale - 1) | 1;
+ int qmul = qscale << 1;
+ int qadd = (qscale - 1) | 1;
av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
- nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+ x86_reg offset = s->inter_scantable.raster_end[s->block_last_index[n]] <<
1;
__asm__ volatile(
- "movd %1, %%mm6 \n\t" //qmul
+ "movd %2, %%mm6 \n\t" //qmul
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
- "movd %2, %%mm5 \n\t" //qadd
+ "movd %3, %%mm5 \n\t" //qadd
+ "add %1, %0 \n\t"
+ "neg %1 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
@@ -129,14 +128,14 @@ __asm__ volatile(
"pxor %%mm4, %%mm4 \n\t"
".p2align 4 \n\t"
"1: \n\t"
- "movq (%0, %3), %%mm0 \n\t"
- "movq 8(%0, %3), %%mm1 \n\t"
+ "movq (%0, %1), %%mm0 \n\t"
+ "movq 8(%0, %1), %%mm1 \n\t"
"pmullw %%mm6, %%mm0 \n\t"
"pmullw %%mm6, %%mm1 \n\t"
- "movq (%0, %3), %%mm2 \n\t"
- "movq 8(%0, %3), %%mm3 \n\t"
+ "movq (%0, %1), %%mm2 \n\t"
+ "movq 8(%0, %1), %%mm3 \n\t"
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
@@ -156,12 +155,13 @@ __asm__ volatile(
"pandn %%mm2, %%mm0 \n\t"
"pandn %%mm3, %%mm1 \n\t"
- "movq %%mm0, (%0, %3) \n\t"
- "movq %%mm1, 8(%0, %3) \n\t"
+ "movq %%mm0, (%0, %1) \n\t"
+ "movq %%mm1, 8(%0, %1) \n\t"
- "add $16, %3 \n\t"
+ "add $16, %1 \n\t"
"jng 1b \n\t"
- ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r"
(2*(-nCoeffs))
+ : "+r" (block), "+r" (offset)
+ : "rm"(qmul), "rm" (qadd)
: "memory"
);
}
commit 0f7cc6aeeacba070d6d4b76a9f3a4d4036c3bb0b
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 29 01:17:08 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100
avcodec/mpegvideo: Move ff_init_scantable() to mpegvideo_unquantize.c
This is necessary so that the mpegvideo_unquantize checkasm test
does not pull mpegvideo.o and then all of libavcodec into checkasm.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index a137fe31db..7ca2c8f701 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -42,7 +42,6 @@
#include "mpegutils.h"
#include "mpegvideo.h"
#include "mpegvideodata.h"
-#include "mpegvideo_unquantize.h"
#include "libavutil/refstruct.h"
@@ -79,20 +78,6 @@ static av_cold void dsp_init(MpegEncContext *s)
}
}
-av_cold void ff_init_scantable(const uint8_t *permutation, ScanTable *st,
- const uint8_t *src_scantable)
-{
- st->scantable = src_scantable;
-
- for (int i = 0, end = -1; i < 64; i++) {
- int j = src_scantable[i];
- st->permutated[i] = permutation[j];
- if (permutation[j] > end)
- end = permutation[j];
- st->raster_end[i] = end;
- }
-}
-
av_cold void ff_mpv_idct_init(MpegEncContext *s)
{
if (s->codec_id == AV_CODEC_ID_MPEG4)
diff --git a/libavcodec/mpegvideo_unquantize.c
b/libavcodec/mpegvideo_unquantize.c
index 06c29d0753..9297c80b47 100644
--- a/libavcodec/mpegvideo_unquantize.c
+++ b/libavcodec/mpegvideo_unquantize.c
@@ -33,6 +33,20 @@
#include "mpegvideodata.h"
#include "mpegvideo_unquantize.h"
+av_cold void ff_init_scantable(const uint8_t *permutation, ScanTable *st,
+ const uint8_t *src_scantable)
+{
+ st->scantable = src_scantable;
+
+ for (int i = 0, end = -1; i < 64; i++) {
+ int j = src_scantable[i];
+ st->permutated[i] = permutation[j];
+ if (permutation[j] > end)
+ end = permutation[j];
+ st->raster_end[i] = end;
+ }
+}
+
static void dct_unquantize_mpeg1_intra_c(const MPVContext *s,
int16_t *block, int n, int qscale)
{
commit 357fc5243c32300bba91c096488e86558beed4c8
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 29 01:05:51 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:39 2025 +0100
avcodec/{arm,neon}/mpegvideo: Fix h263 unquantize functions
These functions currently operate on the assumption that the number
of coefficients to process is always of the form 16k+m with m<=4 or >8.
Yet this is not true when the IDCT permutation is of type
FF_IDCT_PERM_LIBMPEG2
(i.e. when FF_IDCT_INT is in use).
Reviewed-by: Martin Storsjö <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S
index c7a35ea267..7e42bdf6c5 100644
--- a/libavcodec/arm/mpegvideo_neon.S
+++ b/libavcodec/arm/mpegvideo_neon.S
@@ -36,7 +36,7 @@ function ff_dct_unquantize_h263_neon, export=1
vdup.16 q15, r0 @ qmul
vdup.16 q14, r2 @ qadd
vneg.s16 q13, q14
- cmp r3, #4
+ cmp r3, #8
mov r0, r1
ble 2f
1:
@@ -62,14 +62,14 @@ function ff_dct_unquantize_h263_neon, export=1
cmp r3, #8
bgt 1b
2:
- vld1.16 {d0}, [r0,:64]
- vclt.s16 d3, d0, #0
- vceq.s16 d1, d0, #0
- vmul.s16 d2, d0, d30
- vbsl d3, d26, d28
- vadd.s16 d2, d2, d3
- vbif d0, d2, d1
- vst1.16 {d0}, [r1,:64]
+ vld1.16 {q0}, [r0,:128]
+ vclt.s16 q3, q0, #0
+ vceq.s16 q1, q0, #0
+ vmul.s16 q2, q0, q15
+ vbsl q3, q13, q14
+ vadd.s16 q2, q2, q3
+ vbif q0, q2, q1
+ vst1.16 {q0}, [r1,:128]
bx lr
endfunc
diff --git a/libavcodec/neon/mpegvideo.c b/libavcodec/neon/mpegvideo.c
index 3427dbe427..44e9b70303 100644
--- a/libavcodec/neon/mpegvideo.c
+++ b/libavcodec/neon/mpegvideo.c
@@ -39,12 +39,7 @@ static void inline ff_dct_unquantize_h263_neon(int qscale,
int qadd, int nCoeffs
{
int16x8_t q0s16, q2s16, q3s16, q8s16, q10s16, q11s16, q13s16;
int16x8_t q14s16, q15s16, qzs16;
- int16x4_t d0s16, d2s16, d3s16, dzs16;
uint16x8_t q1u16, q9u16;
- uint16x4_t d1u16;
-
- dzs16 = vdup_n_s16(0);
- qzs16 = vdupq_n_s16(0);
q15s16 = vdupq_n_s16(qscale << 1);
q14s16 = vdupq_n_s16(qadd);
@@ -73,15 +68,14 @@ static void inline ff_dct_unquantize_h263_neon(int qscale,
int qadd, int nCoeffs
if (nCoeffs <= 0)
return;
- d0s16 = vld1_s16(block);
- d3s16 = vreinterpret_s16_u16(vclt_s16(d0s16, dzs16));
- d1u16 = vceq_s16(d0s16, dzs16);
- d2s16 = vmul_s16(d0s16, vget_high_s16(q15s16));
- d3s16 = vbsl_s16(vreinterpret_u16_s16(d3s16),
- vget_high_s16(q13s16), vget_high_s16(q14s16));
- d2s16 = vadd_s16(d2s16, d3s16);
- d0s16 = vbsl_s16(d1u16, d0s16, d2s16);
- vst1_s16(block, d0s16);
+ q0s16 = vld1q_s16(block);
+ q3s16 = vreinterpretq_s16_u16(vcltq_s16(q0s16, qzs16));
+ q1u16 = vceqq_s16(q0s16, qzs16);
+ q2s16 = vmulq_s16(q0s16, q15s16);
+ q3s16 = vbslq_s16(vreinterpretq_u16_s16(q3s16), q13s16, q14s16);
+ q2s16 = vaddq_s16(q2s16, q3s16);
+ q0s16 = vbslq_s16(q1u16, q0s16, q2s16);
+ vst1q_s16(block, q0s16);
}
static void dct_unquantize_h263_inter_neon(const MPVContext *s, int16_t *block,
commit 581050a1755b335cb106ad1b6c8e5f6fa9c19bd0
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Fri Nov 28 22:25:39 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:39 2025 +0100
tests/checkasm: Add mpegvideo unquantize test
This adds a test for the mpegvideo unquantize functions.
It has been written in order to be able to easily bench
these functions. It should be noted that the random input
fed to the tested functions is not necessarily representative
of the stuff actually occuring in the wild. So benchmarks should
be taken with a grain of salt; but comparisons between two functions
that do not depend on branch predictions are valid (the usecase
for this is to port the x86 mmx functions to use xmm registers).
During testing I have found a bug in the arm/aarch64 neon optimizations
when using the LIBMPEG2 permutation (used by FF_IDCT_INT): The code
seems to be based on the presumption that the remainder of the number
of coefficients to process is always <= 4 mod 16. The test therefore
sometimes fails for these arches.
Hint: I am not certain that 16 bits are enough for the intermediate
values of all the computations involved; e.g. both FLV and MPEG-4
escape values can go beyond that after the corresponding
multiplications. The input in this test is nevertheless designed
to fit into 16 bits.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3762c0d83b..b9c8adb21f 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -19,6 +19,7 @@ AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o
AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llviddspenc.o
AVCODECOBJS-$(CONFIG_LPC) += lpc.o
AVCODECOBJS-$(CONFIG_ME_CMP) += motion.o
+AVCODECOBJS-$(CONFIG_MPEGVIDEO) += mpegvideo_unquantize.o
AVCODECOBJS-$(CONFIG_MPEGVIDEOENCDSP) += mpegvideoencdsp.o
AVCODECOBJS-$(CONFIG_QPELDSP) += qpeldsp.o
AVCODECOBJS-$(CONFIG_VC1DSP) += vc1dsp.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 8c64684fa3..a899967937 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -215,6 +215,9 @@ static const struct {
#if CONFIG_ME_CMP
{ "motion", checkasm_check_motion },
#endif
+ #if CONFIG_MPEGVIDEO
+ { "mpegvideo_unquantize", checkasm_check_mpegvideo_unquantize },
+ #endif
#if CONFIG_MPEGVIDEOENCDSP
{ "mpegvideoencdsp", checkasm_check_mpegvideoencdsp },
#endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 05f74ca16b..ec075c4763 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -123,6 +123,7 @@ void checkasm_check_llviddsp(void);
void checkasm_check_llviddspenc(void);
void checkasm_check_lpc(void);
void checkasm_check_motion(void);
+void checkasm_check_mpegvideo_unquantize(void);
void checkasm_check_mpegvideoencdsp(void);
void checkasm_check_nlmeans(void);
void checkasm_check_opusdsp(void);
diff --git a/tests/checkasm/mpegvideo_unquantize.c
b/tests/checkasm/mpegvideo_unquantize.c
new file mode 100644
index 0000000000..837606e60e
--- /dev/null
+++ b/tests/checkasm/mpegvideo_unquantize.c
@@ -0,0 +1,273 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "config.h"
+
+#include "checkasm.h"
+
+#include "libavcodec/idctdsp.h"
+#include "libavcodec/mathops.h"
+#include "libavcodec/mpegvideo.h"
+#include "libavcodec/mpegvideodata.h"
+#include "libavcodec/mpegvideo_unquantize.h"
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#define randomize_struct(TYPE, s) do { \
+ static_assert(!(_Alignof(TYPE) % 4), \
+ "can't use aligned stores"); \
+ unsigned char *ptr = (unsigned char*)s; \
+ for (size_t i = 0; i < sizeof(*s) & ~3; i += 4) \
+ AV_WN32A(ptr + i, rnd()); \
+ for (size_t i = sizeof(*s) & ~3; i < sizeof(*s); ++i) \
+ ptr[i] = rnd(); \
+ } while (0)
+
+enum TestType {
+ H263,
+ MPEG1,
+ MPEG2,
+};
+
+static void init_idct_scantable(MPVContext *const s, int intra_scantable)
+{
+ static const enum idct_permutation_type permutation_types[] = {
+ FF_IDCT_PERM_NONE,
+ FF_IDCT_PERM_LIBMPEG2,
+#if ARCH_X86_32 && HAVE_X86ASM
+ FF_IDCT_PERM_SIMPLE,
+#endif
+#if ARCH_PPC || ARCH_X86
+ FF_IDCT_PERM_TRANSPOSE,
+#endif
+#if ARCH_ARM || ARCH_AARCH64
+ FF_IDCT_PERM_PARTTRANS,
+#endif
+#if ARCH_X86 && HAVE_X86ASM
+ FF_IDCT_PERM_SSE2,
+#endif
+ };
+ // Copied here to avoid #ifs.
+ static const uint8_t ff_wmv1_scantable[][64] = {
+ { 0x00, 0x08, 0x01, 0x02, 0x09, 0x10, 0x18, 0x11,
+ 0x0A, 0x03, 0x04, 0x0B, 0x12, 0x19, 0x20, 0x28,
+ 0x30, 0x38, 0x29, 0x21, 0x1A, 0x13, 0x0C, 0x05,
+ 0x06, 0x0D, 0x14, 0x1B, 0x22, 0x31, 0x39, 0x3A,
+ 0x32, 0x2A, 0x23, 0x1C, 0x15, 0x0E, 0x07, 0x0F,
+ 0x16, 0x1D, 0x24, 0x2B, 0x33, 0x3B, 0x3C, 0x34,
+ 0x2C, 0x25, 0x1E, 0x17, 0x1F, 0x26, 0x2D, 0x35,
+ 0x3D, 0x3E, 0x36, 0x2E, 0x27, 0x2F, 0x37, 0x3F, },
+ { 0x00, 0x08, 0x01, 0x02, 0x09, 0x10, 0x18, 0x11,
+ 0x0A, 0x03, 0x04, 0x0B, 0x12, 0x19, 0x20, 0x28,
+ 0x21, 0x30, 0x1A, 0x13, 0x0C, 0x05, 0x06, 0x0D,
+ 0x14, 0x1B, 0x22, 0x29, 0x38, 0x31, 0x39, 0x2A,
+ 0x23, 0x1C, 0x15, 0x0E, 0x07, 0x0F, 0x16, 0x1D,
+ 0x24, 0x2B, 0x32, 0x3A, 0x33, 0x3B, 0x2C, 0x25,
+ 0x1E, 0x17, 0x1F, 0x26, 0x2D, 0x34, 0x3C, 0x35,
+ 0x3D, 0x2E, 0x27, 0x2F, 0x36, 0x3E, 0x37, 0x3F, },
+ { 0x00, 0x01, 0x08, 0x02, 0x03, 0x09, 0x10, 0x18,
+ 0x11, 0x0A, 0x04, 0x05, 0x0B, 0x12, 0x19, 0x20,
+ 0x28, 0x30, 0x21, 0x1A, 0x13, 0x0C, 0x06, 0x07,
+ 0x0D, 0x14, 0x1B, 0x22, 0x29, 0x38, 0x31, 0x39,
+ 0x2A, 0x23, 0x1C, 0x15, 0x0E, 0x0F, 0x16, 0x1D,
+ 0x24, 0x2B, 0x32, 0x3A, 0x33, 0x2C, 0x25, 0x1E,
+ 0x17, 0x1F, 0x26, 0x2D, 0x34, 0x3B, 0x3C, 0x35,
+ 0x2E, 0x27, 0x2F, 0x36, 0x3D, 0x3E, 0x37, 0x3F, },
+ { 0x00, 0x08, 0x10, 0x01, 0x18, 0x20, 0x28, 0x09,
+ 0x02, 0x03, 0x0A, 0x11, 0x19, 0x30, 0x38, 0x29,
+ 0x21, 0x1A, 0x12, 0x0B, 0x04, 0x05, 0x0C, 0x13,
+ 0x1B, 0x22, 0x31, 0x39, 0x32, 0x2A, 0x23, 0x1C,
+ 0x14, 0x0D, 0x06, 0x07, 0x0E, 0x15, 0x1D, 0x24,
+ 0x2B, 0x33, 0x3A, 0x3B, 0x34, 0x2C, 0x25, 0x1E,
+ 0x16, 0x0F, 0x17, 0x1F, 0x26, 0x2D, 0x3C, 0x35,
+ 0x2E, 0x27, 0x2F, 0x36, 0x3D, 0x3E, 0x37, 0x3F, }
+ };
+
+ static const uint8_t *const scantables[] = {
+ ff_alternate_vertical_scan,
+ ff_alternate_horizontal_scan,
+ ff_zigzag_direct,
+ ff_wmv1_scantable[0],
+ ff_wmv1_scantable[1],
+ ff_wmv1_scantable[2],
+ ff_wmv1_scantable[3],
+ };
+ static const uint8_t *scantable = NULL;
+ static enum idct_permutation_type idct_permutation;
+
+ if (!scantable) {
+ scantable = scantables[rnd() % FF_ARRAY_ELEMS(scantables)];
+ idct_permutation = permutation_types[rnd() %
FF_ARRAY_ELEMS(permutation_types)];
+ }
+ ff_init_scantable_permutation(s->idsp.idct_permutation, idct_permutation);
+ ff_init_scantable(s->idsp.idct_permutation,
+ intra_scantable ? &s->intra_scantable :
&s->inter_scantable,
+ scantable);
+}
+
+static void init_h263_test(MPVContext *const s, int16_t block[64],
+ int last_nonzero_coeff, int qscale, int intra)
+{
+ const uint8_t *permutation = s->inter_scantable.permutated;
+ if (intra) {
+ permutation = s->intra_scantable.permutated;
+ block[0] = rnd() & 511;
+ static int h263_aic = -1, ac_pred;
+ if (h263_aic < 0) {
+ h263_aic = rnd() & 1;
+ ac_pred = rnd() & 1;
+ }
+ s->h263_aic = h263_aic;
+ s->ac_pred = ac_pred;
+ if (s->ac_pred)
+ last_nonzero_coeff = 63;
+ }
+ for (int i = intra; i <= last_nonzero_coeff; ++i) {
+ int random = rnd();
+ if (random & 1)
+ continue;
+ random >>= 1;
+ // Select level so that the multiplication fits into 16 bits.
+ // FIXME: The FLV and MPEG-4 decoders can have escape values exceeding
this.
+ block[permutation[i]] = sign_extend(random, 10);
+ }
+}
+
+static void init_mpeg12_test(MPVContext *const s, int16_t block[64],
+ int last_nonzero_coeff, int qscale, int intra,
+ enum TestType type)
+{
+ uint16_t *matrix = intra ? s->intra_matrix : s->inter_matrix;
+
+ if (type == MPEG2)
+ qscale = s->q_scale_type ? ff_mpeg2_non_linear_qscale[qscale] : qscale
<< 1;
+
+ for (int i = 0; i < 64; ++i)
+ matrix[i] = 1 + rnd() % 254;
+
+ const uint8_t *permutation = s->intra_scantable.permutated;
+ if (intra) {
+ block[0] = (int8_t)rnd();
+ for (int i = 1; i <= last_nonzero_coeff; ++i) {
+ int j = permutation[i];
+ unsigned random = rnd();
+ if (random & 1)
+ continue;
+ random >>= 1;
+ // Select level so that the multiplication does not overflow
+ // an int16_t and so that it is within the possible range
+ // (-2048..2047). FIXME: It seems that this need not be fulfilled
+ // in practice for the MPEG-4 decoder at least.
+ int limit = FFMIN(INT16_MAX / (qscale * matrix[j]), 2047);
+ block[j] = random % (2 * limit + 1) - limit;
+ }
+ } else {
+ for (int i = 0; i <= last_nonzero_coeff; ++i) {
+ int j = permutation[i];
+ unsigned random = rnd();
+ if (random & 1)
+ continue;
+ random >>= 1;
+ int limit = FFMIN((INT16_MAX / (qscale * matrix[j]) - 1) / 2,
2047);
+ block[j] = random % (2 * limit + 1) - limit;
+ }
+ }
+}
+
+void checkasm_check_mpegvideo_unquantize(void)
+{
+ static const struct {
+ const char *name;
+ size_t offset;
+ int intra, intra_scantable;
+ enum TestType type;
+ } tests[] = {
+#define TEST(NAME, INTRA, INTRA_SCANTABLE, TYPE) \
+ { .name = #NAME, .offset = offsetof(MPVUnquantDSPContext, NAME), \
+ .intra = INTRA, .intra_scantable = INTRA_SCANTABLE, .type = TYPE }
+ TEST(dct_unquantize_mpeg1_intra, 1, 1, MPEG1),
+ TEST(dct_unquantize_mpeg1_inter, 0, 1, MPEG1),
+ TEST(dct_unquantize_mpeg2_intra, 1, 1, MPEG2),
+ TEST(dct_unquantize_mpeg2_inter, 0, 1, MPEG2),
+ TEST(dct_unquantize_h263_intra, 1, 1, H263),
+ TEST(dct_unquantize_h263_inter, 0, 0, H263),
+ };
+ MPVUnquantDSPContext unquant_dsp_ctx;
+ int q_scale_type = rnd() & 1;
+
+ ff_mpv_unquantize_init(&unquant_dsp_ctx, 1 /* bitexact */, q_scale_type);
+ declare_func_emms(AV_CPU_FLAG_MMX, void, MPVContext *s, int16_t *block,
int n, int qscale);
+
+ for (size_t i = 0; i < FF_ARRAY_ELEMS(tests); ++i) {
+ void (*func)(MPVContext *s, int16_t *block, int n, int qscale) =
+ *(void (**)(MPVContext *, int16_t *, int,
int))((char*)&unquant_dsp_ctx + tests[i].offset);
+ if (check_func(func, "%s", tests[i].name)) {
+ MPVContext new, ref;
+ DECLARE_ALIGNED(16, int16_t, block_new)[64];
+ DECLARE_ALIGNED(16, int16_t, block_ref)[64];
+ static int block_last_index = -1;
+
+ randomize_struct(MPVContext, &ref);
+
+ ref.q_scale_type = q_scale_type;
+
+ init_idct_scantable(&ref, tests[i].intra_scantable);
+
+ if (block_last_index < 0)
+ block_last_index = rnd() % 64;
+
+ memset(block_ref, 0, sizeof(block_ref));
+
+ if (tests[i].intra) {
+ // Less restricted than real dc_scale values
+ ref.y_dc_scale = 1 + rnd() % 64;
+ ref.c_dc_scale = 1 + rnd() % 64;
+ }
+
+ static int qscale = 0;
+
+ if (qscale == 0)
+ qscale = 1 + rnd() % 31;
+
+ if (tests[i].type == H263)
+ init_h263_test(&ref, block_ref, block_last_index, qscale,
+ tests[i].intra);
+ else
+ init_mpeg12_test(&ref, block_ref, block_last_index, qscale,
+ tests[i].intra, tests[i].type);
+
+ int n = rnd() % 6;
+ ref.block_last_index[n] = block_last_index;
+
+ memcpy(&new, &ref, sizeof(new));
+ memcpy(block_new, block_ref, sizeof(block_new));
+
+ call_ref(&ref, block_ref, n, qscale);
+ call_new(&new, block_new, n, qscale);
+
+ if (memcmp(&ref, &new, sizeof(new)) || memcmp(block_new,
block_ref, sizeof(block_new)))
+ fail();
+
+ bench_new(&new, block_new, n, qscale);
+ }
+ }
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index f182efde46..48edd17bf2 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -39,6 +39,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp
\
fate-checkasm-llviddspenc \
fate-checkasm-lpc \
fate-checkasm-motion \
+ fate-checkasm-mpegvideo_unquantize \
fate-checkasm-mpegvideoencdsp \
fate-checkasm-opusdsp \
fate-checkasm-pixblockdsp \
commit e7a629049f7e9be397b0acabe75beb207ad9dc21
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Fri Nov 28 16:58:44 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:20:42 2025 +0100
avcodec/{arm,neon}/mpegvideo: Use intra scantable to unquant H263 intra
Forgotten in 70a7df049c411d9247eb6075720c84196c3e55e8.
Using the wrong scantable matters for codecs for which both scantables
can differ, namely the MPEG-4 decoder and the WMV1/2 codecs.
For WMV1 it can lead to wrong output in case the IDCT permutation
is FF_IDCT_PERM_PARTTRANS, because in this case the entries of
of the intra scantable's raster end are not always <= the corresponding
entries of the inter scantable's raster end when the former is
initialized via ff_wmv1_scantable[1] and the latter via
ff_wmv1_scantable[0].
FF_IDCT_PERM_PARTTRANS is used iff the Neon IDCT is used (for both arm
and aarch64).* Said IDCT is not used during FATE, so that this issue
went unnoticed.
WMV2 uses the same scantables, but uses a custom IDCT
which always uses FF_IDCT_PERM_NONE for which the inter_scantable,
so that the output is always correct for it.
The scantable for MPEG-4 can change mid-stream (for the decoder),
but since c41818dc5dc14eb944761204e7b0ac179a6dcd1a only the intra
scantable is updated, so that both scantables can get out of sync.
In such a case the unquantize intra functions could unquantize
an incorrect number of coefficients.
Using raster_end of the wrong scantable can also lead to an
unnecessarily large amount of coefficients unquantized.
*: FF_IDCT_PERM_SIMPLE and FF_IDCT_PERM_TRANSPOSE would also not work,
but they are not used at all by arm and aarch64.
Reviewed-by: Martin Storsjö <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/arm/asm-offsets.h b/libavcodec/arm/asm-offsets.h
index a2174b0a08..67e1f2ff6d 100644
--- a/libavcodec/arm/asm-offsets.h
+++ b/libavcodec/arm/asm-offsets.h
@@ -28,5 +28,6 @@
#define BLOCK_LAST_INDEX 0x10
#define H263_AIC 0x40
#define INTER_SCANTAB_RASTER_END 0x88
+#define INTRA_SCANTAB_RASTER_END 0x10c
#endif /* AVCODEC_ARM_ASM_OFFSETS_H */
diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c
index cb109cd832..593e998181 100644
--- a/libavcodec/arm/mpegvideo_arm.c
+++ b/libavcodec/arm/mpegvideo_arm.c
@@ -38,6 +38,8 @@ CHECK_OFFSET(MpegEncContext, ac_pred, AC_PRED);
CHECK_OFFSET(MpegEncContext, block_last_index, BLOCK_LAST_INDEX);
CHECK_OFFSET(MpegEncContext, inter_scantable.raster_end,
INTER_SCANTAB_RASTER_END);
+CHECK_OFFSET(MpegEncContext, intra_scantable.raster_end,
+ INTRA_SCANTAB_RASTER_END);
CHECK_OFFSET(MpegEncContext, h263_aic, H263_AIC);
#endif
diff --git a/libavcodec/arm/mpegvideo_armv5te.c
b/libavcodec/arm/mpegvideo_armv5te.c
index 3a6d015767..b2790b48fe 100644
--- a/libavcodec/arm/mpegvideo_armv5te.c
+++ b/libavcodec/arm/mpegvideo_armv5te.c
@@ -73,7 +73,7 @@ static void dct_unquantize_h263_intra_armv5te(const
MPVContext *s,
if(s->ac_pred)
nCoeffs=63;
else
- nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+ nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
block[0] = level;
diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S
index 1889d7a912..c7a35ea267 100644
--- a/libavcodec/arm/mpegvideo_neon.S
+++ b/libavcodec/arm/mpegvideo_neon.S
@@ -77,7 +77,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1
push {r4-r6,lr}
add r12, r0, #BLOCK_LAST_INDEX
ldr r6, [r0, #AC_PRED]
- add lr, r0, #INTER_SCANTAB_RASTER_END
+ add lr, r0, #INTRA_SCANTAB_RASTER_END
cmp r6, #0
it ne
movne r12, #63
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index e21ce5164d..758bf57ab9 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -72,11 +72,11 @@ typedef struct MpegEncContext {
/* scantables */
ScanTable inter_scantable; ///< if inter == intra then intra should be
used to reduce the cache usage
+ ScanTable intra_scantable;
/* WARNING: changes above this line require updates to hardcoded
* offsets used in ASM. */
- ScanTable intra_scantable;
uint8_t permutated_intra_h_scantable[64];
uint8_t permutated_intra_v_scantable[64];
diff --git a/libavcodec/neon/mpegvideo.c b/libavcodec/neon/mpegvideo.c
index fdc57d3876..3427dbe427 100644
--- a/libavcodec/neon/mpegvideo.c
+++ b/libavcodec/neon/mpegvideo.c
@@ -112,7 +112,7 @@ static void dct_unquantize_h263_intra_neon(const MPVContext
*s, int16_t *block,
if (s->ac_pred) {
nCoeffs = 63;
} else {
- nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+ nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
if (nCoeffs <= 0)
return;
}
commit 5d41d3e21dff14058b283491480a7382daeb5da9
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 19 12:00:08 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:20:42 2025 +0100
avcodec/ppc/mpegvideo_altivec: Reindent after the previous commit
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/ppc/mpegvideo_altivec.c
b/libavcodec/ppc/mpegvideo_altivec.c
index 7b54de3d91..71894e760b 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -43,30 +43,30 @@
static av_always_inline
void dct_unquantize_h263_altivec(int16_t *block, int nb_coeffs, int qadd, int
qmul)
{
- register const vector signed short vczero = (const vector signed
short)vec_splat_s16(0);
- DECLARE_ALIGNED(16, short, qmul8) = qmul;
- DECLARE_ALIGNED(16, short, qadd8) = qadd;
- register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
- register vector bool short blockv_null, blockv_neg;
+ register const vector signed short vczero = (const vector signed
short)vec_splat_s16(0);
+ DECLARE_ALIGNED(16, short, qmul8) = qmul;
+ DECLARE_ALIGNED(16, short, qadd8) = qadd;
+ register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
+ register vector bool short blockv_null, blockv_neg;
- qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0);
- qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0);
- nqaddv = vec_sub(vczero, qaddv);
+ qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0);
+ qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0);
+ nqaddv = vec_sub(vczero, qaddv);
- // vectorize all the 16 bytes-aligned blocks
- // of 8 elements
- for (register int j = 0; j <= nb_coeffs; j += 8) {
- blockv = vec_ld(j << 1, block);
- blockv_neg = vec_cmplt(blockv, vczero);
- blockv_null = vec_cmpeq(blockv, vczero);
- // choose between +qadd or -qadd as the third operand
- temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
- // multiply & add (block{i,i+7} * qmul [+-] qadd)
- temp1 = vec_mladd(blockv, qmulv, temp1);
- // put 0 where block[{i,i+7} used to have 0
- blockv = vec_sel(temp1, blockv, blockv_null);
- vec_st(blockv, j << 1, block);
- }
+ // vectorize all the 16 bytes-aligned blocks
+ // of 8 elements
+ for (register int j = 0; j <= nb_coeffs; j += 8) {
+ blockv = vec_ld(j << 1, block);
+ blockv_neg = vec_cmplt(blockv, vczero);
+ blockv_null = vec_cmpeq(blockv, vczero);
+ // choose between +qadd or -qadd as the third operand
+ temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
+ // multiply & add (block{i,i+7} * qmul [+-] qadd)
+ temp1 = vec_mladd(blockv, qmulv, temp1);
+ // put 0 where block[{i,i+7} used to have 0
+ blockv = vec_sel(temp1, blockv, blockv_null);
+ vec_st(blockv, j << 1, block);
+ }
}
static void dct_unquantize_h263_intra_altivec(const MPVContext *s,
commit 011ef7fc65fcbf2141adcec9ca805874bb0a6a16
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 19 11:51:03 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:20:42 2025 +0100
avcodec/ppc/mpegvideo_altivec: Split intra/inter unquantizing
Don't use a single function that checks mb_intra. Forgotten
in d50635cd247e17fe16c63219b9ae80d45a8185b1.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/ppc/mpegvideo_altivec.c
b/libavcodec/ppc/mpegvideo_altivec.c
index ad3a783a87..7b54de3d91 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -40,41 +40,14 @@
/* AltiVec version of dct_unquantize_h263
this code assumes `block' is 16 bytes-aligned */
-static void dct_unquantize_h263_altivec(const MPVContext *s,
- int16_t *block, int n, int qscale)
+static av_always_inline
+void dct_unquantize_h263_altivec(int16_t *block, int nb_coeffs, int qadd, int
qmul)
{
- int i, qmul, qadd;
- int nCoeffs;
-
- qadd = (qscale - 1) | 1;
- qmul = qscale << 1;
-
- if (s->mb_intra) {
- if (!s->h263_aic) {
- if (n < 4)
- block[0] = block[0] * s->y_dc_scale;
- else
- block[0] = block[0] * s->c_dc_scale;
- }else
- qadd = 0;
- i = 1;
- if (s->ac_pred)
- nCoeffs = 63;
- else
- nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
- } else {
- i = 0;
- av_assert2(s->block_last_index[n]>=0);
- nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
- }
-
- {
register const vector signed short vczero = (const vector signed
short)vec_splat_s16(0);
DECLARE_ALIGNED(16, short, qmul8) = qmul;
DECLARE_ALIGNED(16, short, qadd8) = qadd;
register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
register vector bool short blockv_null, blockv_neg;
- register short backup_0 = block[0];
qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0);
qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0);
@@ -82,7 +55,7 @@ static void dct_unquantize_h263_altivec(const MPVContext *s,
// vectorize all the 16 bytes-aligned blocks
// of 8 elements
- for (register int j = 0; j <= nCoeffs ; j += 8) {
+ for (register int j = 0; j <= nb_coeffs; j += 8) {
blockv = vec_ld(j << 1, block);
blockv_neg = vec_cmplt(blockv, vczero);
blockv_null = vec_cmpeq(blockv, vczero);
@@ -94,14 +67,36 @@ static void dct_unquantize_h263_altivec(const MPVContext *s,
blockv = vec_sel(temp1, blockv, blockv_null);
vec_st(blockv, j << 1, block);
}
+}
- if (i == 1) {
- // cheat. this avoid special-casing the first iteration
- block[0] = backup_0;
- }
- }
+static void dct_unquantize_h263_intra_altivec(const MPVContext *s,
+ int16_t *block, int n, int
qscale)
+{
+ int qadd = (qscale - 1) | 1;
+ int qmul = qscale << 1;
+ int block0 = block[0];
+ if (!s->h263_aic) {
+ block0 *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
+ } else
+ qadd = 0;
+ int nb_coeffs = s->ac_pred ? 63 :
s->intra_scantable.raster_end[s->block_last_index[n]];
+
+ dct_unquantize_h263_altivec(block, nb_coeffs, qadd, qmul);
+
+ // cheat. this avoid special-casing the first iteration
+ block[0] = block0;
}
+static void dct_unquantize_h263_inter_altivec(const MPVContext *s,
+ int16_t *block, int n, int
qscale)
+{
+ int qadd = (qscale - 1) | 1;
+ int qmul = qscale << 1;
+ av_assert2(s->block_last_index[n]>=0);
+ int nb_coeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+
+ dct_unquantize_h263_altivec(block, nb_coeffs, qadd, qmul);
+}
#endif /* HAVE_ALTIVEC */
av_cold void ff_mpv_unquantize_init_ppc(MPVUnquantDSPContext *s, int bitexact)
@@ -110,7 +105,7 @@ av_cold void
ff_mpv_unquantize_init_ppc(MPVUnquantDSPContext *s, int bitexact)
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
- s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec;
- s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec;
+ s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_altivec;
+ s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_altivec;
#endif /* HAVE_ALTIVEC */
}
commit 358c569b05bc6f9a107a5caebcc8da56e8bf9799
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Fri Nov 14 11:24:45 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:20:41 2025 +0100
avcodec/mpegvideo_unquantize: Constify MPVContext pointee
Also use MPVContext instead of MpegEncContext.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c
index 5c96c9df2c..cb109cd832 100644
--- a/libavcodec/arm/mpegvideo_arm.c
+++ b/libavcodec/arm/mpegvideo_arm.c
@@ -41,9 +41,9 @@ CHECK_OFFSET(MpegEncContext, inter_scantable.raster_end,
CHECK_OFFSET(MpegEncContext, h263_aic, H263_AIC);
#endif
-void ff_dct_unquantize_h263_inter_neon(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_h263_inter_neon(const MPVContext *s, int16_t *block,
int n, int qscale);
-void ff_dct_unquantize_h263_intra_neon(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_h263_intra_neon(const MPVContext *s, int16_t *block,
int n, int qscale);
av_cold void ff_mpv_unquantize_init_arm(MPVUnquantDSPContext *s, int bitexact)
diff --git a/libavcodec/arm/mpegvideo_armv5te.c
b/libavcodec/arm/mpegvideo_armv5te.c
index 2737f68643..3a6d015767 100644
--- a/libavcodec/arm/mpegvideo_armv5te.c
+++ b/libavcodec/arm/mpegvideo_armv5te.c
@@ -50,8 +50,8 @@ static inline void dct_unquantize_h263_helper_c(int16_t
*block, int qmul, int qa
}
#endif
-static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_intra_armv5te(const MPVContext *s,
+ int16_t *block, int n, int
qscale)
{
int level, qmul, qadd;
int nCoeffs;
@@ -79,8 +79,8 @@ static void dct_unquantize_h263_intra_armv5te(MpegEncContext
*s,
block[0] = level;
}
-static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_inter_armv5te(const MPVContext *s,
+ int16_t *block, int n, int
qscale)
{
int qmul, qadd;
int nCoeffs;
diff --git a/libavcodec/mips/h263dsp_mips.h b/libavcodec/mips/h263dsp_mips.h
index d4de2233a7..5ea9fcbb88 100644
--- a/libavcodec/mips/h263dsp_mips.h
+++ b/libavcodec/mips/h263dsp_mips.h
@@ -25,11 +25,11 @@
void ff_h263_h_loop_filter_msa(uint8_t *src, int stride, int q_scale);
void ff_h263_v_loop_filter_msa(uint8_t *src, int stride, int q_scale);
-void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_mpeg2_inter_msa(const MPVContext *s, int16_t *block,
int32_t index, int32_t q_scale);
-void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_h263_inter_msa(const MPVContext *s, int16_t *block,
int32_t index, int32_t q_scale);
-void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_h263_intra_msa(const MPVContext *s, int16_t *block,
int32_t index, int32_t q_scale);
int ff_pix_sum_msa(const uint8_t *pix, ptrdiff_t line_size);
diff --git a/libavcodec/mips/mpegvideo_mips.h b/libavcodec/mips/mpegvideo_mips.h
index 2a9ea4006e..2544279ac5 100644
--- a/libavcodec/mips/mpegvideo_mips.h
+++ b/libavcodec/mips/mpegvideo_mips.h
@@ -23,16 +23,16 @@
#include "libavcodec/mpegvideo.h"
-void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
- int n, int qscale);
-void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
- int n, int qscale);
-void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
- int n, int qscale);
-void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
- int n, int qscale);
-void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
- int n, int qscale);
+void ff_dct_unquantize_h263_intra_mmi(const MPVContext *s, int16_t *block,
+ int n, int qscale);
+void ff_dct_unquantize_h263_inter_mmi(const MPVContext *s, int16_t *block,
+ int n, int qscale);
+void ff_dct_unquantize_mpeg1_intra_mmi(const MPVContext *s, int16_t *block,
+ int n, int qscale);
+void ff_dct_unquantize_mpeg1_inter_mmi(const MPVContext *s, int16_t *block,
+ int n, int qscale);
+void ff_dct_unquantize_mpeg2_intra_mmi(const MPVContext *s, int16_t *block,
+ int n, int qscale);
void ff_denoise_dct_mmi(int16_t block[64], int sum[64], const uint16_t
offset[64]);
#endif /* AVCODEC_MIPS_MPEGVIDEO_MIPS_H */
diff --git a/libavcodec/mips/mpegvideo_mmi.c b/libavcodec/mips/mpegvideo_mmi.c
index 87d4aafd8c..90bd90c147 100644
--- a/libavcodec/mips/mpegvideo_mmi.c
+++ b/libavcodec/mips/mpegvideo_mmi.c
@@ -25,8 +25,8 @@
#include "mpegvideo_mips.h"
#include "libavutil/mips/mmiutils.h"
-void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
- int n, int qscale)
+void ff_dct_unquantize_h263_intra_mmi(const MPVContext *s, int16_t *block,
+ int n, int qscale)
{
int64_t level, nCoeffs;
double ftmp[6];
@@ -101,8 +101,8 @@ void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s,
int16_t *block,
block[0] = level;
}
-void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
- int n, int qscale)
+void ff_dct_unquantize_h263_inter_mmi(const MPVContext *s, int16_t *block,
+ int n, int qscale)
{
int64_t nCoeffs;
double ftmp[6];
@@ -160,8 +160,8 @@ void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s,
int16_t *block,
);
}
-void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
- int n, int qscale)
+void ff_dct_unquantize_mpeg1_intra_mmi(const MPVContext *s, int16_t *block,
+ int n, int qscale)
{
int64_t nCoeffs;
const uint16_t *quant_matrix;
@@ -254,8 +254,8 @@ void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s,
int16_t *block,
block[0] = block0;
}
-void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
- int n, int qscale)
+void ff_dct_unquantize_mpeg1_inter_mmi(const MPVContext *s, int16_t *block,
+ int n, int qscale)
{
int64_t nCoeffs;
const uint16_t *quant_matrix;
@@ -342,8 +342,8 @@ void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s,
int16_t *block,
);
}
-void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
- int n, int qscale)
+void ff_dct_unquantize_mpeg2_intra_mmi(const MPVContext *s, int16_t *block,
+ int n, int qscale)
{
uint64_t nCoeffs;
const uint16_t *quant_matrix;
diff --git a/libavcodec/mips/mpegvideo_msa.c b/libavcodec/mips/mpegvideo_msa.c
index cd4adc0f77..a870a2cd79 100644
--- a/libavcodec/mips/mpegvideo_msa.c
+++ b/libavcodec/mips/mpegvideo_msa.c
@@ -194,7 +194,7 @@ static int32_t mpeg2_dct_unquantize_inter_msa(int16_t
*block,
return sum_res;
}
-void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s,
+void ff_dct_unquantize_h263_intra_msa(const MPVContext *s,
int16_t *block, int32_t index,
int32_t qscale)
{
@@ -219,7 +219,7 @@ void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s,
h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1);
}
-void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s,
+void ff_dct_unquantize_h263_inter_msa(const MPVContext *s,
int16_t *block, int32_t index,
int32_t qscale)
{
@@ -236,7 +236,7 @@ void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s,
h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0);
}
-void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s,
+void ff_dct_unquantize_mpeg2_inter_msa(const MPVContext *s,
int16_t *block, int32_t index,
int32_t qscale)
{
diff --git a/libavcodec/mpeg4videodec.h b/libavcodec/mpeg4videodec.h
index aafde454ea..2eafa1ef8b 100644
--- a/libavcodec/mpeg4videodec.h
+++ b/libavcodec/mpeg4videodec.h
@@ -93,11 +93,11 @@ typedef struct Mpeg4DecContext {
Mpeg4VideoDSPContext mdsp;
- void (*dct_unquantize_mpeg2_inter)(MpegEncContext *s,
+ void (*dct_unquantize_mpeg2_inter)(const MPVContext *s,
int16_t *block, int n, int qscale);
- void (*dct_unquantize_mpeg2_intra)(MpegEncContext *s,
+ void (*dct_unquantize_mpeg2_intra)(const MPVContext *s,
int16_t *block, int n, int qscale);
- void (*dct_unquantize_h263_intra)(MpegEncContext *s,
+ void (*dct_unquantize_h263_intra)(const MPVContext *s,
int16_t *block, int n, int qscale);
union {
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index cb4b99acd3..e21ce5164d 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -57,6 +57,8 @@ enum OutputFormat {
FMT_SPEEDHQ,
};
+typedef struct MpegEncContext MPVContext;
+
/**
* MpegEncContext.
*/
@@ -271,10 +273,10 @@ typedef struct MpegEncContext {
int interlaced_dct;
int first_field; ///< is 1 for the first field of a field picture
0 otherwise
- void (*dct_unquantize_intra)(struct MpegEncContext *s, // unquantizer to
use (MPEG-4 can use both)
- int16_t *block/*align 16*/, int n, int qscale);
- void (*dct_unquantize_inter)(struct MpegEncContext *s, // unquantizer to
use (MPEG-4 can use both)
- int16_t *block/*align 16*/, int n, int qscale);
+ void (*dct_unquantize_intra)(const MPVContext *s, // unquantizer to use
(MPEG-4 can use both)
+ int16_t *block/*align 16*/, int n, int
qscale);
+ void (*dct_unquantize_inter)(const MPVContext *s, // unquantizer to use
(MPEG-4 can use both)
+ int16_t *block/*align 16*/, int n, int
qscale);
/* flag to indicate a reinitialization is required, e.g. after
* a frame size change */
@@ -286,8 +288,6 @@ typedef struct MpegEncContext {
ERContext er;
} MpegEncContext;
-typedef MpegEncContext MPVContext;
-
/**
* Set the given MpegEncContext to common defaults (same for encoding
* and decoding). The changed fields will not depend upon the prior
diff --git a/libavcodec/mpegvideo_unquantize.c
b/libavcodec/mpegvideo_unquantize.c
index 213e37a514..06c29d0753 100644
--- a/libavcodec/mpegvideo_unquantize.c
+++ b/libavcodec/mpegvideo_unquantize.c
@@ -33,8 +33,8 @@
#include "mpegvideodata.h"
#include "mpegvideo_unquantize.h"
-static void dct_unquantize_mpeg1_intra_c(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_intra_c(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
int i, level, nCoeffs;
const uint16_t *quant_matrix;
@@ -62,8 +62,8 @@ static void dct_unquantize_mpeg1_intra_c(MpegEncContext *s,
}
}
-static void dct_unquantize_mpeg1_inter_c(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_inter_c(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
int i, level, nCoeffs;
const uint16_t *quant_matrix;
@@ -91,8 +91,8 @@ static void dct_unquantize_mpeg1_inter_c(MpegEncContext *s,
}
}
-static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_intra_c(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
int i, level, nCoeffs;
const uint16_t *quant_matrix;
@@ -120,8 +120,8 @@ static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
}
}
-static void dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_intra_bitexact(const MPVContext *s,
+ int16_t *block, int n, int
qscale)
{
int i, level, nCoeffs;
const uint16_t *quant_matrix;
@@ -153,8 +153,8 @@ static void
dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
block[63]^=sum&1;
}
-static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_inter_c(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
int i, level, nCoeffs;
const uint16_t *quant_matrix;
@@ -186,8 +186,8 @@ static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
block[63]^=sum&1;
}
-static void dct_unquantize_h263_intra_c(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_intra_c(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
int i, level, qmul, qadd;
int nCoeffs;
@@ -220,8 +220,8 @@ static void dct_unquantize_h263_intra_c(MpegEncContext *s,
}
}
-static void dct_unquantize_h263_inter_c(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_inter_c(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
int i, level, qmul, qadd;
int nCoeffs;
diff --git a/libavcodec/mpegvideo_unquantize.h
b/libavcodec/mpegvideo_unquantize.h
index 3e6d8aedf7..1a43f467c6 100644
--- a/libavcodec/mpegvideo_unquantize.h
+++ b/libavcodec/mpegvideo_unquantize.h
@@ -29,21 +29,21 @@
#include "config.h"
-typedef struct MpegEncContext MpegEncContext;
+typedef struct MpegEncContext MPVContext;
typedef struct MPVUnquantDSPContext {
- void (*dct_unquantize_mpeg1_intra)(struct MpegEncContext *s,
- int16_t *block/*align 16*/, int n, int qscale);
- void (*dct_unquantize_mpeg1_inter)(struct MpegEncContext *s,
- int16_t *block/*align 16*/, int n, int qscale);
- void (*dct_unquantize_mpeg2_intra)(struct MpegEncContext *s,
- int16_t *block/*align 16*/, int n, int qscale);
- void (*dct_unquantize_mpeg2_inter)(struct MpegEncContext *s,
- int16_t *block/*align 16*/, int n, int qscale);
- void (*dct_unquantize_h263_intra)(struct MpegEncContext *s,
- int16_t *block/*align 16*/, int n, int qscale);
- void (*dct_unquantize_h263_inter)(struct MpegEncContext *s,
- int16_t *block/*align 16*/, int n, int qscale);
+ void (*dct_unquantize_mpeg1_intra)(const MPVContext *s,
+ int16_t *block/*align 16*/, int n, int
qscale);
+ void (*dct_unquantize_mpeg1_inter)(const MPVContext *s,
+ int16_t *block/*align 16*/, int n, int
qscale);
+ void (*dct_unquantize_mpeg2_intra)(const MPVContext *s,
+ int16_t *block/*align 16*/, int n, int
qscale);
+ void (*dct_unquantize_mpeg2_inter)(const MPVContext *s,
+ int16_t *block/*align 16*/, int n, int
qscale);
+ void (*dct_unquantize_h263_intra)(const MPVContext *s,
+ int16_t *block/*align 16*/, int n, int
qscale);
+ void (*dct_unquantize_h263_inter)(const MPVContext *s,
+ int16_t *block/*align 16*/, int n, int
qscale);
} MPVUnquantDSPContext;
#if !ARCH_MIPS
diff --git a/libavcodec/neon/mpegvideo.c b/libavcodec/neon/mpegvideo.c
index a0276ad808..fdc57d3876 100644
--- a/libavcodec/neon/mpegvideo.c
+++ b/libavcodec/neon/mpegvideo.c
@@ -84,7 +84,7 @@ static void inline ff_dct_unquantize_h263_neon(int qscale,
int qadd, int nCoeffs
vst1_s16(block, d0s16);
}
-static void dct_unquantize_h263_inter_neon(MpegEncContext *s, int16_t *block,
+static void dct_unquantize_h263_inter_neon(const MPVContext *s, int16_t *block,
int n, int qscale)
{
int nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
@@ -93,7 +93,7 @@ static void dct_unquantize_h263_inter_neon(MpegEncContext *s,
int16_t *block,
ff_dct_unquantize_h263_neon(qscale, qadd, nCoeffs + 1, block);
}
-static void dct_unquantize_h263_intra_neon(MpegEncContext *s, int16_t *block,
+static void dct_unquantize_h263_intra_neon(const MPVContext *s, int16_t *block,
int n, int qscale)
{
int qadd;
diff --git a/libavcodec/ppc/mpegvideo_altivec.c
b/libavcodec/ppc/mpegvideo_altivec.c
index 26e98acfb8..ad3a783a87 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -40,8 +40,8 @@
/* AltiVec version of dct_unquantize_h263
this code assumes `block' is 16 bytes-aligned */
-static void dct_unquantize_h263_altivec(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_altivec(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
int i, qmul, qadd;
int nCoeffs;
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 8632acd412..4c3299362e 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -30,8 +30,8 @@
#if HAVE_MMX_INLINE
-static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_intra_mmx(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
x86_reg level, qmul, qadd, nCoeffs;
@@ -105,8 +105,8 @@ __asm__ volatile(
}
-static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_inter_mmx(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
x86_reg qmul, qadd, nCoeffs;
@@ -166,8 +166,8 @@ __asm__ volatile(
);
}
-static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_intra_mmx(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
@@ -235,8 +235,8 @@ __asm__ volatile(
block[0]= block0;
}
-static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_inter_mmx(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
@@ -301,8 +301,8 @@ __asm__ volatile(
);
}
-static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_intra_mmx(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
@@ -369,8 +369,8 @@ __asm__ volatile(
//Note, we do not do mismatch control for intra as errors cannot
accumulate
}
-static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
- int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_inter_mmx(const MPVContext *s,
+ int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
-----------------------------------------------------------------------
Summary of changes:
libavcodec/arm/asm-offsets.h | 1 +
libavcodec/arm/mpegvideo_arm.c | 6 +-
libavcodec/arm/mpegvideo_armv5te.c | 10 +-
libavcodec/arm/mpegvideo_neon.S | 20 +-
libavcodec/mips/h263dsp_mips.h | 6 +-
libavcodec/mips/mpegvideo_mips.h | 20 +-
libavcodec/mips/mpegvideo_mmi.c | 20 +-
libavcodec/mips/mpegvideo_msa.c | 6 +-
libavcodec/mpeg4videodec.h | 6 +-
libavcodec/mpegvideo.c | 15 -
libavcodec/mpegvideo.h | 24 +-
libavcodec/mpegvideo_unquantize.c | 42 ++-
libavcodec/mpegvideo_unquantize.h | 26 +-
libavcodec/neon/mpegvideo.c | 28 +-
libavcodec/ppc/mpegvideo_altivec.c | 105 +++---
libavcodec/x86/mpegvideo.c | 595 +++++++++++++++-------------------
tests/checkasm/Makefile | 1 +
tests/checkasm/checkasm.c | 3 +
tests/checkasm/checkasm.h | 1 +
tests/checkasm/mpegvideo_unquantize.c | 273 ++++++++++++++++
tests/fate/checkasm.mak | 1 +
21 files changed, 704 insertions(+), 505 deletions(-)
create mode 100644 tests/checkasm/mpegvideo_unquantize.c
hooks/post-receive
--
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]