The branch, master has been updated
via 89f984e3d1d8f1f009c616e0c6425094395fdbdc (commit)
via e6ae2802a39b24324bd6c394c073a26900a39f23 (commit)
via ada0a81577b8e6724545514024ae41f48633af46 (commit)
via 012c25bac4b65ff1064ac7a52e2ecce49770f351 (commit)
via b9cbbd9074974234a319f042a21875b69b18c6f2 (commit)
via 0ec9c1b68d2f0a0e30ec161659b90ad86fb1e386 (commit)
via 01ff05e4bc0b8f80da8db3a664270bf7976d81f0 (commit)
via b51cbd4116f3e612d148abffa47471acccf19a72 (commit)
via 18019f177e660b38725c4fb91334a6c25801ebdd (commit)
from 9cd4be6d7ce1c226f9b92c0de2ac8add7a90020f (commit)
- Log -----------------------------------------------------------------
commit 89f984e3d1d8f1f009c616e0c6425094395fdbdc
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 26 20:15:55 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sun Nov 30 00:15:43 2025 +0100
avcodec/x86/h264_idct: Fix ff_h264_luma_dc_dequant_idct_sse2 checkasm
failures
ff_h264_luma_dc_dequant_idct_sse2() does not pass checkasm for certain
seeds, because the input to packssdw no longer fits into an int16_t,
leading to saturation, where the C code just truncates. I don't know
whether the spec contains provisions that ensure that valid input
must not exceed 16 bit or whether the such inputs (even if invalid)
can be triggered by the actual code and not only the test.
This commit adapts the behavior of the function to the C reference code
to fix the test. packssdw is avoided, instead the lower words are
directly transfered to GPRs to be written out. This has unfortunately
led to a slight performance regression here (14.5 vs 15.1 cycles).
Fixes issue #20835.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index d35d583ce7..47e4116f42 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -592,36 +592,58 @@ RET
psrad m1, %1
psrad m2, %1
psrad m3, %1
- packssdw m0, m1
- packssdw m2, m3
%endmacro
-%macro STORE_WORDS 9
- movd t0d, %1
- psrldq %1, 4
- movd t1d, %1
- psrldq %1, 4
- mov [t2+%2*32], t0w
- mov [t2+%4*32], t1w
- shr t0d, 16
- shr t1d, 16
+%macro STORE_WORDS 10
+%if ARCH_X86_64
+ movq t0, %1
+ movq t1, %2
+ psrldq %1, 8
+ psrldq %2, 8
mov [t2+%3*32], t0w
- mov [t2+%5*32], t1w
- movd t0d, %1
- psrldq %1, 4
- movd t1d, %1
+ mov [t2+%7*32], t1w
+ shr t0, 32
+ shr t1, 32
+ mov [t2+%4*32], t0w
+ mov [t2+%8*32], t1w
+ movq t0, %1
+ movq t1, %2
+ mov [t2+%5*32], t0w
+ mov [t2+%9*32], t1w
+ shr t0, 32
+ shr t1, 32
mov [t2+%6*32], t0w
+ mov [t2+%10*32], t1w
+%else
+ movd t0d, %1
+ movd t1d, %2
+ psrldq %1, 4
+ psrldq %2, 4
+ mov [t2+%3*32], t0w
+ mov [t2+%7*32], t1w
+ movd t0d, %1
+ movd t1d, %2
+ psrldq %1, 4
+ psrldq %2, 4
+ mov [t2+%4*32], t0w
mov [t2+%8*32], t1w
- shr t0d, 16
- shr t1d, 16
- mov [t2+%7*32], t0w
+ movd t0d, %1
+ movd t1d, %2
+ psrldq %1, 4
+ psrldq %2, 4
+ mov [t2+%5*32], t0w
mov [t2+%9*32], t1w
+ movd t0d, %1
+ movd t1d, %2
+ mov [t2+%6*32], t0w
+ mov [t2+%10*32], t1w
+%endif
%endmacro
%macro DEQUANT_STORE 1
DEQUANT %1
- STORE_WORDS m0, 0, 1, 4, 5, 2, 3, 6, 7
- STORE_WORDS m2, 8, 9, 12, 13, 10, 11, 14, 15
+ STORE_WORDS m0, m1, 0, 1, 4, 5, 2, 3, 6, 7
+ STORE_WORDS m2, m3, 8, 9, 12, 13, 10, 11, 14, 15
%endmacro
INIT_XMM sse2
commit e6ae2802a39b24324bd6c394c073a26900a39f23
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 26 17:48:01 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sun Nov 30 00:15:43 2025 +0100
avcodec/x86/h264_idct: Deduplicate generating constant
pw_1 is currently loaded in both codepaths. Generate it earlier instead.
Gives tiny speedups (15 vs 14.5 cycles) and reduces codesize.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index fe46107867..d35d583ce7 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -51,7 +51,6 @@ scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
%endif
cextern pw_32
-cextern pw_1
SECTION .text
@@ -577,9 +576,9 @@ RET
SWAP %1, %4, %3
%endmacro
+; requires m5 to contain pw_1
%macro DEQUANT 1
movd m4, t3d
- movq m5, [pw_1]
pshufd m4, m4, 0
punpcklwd m0, m5
punpcklwd m1, m5
@@ -635,6 +634,7 @@ cglobal h264_luma_dc_dequant_idct, 3, 4, 7
punpcklwd m0, m1
punpcklwd m2, m3
mova m4, m0
+ pcmpeqw m5, m5
punpckldq m0, m2
punpckhdq m4, m2
movhlps m1, m0
@@ -652,6 +652,7 @@ cglobal h264_luma_dc_dequant_idct, 3, 4, 7
%else
DECLARE_REG_TMP 1,3,0,2
%endif
+ psrlw m5, 15
cmp t3d, 32767
jg .big_qmul
commit ada0a81577b8e6724545514024ae41f48633af46
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 26 17:26:47 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sun Nov 30 00:15:43 2025 +0100
avcodec/x86/h264_idct: Don't use MMX registers in
ff_h264_luma_dc_dequant_idct_sse2
It is ABI compliant and gives a tiny speedup here (and is 16B smaller).
Old benchmarks:
h264_luma_dc_dequant_idct_8_c: 33.2 ( 1.00x)
h264_luma_dc_dequant_idct_8_sse2: 16.0 ( 2.07x)
New benchmarks:
h264_luma_dc_dequant_idct_8_c: 33.0 ( 1.00x)
h264_luma_dc_dequant_idct_8_sse2: 15.0 ( 2.20x)
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 50647f2454..fe46107867 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -578,27 +578,23 @@ RET
%endmacro
%macro DEQUANT 1
- movd xmm4, t3d
- movq xmm5, [pw_1]
- pshufd xmm4, xmm4, 0
- movq2dq xmm0, m0
- movq2dq xmm1, m1
- movq2dq xmm2, m2
- movq2dq xmm3, m3
- punpcklwd xmm0, xmm5
- punpcklwd xmm1, xmm5
- punpcklwd xmm2, xmm5
- punpcklwd xmm3, xmm5
- pmaddwd xmm0, xmm4
- pmaddwd xmm1, xmm4
- pmaddwd xmm2, xmm4
- pmaddwd xmm3, xmm4
- psrad xmm0, %1
- psrad xmm1, %1
- psrad xmm2, %1
- psrad xmm3, %1
- packssdw xmm0, xmm1
- packssdw xmm2, xmm3
+ movd m4, t3d
+ movq m5, [pw_1]
+ pshufd m4, m4, 0
+ punpcklwd m0, m5
+ punpcklwd m1, m5
+ punpcklwd m2, m5
+ punpcklwd m3, m5
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ pmaddwd m3, m4
+ psrad m0, %1
+ psrad m1, %1
+ psrad m2, %1
+ psrad m3, %1
+ packssdw m0, m1
+ packssdw m2, m3
%endmacro
%macro STORE_WORDS 9
@@ -625,19 +621,25 @@ RET
%macro DEQUANT_STORE 1
DEQUANT %1
- STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
- STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
+ STORE_WORDS m0, 0, 1, 4, 5, 2, 3, 6, 7
+ STORE_WORDS m2, 8, 9, 12, 13, 10, 11, 14, 15
%endmacro
INIT_XMM sse2
cglobal h264_luma_dc_dequant_idct, 3, 4, 7
-INIT_MMX cpuname
movq m3, [r1+24]
movq m2, [r1+16]
movq m1, [r1+ 8]
movq m0, [r1+ 0]
WALSH4_1D 0,1,2,3,4
- TRANSPOSE4x4W 0,1,2,3,4
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ mova m4, m0
+ punpckldq m0, m2
+ punpckhdq m4, m2
+ movhlps m1, m0
+ movhlps m3, m4
+ SWAP 2, 4
WALSH4_1D 0,1,2,3,4
; shift, tmp, output, qmul
@@ -665,8 +667,8 @@ INIT_MMX cpuname
inc t1d
shr t3d, t0b
sub t1d, t0d
- movd xmm6, t1d
- DEQUANT_STORE xmm6
+ movd m6, t1d
+ DEQUANT_STORE m6
RET
%ifdef __NASM_VER__
diff --git a/tests/checkasm/h264dsp.c b/tests/checkasm/h264dsp.c
index f05ae419fc..acf4f61ebb 100644
--- a/tests/checkasm/h264dsp.c
+++ b/tests/checkasm/h264dsp.c
@@ -336,7 +336,7 @@ static void check_idct_dequant(void)
LOCAL_ALIGNED_16(int32_t, dst1_32, [16 * 16]);
H264DSPContext h;
int bit_depth, i, qmul;
- declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_SSE2, void, int16_t
*output, int16_t *input, int qmul);
+ declare_func(void, int16_t *output, int16_t *input, int qmul);
qmul = rnd() % 4096;
commit 012c25bac4b65ff1064ac7a52e2ecce49770f351
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 26 15:59:03 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sun Nov 30 00:15:43 2025 +0100
avcodec/x86/h264_idct: Zero with full-width stores
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 4b9efd6d6d..50647f2454 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -90,10 +90,15 @@ SECTION .text
paddw m0, m6
IDCT4_1D w, 0, 1, 2, 3, 4, 5
pxor m7, m7
- movq [%2+ 0], m7
- movq [%2+ 8], m7
- movq [%2+16], m7
- movq [%2+24], m7
+ %if mmsize == 16
+ mova [%2+ 0], m7
+ mova [%2+16], m7
+ %else
+ movq [%2+ 0], m7
+ movq [%2+ 8], m7
+ movq [%2+16], m7
+ movq [%2+24], m7
+ %endif
STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
lea %1, [%1+%3*2]
commit b9cbbd9074974234a319f042a21875b69b18c6f2
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 26 15:23:31 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sun Nov 30 00:15:43 2025 +0100
avcodec/x86/h264_idct: Use tail call where advantageous
It is possible on UNIX64.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 9405aa848a..4b9efd6d6d 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -55,6 +55,19 @@ cextern pw_1
SECTION .text
+; %1=callee, %2=dst to jump to if tail call is impossible (can be empty,
+; then no jmp is performed), %3=current iteration, %4=last iteration
+%macro TAIL_CALL_IF_LAST 4
+%if (%3 == %4) && !has_epilogue
+ jmp %1
+%else
+ call %1
+ %ifnempty %2
+ jmp %2
+ %endif
+%endif
+%endmacro
+
; %1=uint8_t *dst, %2=int16_t *block, %3=ptrdiff_t stride
%macro IDCT4_ADD 3
; Load dct coeffs
@@ -424,7 +437,7 @@ h264_add8x4_idct_sse2:
%else
add r0, r0m
%endif
- call h264_add8x4_idct_sse2
+ TAIL_CALL_IF_LAST h264_add8x4_idct_sse2, , %1, 7
.cycle%1end:
%if %1 < 7
add r2, 64
@@ -461,8 +474,7 @@ RET
%else
add r0, r0m
%endif
- call h264_add8x4_idct_sse2
- jmp .cycle%1end
+ TAIL_CALL_IF_LAST h264_add8x4_idct_sse2, .cycle%1end, %1, 7
.try%1dc:
movsx r0, word [r2 ]
or r0w, word [r2+32]
@@ -473,7 +485,7 @@ RET
%else
add r0, r0m
%endif
- call h264_idct_dc_add8_mmxext
+ TAIL_CALL_IF_LAST h264_idct_dc_add8_mmxext, , %1, 7
.cycle%1end:
%if %1 < 7
add r2, 64
@@ -510,8 +522,7 @@ RET
mov r0, [r0]
add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
%endif
- call h264_add8x4_idct_sse2
- jmp .cycle%1end
+ TAIL_CALL_IF_LAST h264_add8x4_idct_sse2, .cycle%1end, %1, 3
.try%1dc:
movsx r0, word [r2 ]
or r0w, word [r2+32]
@@ -524,7 +535,7 @@ RET
mov r0, [r0]
add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
%endif
- call h264_idct_dc_add8_mmxext
+ TAIL_CALL_IF_LAST h264_idct_dc_add8_mmxext, , %1, 3
.cycle%1end:
%if %1 == 1
add r2, 384+64
commit 0ec9c1b68d2f0a0e30ec161659b90ad86fb1e386
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 26 14:57:45 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sun Nov 30 00:15:43 2025 +0100
avutil/x86/x86inc: Use parentheses in has_epilogue
Prevents surprises.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index e61d924bc1..0e80ebed43 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -609,7 +609,7 @@ DECLARE_REG 14, R13, 120
RESET_STACK_STATE
%endmacro
-%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required ||
xmm_regs_used > 6+high_mm_regs
+%define has_epilogue (regs_used > 7 || stack_size > 0 || vzeroupper_required
|| xmm_regs_used > 6+high_mm_regs)
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL
@@ -658,7 +658,7 @@ DECLARE_REG 14, R13, 72
%endif
%endmacro
-%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
+%define has_epilogue (regs_used > 9 || stack_size > 0 || vzeroupper_required)
%macro RET 0
%if stack_size_padded > 0
@@ -722,7 +722,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%endif
%endmacro
-%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
+%define has_epilogue (regs_used > 3 || stack_size > 0 || vzeroupper_required)
%macro RET 0
%if stack_size_padded > 0
commit 01ff05e4bc0b8f80da8db3a664270bf7976d81f0
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 26 14:09:57 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sun Nov 30 00:15:43 2025 +0100
avcodec/x86/h264_idct: Avoid call where possible
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 6863dbcb4d..9405aa848a 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -316,29 +316,6 @@ INIT_XMM cpuname
RET
INIT_MMX mmx
-h264_idct_add8_mmx_plane:
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- or r6w, word [r2]
- test r6, r6
- jz .skipblock
-%if ARCH_X86_64
- mov r0d, dword [r1+r5*4]
- add r0, [dst2q]
-%else
- mov r0, r1m ; XXX r1m here is actually r0m of the calling func
- mov r0, [r0]
- add r0, dword [r1+r5*4]
-%endif
- IDCT4_ADD r0, r2, r3
-.skipblock:
- inc r5
- add r2, 32
- test r5, 3
- jnz .nextblock
- rep ret
-
cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block,
stride, nnzc, cntr, coeff, dst2, picreg
; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
movsxdifnidn r3, r3d
@@ -367,9 +344,31 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1,
block_offset, block, str
call h264_idct_add8_mmx_plane
add r5, 4
- call h264_idct_add8_mmx_plane
+ TAIL_CALL h264_idct_add8_mmx_plane, 0
+
+h264_idct_add8_mmx_plane:
+.nextblock:
+ movzx r6d, byte [scan8+r5]
+ movzx r6d, byte [r4+r6]
+ or r6w, word [r2]
+ test r6d, r6d
+ jz .skipblock
+%if ARCH_X86_64
+ mov r0d, dword [r1+r5*4]
+ add r0, [dst2q]
+%else
+ mov r0, r1m ; XXX r1m here is actually r0m of the calling func
+ mov r0, [r0]
+ add r0, dword [r1+r5*4]
+%endif
+ IDCT4_ADD r0, r2, r3
+.skipblock:
+ inc r5d
+ add r2, 32
+ test r5d, 3
+ jnz .nextblock
+ rep ret
- RET ; TODO: check rep ret after a function call
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = ptrdiff_t stride, r6=clobbered
h264_idct_dc_add8_mmxext:
commit b51cbd4116f3e612d148abffa47471acccf19a72
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 26 12:38:58 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sun Nov 30 00:15:43 2025 +0100
avcodec/x86/h264_idct: Remove redundant movsxdifnidn
Only exported (i.e. cglobal) functions need it; stride is already
sign-extended when it reaches any of the internal functions used here,
so don't sign-extend again.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 985955d96a..6863dbcb4d 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -55,7 +55,7 @@ cextern pw_1
SECTION .text
-; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
+; %1=uint8_t *dst, %2=int16_t *block, %3=ptrdiff_t stride
%macro IDCT4_ADD 3
; Load dct coeffs
movq m0, [%2]
@@ -145,7 +145,7 @@ SECTION .text
IDCT8_1D [%1], [%1+ 64]
%endmacro
-; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
+; %1=uint8_t *dst, %2=int16_t *block, %3=ptrdiff_t stride
%macro IDCT8_ADD_SSE 4
IDCT8_1D_FULL %2
%if ARCH_X86_64
@@ -317,7 +317,6 @@ INIT_XMM cpuname
INIT_MMX mmx
h264_idct_add8_mmx_plane:
- movsxdifnidn r3, r3d
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
@@ -372,9 +371,8 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1,
block_offset, block, str
RET ; TODO: check rep ret after a function call
-; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
+; r0 = uint8_t *dst, r2 = int16_t *block, r3 = ptrdiff_t stride, r6=clobbered
h264_idct_dc_add8_mmxext:
- movsxdifnidn r3, r3d
movd m0, [r2 ] ; 0 0 X D
mov word [r2+ 0], 0
punpcklwd m0, [r2+32] ; x X d D
@@ -393,9 +391,8 @@ h264_idct_dc_add8_mmxext:
ALIGN 16
INIT_XMM sse2
-; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
+; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = ptrdiff_t stride
h264_add8x4_idct_sse2:
- movsxdifnidn r3, r3d
movq m0, [r2+ 0]
movq m1, [r2+ 8]
movq m2, [r2+16]
commit 18019f177e660b38725c4fb91334a6c25801ebdd
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 26 12:26:37 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sun Nov 30 00:15:43 2025 +0100
avcodec/x86/h264idct: Remove dead MMX macros
Forgotten in 4618f36a2424a3a4d5760afabc2e9dd18d73f0a4.
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index d9c3c9c862..985955d96a 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -145,61 +145,6 @@ SECTION .text
IDCT8_1D [%1], [%1+ 64]
%endmacro
-; %1=int16_t *block, %2=int16_t *dstblock
-%macro IDCT8_ADD_MMX_START 2
- IDCT8_1D_FULL %1
- mova [%1], m7
- TRANSPOSE4x4W 0, 1, 2, 3, 7
- mova m7, [%1]
- mova [%2 ], m0
- mova [%2+16], m1
- mova [%2+32], m2
- mova [%2+48], m3
- TRANSPOSE4x4W 4, 5, 6, 7, 3
- mova [%2+ 8], m4
- mova [%2+24], m5
- mova [%2+40], m6
- mova [%2+56], m7
-%endmacro
-
-; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
-%macro IDCT8_ADD_MMX_END 3-4
- IDCT8_1D_FULL %2
- mova [%2 ], m5
- mova [%2+16], m6
- mova [%2+32], m7
-
- pxor m7, m7
-%if %0 == 4
- movq [%4+ 0], m7
- movq [%4+ 8], m7
- movq [%4+ 16], m7
- movq [%4+ 24], m7
- movq [%4+ 32], m7
- movq [%4+ 40], m7
- movq [%4+ 48], m7
- movq [%4+ 56], m7
- movq [%4+ 64], m7
- movq [%4+ 72], m7
- movq [%4+ 80], m7
- movq [%4+ 88], m7
- movq [%4+ 96], m7
- movq [%4+104], m7
- movq [%4+112], m7
- movq [%4+120], m7
-%endif
- STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
- lea %1, [%1+%3*2]
- STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
- mova m0, [%2 ]
- mova m1, [%2+16]
- mova m2, [%2+32]
- lea %1, [%1+%3*2]
- STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
- lea %1, [%1+%3*2]
- STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
-%endmacro
-
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_SSE 4
IDCT8_1D_FULL %2
@@ -612,7 +557,7 @@ cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
add8_sse2_cycle 3, 0x64
RET
-;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int
qmul)
+;void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int
qmul)
%macro WALSH4_1D 5
SUMSUB_BADC w, %4, %3, %2, %1, %5
@@ -620,8 +565,7 @@ RET
SWAP %1, %4, %3
%endmacro
-%macro DEQUANT 1-3
-%if cpuflag(sse2)
+%macro DEQUANT 1
movd xmm4, t3d
movq xmm5, [pw_1]
pshufd xmm4, xmm4, 0
@@ -643,31 +587,9 @@ RET
psrad xmm3, %1
packssdw xmm0, xmm1
packssdw xmm2, xmm3
-%else
- mova m7, [pw_1]
- mova m4, %1
- punpcklwd %1, m7
- punpckhwd m4, m7
- mova m5, %2
- punpcklwd %2, m7
- punpckhwd m5, m7
- movd m7, t3d
- punpckldq m7, m7
- pmaddwd %1, m7
- pmaddwd %2, m7
- pmaddwd m4, m7
- pmaddwd m5, m7
- psrad %1, %3
- psrad %2, %3
- psrad m4, %3
- psrad m5, %3
- packssdw %1, m4
- packssdw %2, m5
-%endif
%endmacro
-%macro STORE_WORDS 5-9
-%if cpuflag(sse)
+%macro STORE_WORDS 9
movd t0d, %1
psrldq %1, 4
movd t1d, %1
@@ -687,33 +609,12 @@ RET
shr t1d, 16
mov [t2+%7*32], t0w
mov [t2+%9*32], t1w
-%else
- movd t0d, %1
- psrlq %1, 32
- movd t1d, %1
- mov [t2+%2*32], t0w
- mov [t2+%4*32], t1w
- shr t0d, 16
- shr t1d, 16
- mov [t2+%3*32], t0w
- mov [t2+%5*32], t1w
-%endif
%endmacro
%macro DEQUANT_STORE 1
-%if cpuflag(sse2)
DEQUANT %1
STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
-%else
- DEQUANT m0, m1, %1
- STORE_WORDS m0, 0, 1, 4, 5
- STORE_WORDS m1, 2, 3, 6, 7
-
- DEQUANT m2, m3, %1
- STORE_WORDS m2, 8, 9, 12, 13
- STORE_WORDS m3, 10, 11, 14, 15
-%endif
%endmacro
INIT_XMM sse2
-----------------------------------------------------------------------
Summary of changes:
libavcodec/x86/h264_idct.asm | 310 +++++++++++++++++--------------------------
libavutil/x86/x86inc.asm | 6 +-
tests/checkasm/h264dsp.c | 2 +-
3 files changed, 128 insertions(+), 190 deletions(-)
hooks/post-receive
--
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]