The branch, master has been updated
via a66a260ae9da037b1f14ad4d110de3bceb5b42e9 (commit)
from 9a34ddc345ddbc4810799fad65c7fa9d65b15eb1 (commit)
- Log -----------------------------------------------------------------
commit a66a260ae9da037b1f14ad4d110de3bceb5b42e9
Author: Henrik Gramner <[email protected]>
AuthorDate: Mon Sep 1 02:03:00 2025 +0200
Commit: Henrik Gramner <[email protected]>
CommitDate: Mon Sep 1 13:54:52 2025 +0000
vp9: Add 8bpc intra prediction AVX2 asm
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 9836b3321c..bbabcf38c3 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -207,11 +207,8 @@ ipred_dir_tm_h_funcs(8, avx);
ipred_dir_tm_h_funcs(16, avx);
ipred_dir_tm_h_funcs(32, avx);
-ipred_func(32, v, avx);
-
-ipred_dc_funcs(32, avx2);
-ipred_func(32, h, avx2);
-ipred_func(32, tm, avx2);
+ipred_all_funcs(32, avx2);
+ipred_func(32, v, avx2);
#undef ipred_func
#undef ipred_dir_tm_h_funcs
@@ -388,7 +385,6 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int
bpp, int bitexact)
if (EXTERNAL_AVX_FAST(cpu_flags)) {
init_fpel_func(1, 0, 32, put, , avx);
init_fpel_func(0, 0, 64, put, , avx);
- init_ipred(32, avx, v, VERT);
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
@@ -408,9 +404,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int
bpp, int bitexact)
init_subpel3_32_64(1, avg, 8, avx2);
#endif
}
- init_dc_ipred(32, avx2);
- init_ipred(32, avx2, h, HOR);
- init_ipred(32, avx2, tm, TM_VP8);
+ init_all_ipred(32, avx2);
+ init_ipred(32, avx2, v, VERT);
}
#if ARCH_X86_64
diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm
index 31f7d449fd..b67addd7e3 100644
--- a/libavcodec/x86/vp9intrapred.asm
+++ b/libavcodec/x86/vp9intrapred.asm
@@ -2,6 +2,7 @@
;* VP9 Intra prediction SIMD optimizations
;*
;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;* Copyright (c) 2025 Two Orioles, LLC
;*
;* Parts based on:
;* H.264 intra prediction asm optimizations
@@ -230,40 +231,6 @@ DC_16to32_FUNCS
INIT_XMM ssse3
DC_16to32_FUNCS
-%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
- mova m0, [lq]
- mova m1, [aq]
- DEFINE_ARGS dst, stride, stride3, cnt
- lea stride3q, [strideq*3]
- pxor m2, m2
- psadbw m0, m2
- psadbw m1, m2
- paddw m0, m1
- vextracti128 xm1, m0, 1
- paddw xm0, xm1
- movhlps xm1, xm0
- paddw xm0, xm1
- pmulhrsw xm0, [pw_512]
- vpbroadcastb m0, xm0
- mov cntd, 4
-.loop:
- mova [dstq+strideq*0], m0
- mova [dstq+strideq*1], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- mova [dstq+strideq*0], m0
- mova [dstq+strideq*1], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- dec cntd
- jg .loop
- RET
-%endif
-
; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const
uint8_t *a)
%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
@@ -395,44 +362,6 @@ INIT_XMM ssse3
DC_1D_16to32_FUNCS top, a
DC_1D_16to32_FUNCS left, l
-%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
-%if HAVE_AVX2_EXTERNAL
-cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
- mova m0, [%2q]
- DEFINE_ARGS dst, stride, stride3, cnt
- lea stride3q, [strideq*3]
- pxor m2, m2
- psadbw m0, m2
- vextracti128 xm1, m0, 1
- paddw xm0, xm1
- movhlps xm1, xm0
- paddw xm0, xm1
- pmulhrsw xm0, [pw_1024]
- vpbroadcastb m0, xm0
- mov cntd, 4
-.loop:
- mova [dstq+strideq*0], m0
- mova [dstq+strideq*1], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- mova [dstq+strideq*0], m0
- mova [dstq+strideq*1], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- dec cntd
- jg .loop
- RET
-%endif
-%endmacro
-
-INIT_YMM avx2
-DC_1D_AVX2_FUNCS top, a
-DC_1D_AVX2_FUNCS left, l
-
-; v
-
INIT_MMX mmx
cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
movq m0, [aq]
@@ -486,29 +415,6 @@ cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
jg .loop
RET
-INIT_YMM avx
-cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
- mova m0, [aq]
- DEFINE_ARGS dst, stride, stride3, cnt
- lea stride3q, [strideq*3]
- mov cntd, 4
-.loop:
- mova [dstq+strideq*0], m0
- mova [dstq+strideq*1], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- mova [dstq+strideq*0], m0
- mova [dstq+strideq*1], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- dec cntd
- jg .loop
- RET
-
-; h
-
%macro H_XMM_FUNCS 2
%if notcpuflag(avx)
cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
@@ -642,34 +548,6 @@ H_XMM_FUNCS 4, 8
INIT_XMM avx
H_XMM_FUNCS 4, 8
-%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
- mova m5, [pb_1]
- mova m6, [pb_2]
- mova m7, [pb_3]
- pxor m4, m4
- lea stride3q, [strideq*3]
- mov cntq, 7
-.loop:
- movd xm3, [lq+cntq*4]
- vinserti128 m3, m3, xm3, 1
- pshufb m0, m3, m7
- pshufb m1, m3, m6
- mova [dstq+strideq*0], m0
- mova [dstq+strideq*1], m1
- pshufb m2, m3, m5
- pshufb m3, m4
- mova [dstq+strideq*2], m2
- mova [dstq+stride3q ], m3
- lea dstq, [dstq+strideq*4]
- dec cntq
- jge .loop
- RET
-%endif
-
-; tm
-
%macro TM_MMX_FUNCS 0
cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
pxor m1, m1
@@ -898,46 +776,9 @@ TM_XMM_FUNCS
INIT_XMM avx
TM_XMM_FUNCS
-%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
- pxor m3, m3
- pinsrw xm2, [aq-1], 0
- vinserti128 m2, m2, xm2, 1
- mova m0, [aq]
- DEFINE_ARGS dst, stride, l, cnt
- mova m4, [pw_m256]
- mova m5, [pw_m255]
- pshufb m2, m4
- punpckhbw m1, m0, m3
- punpcklbw m0, m3
- psubw m1, m2
- psubw m0, m2
- mov cntq, 15
-.loop:
- pinsrw xm7, [lq+cntq*2], 0
- vinserti128 m7, m7, xm7, 1
- pshufb m3, m7, m5
- pshufb m7, m4
- paddw m2, m3, m0
- paddw m3, m1
- paddw m6, m7, m0
- paddw m7, m1
- packuswb m2, m3
- packuswb m6, m7
- mova [dstq+strideq*0], m2
- mova [dstq+strideq*1], m6
- lea dstq, [dstq+strideq*2]
- dec cntq
- jge .loop
- RET
-%endif
-
-; dl
-
-%macro LOWPASS 4 ; left [dst], center, right, tmp
+%macro LOWPASS 4-5 [pb_1] ; left [dst], center, right, tmp, pb_1
pxor m%4, m%1, m%3
- pand m%4, [pb_1]
+ pand m%4, %5
pavgb m%1, m%3
psubusb m%1, m%4
pavgb m%1, m%2
@@ -2041,4 +1882,306 @@ HU_XMM_FUNCS 7
INIT_XMM avx
HU_XMM_FUNCS 7
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
+ pxor m1, m1
+ psadbw m0, m1, [lq]
+ psadbw m1, [aq]
+ movd xm2, [pw_512]
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+.main:
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastb m0, xm0
+.main2:
+ lea r2, [strideq*3]
+ mov r3d, 8
+.loop:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+r2 ], m0
+ lea dstq, [dstq+strideq*4]
+ dec r3d
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dc_top_32x32, 0, 4, 3, dst, stride, l, a
+ mov lq, amp
+%if ARCH_X86_32
+ jmp mangle(private_prefix %+ _vp9_ipred_dc_left_32x32 %+ SUFFIX).main
+%endif
+
+%assign function_align 1
+cglobal vp9_ipred_dc_left_32x32, 0, 4, 3, dst, stride, l, a
+ movifnidn lq, lmp
+.main:
+ movifnidn dstq, dstmp
+ movifnidn strideq, stridemp
+ pxor xm1, xm1
+ psadbw xm0, xm1, [lq]
+ psadbw xm1, [lq+16]
+ movd xm2, [pw_1024]
+ jmp mangle(private_prefix %+ _vp9_ipred_dc_32x32 %+ SUFFIX).main
+
+cglobal vp9_ipred_v_32x32, 2, 4, 3, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq]
+ jmp mangle(private_prefix %+ _vp9_ipred_dc_32x32 %+ SUFFIX).main2
+
+%assign function_align 16
+cglobal vp9_ipred_h_32x32, 3, 5, 6, dst, stride, l
+ vpbroadcastd m2, [pb_3]
+ mov r3d, 7
+ vpbroadcastd m3, [pb_2]
+ pxor m5, m5
+ vpbroadcastd m4, [pb_1]
+ lea r4, [strideq*3]
+.loop:
+ vpbroadcastd m1, [lq+r3*4]
+ pshufb m0, m1, m2
+ mova [dstq+strideq*0], m0
+ pshufb m0, m1, m3
+ mova [dstq+strideq*1], m0
+ pshufb m0, m1, m4
+ mova [dstq+strideq*2], m0
+ pshufb m1, m5
+ mova [dstq+r4 ], m1
+ lea dstq, [dstq+strideq*4]
+ dec r3d
+ jge .loop
+ RET
+
+cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
+ vpbroadcastd m0, [aq-1]
+ mova m7, [aq]
+ pxor m1, m1
+ vpbroadcastd m4, [pw_m255]
+ mov r3d, 15
+ vpbroadcastd m5, [pw_m256]
+ pshufb m0, m5
+ punpcklbw m6, m7, m1
+ punpckhbw m7, m1
+ psubw m6, m0
+ psubw m7, m0
+.loop:
+ vpbroadcastd m3, [lq+r3*2]
+ pshufb m2, m3, m4
+ pshufb m3, m5
+ paddw m0, m2, m6
+ paddw m2, m7
+ paddw m1, m3, m6
+ paddw m3, m7
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ dec r3d
+ jge .loop
+ RET
+
+cglobal vp9_ipred_dl_32x32, 2, 5, 6, dst, stride, l, a
+ movifnidn aq, amp
+ vpbroadcastb m2, [aq+31]
+ vinserti128 m3, m2, [aq+16], 0
+ mova m0, [aq+ 0]
+ vpbroadcastd m5, [pb_1]
+ palignr m4, m3, m0, 2
+ lea r3, [strideq*2]
+ palignr m3, m0, 1
+ LOWPASS 0, 3, 4, 1, m5
+ lea r4, [strideq*3]
+ vperm2i128 m1, m0, m2, 0x31
+ mov r2d, 8
+.loop:
+ shufpd m3, m0, m1, 0x05
+ mova [dstq+r3*0], m0
+ punpckhqdq m4, m1, m2
+ mova [dstq+r3*4], m3
+ palignr m0, m1, m0, 1
+ mova [dstq+r3*8], m1
+ palignr m1, m2, m1, 1
+ mova [dstq+r4*8], m4
+ add dstq, strideq
+ dec r2d
+ jg .loop
+ RET
+
+cglobal vp9_ipred_dr_32x32, 4, 5, 7, dst, stride, l, a
+ mova m3, [lq+ 0]
+ movu m1, [aq- 1]
+ mova m0, [aq+ 0]
+ vpbroadcastd m6, [pb_1]
+ vperm2i128 m2, m3, m1, 0x21
+ lea r3, [strideq*2]
+ palignr m4, m1, m2, 15
+ LOWPASS 0, 1, 4, 5, m6
+ pslldq xm4, xm3, 1
+ palignr m2, m3, 1
+ vinserti128 m4, [lq+15], 1
+ LOWPASS 2, 3, 4, 5, m6
+ lea r4, [strideq*3]
+ vperm2i128 m1, m2, m0, 0x21
+ mov r2d, 8
+.loop:
+ shufpd m3, m1, m0, 0x05
+ mova [dstq+r3*0], m0
+ shufpd m4, m2, m1, 0x05
+ mova [dstq+r3*4], m3
+ palignr m0, m1, 15
+ mova [dstq+r3*8], m1
+ palignr m1, m2, 15
+ mova [dstq+r4*8], m4
+ add dstq, strideq
+ pslldq m2, 1
+ dec r2d
+ jg .loop
+ RET
+
+cglobal vp9_ipred_hd_32x32, 4, 6, 7, dst, stride, l, a
+ movu m1, [aq-1]
+ mova m0, [lq]
+ vpbroadcastd m6, [pb_1]
+ vperm2i128 m4, m0, m1, 0x21
+ palignr m3, m4, m0, 1
+ palignr m4, m0, 2
+ LOWPASS 4, 3, 0, 2, m6
+ pavgb m3, m0
+ movu xm0, [aq+15]
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ palignr m4, m0, m1, 2
+ palignr m0, m1, 1
+ LOWPASS 4, 0, 1, 5, m6
+ lea r2, [strideq*8]
+ vinserti128 m0, m2, xm3, 1
+ lea r3, [dstq+r2*1]
+ vpblendd m1, m2, m3, 0x0f
+ lea r4, [dstq+r2*2]
+ vperm2i128 m2, m3, 0x31
+ lea r5, [r3 +r2*2]
+ vperm2i128 m3, m4, 0x21
+.loop:
+ sub r2, strideq
+ mova [r5 +r2], m0
+ palignr m0, m1, m0, 2
+ mova [r4 +r2], m1
+ palignr m1, m2, m1, 2
+ mova [r3 +r2], m2
+ palignr m2, m3, m2, 2
+ mova [dstq+r2], m3
+ palignr m3, m4, m3, 2
+ psrldq m4, 2
+ jg .loop
+ RET
+
+cglobal vp9_ipred_hu_32x32, 3, 5, 6, dst, stride, l, a
+ mova m0, [lq]
+ vpbroadcastb xm3, [lq+31]
+ vpbroadcastd m1, [pb_1]
+ vbroadcasti128 m4, [pb_2toE_3xF]
+ vperm2i128 m3, m0, 0x03
+ palignr m5, m3, m0, 2
+ palignr m3, m0, 1
+ LOWPASS 5, 3, 0, 2, m1
+ vpbroadcastd m1, [pb_15]
+ pavgb m3, m0
+ punpcklbw m2, m3, m5
+ punpckhbw m3, m5
+ vinserti128 m0, m2, xm3, 1
+ pshufb m5, m1
+ vperm2i128 m1, m2, m3, 0x12
+ lea r3, [strideq*2]
+ vperm2i128 m2, m3, 0x31
+ lea r4, [strideq*3]
+ vperm2i128 m3, m5, 0x31
+ mov r2d, 8
+.loop:
+ mova [dstq+r3*0], m0
+ palignr m0, m1, m0, 2
+ mova [dstq+r3*4], m1
+ palignr m1, m2, m1, 2
+ mova [dstq+r3*8], m2
+ palignr m2, m3, m2, 2
+ mova [dstq+r4*8], m3
+ pshufb m3, m4
+ add dstq, strideq
+ dec r2d
+ jg .loop
+ RET
+
+cglobal vp9_ipred_vl_32x32, 2, 5, 6, dst, stride, l, a
+ movifnidn aq, amp
+ vpbroadcastb m4, [aq+31]
+ vinserti128 m0, m4, [aq+16], 0
+ mova m1, [aq+ 0]
+ vpbroadcastd m5, [pb_1]
+ palignr m2, m0, m1, 2
+ palignr m0, m1, 1
+ LOWPASS 2, 0, 1, 3, m5
+ pavgb m0, m1
+ lea r3, [strideq*2]
+ vperm2i128 m1, m0, m4, 0x31
+ lea r4, [strideq+r3*8]
+ vperm2i128 m3, m2, m4, 0x31
+ mov r2d, 8
+.loop:
+ shufpd m4, m0, m1, 0x05
+ mova [dstq+strideq*0], m0
+ shufpd m5, m2, m3, 0x05
+ mova [dstq+strideq*1], m2
+ palignr m0, m1, m0, 1
+ mova [dstq+r3*8 ], m4
+ psrldq m1, 1
+ mova [dstq+r4 ], m5
+ palignr m2, m3, m2, 1
+ add dstq, r3
+ psrldq m3, 1
+ dec r2d
+ jg .loop
+ RET
+
+cglobal vp9_ipred_vr_32x32, 4, 5, 7, dst, stride, l, a
+ mova m4, [lq+ 0]
+ movu m0, [aq- 1]
+ vpbroadcastd m6, [pb_1]
+ vperm2i128 m2, m4, m0, 0x21
+ pslldq xm5, xm4, 1
+ palignr m3, m2, m4, 1
+ vinserti128 m5, [lq+15], 1
+ LOWPASS 3, 4, 5, 1, m6
+ mova m1, [aq+ 0]
+ vbroadcasti128 m4, [pb_02468ACE_13579BDF]
+ palignr m2, m0, m2, 15
+ LOWPASS 2, 0, 1, 5, m6
+ pshufb m3, m4
+ lea r3, [strideq*2]
+ vpermq m3, m3, q2031
+ pavgb m0, m1
+ vinserti128 m1, m3, xm0, 1
+ lea r4, [strideq+r3*8]
+ vperm2i128 m3, m2, 0x21
+ mov r2d, 8
+.loop:
+ shufpd m4, m1, m0, 0x05
+ mova [dstq+strideq*0], m0
+ shufpd m5, m3, m2, 0x05
+ mova [dstq+strideq*1], m2
+ palignr m0, m1, 15
+ mova [dstq+r3*8 ], m4
+ pslldq m1, 1
+ mova [dstq+r4 ], m5
+ palignr m2, m3, 15
+ add dstq, r3
+ pslldq m3, 1
+ dec r2d
+ jg .loop
+ RET
+%endif
+
; FIXME 127, 128, 129 ?
-----------------------------------------------------------------------
Summary of changes:
libavcodec/x86/vp9dsp_init.c | 13 +-
libavcodec/x86/vp9intrapred.asm | 467 ++++++++++++++++++++++++++--------------
2 files changed, 309 insertions(+), 171 deletions(-)
hooks/post-receive
--
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]