On Sun, Oct 24, 2021 at 9:10 PM <mindm...@gmail.com> wrote: > From: Mark Reid <mindm...@gmail.com> > > yuv2gbrp_full_X_4_512_c: 12096.6 > yuv2gbrp_full_X_4_512_sse2: 10782.6 > yuv2gbrp_full_X_4_512_sse4: 5143.6 > yuv2gbrp_full_X_4_512_avx2: 3000.1 > yuv2gbrap_full_X_4_512_c: 15463.1 > yuv2gbrap_full_X_4_512_sse2: 14296.6 > yuv2gbrap_full_X_4_512_sse4: 6319.1 > yuv2gbrap_full_X_4_512_avx2: 3554.1 > yuv2gbrp9be_full_X_4_512_c: 14281.6 > yuv2gbrp9be_full_X_4_512_sse2: 11206.1 > yuv2gbrp9be_full_X_4_512_sse4: 5033.6 > yuv2gbrp9be_full_X_4_512_avx2: 3012.6 > yuv2gbrp9le_full_X_4_512_c: 12688.6 > yuv2gbrp9le_full_X_4_512_sse2: 10914.1 > yuv2gbrp9le_full_X_4_512_sse4: 5144.6 > yuv2gbrp9le_full_X_4_512_avx2: 3014.6 > yuv2gbrp10be_full_X_4_512_c: 14257.6 > yuv2gbrp10be_full_X_4_512_sse2: 11089.6 > yuv2gbrp10be_full_X_4_512_sse4: 5039.1 > yuv2gbrp10be_full_X_4_512_avx2: 3001.1 > yuv2gbrp10le_full_X_4_512_c: 12098.6 > yuv2gbrp10le_full_X_4_512_sse2: 10884.1 > yuv2gbrp10le_full_X_4_512_sse4: 5138.1 > yuv2gbrp10le_full_X_4_512_avx2: 2999.6 > yuv2gbrap10be_full_X_4_512_c: 18549.6 > yuv2gbrap10be_full_X_4_512_sse2: 14538.6 > yuv2gbrap10be_full_X_4_512_sse4: 6292.6 > yuv2gbrap10be_full_X_4_512_avx2: 3583.6 > yuv2gbrap10le_full_X_4_512_c: 16631.1 > yuv2gbrap10le_full_X_4_512_sse2: 14190.6 > yuv2gbrap10le_full_X_4_512_sse4: 6348.1 > yuv2gbrap10le_full_X_4_512_avx2: 3554.6 > yuv2gbrp12be_full_X_4_512_c: 13555.1 > yuv2gbrp12be_full_X_4_512_sse2: 10952.1 > yuv2gbrp12be_full_X_4_512_sse4: 5137.6 > yuv2gbrp12be_full_X_4_512_avx2: 3009.6 > yuv2gbrp12le_full_X_4_512_c: 12082.6 > yuv2gbrp12le_full_X_4_512_sse2: 10891.1 > yuv2gbrp12le_full_X_4_512_sse4: 5184.1 > yuv2gbrp12le_full_X_4_512_avx2: 3011.1 > yuv2gbrap12be_full_X_4_512_c: 18689.6 > yuv2gbrap12be_full_X_4_512_sse2: 14522.6 > yuv2gbrap12be_full_X_4_512_sse4: 6237.6 > yuv2gbrap12be_full_X_4_512_avx2: 3585.6 > yuv2gbrap12le_full_X_4_512_c: 16760.6 > yuv2gbrap12le_full_X_4_512_sse2: 14202.1 > yuv2gbrap12le_full_X_4_512_sse4: 6252.1 > yuv2gbrap12le_full_X_4_512_avx2: 3591.1 > yuv2gbrp14be_full_X_4_512_c: 13555.6 > yuv2gbrp14be_full_X_4_512_sse2: 10949.1 > yuv2gbrp14be_full_X_4_512_sse4: 5185.1 > yuv2gbrp14be_full_X_4_512_avx2: 3012.1 > yuv2gbrp14le_full_X_4_512_c: 12068.1 > yuv2gbrp14le_full_X_4_512_sse2: 10883.6 > yuv2gbrp14le_full_X_4_512_sse4: 5145.1 > yuv2gbrp14le_full_X_4_512_avx2: 3007.1 > yuv2gbrp16be_full_X_4_512_c: 12383.6 > yuv2gbrp16be_full_X_4_512_sse2: 8230.6 > yuv2gbrp16be_full_X_4_512_sse4: 4765.6 > yuv2gbrp16be_full_X_4_512_avx2: 2742.6 > yuv2gbrp16le_full_X_4_512_c: 10906.1 > yuv2gbrp16le_full_X_4_512_sse2: 28732.1 > yuv2gbrp16le_full_X_4_512_sse4: 4709.6 > yuv2gbrp16le_full_X_4_512_avx2: 2753.1 > yuv2gbrap16be_full_X_4_512_c: 15472.6 > yuv2gbrap16be_full_X_4_512_sse2: 11021.6 > yuv2gbrap16be_full_X_4_512_sse4: 5487.6 > yuv2gbrap16be_full_X_4_512_avx2: 3143.6 > yuv2gbrap16le_full_X_4_512_c: 13668.6 > yuv2gbrap16le_full_X_4_512_sse2: 10562.1 > yuv2gbrap16le_full_X_4_512_sse4: 5506.6 > yuv2gbrap16le_full_X_4_512_avx2: 3149.6 > yuv2gbrpf32be_full_X_4_512_c: 15471.1 > yuv2gbrpf32be_full_X_4_512_sse2: 8524.6 > yuv2gbrpf32be_full_X_4_512_sse4: 4559.1 > yuv2gbrpf32be_full_X_4_512_avx2: 2388.1 > yuv2gbrpf32le_full_X_4_512_c: 14247.6 > yuv2gbrpf32le_full_X_4_512_sse2: 7600.6 > yuv2gbrpf32le_full_X_4_512_sse4: 4385.6 > yuv2gbrpf32le_full_X_4_512_avx2: 2258.6 > yuv2gbrapf32be_full_X_4_512_c: 18412.1 > yuv2gbrapf32be_full_X_4_512_sse2: 11353.6 > yuv2gbrapf32be_full_X_4_512_sse4: 5807.1 > yuv2gbrapf32be_full_X_4_512_avx2: 2928.1 > yuv2gbrapf32le_full_X_4_512_c: 16485.1 > yuv2gbrapf32le_full_X_4_512_sse2: 10202.1 > yuv2gbrapf32le_full_X_4_512_sse4: 5571.6 > yuv2gbrapf32le_full_X_4_512_avx2: 2847.6 > > > --- > libswscale/x86/output.asm | 440 +++++++++++++++++++++++++++++++++++++- > libswscale/x86/swscale.c | 99 +++++++++ > tests/checkasm/Makefile | 2 +- > tests/checkasm/checkasm.c | 1 + > tests/checkasm/checkasm.h | 1 + > tests/checkasm/sw_gbrp.c | 198 +++++++++++++++++ > tests/fate/checkasm.mak | 1 + > 7 files changed, 740 insertions(+), 2 deletions(-) > create mode 100644 tests/checkasm/sw_gbrp.c > > diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm > index 52cf9f2c2e..e80b6256b4 100644 > --- a/libswscale/x86/output.asm > +++ b/libswscale/x86/output.asm > @@ -38,7 +38,49 @@ pw_32: times 8 dw 32 > pd_255: times 8 dd 255 > pw_512: times 8 dw 512 > pw_1024: times 8 dw 1024 > - > +pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0 > +pd_yuv2gbrp16_start: times 8 dd -0x40000000 > +pd_yuv2gbrp_y_start: times 8 dd (1 << 9) > +pd_yuv2gbrp_uv_start: times 8 dd ((1 << 9) - (128 << 19)) > +pd_yuv2gbrp_a_start: times 8 dd (1 << 18) > +pd_yuv2gbrp16_offset: times 8 dd 0x10000 ;(1 << 16) > +pd_yuv2gbrp16_round13: times 8 dd 0x02000 ;(1 << 13) > +pd_yuv2gbrp16_a_offset: times 8 dd 0x20002000 > +pd_yuv2gbrp16_upper30: times 8 dd 0x3FFFFFFF ;(1<<30) - 1 > +pd_yuv2gbrp16_upper27: times 8 dd 0x07FFFFFF ;(1<<27) - 1 > +pd_yuv2gbrp16_upperC: times 8 dd 0xC0000000 > +pb_lo_pack_shuffle8: db 0, 4, 8, 12, \ > + -1, -1, -1, -1, \ > + -1, -1, -1, -1, \ > + -1, -1, -1, -1 > +pb_hi_pack_shuffle8: db -1, -1, -1, -1, \ > + 0, 4, 8, 12, \ > + -1, -1, -1, -1, \ > + -1, -1, -1, -1 > +pb_lo_pack_shuffle16le: db 0, 1, 4, 5, \ > + 8, 9, 12, 13, \ > + -1, -1, -1, -1, \ > + -1, -1, -1, -1 > +pb_lo_pack_shuffle16be: db 1, 0, 5, 4, \ > + 9, 8, 13, 12, \ > + -1, -1, -1, -1, \ > + -1, -1, -1, -1 > +pb_hi_pack_shuffle16le: db -1, -1, -1, -1, \ > + -1, -1, -1, -1, \ > + 0, 1, 4, 5, \ > + 8, 9, 12, 13 > +pb_hi_pack_shuffle16be: db -1, -1, -1, -1, \ > + -1, -1, -1, -1, \ > + 1, 0, 5, 4, \ > + 9, 8, 13, 12 > +pb_shuffle32be db 3, 2, 1, 0, \ > + 7, 6, 5, 4, \ > + 11, 10, 9, 8, \ > + 15, 14, 13, 12, \ > + 3, 2, 1, 0, \ > + 7, 6, 5, 4, \ > + 11, 10, 9, 8, \ > + 15, 14, 13, 12 > yuv2nv12_shuffle_mask: times 2 db 0, 4, 8, 12, \ > -1, -1, -1, -1, \ > -1, -1, -1, -1, \ > @@ -549,3 +591,399 @@ yuv2nv12cX_fn yuv2nv12 > yuv2nv12cX_fn yuv2nv21 > %endif > %endif ; ARCH_X86_64 > + > > +;----------------------------------------------------------------------------- > +; planar grb yuv2anyX functions > +; void ff_yuv2<gbr_format>_full_X_<opt>(SwsContext *c, const int16_t > *lumFilter, > +; const int16_t **lumSrcx, int > lumFilterSize, > +; const int16_t *chrFilter, const > int16_t **chrUSrcx, > +; const int16_t **chrVSrcx, int > chrFilterSize, > +; const int16_t **alpSrcx, uint8_t > **dest, > +; int dstW, int y) > > +;----------------------------------------------------------------------------- > + > +%if ARCH_X86_64 > +struc SwsContext > + .padding: resb 40292 ; offsetof(SwsContext, > yuv2rgb_y_offset) > + .yuv2rgb_y_offset: resd 1 > + .yuv2rgb_y_coeff: resd 1 > + .yuv2rgb_v2r_coeff: resd 1 > + .yuv2rgb_v2g_coeff: resd 1 > + .yuv2rgb_u2g_coeff: resd 1 > + .yuv2rgb_u2b_coeff: resd 1 > +endstruc > + > +%define R m0 > +%define G m1 > +%define B m2 > +%define A m3 > + > +%define Y m4 > +%define U m5 > +%define V m6 > + > +; Clip a signed integer to an unsigned power of two range. > +; av_clip_uintp2 > +; 1 - dest > +; 2 - bit position to clip at > +%macro CLIPP2 2 > + ; (~a) >> 31 & ((1<<p) - 1); > + pcmpeqb m4, m4 > + pxor m4, %1 > + psrad m4, 31 > + movu m5, [pd_yuv2gbrp16_upper%2] > + pand m4, m5 > + > + ; (a & ~((1<<p) - 1)) == 0 > + pandn m5, %1 > + pxor m6, m6 > + pcmpeqd m5, m6 > +%if cpuflag(avx2) > + vpblendvb %1, m4, %1, m5 > +%else > + pxor %1, m4 > + pand %1, m5 > + pxor %1, m4 > +%endif > +%endmacro > + > +; 1 - dest > +; 2 - source > +%macro LOAD16 2 > + %if cpuflag(avx2) > + movu xm%1, %2 > + vpmovsxwd m%1, xm%1 > + %elif cpuflag(sse4) > + movsd xm%1, %2 > + vpmovsxwd m%1, xm%1
+ %else > + movsd xm%1, %2 > + pshufd xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0) > + pshuflw xm%1, xm%1, (1 << 6 | 1 << 4 | 0 << 2 | 0 << 0) > + pshufhw xm%1, xm%1, (1 << 6 | 1 << 4 | 0 << 2 | 0 << 0) > + psrad xm%1, 16 ; sign extend > + %endif > +%endmacro The sse4 path shouldn't have a vex prefix instruction, I think I also have a way to load in sse2 with less instructions. I'll submit a new version of this patch _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".