Hello,
Excuse me, how about FMADD on AVX2 platform? For example + mulps m7, m7, m14 + addps m0, m0, m7 ==> fmadd231ps m0,m7,m14 Regards, Min Chen 2021-09-29 09:18:05,mindm...@gmail.com >From: Mark Reid <mindm...@gmail.com> > >Only supports float and 16bit planer formats at the momoment. >Mainly focused on AVX and AVX2 optimizations, but SSE2 does seem offer some >speed gains. > >f32 1920x1080 1 thread with prelut >c impl >1389936500 UNITS in lut3d->interp, 1 runs, 0 skips >1425800240 UNITS in lut3d->interp, 2 runs, 0 skips >1433312777 UNITS in lut3d->interp, 4 runs, 0 skips >1443346798 UNITS in lut3d->interp, 8 runs, 0 skips > >sse2 >948662320 UNITS in lut3d->interp, 1 runs, 0 skips >1101247540 UNITS in lut3d->interp, 2 runs, 0 skips >1050645695 UNITS in lut3d->interp, 4 runs, 0 skips >1041102937 UNITS in lut3d->interp, 8 runs, 0 skips > >avx >633837000 UNITS in lut3d->interp, 1 runs, 0 skips >669452850 UNITS in lut3d->interp, 2 runs, 0 skips >650716580 UNITS in lut3d->interp, 4 runs, 0 skips >644698550 UNITS in lut3d->interp, 8 runs, 0 skips > >avx2 >354940020 UNITS in lut3d->interp, 1 runs, 0 skips >362384340 UNITS in lut3d->interp, 2 runs, 0 skips >356799020 UNITS in lut3d->interp, 4 runs, 0 skips >357276815 UNITS in lut3d->interp, 8 runs, 0 skips > >gbrap16 1920x1080 1 thread with prelut >c impl >1445071160 UNITS in lut3d->interp, 1 runs, 0 skips >1477959120 UNITS in lut3d->interp, 2 runs, 0 skips >1472102670 UNITS in lut3d->interp, 4 runs, 0 skips >1462579330 UNITS in lut3d->interp, 8 runs, 0 skips > >sse2 >1035437580 UNITS in lut3d->interp, 1 runs, 0 skips >1050139710 UNITS in lut3d->interp, 2 runs, 0 skips >1070147205 UNITS in lut3d->interp, 4 runs, 0 skips >1064583037 UNITS in lut3d->interp, 8 runs, 0 skips > >avx >678089880 UNITS in lut3d->interp, 1 runs, 0 skips >679112485 UNITS in lut3d->interp, 2 runs, 0 skips >695527212 UNITS in lut3d->interp, 4 runs, 0 skips >691300053 UNITS in lut3d->interp, 8 runs, 0 skips > >avx2 >372671340 UNITS in lut3d->interp, 1 runs, 0 skips >373449870 UNITS in lut3d->interp, 2 runs, 0 skips >383725625 UNITS in lut3d->interp, 4 runs, 0 skips >382860848 UNITS in lut3d->interp, 8 runs, 0 skips > >--- > libavfilter/lut3d.h | 83 ++++ > libavfilter/vf_lut3d.c | 61 +-- > libavfilter/x86/Makefile | 2 + > libavfilter/x86/vf_lut3d.asm | 757 ++++++++++++++++++++++++++++++++ > libavfilter/x86/vf_lut3d_init.c | 88 ++++ > 5 files changed, 935 insertions(+), 56 deletions(-) > create mode 100644 libavfilter/lut3d.h > create mode 100644 libavfilter/x86/vf_lut3d.asm > create mode 100644 libavfilter/x86/vf_lut3d_init.c > >diff --git a/libavfilter/lut3d.h b/libavfilter/lut3d.h >new file mode 100644 >index 0000000000..ded2a036a5 >--- /dev/null >+++ b/libavfilter/lut3d.h >@@ -0,0 +1,83 @@ >+/* >+ * Copyright (c) 2013 Clément Bœsch >+ * Copyright (c) 2018 Paul B Mahol >+ * >+ * This file is part of FFmpeg. >+ * >+ * FFmpeg is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU Lesser General Public >+ * License as published by the Free Software Foundation; either >+ * version 2.1 of the License, or (at your option) any later version. >+ * >+ * FFmpeg is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >+ * Lesser General Public License for more details. >+ * >+ * You should have received a copy of the GNU Lesser General Public >+ * License along with FFmpeg; if not, write to the Free Software >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 >USA >+ */ >+#ifndef AVFILTER_LUT3D_H >+#define AVFILTER_LUT3D_H >+ >+#include "libavutil/pixdesc.h" >+#include "framesync.h" >+#include "avfilter.h" >+ >+enum interp_mode { >+ INTERPOLATE_NEAREST, >+ INTERPOLATE_TRILINEAR, >+ INTERPOLATE_TETRAHEDRAL, >+ INTERPOLATE_PYRAMID, >+ INTERPOLATE_PRISM, >+ NB_INTERP_MODE >+}; >+ >+struct rgbvec { >+ float r, g, b; >+}; >+ >+/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT >+ * of 512x512 (64x64x64) */ >+#define MAX_LEVEL 256 >+#define PRELUT_SIZE 65536 >+ >+typedef struct Lut3DPreLut { >+ int size; >+ float min[3]; >+ float max[3]; >+ float scale[3]; >+ float* lut[3]; >+} Lut3DPreLut; >+ >+typedef struct LUT3DContext { >+ const AVClass *class; >+ struct rgbvec *lut; >+ int lutsize; >+ int lutsize2; >+ struct rgbvec scale; >+ int interpolation; ///<interp_mode >+ char *file; >+ uint8_t rgba_map[4]; >+ int step; >+ avfilter_action_func *interp; >+ Lut3DPreLut prelut; >+#if CONFIG_HALDCLUT_FILTER >+ uint8_t clut_rgba_map[4]; >+ int clut_step; >+ int clut_bits; >+ int clut_planar; >+ int clut_float; >+ int clut_width; >+ FFFrameSync fs; >+#endif >+} LUT3DContext; >+ >+typedef struct ThreadData { >+ AVFrame *in, *out; >+} ThreadData; >+ >+void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc); >+ >+#endif /* AVFILTER_LUT3D_H */ >\ No newline at end of file >diff --git a/libavfilter/vf_lut3d.c b/libavfilter/vf_lut3d.c >index 9fbda833b9..1fd0af06db 100644 >--- a/libavfilter/vf_lut3d.c >+++ b/libavfilter/vf_lut3d.c >@@ -31,73 +31,18 @@ > #include "libavutil/intreadwrite.h" > #include "libavutil/intfloat.h" > #include "libavutil/avassert.h" >-#include "libavutil/pixdesc.h" > #include "libavutil/avstring.h" >-#include "avfilter.h" > #include "drawutils.h" > #include "formats.h" >-#include "framesync.h" > #include "internal.h" > #include "video.h" >+#include "lut3d.h" > > #define R 0 > #define G 1 > #define B 2 > #define A 3 > >-enum interp_mode { >- INTERPOLATE_NEAREST, >- INTERPOLATE_TRILINEAR, >- INTERPOLATE_TETRAHEDRAL, >- INTERPOLATE_PYRAMID, >- INTERPOLATE_PRISM, >- NB_INTERP_MODE >-}; >- >-struct rgbvec { >- float r, g, b; >-}; >- >-/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT >- * of 512x512 (64x64x64) */ >-#define MAX_LEVEL 256 >-#define PRELUT_SIZE 65536 >- >-typedef struct Lut3DPreLut { >- int size; >- float min[3]; >- float max[3]; >- float scale[3]; >- float* lut[3]; >-} Lut3DPreLut; >- >-typedef struct LUT3DContext { >- const AVClass *class; >- int interpolation; ///<interp_mode >- char *file; >- uint8_t rgba_map[4]; >- int step; >- avfilter_action_func *interp; >- struct rgbvec scale; >- struct rgbvec *lut; >- int lutsize; >- int lutsize2; >- Lut3DPreLut prelut; >-#if CONFIG_HALDCLUT_FILTER >- uint8_t clut_rgba_map[4]; >- int clut_step; >- int clut_bits; >- int clut_planar; >- int clut_float; >- int clut_width; >- FFFrameSync fs; >-#endif >-} LUT3DContext; >- >-typedef struct ThreadData { >- AVFrame *in, *out; >-} ThreadData; >- > #define OFFSET(x) offsetof(LUT3DContext, x) > #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM > #define TFLAGS > AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM >@@ -1207,6 +1152,10 @@ static int config_input(AVFilterLink *inlink) > av_assert0(0); > } > >+ if (ARCH_X86) { >+ ff_lut3d_init_x86(lut3d, desc); >+ } >+ > return 0; > } > >diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile >index 016a5b3511..a29941eaeb 100644 >--- a/libavfilter/x86/Makefile >+++ b/libavfilter/x86/Makefile >@@ -17,6 +17,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) += >x86/vf_hqdn3d_init.o > OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o > OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o > OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o >+OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d_init.o > OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp_init.o > OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o > OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o >@@ -57,6 +58,7 @@ X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += >x86/vf_hqdn3d.o > X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o > X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o > X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o >+X86ASM-OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d.o > X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp.o > X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o > X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o >diff --git a/libavfilter/x86/vf_lut3d.asm b/libavfilter/x86/vf_lut3d.asm >new file mode 100644 >index 0000000000..b3d7c3962b >--- /dev/null >+++ b/libavfilter/x86/vf_lut3d.asm >@@ -0,0 +1,757 @@ >+;***************************************************************************** >+;* x86-optimized functions for lut3d filter >+;* >+;* Copyright (c) 2021 Mark Reid <mindm...@gmail.com> >+;* >+;* This file is part of FFmpeg. >+;* >+;* FFmpeg is free software; you can redistribute it and/or >+;* modify it under the terms of the GNU Lesser General Public >+;* License as published by the Free Software Foundation; either >+;* version 2.1 of the License, or (at your option) any later version. >+;* >+;* FFmpeg is distributed in the hope that it will be useful, >+;* but WITHOUT ANY WARRANTY; without even the implied warranty of >+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >+;* Lesser General Public License for more details. >+;* >+;* You should have received a copy of the GNU Lesser General Public >+;* License along with FFmpeg; if not, write to the Free Software >+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 >USA >+;****************************************************************************** >+ >+%include "libavutil/x86/x86util.asm" >+ >+SECTION_RODATA >+pd_1f: times 8 dd 1.0 >+pd_3f: times 8 dd 3.0 >+ >+; used to limit rshifts as they are more expensive in avx1 >+pd_001: times 8 dd 001b >+pd_010: times 8 dd 010b >+pd_100: times 8 dd 100b >+ >+pd_65535f: times 8 dd 65535.0 >+pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0 >+ >+pb_shuffle16: db 0, 1, 0x80, 0x80, \ >+ 2, 3, 0x80, 0x80, \ >+ 4, 5, 0x80, 0x80, \ >+ 6, 7, 0x80, 0x80 >+ >+pb_lo_pack_shuffle16: db 0, 1, 4, 5, \ >+ 8, 9, 12, 13, \ >+ 0x80, 0x80, 0x80, 0x80, \ >+ 0x80, 0x80, 0x80, 0x80 >+ >+pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \ >+ 0x80, 0x80, 0x80, 0x80, \ >+ 0, 1, 4, 5, \ >+ 8, 9, 12, 13 >+ >+; tetrahedral table -------------------------------------------- >+; name: x2| x1| x0| cxxb| cxxa >+; values: r 00| r 00| r 00| c011 011| c001 001 >+; g 01| g 01| g 01| c101 101| c010 010 >+; b 10| b 10| b 10| c110 110| c100 100 >+ >+; g>b b | g | r | >c110 | c100 >+pd_tetra_table0: times 8 dd (10b << 10) | (01b << 8) | (00b << 6) | (110b << >3) | 100b >+; r>b g | b | r | >c101 | c100 >+pd_tetra_table1: times 8 dd (01b << 10) | (10b << 8) | (00b << 6) | (101b << >3) | 100b >+; else g | r | b | >c101 | c001 >+pd_tetra_table2: times 8 dd (01b << 10) | (00b << 8) | (10b << 6) | (101b << >3) | 001b >+; b>g r | g | b | >c011 | c001 >+pd_tetra_table3: times 8 dd (00b << 10) | (01b << 8) | (10b << 6) | (011b << >3) | 001b >+; b>r r | b | g | >c011 | c010 >+pd_tetra_table4: times 8 dd (00b << 10) | (10b << 8) | (01b << 6) | (011b << >3) | 010b >+; else b | r | g | >c110 | c010 >+pd_tetra_table5: times 8 dd (10b << 10) | (00b << 8) | (01b << 6) | (110b << >3) | 010b >+ >+SECTION .text >+ >+struc Lut3DPreLut >+ .size: resd 1 >+ .min: resd 3 >+ .max: resd 3 >+ .scale: resd 3 >+ .lut: resq 3 >+endstruc >+ >+struc LUT3DContext >+ .class: resq 1 >+ .lut: resq 1 >+ .lutsize: resd 1 >+ .lutsize2: resd 1 >+ .scale: resd 3 >+endstruc >+ >+%define AV_NUM_DATA_POINTERS 8 >+ >+struc AVFrame >+ .data: resq AV_NUM_DATA_POINTERS >+ .linesize: resd AV_NUM_DATA_POINTERS >+ .extended_data: resq 1 >+ .width: resd 1 >+ .height: resd 1 >+endstruc >+ >+%define rm rsp >+%define gm rsp+mmsize >+%define bm rsp+(mmsize*2) >+ >+%define lut3dsizem [rsp+mmsize*3] >+%define lut3dsize2m [rsp+mmsize*4] >+%define lut3dmaxm [rsp+mmsize*5] >+%define prelutmaxm [rsp+mmsize*6] >+ >+%define scalerm [rsp+mmsize*7] >+%define scalegm [rsp+mmsize*8] >+%define scalebm [rsp+mmsize*9] >+ >+%define prelutminrm [rsp+mmsize*10] >+%define prelutmingm [rsp+mmsize*11] >+%define prelutminbm [rsp+mmsize*12] >+ >+%define prelutscalerm [rsp+mmsize*13] >+%define prelutscalegm [rsp+mmsize*14] >+%define prelutscalebm [rsp+mmsize*15] >+ >+; data pointers >+%define srcrm [rsp+mmsize*16 + 0] >+%define srcgm [rsp+mmsize*16 + 8] >+%define srcbm [rsp+mmsize*16 + 16] >+%define srcam [rsp+mmsize*16 + 24] >+ >+%define dstrm [rsp+mmsize*16 + 32] >+%define dstgm [rsp+mmsize*16 + 40] >+%define dstbm [rsp+mmsize*16 + 48] >+%define dstam [rsp+mmsize*16 + 56] >+ >+%macro FETCH_PRELUT_PN 3 >+ mov tmp2d, [rm + %3] >+ mov tmp3d, [gm + %3] >+ movss xm%1, [tmpq + tmp2q*4] >+ movss xm%2, [tmpq + tmp3q*4] >+ movss [rm + %3], xm%1 >+ movss [gm + %3], xm%2 >+%endmacro >+ >+; 1 - p >+; 2 - n >+; 3 - p indices >+; 4 - n indices >+%macro GATHER_PRELUT 4 >+ %if cpuflag(avx2) >+ vpcmpeqb m7, m7 >+ vgatherdps m%1, [tmpq + m%3*4], m7 ; p >+ vpcmpeqb m9, m9 >+ vgatherdps m%2, [tmpq + m%4*4], m9 ; n >+ %else >+ mova [rm], m%3 >+ mova [gm], m%4 >+ FETCH_PRELUT_PN %1, %2, 0 >+ FETCH_PRELUT_PN %1, %2, 4 >+ FETCH_PRELUT_PN %1, %2, 8 >+ FETCH_PRELUT_PN %1, %2, 12 >+ %if mmsize > 16 >+ FETCH_PRELUT_PN %1, %2, 16 >+ FETCH_PRELUT_PN %1, %2, 20 >+ FETCH_PRELUT_PN %1, %2, 24 >+ FETCH_PRELUT_PN %1, %2, 28 >+ %endif >+ movu m%1, [rm] >+ movu m%2, [gm] >+ %endif >+%endmacro >+ >+%macro FLOORPS 2 >+ %if mmsize > 16 >+ vroundps %1, %2, 0x01 >+ %else >+ cvttps2dq %1, %2 >+ cvtdq2ps %1, %1 >+ %endif >+%endmacro >+ >+; 1 - dst >+; 2 - index >+; 3 - min >+; 4 - scale >+; assumes lut max m13, m14 1.0f, zero m15 >+%macro APPLY_PRELUT 4 >+ ; scale >+ subps m5, m%1, %3 ; v - min >+ mulps m5, m5, %4 ; v * scale >+ ; clamp >+ maxps m5, m5, m15 ; max zero >+ minps m5, m5, m13 ; min lut max >+ >+ FLOORPS m3, m5 ; prev index >+ subps m5, m5, m3 ; d >+ addps m4, m3, m14 ; p+1 = n index >+ minps m4, m4, m13 ; clamp n idex >+ >+ mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8] >+ cvttps2dq m6, m3 >+ cvttps2dq m10, m4 >+ GATHER_PRELUT 3, 4, 6, 10 >+ >+ ; lerp >+ subps m8, m4, m3 >+ mulps m8, m8, m5 >+ addps m%1, m8, m3 >+%endmacro >+ >+; 1 - dst >+; 2 - scale >+; assumes lut max m13, zero m15 >+%macro APPLY_SCALE 2 >+ mulps m%1, m%1, %2 >+ maxps m%1, m%1, m15 >+ minps m%1, m%1, m13 >+%endmacro >+ >+%macro BLEND 4 >+%if mmsize > 16 >+ vblendvps %1, %2, %3, %4 >+%else >+ %ifidni %1,%2 >+ %error operand 1 must not equal operand 2 >+ %endif >+ %ifidni %1,%3 >+ %error operand 1 must not equal operand 3 >+ %endif >+ mova %1, %2 >+ xorps %1, %3 >+ andps %1, %4 >+ xorps %1, %2 >+%endif >+%endmacro >+ >+; sets nans to zere, +inf -inf handled later by min/max clamps >+%macro SANITIZE_F 1 >+ cmpps m5, %1, %1, 0x0 ; nan == nan = False >+ %if mmsize <= 16 >+ mova m6, %1 >+ BLEND %1, m15, m6, m5 >+ %else >+ BLEND %1, m15, %1, m5 >+ %endif >+%endmacro >+ >+%macro ADD3 4 >+ addps %1, %2, %3 >+ addps %1, %1, %4 >+%endmacro >+ >+%macro CMP_EQUAL 3 >+%if cpuflag(avx2) >+ vpcmpeqd %1, %2, %3 >+%elif cpuflag(avx) >+ cmpps %1, %2, %3, 0x0 >+%else >+ pcmpeqd %1, %2, %3 >+%endif >+%endmacro >+ >+%macro SHIFT_RIGHT 2 >+%if mmsize <= 16 >+ psrld xm%1, %2 >+%elif cpuflag(avx2) >+ vpsrld m%1, m%1, %2 >+%else >+ vextractf128 xm15, m%1, 1 >+ psrld xm%1, %2 >+ psrld xm15, %2 >+ vinsertf128 m%1, m%1, xm15, 1 >+%endif >+%endmacro >+ >+%macro FETCH_LUT3D_RGB 4 >+ mov tmp2d, [rm + %4] >+ movss xm%1, [tmpq + tmp2q*4 + 0] >+ movss xm%2, [tmpq + tmp2q*4 + 4] >+ movss xm%3, [tmpq + tmp2q*4 + 8] >+ movss [rm + %4], xm%1 >+ movss [gm + %4], xm%2 >+ movss [bm + %4], xm%3 >+%endmacro >+ >+; 1 - dstr >+; 2 - dstg >+; 3 - dstb >+; 4 - indices >+%macro GATHER_LUT3D_INDICES 4 >+%if cpuflag(avx2) >+ vpcmpeqb m3, m3 >+ vgatherdps m%1, [tmpq + m%4*4 + 0], m3 >+ vpcmpeqb m14, m14 >+ vgatherdps m%2, [tmpq + m%4*4 + 4], m14 >+ vpcmpeqb m15, m15 >+ vgatherdps m%3, [tmpq + m%4*4 + 8], m15 >+%else >+ movu [rm], m%4 >+ FETCH_LUT3D_RGB %1, %2, %3, 0 >+ FETCH_LUT3D_RGB %1, %2, %3, 4 >+ FETCH_LUT3D_RGB %1, %2, %3, 8 >+ FETCH_LUT3D_RGB %1, %2, %3, 12 >+%if mmsize > 16 >+ FETCH_LUT3D_RGB %1, %2, %3, 16 >+ FETCH_LUT3D_RGB %1, %2, %3, 20 >+ FETCH_LUT3D_RGB %1, %2, %3, 24 >+ FETCH_LUT3D_RGB %1, %2, %3, 28 >+%endif >+ movu m%1, [rm] >+ movu m%2, [gm] >+ movu m%3, [bm] >+%endif >+%endmacro >+ >+%macro interp_tetrahedral 0 >+ %define d_r m0 >+ %define d_g m1 >+ %define d_b m2 >+ >+ %define prev_r m3 >+ %define prev_g m4 >+ %define prev_b m5 >+ >+ %define next_r m6 >+ %define next_g m7 >+ %define next_b m8 >+ >+ %define x0 m4 >+ %define x1 m5 >+ %define x2 m6 >+ >+ ; setup prev index >+ FLOORPS prev_r, m0 >+ FLOORPS prev_g, m1 >+ FLOORPS prev_b, m2 >+ >+ ; setup deltas >+ subps d_r, m0, prev_r >+ subps d_g, m1, prev_g >+ subps d_b, m2, prev_b >+ >+ ; calculate select mask m9 >+ movu m6, [pd_tetra_table2] >+ cmpps m7, d_r, d_b, 0x1E ; r > b CMP_GT_OQ >+ BLEND m10, m6, [pd_tetra_table1], m7 >+ cmpps m7, d_g, d_b, 0x1E ; g > b CMP_GT_OQ >+ BLEND m6, m10, [pd_tetra_table0], m7 >+ >+ movu m10, [pd_tetra_table5] >+ cmpps m7, d_b, d_r, 0x1E ; b > r CMP_GT_OQ >+ BLEND m9, m10, [pd_tetra_table4], m7 >+ cmpps m7, d_b, d_g, 0x1E ; b > g CMP_GT_OQ >+ BLEND m10, m9, [pd_tetra_table3], m7 >+ >+ cmpps m7, d_r, d_g, 0x1E ; r > g CMP_GT_OQ >+ BLEND m9, m10, m6, m7 >+ >+ ; setup next index >+ addps next_r, prev_r, m14 ; +1 >+ minps next_r, next_r, m13 ; clamp lutmax >+ >+ addps next_g, prev_g, m14 ; +1 >+ minps next_g, next_g, m13 ; clamp lutmax >+ >+ addps next_b, prev_b, m14 ; +1 >+ minps next_b, next_b, m13 ; clamp lutmax >+ >+ ; prescale indices >+ mulps prev_r, prev_r, lut3dsize2m >+ mulps next_r, next_r, lut3dsize2m >+ >+ mulps prev_g, prev_g, lut3dsizem >+ mulps next_g, next_g, lut3dsizem >+ >+ mulps prev_b, prev_b, [pd_3f] >+ mulps next_b, next_b, [pd_3f] >+ >+ movu m14, [pd_001] >+ >+ ; cxxa m10 >+ ; b >+ andps m15, m9, m14 >+ CMP_EQUAL m15, m15, m14 >+ BLEND m10, prev_b, next_b, m15 >+ >+ ; g >+ andps m15, m9, [pd_010] >+ CMP_EQUAL m15, m15, [pd_010] >+ BLEND m12, prev_g, next_g, m15 >+ >+ ; r >+ andps m15, m9, [pd_100] >+ CMP_EQUAL m15, m15, [pd_100] >+ BLEND m13, prev_r, next_r, m15 >+ >+ ADD3 m10, m10, m12, m13 >+ >+ SHIFT_RIGHT 9, 3 ; 3 >+ >+ ; cxxb m11; >+ ; b >+ andps m15, m9, m14 >+ CMP_EQUAL m15, m15, m14 >+ BLEND m11, prev_b, next_b, m15 >+ >+ ; g >+ andps m15, m9, [pd_010] >+ CMP_EQUAL m15, m15, [pd_010] >+ BLEND m12, prev_g, next_g, m15 >+ >+ ; r >+ andps m15, m9, [pd_100] >+ CMP_EQUAL m15, m15, [pd_100] >+ BLEND m13, prev_r, next_r, m15 >+ >+ ADD3 m11, m11, m12, m13 >+ >+ ; c000 m12; >+ ADD3 m12, prev_r, prev_g, prev_b >+ >+ ; c111 m13; >+ ADD3 m13, next_r, next_g, next_b >+ >+ SHIFT_RIGHT 9, 3 ; 6 >+ >+ ; x0, m4 >+ andps m15, m9, m14 >+ CMP_EQUAL m15, m15, m14 >+ BLEND m7, d_r, d_g, m15 ; r,g >+ >+ andps m15, m9, [pd_010] >+ CMP_EQUAL m15, m15, [pd_010] >+ BLEND x0, m7, d_b, m15 ; b >+ >+ ; x1, m5 >+ andps m15, m9, [pd_100] >+ CMP_EQUAL m15, m15, [pd_100] >+ BLEND m7, d_r, d_g, m15 ; r,g >+ >+ SHIFT_RIGHT 9, 3 ; 9 >+ >+ andps m15, m9, m14 >+ CMP_EQUAL m15, m15, m14 >+ BLEND x1, m7, d_b, m15 ; b >+ >+ ; x2, m6 >+ andps m15, m9, [pd_010] >+ CMP_EQUAL m15, m15, [pd_010] >+ BLEND m7, d_r, d_g, m15 ; r,g >+ >+ andps m15, m9, [pd_100] >+ CMP_EQUAL m15, m15, [pd_100] >+ BLEND x2, m7, d_b, m15 ; b >+ >+ ; convert indices to integer >+ cvttps2dq m12, m12 >+ cvttps2dq m10, m10 >+ cvttps2dq m11, m11 >+ cvttps2dq m13, m13 >+ >+ ; now the gathering festival >+ mov tmpq, [ctxq + LUT3DContext.lut] >+ >+ GATHER_LUT3D_INDICES 0, 1, 2, 12 >+ movu m14, [pd_1f] >+ subps m14, m14, x0; 1 - x0 >+ >+ mulps m0, m0, m14 >+ mulps m1, m1, m14 >+ mulps m2, m2, m14 >+ >+ GATHER_LUT3D_INDICES 7, 8, 9, 10 >+ subps m14, x0, x1; x0 - x1 >+ mulps m7, m7, m14 >+ addps m0, m0, m7 >+ >+ mulps m8, m8, m14 >+ addps m1, m1, m8 >+ >+ mulps m9, m9, m14 >+ addps m2, m2, m9 >+ >+ GATHER_LUT3D_INDICES 7, 8, 9, 11 >+ subps m14, x1, x2; x1 - x2 >+ >+ mulps m7, m7, m14 >+ addps m0, m0, m7 >+ >+ mulps m8, m8, m14 >+ addps m1, m1, m8 >+ >+ mulps m9, m9, m14 >+ addps m2, m2, m9 >+ >+ GATHER_LUT3D_INDICES 7, 8, 9, 13 >+ mulps m7, m7, x2 >+ addps m0, m0, m7 >+ >+ mulps m8, m8, x2 >+ addps m1, m1, m8 >+ >+ mulps m9, m9, x2 >+ addps m2, m2, m9 >+%endmacro >+ >+%macro INIT_DATA_PTR 3 >+ mov ptrq, [%2 + AVFrame.data + %3 * 8] >+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4] >+ imul tmpd, slice_startd >+ add ptrq, tmpq >+ mov %1, ptrq >+%endmacro >+ >+%macro INC_DATA_PTR 3 >+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4] >+ mov ptrq, %1 >+ add ptrq, tmpq >+ mov %1, ptrq >+%endmacro >+ >+%macro LOAD16 2 >+ mov ptrq, %2 >+ %if mmsize > 16 >+ movu xm%1, [ptrq + xq*2] >+ %else >+ movsd xm%1, [ptrq + xq*2] >+ %endif >+ %if cpuflag(avx2) >+ vpmovzxwd m%1, xm%1 >+ %else >+ %if mmsize > 16 >+ pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0) >+ pshufb xm%1, xm6 ; pb_shuffle16 >+ pshufb xm4, xm6 ; pb_shuffle16 >+ vinsertf128 m%1, m%1, xm4, 1 >+ %else >+ pshufd xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0) >+ pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >+ pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >+ %endif >+ %endif >+ cvtdq2ps m%1, m%1 >+ mulps m%1, m%1, m7 ; pd_65535_invf >+%endmacro >+ >+%macro STORE16 2 >+ mulps m%2, m%2, m5 ; [pd_65535f] >+ minps m%2, m%2, m5 ; [pd_65535f] >+ maxps m%2, m%2, m15 ; zero >+ cvttps2dq m%2, m%2 >+ %if mmsize > 16 >+ vextractf128 xm4, m%2, 1 >+ pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16] >+ pshufb xm4, xm7 ; [pb_hi_pack_shuffle16] >+ por xm%2, xm4 >+ %else >+ pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >+ pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >+ pshufd xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0) >+ %endif >+ mov ptrq, %1 >+ %if mmsize > 16 >+ movu [ptrq + xq*2], xm%2 >+ %else >+ movsd [ptrq + xq*2], xm%2 >+ %endif >+%endmacro >+ >+; 1 - interp method >+; 2 - format_name >+; 3 - depth >+; 4 - is float format >+%macro DEFINE_INTERP_FUNC 4 >+cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut, src_image, >dst_image, slice_start, slice_end, has_alpha, width, x, ptr, tmp, tmp2, tmp3 >+ ; store lut max and lutsize >+ mov tmpd, dword [ctxq + LUT3DContext.lutsize] >+ cvtsi2ss xm0, tmpd >+ mulss xm0, xm0, [pd_3f] >+ VBROADCASTSS m0, xm0 >+ mova lut3dsizem, m0 >+ sub tmpd, 1 >+ cvtsi2ss xm0, tmpd >+ VBROADCASTSS m0, xm0 >+ mova lut3dmaxm, m0 >+ >+ ; scale_r >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4] >+ VBROADCASTSS m1, xm1 >+ mova scalerm, m1 >+ >+ ; scale_g >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4] >+ VBROADCASTSS m1, xm1 >+ mova scalegm, m1 >+ >+ ; scale_b >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4] >+ VBROADCASTSS m1, xm1 >+ mova scalebm, m1 >+ >+ ; store lutsize2 >+ cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2] >+ mulss xm0, xm0, [pd_3f] >+ VBROADCASTSS m0, xm0 >+ mova lut3dsize2m, m0 >+ >+ ; init prelut values >+ cmp prelutq, 0 >+ je %%skip_init_prelut >+ mov tmpd, dword [prelutq + Lut3DPreLut.size] >+ sub tmpd, 1 >+ cvtsi2ss xm0, tmpd >+ VBROADCASTSS m0, xm0 >+ mova prelutmaxm, m0 >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4] >+ mova prelutminrm, m0 >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4] >+ mova prelutmingm, m0 >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4] >+ mova prelutminbm, m0 >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4] >+ mova prelutscalerm, m0 >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4] >+ mova prelutscalegm, m0 >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4] >+ mova prelutscalebm, m0 >+ %%skip_init_prelut: >+ >+ mov widthd, [src_imageq + AVFrame.width] >+ >+ ; gbra pixel order >+ INIT_DATA_PTR srcrm, src_imageq, 2 >+ INIT_DATA_PTR srcgm, src_imageq, 0 >+ INIT_DATA_PTR srcbm, src_imageq, 1 >+ INIT_DATA_PTR srcam, src_imageq, 3 >+ >+ INIT_DATA_PTR dstrm, dst_imageq, 2 >+ INIT_DATA_PTR dstgm, dst_imageq, 0 >+ INIT_DATA_PTR dstbm, dst_imageq, 1 >+ INIT_DATA_PTR dstam, dst_imageq, 3 >+ >+ %%loop_y: >+ xor xq, xq >+ %%loop_x: >+ movu m14, [pd_1f] >+ xorps m15, m15, m15 >+ %if %4 ; float >+ mov ptrq, srcrm >+ movu m0, [ptrq + xq*4] >+ mov ptrq, srcgm >+ movu m1, [ptrq + xq*4] >+ mov ptrq, srcbm >+ movu m2, [ptrq + xq*4] >+ SANITIZE_F m0 >+ SANITIZE_F m1 >+ SANITIZE_F m2 >+ %else >+ ; constants for LOAD16 >+ movu m7, [pd_65535_invf] >+ %if notcpuflag(avx2) && mmsize >= 32 >+ movu xm6, [pb_shuffle16] >+ %endif >+ LOAD16 0, srcrm >+ LOAD16 1, srcgm >+ LOAD16 2, srcbm >+ %endif >+ >+ cmp prelutq, 0 >+ je %%skip_prelut >+ mova m13, prelutmaxm >+ APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm >+ APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm >+ APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm >+ %%skip_prelut: >+ >+ mova m13, lut3dmaxm >+ APPLY_SCALE 0, scalerm >+ APPLY_SCALE 1, scalegm >+ APPLY_SCALE 2, scalebm >+ >+ interp_%1 >+ >+ %if %4 ; float >+ mov ptrq, dstrm >+ movu [ptrq + xq*4], m0 >+ mov ptrq, dstgm >+ movu [ptrq + xq*4], m1 >+ mov ptrq, dstbm >+ movu [ptrq + xq*4], m2 >+ cmp has_alphad, 0 >+ je %%skip_alphaf >+ mov ptrq, srcam >+ movu m0, [ptrq + xq*4] >+ mov ptrq, dstam >+ movu [ptrq + xq*4], m0 >+ %%skip_alphaf: >+ %else >+ ; constants for STORE16 >+ movu m5, [pd_65535f] >+ %if mmsize > 16 >+ movu xm6, [pb_lo_pack_shuffle16] >+ movu xm7, [pb_hi_pack_shuffle16] >+ %endif >+ >+ xorps m15, m15, m15 >+ STORE16 dstrm, 0 >+ STORE16 dstgm, 1 >+ STORE16 dstbm, 2 >+ >+ cmp has_alphad, 0 >+ je %%skip_alpha >+ %if mmsize > 16 >+ mov ptrq, srcam >+ movu xm0, [ptrq + xq*2] >+ mov ptrq, dstam >+ movu [ptrq + xq*2], xm0 >+ %else >+ mov ptrq, srcam >+ movsd xm0, [ptrq + xq*2] >+ mov ptrq, dstam >+ movsd [ptrq + xq*2], xm0 >+ %endif >+ >+ %%skip_alpha: >+ %endif >+ >+ add xq, mmsize/4 >+ cmp xd, widthd >+ jl %%loop_x >+ >+ INC_DATA_PTR srcrm, src_imageq, 2 >+ INC_DATA_PTR srcgm, src_imageq, 0 >+ INC_DATA_PTR srcbm, src_imageq, 1 >+ INC_DATA_PTR srcam, src_imageq, 3 >+ >+ INC_DATA_PTR dstrm, dst_imageq, 2 >+ INC_DATA_PTR dstgm, dst_imageq, 0 >+ INC_DATA_PTR dstbm, dst_imageq, 1 >+ INC_DATA_PTR dstam, dst_imageq, 3 >+ >+ inc slice_startd >+ cmp slice_startd, slice_endd >+ jl %%loop_y >+ >+ RET >+%endmacro >+%if ARCH_X86_64 >+ %if HAVE_AVX2_EXTERNAL >+ INIT_YMM avx2 >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 >+ %endif >+ %if HAVE_AVX_EXTERNAL >+ INIT_YMM avx >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 >+ %endif >+ INIT_XMM sse2 >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 >+%endif >\ No newline at end of file >diff --git a/libavfilter/x86/vf_lut3d_init.c b/libavfilter/x86/vf_lut3d_init.c >new file mode 100644 >index 0000000000..9b9b36e4af >--- /dev/null >+++ b/libavfilter/x86/vf_lut3d_init.c >@@ -0,0 +1,88 @@ >+/* >+ * Copyright (c) 2021 Mark Reid <mindm...@gmail.com> >+ * >+ * This file is part of FFmpeg. >+ * >+ * FFmpeg is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU Lesser General Public >+ * License as published by the Free Software Foundation; either >+ * version 2.1 of the License, or (at your option) any later version. >+ * >+ * FFmpeg is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >+ * Lesser General Public License for more details. >+ * >+ * You should have received a copy of the GNU Lesser General Public >+ * License along with FFmpeg; if not, write to the Free Software >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 >USA >+ */ >+ >+#include "libavutil/attributes.h" >+#include "libavutil/cpu.h" >+#include "libavutil/x86/cpu.h" >+#include "libavfilter/lut3d.h" >+ >+#define DEFINE_INTERP_FUNC(name, format, opt) > \ >+void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d, Lut3DPreLut >*prelut, AVFrame *src, AVFrame *dst, int slice_start, int slice_end, int >has_alpha); \ >+static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void *arg, >int jobnr, int nb_jobs) \ >+{ > \ >+ LUT3DContext *lut3d = ctx->priv; > \ >+ Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut: NULL; > \ >+ ThreadData *td = arg; > \ >+ AVFrame *in = td->in; > \ >+ AVFrame *out = td->out; > \ >+ int has_alpha = in->linesize[3] && out != in; > \ >+ int slice_start = (in->height * jobnr ) / nb_jobs; > \ >+ int slice_end = (in->height * (jobnr+1)) / nb_jobs; > \ >+ ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out, slice_start, >slice_end, has_alpha); \ >+ return 0; > \ >+} >+ >+#if ARCH_X86_64 >+#if HAVE_AVX2_EXTERNAL >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2) >+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx2) >+#endif >+#if HAVE_AVX_EXTERNAL >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx) >+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx) >+#endif >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2) >+ DEFINE_INTERP_FUNC(tetrahedral, p16, sse2) >+#endif >+ >+ >+av_cold void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor >*desc) >+{ >+ int cpu_flags = av_get_cpu_flags(); >+ int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR; >+ int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT; >+ int depth = desc->comp[0].depth; >+ >+#if ARCH_X86_64 >+ if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interpolation == >INTERPOLATE_TETRAHEDRAL && planar) { >+#if HAVE_AVX2_EXTERNAL >+ if (isfloat && planar) { >+ s->interp = interp_tetrahedral_pf32_avx2; >+ } else if (depth == 16) { >+ s->interp = interp_tetrahedral_p16_avx2; >+ } >+#endif >+ } else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation == >INTERPOLATE_TETRAHEDRAL && planar) { >+#if HAVE_AVX_EXTERNAL >+ if (isfloat) { >+ s->interp = interp_tetrahedral_pf32_avx; >+ } else if (depth == 16) { >+ s->interp = interp_tetrahedral_p16_avx; >+ } >+#endif >+ } else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation == >INTERPOLATE_TETRAHEDRAL && planar) { >+ if (isfloat) { >+ s->interp = interp_tetrahedral_pf32_sse2; >+ } else if (depth == 16) { >+ s->interp = interp_tetrahedral_p16_sse2; >+ } >+ } >+#endif >+} >-- >2.31.1.windows.1 > >_______________________________________________ >ffmpeg-devel mailing list >ffmpeg-devel@ffmpeg.org >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > >To unsubscribe, visit link above, or email >ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".