On Wed, Sep 29, 2021 at 10:27 AM Mark Reid <mindm...@gmail.com> wrote:
> > > On Tue, Sep 28, 2021 at 6:38 PM chen <chenm...@163.com> wrote: > >> Hello, >> >> >> Excuse me, how about FMADD on AVX2 platform? >> >> >> For example >> + mulps m7, m7, m14 >> + addps m0, m0, m7 >> >> ==> >> >> >> fmadd231ps m0,m7,m14 >> >> > Interesting, does having AVX2 guarantee having FMA instructions? > > I'm still not 100% certain all AVX2 cpus have FMA instructions so I'll add cpuflags check for FMA too. I also came up with a faster way to calculate x0,x1,x2 without the lookup table. will send a new patch. > >> Regards, >> Min Chen >> >> >> 2021-09-29 09:18:05,mindm...@gmail.com >> >From: Mark Reid <mindm...@gmail.com> >> > >> >Only supports float and 16bit planer formats at the momoment. >> >Mainly focused on AVX and AVX2 optimizations, but SSE2 does seem offer >> some >> >speed gains. >> > >> >f32 1920x1080 1 thread with prelut >> >c impl >> >1389936500 UNITS in lut3d->interp, 1 runs, 0 skips >> >1425800240 UNITS in lut3d->interp, 2 runs, 0 skips >> >1433312777 UNITS in lut3d->interp, 4 runs, 0 skips >> >1443346798 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >sse2 >> >948662320 UNITS in lut3d->interp, 1 runs, 0 skips >> >1101247540 UNITS in lut3d->interp, 2 runs, 0 skips >> >1050645695 UNITS in lut3d->interp, 4 runs, 0 skips >> >1041102937 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >avx >> >633837000 UNITS in lut3d->interp, 1 runs, 0 skips >> >669452850 UNITS in lut3d->interp, 2 runs, 0 skips >> >650716580 UNITS in lut3d->interp, 4 runs, 0 skips >> >644698550 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >avx2 >> >354940020 UNITS in lut3d->interp, 1 runs, 0 skips >> >362384340 UNITS in lut3d->interp, 2 runs, 0 skips >> >356799020 UNITS in lut3d->interp, 4 runs, 0 skips >> >357276815 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >gbrap16 1920x1080 1 thread with prelut >> >c impl >> >1445071160 UNITS in lut3d->interp, 1 runs, 0 skips >> >1477959120 UNITS in lut3d->interp, 2 runs, 0 skips >> >1472102670 UNITS in lut3d->interp, 4 runs, 0 skips >> >1462579330 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >sse2 >> >1035437580 UNITS in lut3d->interp, 1 runs, 0 skips >> >1050139710 UNITS in lut3d->interp, 2 runs, 0 skips >> >1070147205 UNITS in lut3d->interp, 4 runs, 0 skips >> >1064583037 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >avx >> >678089880 UNITS in lut3d->interp, 1 runs, 0 skips >> >679112485 UNITS in lut3d->interp, 2 runs, 0 skips >> >695527212 UNITS in lut3d->interp, 4 runs, 0 skips >> >691300053 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >avx2 >> >372671340 UNITS in lut3d->interp, 1 runs, 0 skips >> >373449870 UNITS in lut3d->interp, 2 runs, 0 skips >> >383725625 UNITS in lut3d->interp, 4 runs, 0 skips >> >382860848 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >--- >> > libavfilter/lut3d.h | 83 ++++ >> > libavfilter/vf_lut3d.c | 61 +-- >> > libavfilter/x86/Makefile | 2 + >> > libavfilter/x86/vf_lut3d.asm | 757 ++++++++++++++++++++++++++++++++ >> > libavfilter/x86/vf_lut3d_init.c | 88 ++++ >> > 5 files changed, 935 insertions(+), 56 deletions(-) >> > create mode 100644 libavfilter/lut3d.h >> > create mode 100644 libavfilter/x86/vf_lut3d.asm >> > create mode 100644 libavfilter/x86/vf_lut3d_init.c >> > >> >diff --git a/libavfilter/lut3d.h b/libavfilter/lut3d.h >> >new file mode 100644 >> >index 0000000000..ded2a036a5 >> >--- /dev/null >> >+++ b/libavfilter/lut3d.h >> >@@ -0,0 +1,83 @@ >> >+/* >> >+ * Copyright (c) 2013 Clément Bœsch >> >+ * Copyright (c) 2018 Paul B Mahol >> >+ * >> >+ * This file is part of FFmpeg. >> >+ * >> >+ * FFmpeg is free software; you can redistribute it and/or >> >+ * modify it under the terms of the GNU Lesser General Public >> >+ * License as published by the Free Software Foundation; either >> >+ * version 2.1 of the License, or (at your option) any later version. >> >+ * >> >+ * FFmpeg is distributed in the hope that it will be useful, >> >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >> >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >+ * Lesser General Public License for more details. >> >+ * >> >+ * You should have received a copy of the GNU Lesser General Public >> >+ * License along with FFmpeg; if not, write to the Free Software >> >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >> 02110-1301 USA >> >+ */ >> >+#ifndef AVFILTER_LUT3D_H >> >+#define AVFILTER_LUT3D_H >> >+ >> >+#include "libavutil/pixdesc.h" >> >+#include "framesync.h" >> >+#include "avfilter.h" >> >+ >> >+enum interp_mode { >> >+ INTERPOLATE_NEAREST, >> >+ INTERPOLATE_TRILINEAR, >> >+ INTERPOLATE_TETRAHEDRAL, >> >+ INTERPOLATE_PYRAMID, >> >+ INTERPOLATE_PRISM, >> >+ NB_INTERP_MODE >> >+}; >> >+ >> >+struct rgbvec { >> >+ float r, g, b; >> >+}; >> >+ >> >+/* 3D LUT don't often go up to level 32, but it is common to have a >> Hald CLUT >> >+ * of 512x512 (64x64x64) */ >> >+#define MAX_LEVEL 256 >> >+#define PRELUT_SIZE 65536 >> >+ >> >+typedef struct Lut3DPreLut { >> >+ int size; >> >+ float min[3]; >> >+ float max[3]; >> >+ float scale[3]; >> >+ float* lut[3]; >> >+} Lut3DPreLut; >> >+ >> >+typedef struct LUT3DContext { >> >+ const AVClass *class; >> >+ struct rgbvec *lut; >> >+ int lutsize; >> >+ int lutsize2; >> >+ struct rgbvec scale; >> >+ int interpolation; ///<interp_mode >> >+ char *file; >> >+ uint8_t rgba_map[4]; >> >+ int step; >> >+ avfilter_action_func *interp; >> >+ Lut3DPreLut prelut; >> >+#if CONFIG_HALDCLUT_FILTER >> >+ uint8_t clut_rgba_map[4]; >> >+ int clut_step; >> >+ int clut_bits; >> >+ int clut_planar; >> >+ int clut_float; >> >+ int clut_width; >> >+ FFFrameSync fs; >> >+#endif >> >+} LUT3DContext; >> >+ >> >+typedef struct ThreadData { >> >+ AVFrame *in, *out; >> >+} ThreadData; >> >+ >> >+void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc); >> >+ >> >+#endif /* AVFILTER_LUT3D_H */ >> >\ No newline at end of file >> >diff --git a/libavfilter/vf_lut3d.c b/libavfilter/vf_lut3d.c >> >index 9fbda833b9..1fd0af06db 100644 >> >--- a/libavfilter/vf_lut3d.c >> >+++ b/libavfilter/vf_lut3d.c >> >@@ -31,73 +31,18 @@ >> > #include "libavutil/intreadwrite.h" >> > #include "libavutil/intfloat.h" >> > #include "libavutil/avassert.h" >> >-#include "libavutil/pixdesc.h" >> > #include "libavutil/avstring.h" >> >-#include "avfilter.h" >> > #include "drawutils.h" >> > #include "formats.h" >> >-#include "framesync.h" >> > #include "internal.h" >> > #include "video.h" >> >+#include "lut3d.h" >> > >> > #define R 0 >> > #define G 1 >> > #define B 2 >> > #define A 3 >> > >> >-enum interp_mode { >> >- INTERPOLATE_NEAREST, >> >- INTERPOLATE_TRILINEAR, >> >- INTERPOLATE_TETRAHEDRAL, >> >- INTERPOLATE_PYRAMID, >> >- INTERPOLATE_PRISM, >> >- NB_INTERP_MODE >> >-}; >> >- >> >-struct rgbvec { >> >- float r, g, b; >> >-}; >> >- >> >-/* 3D LUT don't often go up to level 32, but it is common to have a >> Hald CLUT >> >- * of 512x512 (64x64x64) */ >> >-#define MAX_LEVEL 256 >> >-#define PRELUT_SIZE 65536 >> >- >> >-typedef struct Lut3DPreLut { >> >- int size; >> >- float min[3]; >> >- float max[3]; >> >- float scale[3]; >> >- float* lut[3]; >> >-} Lut3DPreLut; >> >- >> >-typedef struct LUT3DContext { >> >- const AVClass *class; >> >- int interpolation; ///<interp_mode >> >- char *file; >> >- uint8_t rgba_map[4]; >> >- int step; >> >- avfilter_action_func *interp; >> >- struct rgbvec scale; >> >- struct rgbvec *lut; >> >- int lutsize; >> >- int lutsize2; >> >- Lut3DPreLut prelut; >> >-#if CONFIG_HALDCLUT_FILTER >> >- uint8_t clut_rgba_map[4]; >> >- int clut_step; >> >- int clut_bits; >> >- int clut_planar; >> >- int clut_float; >> >- int clut_width; >> >- FFFrameSync fs; >> >-#endif >> >-} LUT3DContext; >> >- >> >-typedef struct ThreadData { >> >- AVFrame *in, *out; >> >-} ThreadData; >> >- >> > #define OFFSET(x) offsetof(LUT3DContext, x) >> > #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM >> > #define TFLAGS >> AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM >> >@@ -1207,6 +1152,10 @@ static int config_input(AVFilterLink *inlink) >> > av_assert0(0); >> > } >> > >> >+ if (ARCH_X86) { >> >+ ff_lut3d_init_x86(lut3d, desc); >> >+ } >> >+ >> > return 0; >> > } >> > >> >diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile >> >index 016a5b3511..a29941eaeb 100644 >> >--- a/libavfilter/x86/Makefile >> >+++ b/libavfilter/x86/Makefile >> >@@ -17,6 +17,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) += >> x86/vf_hqdn3d_init.o >> > OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o >> > OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o >> > OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o >> >+OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d_init.o >> > OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += >> x86/vf_maskedclamp_init.o >> > OBJS-$(CONFIG_MASKEDMERGE_FILTER) += >> x86/vf_maskedmerge_init.o >> > OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o >> >@@ -57,6 +58,7 @@ X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += >> x86/vf_hqdn3d.o >> > X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o >> > X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o >> > X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o >> >+X86ASM-OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d.o >> > X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp.o >> > X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o >> > X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o >> >diff --git a/libavfilter/x86/vf_lut3d.asm b/libavfilter/x86/vf_lut3d.asm >> >new file mode 100644 >> >index 0000000000..b3d7c3962b >> >--- /dev/null >> >+++ b/libavfilter/x86/vf_lut3d.asm >> >@@ -0,0 +1,757 @@ >> >> >+;***************************************************************************** >> >+;* x86-optimized functions for lut3d filter >> >+;* >> >+;* Copyright (c) 2021 Mark Reid <mindm...@gmail.com> >> >+;* >> >+;* This file is part of FFmpeg. >> >+;* >> >+;* FFmpeg is free software; you can redistribute it and/or >> >+;* modify it under the terms of the GNU Lesser General Public >> >+;* License as published by the Free Software Foundation; either >> >+;* version 2.1 of the License, or (at your option) any later version. >> >+;* >> >+;* FFmpeg is distributed in the hope that it will be useful, >> >+;* but WITHOUT ANY WARRANTY; without even the implied warranty of >> >+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >+;* Lesser General Public License for more details. >> >+;* >> >+;* You should have received a copy of the GNU Lesser General Public >> >+;* License along with FFmpeg; if not, write to the Free Software >> >+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >> 02110-1301 USA >> >> >+;****************************************************************************** >> >+ >> >+%include "libavutil/x86/x86util.asm" >> >+ >> >+SECTION_RODATA >> >+pd_1f: times 8 dd 1.0 >> >+pd_3f: times 8 dd 3.0 >> >+ >> >+; used to limit rshifts as they are more expensive in avx1 >> >+pd_001: times 8 dd 001b >> >+pd_010: times 8 dd 010b >> >+pd_100: times 8 dd 100b >> >+ >> >+pd_65535f: times 8 dd 65535.0 >> >+pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0 >> >+ >> >+pb_shuffle16: db 0, 1, 0x80, 0x80, \ >> >+ 2, 3, 0x80, 0x80, \ >> >+ 4, 5, 0x80, 0x80, \ >> >+ 6, 7, 0x80, 0x80 >> >+ >> >+pb_lo_pack_shuffle16: db 0, 1, 4, 5, \ >> >+ 8, 9, 12, 13, \ >> >+ 0x80, 0x80, 0x80, 0x80, \ >> >+ 0x80, 0x80, 0x80, 0x80 >> >+ >> >+pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \ >> >+ 0x80, 0x80, 0x80, 0x80, \ >> >+ 0, 1, 4, 5, \ >> >+ 8, 9, 12, 13 >> >+ >> >+; tetrahedral table -------------------------------------------- >> >+; name: x2| x1| x0| cxxb| cxxa >> >+; values: r 00| r 00| r 00| c011 011| c001 001 >> >+; g 01| g 01| g 01| c101 101| c010 010 >> >+; b 10| b 10| b 10| c110 110| c100 100 >> >+ >> >+; g>b b | g | r | >> c110 | c100 >> >+pd_tetra_table0: times 8 dd (10b << 10) | (01b << 8) | (00b << 6) | >> (110b << 3) | 100b >> >+; r>b g | b | r | >> c101 | c100 >> >+pd_tetra_table1: times 8 dd (01b << 10) | (10b << 8) | (00b << 6) | >> (101b << 3) | 100b >> >+; else g | r | b | >> c101 | c001 >> >+pd_tetra_table2: times 8 dd (01b << 10) | (00b << 8) | (10b << 6) | >> (101b << 3) | 001b >> >+; b>g r | g | b | >> c011 | c001 >> >+pd_tetra_table3: times 8 dd (00b << 10) | (01b << 8) | (10b << 6) | >> (011b << 3) | 001b >> >+; b>r r | b | g | >> c011 | c010 >> >+pd_tetra_table4: times 8 dd (00b << 10) | (10b << 8) | (01b << 6) | >> (011b << 3) | 010b >> >+; else b | r | g | >> c110 | c010 >> >+pd_tetra_table5: times 8 dd (10b << 10) | (00b << 8) | (01b << 6) | >> (110b << 3) | 010b >> >+ >> >+SECTION .text >> >+ >> >+struc Lut3DPreLut >> >+ .size: resd 1 >> >+ .min: resd 3 >> >+ .max: resd 3 >> >+ .scale: resd 3 >> >+ .lut: resq 3 >> >+endstruc >> >+ >> >+struc LUT3DContext >> >+ .class: resq 1 >> >+ .lut: resq 1 >> >+ .lutsize: resd 1 >> >+ .lutsize2: resd 1 >> >+ .scale: resd 3 >> >+endstruc >> >+ >> >+%define AV_NUM_DATA_POINTERS 8 >> >+ >> >+struc AVFrame >> >+ .data: resq AV_NUM_DATA_POINTERS >> >+ .linesize: resd AV_NUM_DATA_POINTERS >> >+ .extended_data: resq 1 >> >+ .width: resd 1 >> >+ .height: resd 1 >> >+endstruc >> >+ >> >+%define rm rsp >> >+%define gm rsp+mmsize >> >+%define bm rsp+(mmsize*2) >> >+ >> >+%define lut3dsizem [rsp+mmsize*3] >> >+%define lut3dsize2m [rsp+mmsize*4] >> >+%define lut3dmaxm [rsp+mmsize*5] >> >+%define prelutmaxm [rsp+mmsize*6] >> >+ >> >+%define scalerm [rsp+mmsize*7] >> >+%define scalegm [rsp+mmsize*8] >> >+%define scalebm [rsp+mmsize*9] >> >+ >> >+%define prelutminrm [rsp+mmsize*10] >> >+%define prelutmingm [rsp+mmsize*11] >> >+%define prelutminbm [rsp+mmsize*12] >> >+ >> >+%define prelutscalerm [rsp+mmsize*13] >> >+%define prelutscalegm [rsp+mmsize*14] >> >+%define prelutscalebm [rsp+mmsize*15] >> >+ >> >+; data pointers >> >+%define srcrm [rsp+mmsize*16 + 0] >> >+%define srcgm [rsp+mmsize*16 + 8] >> >+%define srcbm [rsp+mmsize*16 + 16] >> >+%define srcam [rsp+mmsize*16 + 24] >> >+ >> >+%define dstrm [rsp+mmsize*16 + 32] >> >+%define dstgm [rsp+mmsize*16 + 40] >> >+%define dstbm [rsp+mmsize*16 + 48] >> >+%define dstam [rsp+mmsize*16 + 56] >> >+ >> >+%macro FETCH_PRELUT_PN 3 >> >+ mov tmp2d, [rm + %3] >> >+ mov tmp3d, [gm + %3] >> >+ movss xm%1, [tmpq + tmp2q*4] >> >+ movss xm%2, [tmpq + tmp3q*4] >> >+ movss [rm + %3], xm%1 >> >+ movss [gm + %3], xm%2 >> >+%endmacro >> >+ >> >+; 1 - p >> >+; 2 - n >> >+; 3 - p indices >> >+; 4 - n indices >> >+%macro GATHER_PRELUT 4 >> >+ %if cpuflag(avx2) >> >+ vpcmpeqb m7, m7 >> >+ vgatherdps m%1, [tmpq + m%3*4], m7 ; p >> >+ vpcmpeqb m9, m9 >> >+ vgatherdps m%2, [tmpq + m%4*4], m9 ; n >> >+ %else >> >+ mova [rm], m%3 >> >+ mova [gm], m%4 >> >+ FETCH_PRELUT_PN %1, %2, 0 >> >+ FETCH_PRELUT_PN %1, %2, 4 >> >+ FETCH_PRELUT_PN %1, %2, 8 >> >+ FETCH_PRELUT_PN %1, %2, 12 >> >+ %if mmsize > 16 >> >+ FETCH_PRELUT_PN %1, %2, 16 >> >+ FETCH_PRELUT_PN %1, %2, 20 >> >+ FETCH_PRELUT_PN %1, %2, 24 >> >+ FETCH_PRELUT_PN %1, %2, 28 >> >+ %endif >> >+ movu m%1, [rm] >> >+ movu m%2, [gm] >> >+ %endif >> >+%endmacro >> >+ >> >+%macro FLOORPS 2 >> >+ %if mmsize > 16 >> >+ vroundps %1, %2, 0x01 >> >+ %else >> >+ cvttps2dq %1, %2 >> >+ cvtdq2ps %1, %1 >> >+ %endif >> >+%endmacro >> >+ >> >+; 1 - dst >> >+; 2 - index >> >+; 3 - min >> >+; 4 - scale >> >+; assumes lut max m13, m14 1.0f, zero m15 >> >+%macro APPLY_PRELUT 4 >> >+ ; scale >> >+ subps m5, m%1, %3 ; v - min >> >+ mulps m5, m5, %4 ; v * scale >> >+ ; clamp >> >+ maxps m5, m5, m15 ; max zero >> >+ minps m5, m5, m13 ; min lut max >> >+ >> >+ FLOORPS m3, m5 ; prev index >> >+ subps m5, m5, m3 ; d >> >+ addps m4, m3, m14 ; p+1 = n index >> >+ minps m4, m4, m13 ; clamp n idex >> >+ >> >+ mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8] >> >+ cvttps2dq m6, m3 >> >+ cvttps2dq m10, m4 >> >+ GATHER_PRELUT 3, 4, 6, 10 >> >+ >> >+ ; lerp >> >+ subps m8, m4, m3 >> >+ mulps m8, m8, m5 >> >+ addps m%1, m8, m3 >> >+%endmacro >> >+ >> >+; 1 - dst >> >+; 2 - scale >> >+; assumes lut max m13, zero m15 >> >+%macro APPLY_SCALE 2 >> >+ mulps m%1, m%1, %2 >> >+ maxps m%1, m%1, m15 >> >+ minps m%1, m%1, m13 >> >+%endmacro >> >+ >> >+%macro BLEND 4 >> >+%if mmsize > 16 >> >+ vblendvps %1, %2, %3, %4 >> >+%else >> >+ %ifidni %1,%2 >> >+ %error operand 1 must not equal operand 2 >> >+ %endif >> >+ %ifidni %1,%3 >> >+ %error operand 1 must not equal operand 3 >> >+ %endif >> >+ mova %1, %2 >> >+ xorps %1, %3 >> >+ andps %1, %4 >> >+ xorps %1, %2 >> >+%endif >> >+%endmacro >> >+ >> >+; sets nans to zere, +inf -inf handled later by min/max clamps >> >+%macro SANITIZE_F 1 >> >+ cmpps m5, %1, %1, 0x0 ; nan == nan = False >> >+ %if mmsize <= 16 >> >+ mova m6, %1 >> >+ BLEND %1, m15, m6, m5 >> >+ %else >> >+ BLEND %1, m15, %1, m5 >> >+ %endif >> >+%endmacro >> >+ >> >+%macro ADD3 4 >> >+ addps %1, %2, %3 >> >+ addps %1, %1, %4 >> >+%endmacro >> >+ >> >+%macro CMP_EQUAL 3 >> >+%if cpuflag(avx2) >> >+ vpcmpeqd %1, %2, %3 >> >+%elif cpuflag(avx) >> >+ cmpps %1, %2, %3, 0x0 >> >+%else >> >+ pcmpeqd %1, %2, %3 >> >+%endif >> >+%endmacro >> >+ >> >+%macro SHIFT_RIGHT 2 >> >+%if mmsize <= 16 >> >+ psrld xm%1, %2 >> >+%elif cpuflag(avx2) >> >+ vpsrld m%1, m%1, %2 >> >+%else >> >+ vextractf128 xm15, m%1, 1 >> >+ psrld xm%1, %2 >> >+ psrld xm15, %2 >> >+ vinsertf128 m%1, m%1, xm15, 1 >> >+%endif >> >+%endmacro >> >+ >> >+%macro FETCH_LUT3D_RGB 4 >> >+ mov tmp2d, [rm + %4] >> >+ movss xm%1, [tmpq + tmp2q*4 + 0] >> >+ movss xm%2, [tmpq + tmp2q*4 + 4] >> >+ movss xm%3, [tmpq + tmp2q*4 + 8] >> >+ movss [rm + %4], xm%1 >> >+ movss [gm + %4], xm%2 >> >+ movss [bm + %4], xm%3 >> >+%endmacro >> >+ >> >+; 1 - dstr >> >+; 2 - dstg >> >+; 3 - dstb >> >+; 4 - indices >> >+%macro GATHER_LUT3D_INDICES 4 >> >+%if cpuflag(avx2) >> >+ vpcmpeqb m3, m3 >> >+ vgatherdps m%1, [tmpq + m%4*4 + 0], m3 >> >+ vpcmpeqb m14, m14 >> >+ vgatherdps m%2, [tmpq + m%4*4 + 4], m14 >> >+ vpcmpeqb m15, m15 >> >+ vgatherdps m%3, [tmpq + m%4*4 + 8], m15 >> >+%else >> >+ movu [rm], m%4 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 0 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 4 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 8 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 12 >> >+%if mmsize > 16 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 16 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 20 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 24 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 28 >> >+%endif >> >+ movu m%1, [rm] >> >+ movu m%2, [gm] >> >+ movu m%3, [bm] >> >+%endif >> >+%endmacro >> >+ >> >+%macro interp_tetrahedral 0 >> >+ %define d_r m0 >> >+ %define d_g m1 >> >+ %define d_b m2 >> >+ >> >+ %define prev_r m3 >> >+ %define prev_g m4 >> >+ %define prev_b m5 >> >+ >> >+ %define next_r m6 >> >+ %define next_g m7 >> >+ %define next_b m8 >> >+ >> >+ %define x0 m4 >> >+ %define x1 m5 >> >+ %define x2 m6 >> >+ >> >+ ; setup prev index >> >+ FLOORPS prev_r, m0 >> >+ FLOORPS prev_g, m1 >> >+ FLOORPS prev_b, m2 >> >+ >> >+ ; setup deltas >> >+ subps d_r, m0, prev_r >> >+ subps d_g, m1, prev_g >> >+ subps d_b, m2, prev_b >> >+ >> >+ ; calculate select mask m9 >> >+ movu m6, [pd_tetra_table2] >> >+ cmpps m7, d_r, d_b, 0x1E ; r > b CMP_GT_OQ >> >+ BLEND m10, m6, [pd_tetra_table1], m7 >> >+ cmpps m7, d_g, d_b, 0x1E ; g > b CMP_GT_OQ >> >+ BLEND m6, m10, [pd_tetra_table0], m7 >> >+ >> >+ movu m10, [pd_tetra_table5] >> >+ cmpps m7, d_b, d_r, 0x1E ; b > r CMP_GT_OQ >> >+ BLEND m9, m10, [pd_tetra_table4], m7 >> >+ cmpps m7, d_b, d_g, 0x1E ; b > g CMP_GT_OQ >> >+ BLEND m10, m9, [pd_tetra_table3], m7 >> >+ >> >+ cmpps m7, d_r, d_g, 0x1E ; r > g CMP_GT_OQ >> >+ BLEND m9, m10, m6, m7 >> >+ >> >+ ; setup next index >> >+ addps next_r, prev_r, m14 ; +1 >> >+ minps next_r, next_r, m13 ; clamp lutmax >> >+ >> >+ addps next_g, prev_g, m14 ; +1 >> >+ minps next_g, next_g, m13 ; clamp lutmax >> >+ >> >+ addps next_b, prev_b, m14 ; +1 >> >+ minps next_b, next_b, m13 ; clamp lutmax >> >+ >> >+ ; prescale indices >> >+ mulps prev_r, prev_r, lut3dsize2m >> >+ mulps next_r, next_r, lut3dsize2m >> >+ >> >+ mulps prev_g, prev_g, lut3dsizem >> >+ mulps next_g, next_g, lut3dsizem >> >+ >> >+ mulps prev_b, prev_b, [pd_3f] >> >+ mulps next_b, next_b, [pd_3f] >> >+ >> >+ movu m14, [pd_001] >> >+ >> >+ ; cxxa m10 >> >+ ; b >> >+ andps m15, m9, m14 >> >+ CMP_EQUAL m15, m15, m14 >> >+ BLEND m10, prev_b, next_b, m15 >> >+ >> >+ ; g >> >+ andps m15, m9, [pd_010] >> >+ CMP_EQUAL m15, m15, [pd_010] >> >+ BLEND m12, prev_g, next_g, m15 >> >+ >> >+ ; r >> >+ andps m15, m9, [pd_100] >> >+ CMP_EQUAL m15, m15, [pd_100] >> >+ BLEND m13, prev_r, next_r, m15 >> >+ >> >+ ADD3 m10, m10, m12, m13 >> >+ >> >+ SHIFT_RIGHT 9, 3 ; 3 >> >+ >> >+ ; cxxb m11; >> >+ ; b >> >+ andps m15, m9, m14 >> >+ CMP_EQUAL m15, m15, m14 >> >+ BLEND m11, prev_b, next_b, m15 >> >+ >> >+ ; g >> >+ andps m15, m9, [pd_010] >> >+ CMP_EQUAL m15, m15, [pd_010] >> >+ BLEND m12, prev_g, next_g, m15 >> >+ >> >+ ; r >> >+ andps m15, m9, [pd_100] >> >+ CMP_EQUAL m15, m15, [pd_100] >> >+ BLEND m13, prev_r, next_r, m15 >> >+ >> >+ ADD3 m11, m11, m12, m13 >> >+ >> >+ ; c000 m12; >> >+ ADD3 m12, prev_r, prev_g, prev_b >> >+ >> >+ ; c111 m13; >> >+ ADD3 m13, next_r, next_g, next_b >> >+ >> >+ SHIFT_RIGHT 9, 3 ; 6 >> >+ >> >+ ; x0, m4 >> >+ andps m15, m9, m14 >> >+ CMP_EQUAL m15, m15, m14 >> >+ BLEND m7, d_r, d_g, m15 ; r,g >> >+ >> >+ andps m15, m9, [pd_010] >> >+ CMP_EQUAL m15, m15, [pd_010] >> >+ BLEND x0, m7, d_b, m15 ; b >> >+ >> >+ ; x1, m5 >> >+ andps m15, m9, [pd_100] >> >+ CMP_EQUAL m15, m15, [pd_100] >> >+ BLEND m7, d_r, d_g, m15 ; r,g >> >+ >> >+ SHIFT_RIGHT 9, 3 ; 9 >> >+ >> >+ andps m15, m9, m14 >> >+ CMP_EQUAL m15, m15, m14 >> >+ BLEND x1, m7, d_b, m15 ; b >> >+ >> >+ ; x2, m6 >> >+ andps m15, m9, [pd_010] >> >+ CMP_EQUAL m15, m15, [pd_010] >> >+ BLEND m7, d_r, d_g, m15 ; r,g >> >+ >> >+ andps m15, m9, [pd_100] >> >+ CMP_EQUAL m15, m15, [pd_100] >> >+ BLEND x2, m7, d_b, m15 ; b >> >+ >> >+ ; convert indices to integer >> >+ cvttps2dq m12, m12 >> >+ cvttps2dq m10, m10 >> >+ cvttps2dq m11, m11 >> >+ cvttps2dq m13, m13 >> >+ >> >+ ; now the gathering festival >> >+ mov tmpq, [ctxq + LUT3DContext.lut] >> >+ >> >+ GATHER_LUT3D_INDICES 0, 1, 2, 12 >> >+ movu m14, [pd_1f] >> >+ subps m14, m14, x0; 1 - x0 >> >+ >> >+ mulps m0, m0, m14 >> >+ mulps m1, m1, m14 >> >+ mulps m2, m2, m14 >> >+ >> >+ GATHER_LUT3D_INDICES 7, 8, 9, 10 >> >+ subps m14, x0, x1; x0 - x1 >> >+ mulps m7, m7, m14 >> >+ addps m0, m0, m7 >> >+ >> >+ mulps m8, m8, m14 >> >+ addps m1, m1, m8 >> >+ >> >+ mulps m9, m9, m14 >> >+ addps m2, m2, m9 >> >+ >> >+ GATHER_LUT3D_INDICES 7, 8, 9, 11 >> >+ subps m14, x1, x2; x1 - x2 >> >+ >> >+ mulps m7, m7, m14 >> >+ addps m0, m0, m7 >> >+ >> >+ mulps m8, m8, m14 >> >+ addps m1, m1, m8 >> >+ >> >+ mulps m9, m9, m14 >> >+ addps m2, m2, m9 >> >+ >> >+ GATHER_LUT3D_INDICES 7, 8, 9, 13 >> >+ mulps m7, m7, x2 >> >+ addps m0, m0, m7 >> >+ >> >+ mulps m8, m8, x2 >> >+ addps m1, m1, m8 >> >+ >> >+ mulps m9, m9, x2 >> >+ addps m2, m2, m9 >> >+%endmacro >> >+ >> >+%macro INIT_DATA_PTR 3 >> >+ mov ptrq, [%2 + AVFrame.data + %3 * 8] >> >+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4] >> >+ imul tmpd, slice_startd >> >+ add ptrq, tmpq >> >+ mov %1, ptrq >> >+%endmacro >> >+ >> >+%macro INC_DATA_PTR 3 >> >+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4] >> >+ mov ptrq, %1 >> >+ add ptrq, tmpq >> >+ mov %1, ptrq >> >+%endmacro >> >+ >> >+%macro LOAD16 2 >> >+ mov ptrq, %2 >> >+ %if mmsize > 16 >> >+ movu xm%1, [ptrq + xq*2] >> >+ %else >> >+ movsd xm%1, [ptrq + xq*2] >> >+ %endif >> >+ %if cpuflag(avx2) >> >+ vpmovzxwd m%1, xm%1 >> >+ %else >> >+ %if mmsize > 16 >> >+ pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0) >> >+ pshufb xm%1, xm6 ; pb_shuffle16 >> >+ pshufb xm4, xm6 ; pb_shuffle16 >> >+ vinsertf128 m%1, m%1, xm4, 1 >> >+ %else >> >+ pshufd xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0) >> >+ pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >> >+ pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >> >+ %endif >> >+ %endif >> >+ cvtdq2ps m%1, m%1 >> >+ mulps m%1, m%1, m7 ; pd_65535_invf >> >+%endmacro >> >+ >> >+%macro STORE16 2 >> >+ mulps m%2, m%2, m5 ; [pd_65535f] >> >+ minps m%2, m%2, m5 ; [pd_65535f] >> >+ maxps m%2, m%2, m15 ; zero >> >+ cvttps2dq m%2, m%2 >> >+ %if mmsize > 16 >> >+ vextractf128 xm4, m%2, 1 >> >+ pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16] >> >+ pshufb xm4, xm7 ; [pb_hi_pack_shuffle16] >> >+ por xm%2, xm4 >> >+ %else >> >+ pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >> >+ pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >> >+ pshufd xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0) >> >+ %endif >> >+ mov ptrq, %1 >> >+ %if mmsize > 16 >> >+ movu [ptrq + xq*2], xm%2 >> >+ %else >> >+ movsd [ptrq + xq*2], xm%2 >> >+ %endif >> >+%endmacro >> >+ >> >+; 1 - interp method >> >+; 2 - format_name >> >+; 3 - depth >> >+; 4 - is float format >> >+%macro DEFINE_INTERP_FUNC 4 >> >+cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut, >> src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr, >> tmp, tmp2, tmp3 >> >+ ; store lut max and lutsize >> >+ mov tmpd, dword [ctxq + LUT3DContext.lutsize] >> >+ cvtsi2ss xm0, tmpd >> >+ mulss xm0, xm0, [pd_3f] >> >+ VBROADCASTSS m0, xm0 >> >+ mova lut3dsizem, m0 >> >+ sub tmpd, 1 >> >+ cvtsi2ss xm0, tmpd >> >+ VBROADCASTSS m0, xm0 >> >+ mova lut3dmaxm, m0 >> >+ >> >+ ; scale_r >> >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4] >> >+ VBROADCASTSS m1, xm1 >> >+ mova scalerm, m1 >> >+ >> >+ ; scale_g >> >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4] >> >+ VBROADCASTSS m1, xm1 >> >+ mova scalegm, m1 >> >+ >> >+ ; scale_b >> >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4] >> >+ VBROADCASTSS m1, xm1 >> >+ mova scalebm, m1 >> >+ >> >+ ; store lutsize2 >> >+ cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2] >> >+ mulss xm0, xm0, [pd_3f] >> >+ VBROADCASTSS m0, xm0 >> >+ mova lut3dsize2m, m0 >> >+ >> >+ ; init prelut values >> >+ cmp prelutq, 0 >> >+ je %%skip_init_prelut >> >+ mov tmpd, dword [prelutq + Lut3DPreLut.size] >> >+ sub tmpd, 1 >> >+ cvtsi2ss xm0, tmpd >> >+ VBROADCASTSS m0, xm0 >> >+ mova prelutmaxm, m0 >> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4] >> >+ mova prelutminrm, m0 >> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4] >> >+ mova prelutmingm, m0 >> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4] >> >+ mova prelutminbm, m0 >> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4] >> >+ mova prelutscalerm, m0 >> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4] >> >+ mova prelutscalegm, m0 >> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4] >> >+ mova prelutscalebm, m0 >> >+ %%skip_init_prelut: >> >+ >> >+ mov widthd, [src_imageq + AVFrame.width] >> >+ >> >+ ; gbra pixel order >> >+ INIT_DATA_PTR srcrm, src_imageq, 2 >> >+ INIT_DATA_PTR srcgm, src_imageq, 0 >> >+ INIT_DATA_PTR srcbm, src_imageq, 1 >> >+ INIT_DATA_PTR srcam, src_imageq, 3 >> >+ >> >+ INIT_DATA_PTR dstrm, dst_imageq, 2 >> >+ INIT_DATA_PTR dstgm, dst_imageq, 0 >> >+ INIT_DATA_PTR dstbm, dst_imageq, 1 >> >+ INIT_DATA_PTR dstam, dst_imageq, 3 >> >+ >> >+ %%loop_y: >> >+ xor xq, xq >> >+ %%loop_x: >> >+ movu m14, [pd_1f] >> >+ xorps m15, m15, m15 >> >+ %if %4 ; float >> >+ mov ptrq, srcrm >> >+ movu m0, [ptrq + xq*4] >> >+ mov ptrq, srcgm >> >+ movu m1, [ptrq + xq*4] >> >+ mov ptrq, srcbm >> >+ movu m2, [ptrq + xq*4] >> >+ SANITIZE_F m0 >> >+ SANITIZE_F m1 >> >+ SANITIZE_F m2 >> >+ %else >> >+ ; constants for LOAD16 >> >+ movu m7, [pd_65535_invf] >> >+ %if notcpuflag(avx2) && mmsize >= 32 >> >+ movu xm6, [pb_shuffle16] >> >+ %endif >> >+ LOAD16 0, srcrm >> >+ LOAD16 1, srcgm >> >+ LOAD16 2, srcbm >> >+ %endif >> >+ >> >+ cmp prelutq, 0 >> >+ je %%skip_prelut >> >+ mova m13, prelutmaxm >> >+ APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm >> >+ APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm >> >+ APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm >> >+ %%skip_prelut: >> >+ >> >+ mova m13, lut3dmaxm >> >+ APPLY_SCALE 0, scalerm >> >+ APPLY_SCALE 1, scalegm >> >+ APPLY_SCALE 2, scalebm >> >+ >> >+ interp_%1 >> >+ >> >+ %if %4 ; float >> >+ mov ptrq, dstrm >> >+ movu [ptrq + xq*4], m0 >> >+ mov ptrq, dstgm >> >+ movu [ptrq + xq*4], m1 >> >+ mov ptrq, dstbm >> >+ movu [ptrq + xq*4], m2 >> >+ cmp has_alphad, 0 >> >+ je %%skip_alphaf >> >+ mov ptrq, srcam >> >+ movu m0, [ptrq + xq*4] >> >+ mov ptrq, dstam >> >+ movu [ptrq + xq*4], m0 >> >+ %%skip_alphaf: >> >+ %else >> >+ ; constants for STORE16 >> >+ movu m5, [pd_65535f] >> >+ %if mmsize > 16 >> >+ movu xm6, [pb_lo_pack_shuffle16] >> >+ movu xm7, [pb_hi_pack_shuffle16] >> >+ %endif >> >+ >> >+ xorps m15, m15, m15 >> >+ STORE16 dstrm, 0 >> >+ STORE16 dstgm, 1 >> >+ STORE16 dstbm, 2 >> >+ >> >+ cmp has_alphad, 0 >> >+ je %%skip_alpha >> >+ %if mmsize > 16 >> >+ mov ptrq, srcam >> >+ movu xm0, [ptrq + xq*2] >> >+ mov ptrq, dstam >> >+ movu [ptrq + xq*2], xm0 >> >+ %else >> >+ mov ptrq, srcam >> >+ movsd xm0, [ptrq + xq*2] >> >+ mov ptrq, dstam >> >+ movsd [ptrq + xq*2], xm0 >> >+ %endif >> >+ >> >+ %%skip_alpha: >> >+ %endif >> >+ >> >+ add xq, mmsize/4 >> >+ cmp xd, widthd >> >+ jl %%loop_x >> >+ >> >+ INC_DATA_PTR srcrm, src_imageq, 2 >> >+ INC_DATA_PTR srcgm, src_imageq, 0 >> >+ INC_DATA_PTR srcbm, src_imageq, 1 >> >+ INC_DATA_PTR srcam, src_imageq, 3 >> >+ >> >+ INC_DATA_PTR dstrm, dst_imageq, 2 >> >+ INC_DATA_PTR dstgm, dst_imageq, 0 >> >+ INC_DATA_PTR dstbm, dst_imageq, 1 >> >+ INC_DATA_PTR dstam, dst_imageq, 3 >> >+ >> >+ inc slice_startd >> >+ cmp slice_startd, slice_endd >> >+ jl %%loop_y >> >+ >> >+ RET >> >+%endmacro >> >+%if ARCH_X86_64 >> >+ %if HAVE_AVX2_EXTERNAL >> >+ INIT_YMM avx2 >> >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 >> >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 >> >+ %endif >> >+ %if HAVE_AVX_EXTERNAL >> >+ INIT_YMM avx >> >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 >> >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 >> >+ %endif >> >+ INIT_XMM sse2 >> >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 >> >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 >> >+%endif >> >\ No newline at end of file >> >diff --git a/libavfilter/x86/vf_lut3d_init.c >> b/libavfilter/x86/vf_lut3d_init.c >> >new file mode 100644 >> >index 0000000000..9b9b36e4af >> >--- /dev/null >> >+++ b/libavfilter/x86/vf_lut3d_init.c >> >@@ -0,0 +1,88 @@ >> >+/* >> >+ * Copyright (c) 2021 Mark Reid <mindm...@gmail.com> >> >+ * >> >+ * This file is part of FFmpeg. >> >+ * >> >+ * FFmpeg is free software; you can redistribute it and/or >> >+ * modify it under the terms of the GNU Lesser General Public >> >+ * License as published by the Free Software Foundation; either >> >+ * version 2.1 of the License, or (at your option) any later version. >> >+ * >> >+ * FFmpeg is distributed in the hope that it will be useful, >> >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >> >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >+ * Lesser General Public License for more details. >> >+ * >> >+ * You should have received a copy of the GNU Lesser General Public >> >+ * License along with FFmpeg; if not, write to the Free Software >> >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >> 02110-1301 USA >> >+ */ >> >+ >> >+#include "libavutil/attributes.h" >> >+#include "libavutil/cpu.h" >> >+#include "libavutil/x86/cpu.h" >> >+#include "libavfilter/lut3d.h" >> >+ >> >+#define DEFINE_INTERP_FUNC(name, format, opt) >> >> \ >> >+void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d, >> Lut3DPreLut *prelut, AVFrame *src, AVFrame *dst, int slice_start, int >> slice_end, int has_alpha); \ >> >+static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void >> *arg, int jobnr, int nb_jobs) >> \ >> >+{ >> >> \ >> >+ LUT3DContext *lut3d = ctx->priv; >> >> \ >> >+ Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut: >> NULL; >> \ >> >+ ThreadData *td = arg; >> >> \ >> >+ AVFrame *in = td->in; >> >> \ >> >+ AVFrame *out = td->out; >> >> \ >> >+ int has_alpha = in->linesize[3] && out != in; >> >> \ >> >+ int slice_start = (in->height * jobnr ) / nb_jobs; >> >> \ >> >+ int slice_end = (in->height * (jobnr+1)) / nb_jobs; >> >> \ >> >+ ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out, >> slice_start, slice_end, has_alpha); >> \ >> >+ return 0; >> >> \ >> >+} >> >+ >> >+#if ARCH_X86_64 >> >+#if HAVE_AVX2_EXTERNAL >> >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2) >> >+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx2) >> >+#endif >> >+#if HAVE_AVX_EXTERNAL >> >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx) >> >+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx) >> >+#endif >> >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2) >> >+ DEFINE_INTERP_FUNC(tetrahedral, p16, sse2) >> >+#endif >> >+ >> >+ >> >+av_cold void ff_lut3d_init_x86(LUT3DContext *s, const >> AVPixFmtDescriptor *desc) >> >+{ >> >+ int cpu_flags = av_get_cpu_flags(); >> >+ int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR; >> >+ int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT; >> >+ int depth = desc->comp[0].depth; >> >+ >> >+#if ARCH_X86_64 >> >+ if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interpolation == >> INTERPOLATE_TETRAHEDRAL && planar) { >> >+#if HAVE_AVX2_EXTERNAL >> >+ if (isfloat && planar) { >> >+ s->interp = interp_tetrahedral_pf32_avx2; >> >+ } else if (depth == 16) { >> >+ s->interp = interp_tetrahedral_p16_avx2; >> >+ } >> >+#endif >> >+ } else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation == >> INTERPOLATE_TETRAHEDRAL && planar) { >> >+#if HAVE_AVX_EXTERNAL >> >+ if (isfloat) { >> >+ s->interp = interp_tetrahedral_pf32_avx; >> >+ } else if (depth == 16) { >> >+ s->interp = interp_tetrahedral_p16_avx; >> >+ } >> >+#endif >> >+ } else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation == >> INTERPOLATE_TETRAHEDRAL && planar) { >> >+ if (isfloat) { >> >+ s->interp = interp_tetrahedral_pf32_sse2; >> >+ } else if (depth == 16) { >> >+ s->interp = interp_tetrahedral_p16_sse2; >> >+ } >> >+ } >> >+#endif >> >+} >> >-- >> >2.31.1.windows.1 >> > >> >_______________________________________________ >> >ffmpeg-devel mailing list >> >ffmpeg-devel@ffmpeg.org >> >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> > >> >To unsubscribe, visit link above, or email >> >ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". >> > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".