From: Niklas Haas <g...@haasn.dev> This covers most 8-bit and 16-bit ops, and some 32-bit ops. It also covers all floating point operations. While this is not yet 100% coverage, it's good enough for the vast majority of formats out there.
Of special note is the packed shuffle fast path, which uses pshufb at vector sizes up to AVX512. --- libswscale/ops.c | 4 + libswscale/x86/Makefile | 3 + libswscale/x86/ops.c | 706 ++++++++++++++++++++++ libswscale/x86/ops_common.asm | 187 ++++++ libswscale/x86/ops_float.asm | 386 ++++++++++++ libswscale/x86/ops_int.asm | 1050 +++++++++++++++++++++++++++++++++ 6 files changed, 2336 insertions(+) create mode 100644 libswscale/x86/ops.c create mode 100644 libswscale/x86/ops_common.asm create mode 100644 libswscale/x86/ops_float.asm create mode 100644 libswscale/x86/ops_int.asm diff --git a/libswscale/ops.c b/libswscale/ops.c index 6403eff324..8a27e70ef9 100644 --- a/libswscale/ops.c +++ b/libswscale/ops.c @@ -29,9 +29,13 @@ extern SwsOpBackend backend_c; extern SwsOpBackend backend_murder; +extern SwsOpBackend backend_x86; const SwsOpBackend * const ff_sws_op_backends[] = { &backend_murder, +#if ARCH_X86 + &backend_x86, +#endif &backend_c, NULL }; diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index f00154941d..a04bc8336f 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -10,6 +10,9 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o X86ASM-OBJS += x86/input.o \ x86/output.o \ + x86/ops_int.o \ + x86/ops_float.o \ + x86/ops.o \ x86/scale.o \ x86/scale_avx2.o \ x86/range_convert.o \ diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c new file mode 100644 index 0000000000..d5fd046d64 --- /dev/null +++ b/libswscale/x86/ops.c @@ -0,0 +1,706 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <float.h> + +#include <libavutil/avassert.h> +#include <libavutil/mem.h> + +#include "../ops_chain.h" + +#define DECL_ENTRY(TYPE, NAME, ...) \ + static const SwsOpEntry op_##NAME = { \ + .op.type = SWS_PIXEL_##TYPE, \ + __VA_ARGS__ \ + } + +#define DECL_ASM(TYPE, NAME, ...) \ + void ff_##NAME(void); \ + DECL_ENTRY(TYPE, NAME, \ + .func = ff_##NAME, \ + __VA_ARGS__) + +#define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...) \ + DECL_ASM(TYPE, p##X##Y##Z##W##_##NAME, \ + .op.comps.unused = { !X, !Y, !Z, !W }, \ + __VA_ARGS__ \ + ) + +#define REF_PATTERN(NAME, X, Y, Z, W) \ + &op_p##X##Y##Z##W##_##NAME + +#define DECL_COMMON_PATTERNS(TYPE, NAME, ...) \ + DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__); \ + DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__); \ + DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__); \ + DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__) \ + +#define REF_COMMON_PATTERNS(NAME) \ + REF_PATTERN(NAME, 1, 0, 0, 0), \ + REF_PATTERN(NAME, 1, 0, 0, 1), \ + REF_PATTERN(NAME, 1, 1, 1, 0), \ + REF_PATTERN(NAME, 1, 1, 1, 1) + +#define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC) \ + DECL_ASM(TYPE, NAME##ELEMS##EXT, \ + .op.op = SWS_OP_##OP, \ + .op.rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC }, \ + ); + +#define DECL_PACKED_RW(EXT, DEPTH) \ + DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 2, true, 0) \ + DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 3, true, 0) \ + DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 4, true, 0) \ + DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true, 0) \ + DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true, 0) \ + DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true, 0) \ + +#define DECL_PACK_UNPACK(EXT, TYPE, X, Y, Z, W) \ + DECL_ASM(TYPE, pack_##X##Y##Z##W##EXT, \ + .op.op = SWS_OP_PACK, \ + .op.pack.pattern = {X, Y, Z, W}, \ + ); \ + \ + DECL_ASM(TYPE, unpack_##X##Y##Z##W##EXT, \ + .op.op = SWS_OP_UNPACK, \ + .op.pack.pattern = {X, Y, Z, W}, \ + ); \ + +static int setup_swap_bytes(const SwsOp *op, SwsOpPriv *out) +{ + const int mask = ff_sws_pixel_type_size(op->type) - 1; + for (int i = 0; i < 16; i++) + out->u8[i] = (i & ~mask) | (mask - (i & mask)); + return 0; +} + +#define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W) \ + DECL_PATTERN(TYPE, swap_bytes_##TYPE##EXT, X, Y, Z, W, \ + .func = ff_p##X##Y##Z##W##_shuffle##EXT, \ + .op.op = SWS_OP_SWAP_BYTES, \ + .setup = setup_swap_bytes, \ + ); + +#define DECL_CLEAR_ALPHA(EXT, IDX) \ + DECL_ASM(U8, clear_alpha##IDX##EXT, \ + .op.op = SWS_OP_CLEAR, \ + .op.c.q4[IDX] = { .num = -1, .den = 1 }, \ + .op.comps.unused[IDX] = true, \ + ); \ + +#define DECL_CLEAR_ZERO(EXT, IDX) \ + DECL_ASM(U8, clear_zero##IDX##EXT, \ + .op.op = SWS_OP_CLEAR, \ + .op.c.q4[IDX] = { .num = 0, .den = 1 }, \ + .op.comps.unused[IDX] = true, \ + ); + +static int setup_clear(const SwsOp *op, SwsOpPriv *out) +{ + for (int i = 0; i < 4; i++) + out->u32[i] = (uint32_t) op->c.q4[i].num; + return 0; +} + +#define DECL_CLEAR(EXT, X, Y, Z, W) \ + DECL_PATTERN(U8, clear##EXT, X, Y, Z, W, \ + .op.op = SWS_OP_CLEAR, \ + .setup = setup_clear, \ + .flexible = true, \ + ); + +#define DECL_SWIZZLE(EXT, X, Y, Z, W) \ + DECL_ASM(U8, swizzle_##X##Y##Z##W##EXT, \ + .op.op = SWS_OP_SWIZZLE, \ + .op.swizzle = SWS_SWIZZLE( X, Y, Z, W ), \ + ); + +#define DECL_CONVERT(EXT, FROM, TO) \ + DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT, \ + .op.op = SWS_OP_CONVERT, \ + .op.convert.to = SWS_PIXEL_##TO, \ + ); + +#define DECL_EXPAND(EXT, FROM, TO) \ + DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT, \ + .op.op = SWS_OP_CONVERT, \ + .op.convert.to = SWS_PIXEL_##TO, \ + .op.convert.expand = true, \ + ); + +static int setup_shift(const SwsOp *op, SwsOpPriv *out) +{ + out->u16[0] = op->c.u; + return 0; +} + +#define DECL_SHIFT16(EXT) \ + DECL_COMMON_PATTERNS(U16, lshift16##EXT, \ + .op.op = SWS_OP_LSHIFT, \ + .setup = setup_shift, \ + .flexible = true, \ + ); \ + \ + DECL_COMMON_PATTERNS(U16, rshift16##EXT, \ + .op.op = SWS_OP_RSHIFT, \ + .setup = setup_shift, \ + .flexible = true, \ + ); + +#define DECL_MIN_MAX(EXT) \ + DECL_COMMON_PATTERNS(F32, min##EXT, \ + .op.op = SWS_OP_MIN, \ + .setup = ff_sws_setup_q4, \ + .flexible = true, \ + ); \ + \ + DECL_COMMON_PATTERNS(F32, max##EXT, \ + .op.op = SWS_OP_MAX, \ + .setup = ff_sws_setup_q4, \ + .flexible = true, \ + ); + +#define DECL_SCALE(EXT) \ + DECL_COMMON_PATTERNS(F32, scale##EXT, \ + .op.op = SWS_OP_SCALE, \ + .setup = ff_sws_setup_q, \ + ); + +/* 2x2 matrix fits inside SwsOpPriv directly, save an indirect in this case */ +static_assert(sizeof(SwsOpPriv) >= sizeof(float[2][2]), "2x2 dither matrix too large"); +static int setup_dither(const SwsOp *op, SwsOpPriv *out) +{ + const int size = 1 << op->dither.size_log2; + float *matrix = out->f32; + if (size > 2) { + matrix = out->ptr = av_mallocz(size * size * sizeof(*matrix)); + if (!matrix) + return AVERROR(ENOMEM); + } + + for (int i = 0; i < size * size; i++) + matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den; + + return 0; +} + +#define DECL_DITHER(EXT, SIZE) \ + DECL_COMMON_PATTERNS(F32, dither##SIZE##EXT, \ + .op.op = SWS_OP_DITHER, \ + .op.dither.size_log2 = SIZE, \ + .setup = setup_dither, \ + .free = SIZE > 2 ? av_free : NULL, \ + ); + +static int setup_linear(const SwsOp *op, SwsOpPriv *out) +{ + float *matrix = out->ptr = av_mallocz(sizeof(float[4][5])); + if (!matrix) + return AVERROR(ENOMEM); + + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 5; x++) + matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den; + } + + return 0; +} + +#define DECL_LINEAR(EXT, NAME, MASK) \ + DECL_ASM(F32, NAME##EXT, \ + .op.op = SWS_OP_LINEAR, \ + .op.lin.mask = (MASK), \ + .setup = setup_linear, \ + .free = av_free, \ + ); + +#define DECL_FUNCS_8(SIZE, EXT, FLAG) \ + DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \ + DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \ + DECL_RW(EXT, U8, read_planar, READ, 3, false, 0) \ + DECL_RW(EXT, U8, read_planar, READ, 4, false, 0) \ + DECL_RW(EXT, U8, write_planar, WRITE, 1, false, 0) \ + DECL_RW(EXT, U8, write_planar, WRITE, 2, false, 0) \ + DECL_RW(EXT, U8, write_planar, WRITE, 3, false, 0) \ + DECL_RW(EXT, U8, write_planar, WRITE, 4, false, 0) \ + DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \ + DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \ + DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \ + DECL_PACKED_RW(EXT, 8) \ + DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \ + DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \ + DECL_PACK_UNPACK(EXT, U8, 2, 3, 3, 0) \ + void ff_p1000_shuffle##EXT(void); \ + void ff_p1001_shuffle##EXT(void); \ + void ff_p1110_shuffle##EXT(void); \ + void ff_p1111_shuffle##EXT(void); \ + DECL_SWIZZLE(EXT, 3, 0, 1, 2) \ + DECL_SWIZZLE(EXT, 3, 0, 2, 1) \ + DECL_SWIZZLE(EXT, 2, 1, 0, 3) \ + DECL_SWIZZLE(EXT, 3, 2, 1, 0) \ + DECL_SWIZZLE(EXT, 3, 1, 0, 2) \ + DECL_SWIZZLE(EXT, 3, 2, 0, 1) \ + DECL_SWIZZLE(EXT, 1, 2, 0, 3) \ + DECL_SWIZZLE(EXT, 1, 0, 2, 3) \ + DECL_SWIZZLE(EXT, 2, 0, 1, 3) \ + DECL_SWIZZLE(EXT, 2, 3, 1, 0) \ + DECL_SWIZZLE(EXT, 2, 1, 3, 0) \ + DECL_SWIZZLE(EXT, 1, 2, 3, 0) \ + DECL_SWIZZLE(EXT, 1, 3, 2, 0) \ + DECL_SWIZZLE(EXT, 0, 2, 1, 3) \ + DECL_SWIZZLE(EXT, 0, 2, 3, 1) \ + DECL_SWIZZLE(EXT, 0, 3, 1, 2) \ + DECL_SWIZZLE(EXT, 3, 1, 2, 0) \ + DECL_SWIZZLE(EXT, 0, 3, 2, 1) \ + DECL_SWIZZLE(EXT, 0, 0, 0, 3) \ + DECL_SWIZZLE(EXT, 3, 0, 0, 0) \ + DECL_SWIZZLE(EXT, 0, 0, 0, 1) \ + DECL_SWIZZLE(EXT, 1, 0, 0, 0) \ + DECL_CLEAR_ALPHA(EXT, 0) \ + DECL_CLEAR_ALPHA(EXT, 1) \ + DECL_CLEAR_ALPHA(EXT, 3) \ + DECL_CLEAR_ZERO(EXT, 0) \ + DECL_CLEAR_ZERO(EXT, 1) \ + DECL_CLEAR_ZERO(EXT, 3) \ + DECL_CLEAR(EXT, 1, 1, 1, 0) \ + DECL_CLEAR(EXT, 0, 1, 1, 1) \ + DECL_CLEAR(EXT, 0, 0, 1, 1) \ + DECL_CLEAR(EXT, 1, 0, 0, 1) \ + DECL_CLEAR(EXT, 1, 1, 0, 0) \ + DECL_CLEAR(EXT, 0, 1, 0, 1) \ + DECL_CLEAR(EXT, 1, 0, 1, 0) \ + DECL_CLEAR(EXT, 1, 0, 0, 0) \ + DECL_CLEAR(EXT, 0, 1, 0, 0) \ + DECL_CLEAR(EXT, 0, 0, 1, 0) \ + \ +static const SwsOpTable ops8##EXT = { \ + .cpu_flags = AV_CPU_FLAG_##FLAG, \ + .block_size = SIZE, \ + .entries = { \ + &op_read_planar1##EXT, \ + &op_read_planar2##EXT, \ + &op_read_planar3##EXT, \ + &op_read_planar4##EXT, \ + &op_write_planar1##EXT, \ + &op_write_planar2##EXT, \ + &op_write_planar3##EXT, \ + &op_write_planar4##EXT, \ + &op_read8_packed2##EXT, \ + &op_read8_packed3##EXT, \ + &op_read8_packed4##EXT, \ + &op_write8_packed2##EXT, \ + &op_write8_packed3##EXT, \ + &op_write8_packed4##EXT, \ + &op_read_nibbles1##EXT, \ + &op_read_bits1##EXT, \ + &op_write_bits1##EXT, \ + &op_pack_1210##EXT, \ + &op_pack_3320##EXT, \ + &op_pack_2330##EXT, \ + &op_unpack_1210##EXT, \ + &op_unpack_3320##EXT, \ + &op_unpack_2330##EXT, \ + &op_swizzle_3012##EXT, \ + &op_swizzle_3021##EXT, \ + &op_swizzle_2103##EXT, \ + &op_swizzle_3210##EXT, \ + &op_swizzle_3102##EXT, \ + &op_swizzle_3201##EXT, \ + &op_swizzle_1203##EXT, \ + &op_swizzle_1023##EXT, \ + &op_swizzle_2013##EXT, \ + &op_swizzle_2310##EXT, \ + &op_swizzle_2130##EXT, \ + &op_swizzle_1230##EXT, \ + &op_swizzle_1320##EXT, \ + &op_swizzle_0213##EXT, \ + &op_swizzle_0231##EXT, \ + &op_swizzle_0312##EXT, \ + &op_swizzle_3120##EXT, \ + &op_swizzle_0321##EXT, \ + &op_swizzle_0003##EXT, \ + &op_swizzle_0001##EXT, \ + &op_swizzle_3000##EXT, \ + &op_swizzle_1000##EXT, \ + &op_clear_alpha0##EXT, \ + &op_clear_alpha1##EXT, \ + &op_clear_alpha3##EXT, \ + &op_clear_zero0##EXT, \ + &op_clear_zero1##EXT, \ + &op_clear_zero3##EXT, \ + REF_PATTERN(clear##EXT, 1, 1, 1, 0), \ + REF_PATTERN(clear##EXT, 0, 1, 1, 1), \ + REF_PATTERN(clear##EXT, 0, 0, 1, 1), \ + REF_PATTERN(clear##EXT, 1, 0, 0, 1), \ + REF_PATTERN(clear##EXT, 1, 1, 0, 0), \ + REF_PATTERN(clear##EXT, 0, 1, 0, 1), \ + REF_PATTERN(clear##EXT, 1, 0, 1, 0), \ + REF_PATTERN(clear##EXT, 1, 0, 0, 0), \ + REF_PATTERN(clear##EXT, 0, 1, 0, 0), \ + REF_PATTERN(clear##EXT, 0, 0, 1, 0), \ + NULL \ + }, \ +}; + +#define DECL_FUNCS_16(SIZE, EXT, FLAG) \ + DECL_PACKED_RW(EXT, 16) \ + DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \ + DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \ + DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \ + DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0) \ + DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1) \ + DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0) \ + DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1) \ + DECL_SHIFT16(EXT) \ + DECL_CONVERT(EXT, U8, U16) \ + DECL_CONVERT(EXT, U16, U8) \ + DECL_EXPAND(EXT, U8, U16) \ + \ +static const SwsOpTable ops16##EXT = { \ + .cpu_flags = AV_CPU_FLAG_##FLAG, \ + .block_size = SIZE, \ + .entries = { \ + &op_read16_packed2##EXT, \ + &op_read16_packed3##EXT, \ + &op_read16_packed4##EXT, \ + &op_write16_packed2##EXT, \ + &op_write16_packed3##EXT, \ + &op_write16_packed4##EXT, \ + &op_pack_4440##EXT, \ + &op_pack_5550##EXT, \ + &op_pack_5650##EXT, \ + &op_unpack_4440##EXT, \ + &op_unpack_5550##EXT, \ + &op_unpack_5650##EXT, \ + REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \ + REF_COMMON_PATTERNS(convert_U8_U16##EXT), \ + REF_COMMON_PATTERNS(convert_U16_U8##EXT), \ + REF_COMMON_PATTERNS(expand_U8_U16##EXT), \ + REF_COMMON_PATTERNS(lshift16##EXT), \ + REF_COMMON_PATTERNS(rshift16##EXT), \ + NULL \ + }, \ +}; + +#define DECL_FUNCS_32(SIZE, EXT, FLAG) \ + DECL_PACKED_RW(_m2##EXT, 32) \ + DECL_PACK_UNPACK(_m2##EXT, U32, 10, 10, 10, 2) \ + DECL_PACK_UNPACK(_m2##EXT, U32, 2, 10, 10, 10) \ + DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0) \ + DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1) \ + DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0) \ + DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1) \ + DECL_CONVERT(EXT, U8, U32) \ + DECL_CONVERT(EXT, U32, U8) \ + DECL_CONVERT(EXT, U16, U32) \ + DECL_CONVERT(EXT, U32, U16) \ + DECL_CONVERT(EXT, U8, F32) \ + DECL_CONVERT(EXT, F32, U8) \ + DECL_CONVERT(EXT, U16, F32) \ + DECL_CONVERT(EXT, F32, U16) \ + DECL_EXPAND(EXT, U8, U32) \ + DECL_MIN_MAX(EXT) \ + DECL_SCALE(EXT) \ + DECL_DITHER(EXT, 0) \ + DECL_DITHER(EXT, 1) \ + DECL_DITHER(EXT, 2) \ + DECL_DITHER(EXT, 3) \ + DECL_DITHER(EXT, 4) \ + DECL_DITHER(EXT, 5) \ + DECL_DITHER(EXT, 6) \ + DECL_DITHER(EXT, 7) \ + DECL_DITHER(EXT, 8) \ + DECL_LINEAR(EXT, luma, SWS_MASK_LUMA) \ + DECL_LINEAR(EXT, alpha, SWS_MASK_ALPHA) \ + DECL_LINEAR(EXT, lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) \ + DECL_LINEAR(EXT, dot3, 0b111) \ + DECL_LINEAR(EXT, row0, SWS_MASK_ROW(0)) \ + DECL_LINEAR(EXT, row0a, SWS_MASK_ROW(0) | SWS_MASK_ALPHA) \ + DECL_LINEAR(EXT, diag3, SWS_MASK_DIAG3) \ + DECL_LINEAR(EXT, diag4, SWS_MASK_DIAG4) \ + DECL_LINEAR(EXT, diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) \ + DECL_LINEAR(EXT, matrix3, SWS_MASK_MAT3) \ + DECL_LINEAR(EXT, affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) \ + DECL_LINEAR(EXT, affine3a, SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \ + DECL_LINEAR(EXT, matrix4, SWS_MASK_MAT4) \ + DECL_LINEAR(EXT, affine4, SWS_MASK_MAT4 | SWS_MASK_OFF4) \ + \ +static const SwsOpTable ops32##EXT = { \ + .cpu_flags = AV_CPU_FLAG_##FLAG, \ + .block_size = SIZE, \ + .entries = { \ + &op_read32_packed2_m2##EXT, \ + &op_read32_packed3_m2##EXT, \ + &op_read32_packed4_m2##EXT, \ + &op_write32_packed2_m2##EXT, \ + &op_write32_packed3_m2##EXT, \ + &op_write32_packed4_m2##EXT, \ + &op_pack_1010102_m2##EXT, \ + &op_pack_2101010_m2##EXT, \ + &op_unpack_1010102_m2##EXT, \ + &op_unpack_2101010_m2##EXT, \ + REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT), \ + REF_COMMON_PATTERNS(convert_U8_U32##EXT), \ + REF_COMMON_PATTERNS(convert_U32_U8##EXT), \ + REF_COMMON_PATTERNS(convert_U16_U32##EXT), \ + REF_COMMON_PATTERNS(convert_U32_U16##EXT), \ + REF_COMMON_PATTERNS(convert_U8_F32##EXT), \ + REF_COMMON_PATTERNS(convert_F32_U8##EXT), \ + REF_COMMON_PATTERNS(convert_U16_F32##EXT), \ + REF_COMMON_PATTERNS(convert_F32_U16##EXT), \ + REF_COMMON_PATTERNS(expand_U8_U32##EXT), \ + REF_COMMON_PATTERNS(min##EXT), \ + REF_COMMON_PATTERNS(max##EXT), \ + REF_COMMON_PATTERNS(scale##EXT), \ + REF_COMMON_PATTERNS(dither0##EXT), \ + REF_COMMON_PATTERNS(dither1##EXT), \ + REF_COMMON_PATTERNS(dither2##EXT), \ + REF_COMMON_PATTERNS(dither3##EXT), \ + REF_COMMON_PATTERNS(dither4##EXT), \ + REF_COMMON_PATTERNS(dither5##EXT), \ + REF_COMMON_PATTERNS(dither6##EXT), \ + REF_COMMON_PATTERNS(dither7##EXT), \ + REF_COMMON_PATTERNS(dither8##EXT), \ + &op_luma##EXT, \ + &op_alpha##EXT, \ + &op_lumalpha##EXT, \ + &op_dot3##EXT, \ + &op_row0##EXT, \ + &op_row0a##EXT, \ + &op_diag3##EXT, \ + &op_diag4##EXT, \ + &op_diagoff3##EXT, \ + &op_matrix3##EXT, \ + &op_affine3##EXT, \ + &op_affine3a##EXT, \ + &op_matrix4##EXT, \ + &op_affine4##EXT, \ + NULL \ + }, \ +}; + +DECL_FUNCS_8(16, _m1_sse4, SSE4) +DECL_FUNCS_8(32, _m1_avx2, AVX2) +DECL_FUNCS_8(32, _m2_sse4, SSE4) +DECL_FUNCS_8(64, _m2_avx2, AVX2) + +DECL_FUNCS_16(16, _m1_avx2, AVX2) +DECL_FUNCS_16(32, _m2_avx2, AVX2) + +DECL_FUNCS_32(16, _avx2, AVX2) + +static av_const int get_mmsize(const int cpu_flags) +{ + if (cpu_flags & AV_CPU_FLAG_AVX512) + return 64; + else if (cpu_flags & AV_CPU_FLAG_AVX2) + return 32; + else if (cpu_flags & AV_CPU_FLAG_SSE4) + return 16; + else + return AVERROR(ENOTSUP); +} + +/** + * Returns true if the operation's implementation only depends on the block + * size, and not the underlying pixel type + */ +static bool op_is_type_invariant(const SwsOp *op) +{ + switch (op->op) { + case SWS_OP_READ: + case SWS_OP_WRITE: + return !op->rw.packed && !op->rw.frac; + case SWS_OP_SWIZZLE: + case SWS_OP_CLEAR: + return true; + } + + return false; +} + +static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out) +{ + uint8_t shuffle[16]; + int read_bytes, write_bytes; + int pixels; + + pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes); + if (pixels < 0) + return pixels; + + if (read_bytes < 16 || write_bytes < 16) + mmsize = 16; /* avoid cross-lane shuffle */ + + const int num_lanes = mmsize / 16; + const int in_total = num_lanes * read_bytes; + const int out_total = num_lanes * write_bytes; + const int read_size = in_total <= 4 ? 4 : in_total <= 8 ? 8 : mmsize; + *out = (SwsCompiledOp) { + .priv = av_memdup(shuffle, sizeof(shuffle)), + .free = av_free, + .block_size = pixels * num_lanes, + .over_read = read_size - in_total, + .over_write = mmsize - out_total, + .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 : + mmsize > 16 ? AV_CPU_FLAG_AVX2 : + AV_CPU_FLAG_SSE4, + }; + + if (!out->priv) + return AVERROR(ENOMEM); + +#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \ +do { \ + SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \ + if (in_total == IN && out_total == OUT) \ + out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \ +} while (0) + + ASSIGN_SHUFFLE_FUNC( 5, 15, sse4); + ASSIGN_SHUFFLE_FUNC( 4, 16, sse4); + ASSIGN_SHUFFLE_FUNC( 2, 12, sse4); + ASSIGN_SHUFFLE_FUNC(10, 15, sse4); + ASSIGN_SHUFFLE_FUNC( 8, 16, sse4); + ASSIGN_SHUFFLE_FUNC( 4, 12, sse4); + ASSIGN_SHUFFLE_FUNC(15, 15, sse4); + ASSIGN_SHUFFLE_FUNC(12, 16, sse4); + ASSIGN_SHUFFLE_FUNC( 6, 12, sse4); + ASSIGN_SHUFFLE_FUNC(16, 12, sse4); + ASSIGN_SHUFFLE_FUNC(16, 16, sse4); + ASSIGN_SHUFFLE_FUNC( 8, 12, sse4); + ASSIGN_SHUFFLE_FUNC(12, 12, sse4); + ASSIGN_SHUFFLE_FUNC(32, 32, avx2); + ASSIGN_SHUFFLE_FUNC(64, 64, avx512); + av_assert1(out->func); + return 0; +} + +/* Normalize clear values into 32-bit integer constants */ +static void normalize_clear(SwsOp *op) +{ + static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch"); + SwsOpPriv priv; + union { + uint32_t u32; + int i; + } c; + + ff_sws_setup_q4(op, &priv); + for (int i = 0; i < 4; i++) { + if (!op->c.q4[i].den) + continue; + switch (ff_sws_pixel_type_size(op->type)) { + case 1: c.u32 = 0x1010101 * priv.u8[i]; break; + case 2: c.u32 = priv.u16[i] << 16 | priv.u16[i]; break; + case 4: c.u32 = priv.u32[i]; break; + } + + op->c.q4[i].num = c.i; + op->c.q4[i].den = 1; + } +} + +static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out) +{ + const int cpu_flags = av_get_cpu_flags(); + const int mmsize = get_mmsize(cpu_flags); + if (mmsize < 0) + return mmsize; + + av_assert1(ops->num_ops > 0); + const SwsOp read = ops->ops[0]; + const SwsOp write = ops->ops[ops->num_ops - 1]; + int ret; + + /* Special fast path for in-place packed shuffle */ + ret = solve_shuffle(ops, mmsize, out); + if (ret != AVERROR(ENOTSUP)) + return ret; + + SwsOpChain *chain = ff_sws_op_chain_alloc(); + if (!chain) + return AVERROR(ENOMEM); + + *out = (SwsCompiledOp) { + .priv = chain, + .free = (void (*)(void *)) ff_sws_op_chain_free, + + /* Use at most two full YMM regs during the widest precision section */ + .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops), + }; + + /* 3-component reads/writes process one extra garbage word */ + if (read.rw.packed && read.rw.elems == 3) + out->over_read = sizeof(uint32_t); + if (write.rw.packed && write.rw.elems == 3) + out->over_write = sizeof(uint32_t); + + static const SwsOpTable *const tables[] = { + &ops8_m1_sse4, + &ops8_m1_avx2, + &ops8_m2_sse4, + &ops8_m2_avx2, + &ops16_m1_avx2, + &ops16_m2_avx2, + &ops32_avx2, + }; + + do { + int op_block_size = out->block_size; + SwsOp *op = &ops->ops[0]; + + if (op_is_type_invariant(op)) { + if (op->op == SWS_OP_CLEAR) + normalize_clear(op); + op_block_size *= ff_sws_pixel_type_size(op->type); + op->type = SWS_PIXEL_U8; + } + + ret = ff_sws_op_compile_tables(tables, FF_ARRAY_ELEMS(tables), ops, + op_block_size, chain); + } while (ret == AVERROR(EAGAIN)); + if (ret < 0) { + ff_sws_op_chain_free(chain); + return ret; + } + + SWS_DECL_FUNC(ff_sws_process1_sse4); + SWS_DECL_FUNC(ff_sws_process2_sse4); + SWS_DECL_FUNC(ff_sws_process3_sse4); + SWS_DECL_FUNC(ff_sws_process4_sse4); + + const int read_planes = read.rw.packed ? 1 : read.rw.elems; + const int write_planes = write.rw.packed ? 1 : write.rw.elems; + switch (FFMAX(read_planes, write_planes)) { + case 1: out->func = ff_sws_process1_sse4; break; + case 2: out->func = ff_sws_process2_sse4; break; + case 3: out->func = ff_sws_process3_sse4; break; + case 4: out->func = ff_sws_process4_sse4; break; + } + + out->cpu_flags = chain->cpu_flags; + return ret; +} + +SwsOpBackend backend_x86 = { + .name = "x86", + .compile = compile, +}; diff --git a/libswscale/x86/ops_common.asm b/libswscale/x86/ops_common.asm new file mode 100644 index 0000000000..400bdfd3bf --- /dev/null +++ b/libswscale/x86/ops_common.asm @@ -0,0 +1,187 @@ +;****************************************************************************** +;* Copyright (c) 2025 Niklas Haas +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +struc SwsOpExec + .in0 resq 1 + .in1 resq 1 + .in2 resq 1 + .in3 resq 1 + .out0 resq 1 + .out1 resq 1 + .out2 resq 1 + .out3 resq 1 + .in_stride0 resq 1 + .in_stride1 resq 1 + .in_stride2 resq 1 + .in_stride3 resq 1 + .out_stride0 resq 1 + .out_stride1 resq 1 + .out_stride2 resq 1 + .out_stride3 resq 1 + .width resd 1 + .height resd 1 + .slice_y resd 1 + .slice_h resd 1 + .pixel_bits_in resd 1 + .pixel_bits_out resd 1 +endstruc + +struc SwsOpImpl + .cont resb 16 + .priv resb 16 + .next resb 0 +endstruc + +; common macros for declaring operations +%macro op 1 ; name + %ifdef X + %define ADD_PAT(name) p %+ X %+ Y %+ Z %+ W %+ _ %+ name + %else + %define ADD_PAT(name) name + %endif + + %ifdef V2 + %if V2 + %define ADD_MUL(name) name %+ _m2 + %else + %define ADD_MUL(name) name %+ _m1 + %endif + %else + %define ADD_MUL(name) name + %endif + + cglobal ADD_PAT(ADD_MUL(%1)), 0, 0, 0 ; already allocated by entry point + + %undef ADD_PAT + %undef ADD_MUL +%endmacro + +%macro decl_v2 2+ ; v2, func + %xdefine V2 %1 + %2 + %undef V2 +%endmacro + +%macro decl_pattern 5+ ; X, Y, Z, W, func + %xdefine X %1 + %xdefine Y %2 + %xdefine Z %3 + %xdefine W %4 + %5 + %undef X + %undef Y + %undef Z + %undef W +%endmacro + +%macro decl_common_patterns 1+ ; func + decl_pattern 1, 0, 0, 0, %1 ; y + decl_pattern 1, 0, 0, 1, %1 ; ya + decl_pattern 1, 1, 1, 0, %1 ; yuv + decl_pattern 1, 1, 1, 1, %1 ; yuva +%endmacro + +; common names for the internal calling convention +%define mx m0 +%define my m1 +%define mz m2 +%define mw m3 + +%define xmx xm0 +%define xmy xm1 +%define xmz xm2 +%define xmw xm3 + +%define ymx ym0 +%define ymy ym1 +%define ymz ym2 +%define ymw ym3 + +%define mx2 m4 +%define my2 m5 +%define mz2 m6 +%define mw2 m7 + +%define xmx2 xm4 +%define xmy2 xm5 +%define xmz2 xm6 +%define xmw2 xm7 + +%define ymx2 ym4 +%define ymy2 ym5 +%define ymz2 ym6 +%define ymw2 ym7 + +; from entry point signature +%define execq r0q +%define implq r1q +%define bxd r2d +%define yd r3d +%define bxendd r4d + +; extra registers for free use by kernels, not saved between ops +%define tmp0q r5q +%define tmp1q r6q + +%define tmp0d r5d +%define tmp1d r6d + +; pinned static registers for plane pointers, incremented by read/write ops +%define in0q r7q +%define out0q r8q +%define in1q r9q +%define out1q r10q +%define in2q r11q +%define out2q r12q +%define in3q r13q +%define out3q r14q + +; load the next operation kernel +%macro LOAD_CONT 1 ; reg + mov %1, [implq + SwsOpImpl.cont] +%endmacro + +; tail call into the next operation kernel +%macro CONTINUE 1 ; reg + add implq, SwsOpImpl.next + jmp %1 + annotate_function_size +%endmacro + +%macro CONTINUE 0 + LOAD_CONT tmp0q + CONTINUE tmp0q +%endmacro + +; helper for inline conditionals +%macro IF 2+ ; cond, body + %if %1 + %2 + %endif +%endmacro + +; alternate name for nested usage to work around some NASM bugs +%macro IF1 2+ + %if %1 + %2 + %endif +%endmacro diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm new file mode 100644 index 0000000000..9077b266be --- /dev/null +++ b/libswscale/x86/ops_float.asm @@ -0,0 +1,386 @@ +;****************************************************************************** +;* Copyright (c) 2025 Niklas Haas +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "ops_common.asm" + +SECTION .text + +;--------------------------------------------------------- +; Pixel type conversions + +%macro conv8to32f 0 +op convert_U8_F32 + LOAD_CONT tmp0q +IF X, vpsrldq xmx2, xmx, 8 +IF Y, vpsrldq xmy2, xmy, 8 +IF Z, vpsrldq xmz2, xmz, 8 +IF W, vpsrldq xmw2, xmw, 8 +IF X, pmovzxbd mx, xmx +IF Y, pmovzxbd my, xmy +IF Z, pmovzxbd mz, xmz +IF W, pmovzxbd mw, xmw +IF X, pmovzxbd mx2, xmx2 +IF Y, pmovzxbd my2, xmy2 +IF Z, pmovzxbd mz2, xmz2 +IF W, pmovzxbd mw2, xmw2 +IF X, vcvtdq2ps mx, mx +IF Y, vcvtdq2ps my, my +IF Z, vcvtdq2ps mz, mz +IF W, vcvtdq2ps mw, mw +IF X, vcvtdq2ps mx2, mx2 +IF Y, vcvtdq2ps my2, my2 +IF Z, vcvtdq2ps mz2, mz2 +IF W, vcvtdq2ps mw2, mw2 + CONTINUE tmp0q +%endmacro + +%macro conv16to32f 0 +op convert_U16_F32 + LOAD_CONT tmp0q +IF X, vextracti128 xmx2, mx, 1 +IF Y, vextracti128 xmy2, my, 1 +IF Z, vextracti128 xmz2, mz, 1 +IF W, vextracti128 xmw2, mw, 1 +IF X, pmovzxwd mx, xmx +IF Y, pmovzxwd my, xmy +IF Z, pmovzxwd mz, xmz +IF W, pmovzxwd mw, xmw +IF X, pmovzxwd mx2, xmx2 +IF Y, pmovzxwd my2, xmy2 +IF Z, pmovzxwd mz2, xmz2 +IF W, pmovzxwd mw2, xmw2 +IF X, vcvtdq2ps mx, mx +IF Y, vcvtdq2ps my, my +IF Z, vcvtdq2ps mz, mz +IF W, vcvtdq2ps mw, mw +IF X, vcvtdq2ps mx2, mx2 +IF Y, vcvtdq2ps my2, my2 +IF Z, vcvtdq2ps mz2, mz2 +IF W, vcvtdq2ps mw2, mw2 + CONTINUE tmp0q +%endmacro + +%macro conv32fto8 0 +op convert_F32_U8 + LOAD_CONT tmp0q +IF X, cvttps2dq mx, mx +IF Y, cvttps2dq my, my +IF Z, cvttps2dq mz, mz +IF W, cvttps2dq mw, mw +IF X, cvttps2dq mx2, mx2 +IF Y, cvttps2dq my2, my2 +IF Z, cvttps2dq mz2, mz2 +IF W, cvttps2dq mw2, mw2 +IF X, packusdw mx, mx2 +IF Y, packusdw my, my2 +IF Z, packusdw mz, mz2 +IF W, packusdw mw, mw2 +IF X, vextracti128 xmx2, mx, 1 +IF Y, vextracti128 xmy2, my, 1 +IF Z, vextracti128 xmz2, mz, 1 +IF W, vextracti128 xmw2, mw, 1 +IF X, packuswb xmx, xmx2 +IF Y, packuswb xmy, xmy2 +IF Z, packuswb xmz, xmz2 +IF W, packuswb xmw, xmw2 +IF X, vpshufd xmx, xmx, q3120 +IF Y, vpshufd xmy, xmy, q3120 +IF Z, vpshufd xmz, xmz, q3120 +IF W, vpshufd xmw, xmw, q3120 + CONTINUE tmp0q +%endmacro + +%macro conv32fto16 0 +op convert_F32_U16 + LOAD_CONT tmp0q +IF X, cvttps2dq mx, mx +IF Y, cvttps2dq my, my +IF Z, cvttps2dq mz, mz +IF W, cvttps2dq mw, mw +IF X, cvttps2dq mx2, mx2 +IF Y, cvttps2dq my2, my2 +IF Z, cvttps2dq mz2, mz2 +IF W, cvttps2dq mw2, mw2 +IF X, packusdw mx, mx2 +IF Y, packusdw my, my2 +IF Z, packusdw mz, mz2 +IF W, packusdw mw, mw2 +IF X, vpermq mx, mx, q3120 +IF Y, vpermq my, my, q3120 +IF Z, vpermq mz, mz, q3120 +IF W, vpermq mw, mw, q3120 + CONTINUE tmp0q +%endmacro + +%macro min_max 0 +op min +IF X, vbroadcastss m8, [implq + SwsOpImpl.priv + 0] +IF Y, vbroadcastss m9, [implq + SwsOpImpl.priv + 4] +IF Z, vbroadcastss m10, [implq + SwsOpImpl.priv + 8] +IF W, vbroadcastss m11, [implq + SwsOpImpl.priv + 12] + LOAD_CONT tmp0q +IF X, minps mx, mx, m8 +IF Y, minps my, my, m9 +IF Z, minps mz, mz, m10 +IF W, minps mw, mw, m11 +IF X, minps mx2, m8 +IF Y, minps my2, m9 +IF Z, minps mz2, m10 +IF W, minps mw2, m11 + CONTINUE tmp0q + +op max +IF X, vbroadcastss m8, [implq + SwsOpImpl.priv + 0] +IF Y, vbroadcastss m9, [implq + SwsOpImpl.priv + 4] +IF Z, vbroadcastss m10, [implq + SwsOpImpl.priv + 8] +IF W, vbroadcastss m11, [implq + SwsOpImpl.priv + 12] + LOAD_CONT tmp0q +IF X, maxps mx, m8 +IF Y, maxps my, m9 +IF Z, maxps mz, m10 +IF W, maxps mw, m11 +IF X, maxps mx2, m8 +IF Y, maxps my2, m9 +IF Z, maxps mz2, m10 +IF W, maxps mw2, m11 + CONTINUE tmp0q +%endmacro + +%macro scale 0 +op scale + vbroadcastss m8, [implq + SwsOpImpl.priv] + LOAD_CONT tmp0q +IF X, mulps mx, m8 +IF Y, mulps my, m8 +IF Z, mulps mz, m8 +IF W, mulps mw, m8 +IF X, mulps mx2, m8 +IF Y, mulps my2, m8 +IF Z, mulps mz2, m8 +IF W, mulps mw2, m8 + CONTINUE tmp0q +%endmacro + +%macro load_dither_row 5 ; size_log2, y, addr, out, out2 + lea tmp0q, %2 + and tmp0q, (1 << %1) - 1 + shl tmp0q, %1+2 +%if %1 == 2 + VBROADCASTI128 %4, [%3 + tmp0q] +%else + mova %4, [%3 + tmp0q] + %if (4 << %1) > mmsize + mova %5, [%3 + tmp0q + mmsize] + %endif +%endif +%endmacro + +%macro dither 1 ; size_log2 +op dither%1 + %define DX m8 + %define DY m9 + %define DZ m10 + %define DW m11 + %define DX2 DX + %define DY2 DY + %define DZ2 DZ + %define DW2 DW +%if %1 == 0 + ; constant offest for all channels + vbroadcastss DX, [implq + SwsOpImpl.priv] + %define DY DX + %define DZ DX + %define DW DX +%elif %1 == 1 + ; 2x2 matrix, only sign of y matters + mov tmp0d, yd + and tmp0d, 1 + shl tmp0d, 3 + %if X || Z + vbroadcastsd DX, [implq + SwsOpImpl.priv + tmp0q] + %endif + %if Y || W + xor tmp0d, 8 + vbroadcastsd DY, [implq + SwsOpImpl.priv + tmp0q] + %endif + %define DZ DX + %define DW DY +%else + ; matrix is at least 4x4, load all four channels with custom offset + %if (4 << %1) > mmsize + %define DX2 m12 + %define DY2 m13 + %define DZ2 m14 + %define DW2 m15 + %endif + mov tmp1q, [implq + SwsOpImpl.priv] + %if (4 << %1) > 2 * mmsize + ; need to add in x offset + mov tmp0d, bxd + shl tmp0d, 6 ; sizeof(float[16]) + and tmp0d, (4 << %1) - 1 + add tmp1q, tmp0q + %endif +IF X, load_dither_row %1, [yd + 0], tmp1q, DX, DX2 +IF Y, load_dither_row %1, [yd + 3], tmp1q, DY, DY2 +IF Z, load_dither_row %1, [yd + 2], tmp1q, DZ, DZ2 +IF W, load_dither_row %1, [yd + 5], tmp1q, DW, DW2 +%endif + LOAD_CONT tmp0q +IF X, addps mx, DX +IF Y, addps my, DY +IF Z, addps mz, DZ +IF W, addps mw, DW +IF X, addps mx2, DX2 +IF Y, addps my2, DY2 +IF Z, addps mz2, DZ2 +IF W, addps mw2, DW2 + CONTINUE tmp0q +%endmacro + +%macro dither_fns 0 + dither 0 + dither 1 + dither 2 + dither 3 + dither 4 + dither 5 + dither 6 + dither 7 + dither 8 +%endmacro + +%xdefine MASK(I, J) (1 << (5 * (I) + (J))) +%xdefine MASK_OFF(I) MASK(I, 4) +%xdefine MASK_ROW(I) (0b11111 << (5 * (I))) +%xdefine MASK_COL(J) (0b1000010000100001 << J) +%xdefine MASK_ALL (1 << 20) - 1 +%xdefine MASK_LUMA MASK(0, 0) | MASK_OFF(0) +%xdefine MASK_ALPHA MASK(3, 3) | MASK_OFF(3) +%xdefine MASK_DIAG3 MASK(0, 0) | MASK(1, 1) | MASK(2, 2) +%xdefine MASK_OFF3 MASK_OFF(0) | MASK_OFF(1) | MASK_OFF(2) +%xdefine MASK_MAT3 MASK(0, 0) | MASK(0, 1) | MASK(0, 2) |\ + MASK(1, 0) | MASK(1, 1) | MASK(1, 2) |\ + MASK(2, 0) | MASK(2, 1) | MASK(2, 2) +%xdefine MASK_DIAG4 MASK_DIAG3 | MASK(3, 3) +%xdefine MASK_OFF4 MASK_OFF3 | MASK_OFF(3) +%xdefine MASK_MAT4 MASK_ALL & ~MASK_OFF4 + +%macro linear_row 7 ; res, x, y, z, w, row, mask +%define COL(J) ((%7) & MASK(%6, J)) ; true if mask contains component J +%define NOP(J) (J == %6 && !COL(J)) ; true if J is untouched input component + + ; load weights + IF COL(0), vbroadcastss m12, [tmp0q + %6 * 20 + 0] + IF COL(1), vbroadcastss m13, [tmp0q + %6 * 20 + 4] + IF COL(2), vbroadcastss m14, [tmp0q + %6 * 20 + 8] + IF COL(3), vbroadcastss m15, [tmp0q + %6 * 20 + 12] + + ; initialize result vector as appropriate + %if COL(4) ; offset + vbroadcastss %1, [tmp0q + %6 * 20 + 16] + %elif NOP(0) + ; directly reuse first component vector if possible + mova %1, %2 + %else + xorps %1, %1 + %endif + + IF COL(0), mulps m12, %2 + IF COL(1), mulps m13, %3 + IF COL(2), mulps m14, %4 + IF COL(3), mulps m15, %5 + IF COL(0), addps %1, m12 + IF NOP(0) && COL(4), addps %1, %3 ; first vector was not reused + IF COL(1), addps %1, m13 + IF NOP(1), addps %1, %3 + IF COL(2), addps %1, m14 + IF NOP(2), addps %1, %4 + IF COL(3), addps %1, m15 + IF NOP(3), addps %1, %5 +%endmacro + +%macro linear_inner 5 ; x, y, z, w, mask + %define ROW(I) ((%5) & MASK_ROW(I)) + IF1 ROW(0), linear_row m8, %1, %2, %3, %4, 0, %5 + IF1 ROW(1), linear_row m9, %1, %2, %3, %4, 1, %5 + IF1 ROW(2), linear_row m10, %1, %2, %3, %4, 2, %5 + IF1 ROW(3), linear_row m11, %1, %2, %3, %4, 3, %5 + IF ROW(0), mova %1, m8 + IF ROW(1), mova %2, m9 + IF ROW(2), mova %3, m10 + IF ROW(3), mova %4, m11 +%endmacro + +%macro linear_mask 2 ; name, mask +op %1 + mov tmp0q, [implq + SwsOpImpl.priv] ; address of matrix + linear_inner mx, my, mz, mw, %2 + linear_inner mx2, my2, mz2, mw2, %2 + CONTINUE +%endmacro + +; specialized functions for very simple cases +%macro linear_dot3 0 +op dot3 + mov tmp0q, [implq + SwsOpImpl.priv] + vbroadcastss m12, [tmp0q + 0] + vbroadcastss m13, [tmp0q + 4] + vbroadcastss m14, [tmp0q + 8] + LOAD_CONT tmp0q + mulps mx, m12 + mulps m8, my, m13 + mulps m9, mz, m14 + addps mx, m8 + addps mx, m9 + mulps mx2, m12 + mulps m10, my2, m13 + mulps m11, mz2, m14 + addps mx2, m10 + addps mx2, m11 + CONTINUE tmp0q +%endmacro + +%macro linear_fns 0 + linear_dot3 + linear_mask luma, MASK_LUMA + linear_mask alpha, MASK_ALPHA + linear_mask lumalpha, MASK_LUMA | MASK_ALPHA + linear_mask row0, MASK_ROW(0) + linear_mask row0a, MASK_ROW(0) | MASK_ALPHA + linear_mask diag3, MASK_DIAG3 + linear_mask diag4, MASK_DIAG4 + linear_mask diagoff3, MASK_DIAG3 | MASK_OFF3 + linear_mask matrix3, MASK_MAT3 + linear_mask affine3, MASK_MAT3 | MASK_OFF3 + linear_mask affine3a, MASK_MAT3 | MASK_OFF3 | MASK_ALPHA + linear_mask matrix4, MASK_MAT4 + linear_mask affine4, MASK_MAT4 | MASK_OFF4 +%endmacro + +INIT_YMM avx2 +decl_common_patterns conv8to32f +decl_common_patterns conv16to32f +decl_common_patterns conv32fto8 +decl_common_patterns conv32fto16 +decl_common_patterns min_max +decl_common_patterns scale +decl_common_patterns dither_fns +linear_fns diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm new file mode 100644 index 0000000000..ca5a483a2c --- /dev/null +++ b/libswscale/x86/ops_int.asm @@ -0,0 +1,1050 @@ +;****************************************************************************** +;* Copyright (c) 2025 Niklas Haas +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "ops_common.asm" + +SECTION_RODATA + +expand16_shuf: db 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 +expand32_shuf: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 + +read8_unpack2: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 +read8_unpack3: db 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1 +read8_unpack4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 +read16_unpack2: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +read16_unpack3: db 0, 1, 6, 7, 2, 3, 8, 9, 4, 5, 10, 11, -1, -1, -1, -1 +read16_unpack4: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +write8_pack2: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +write8_pack3: db 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, -1, -1, -1, -1 +write16_pack3: db 0, 1, 4, 5, 8, 9, 2, 3, 6, 7, 10, 11, -1, -1, -1, -1 + +%define write8_pack4 read8_unpack4 +%define write16_pack4 read16_unpack2 +%define write16_pack2 read16_unpack4 + +align 32 +bits_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, \ + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3 +bits_mask: db 128, 64, 32, 16, 8, 4, 2, 1,128, 64, 32, 16, 8, 4, 2, 1 +bits_reverse: db 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, + +mask1: times 32 db 0x01 +mask2: times 32 db 0x03 +mask3: times 32 db 0x07 +mask4: times 32 db 0x0F + +SECTION .text + +;--------------------------------------------------------- +; Global entry point + +%macro prep_addr 3 ; num_planes, dstp, srcp + %if %1 = 1 + mov tmp0q, [%3] + mov [%2], tmp0q + %elif %1 == 2 + mova xm0, [%3] + mova [%2], xm0 + %else + mova xm0, [%3] + mova xm1, [%3 + 16] + mova [%2], xm0 + mova [%2 + 16], xm1 + %endif +%endmacro + +%macro incr_addr 3 ; num_planes, addrp, stridep + %if %1 = 1 + mov tmp0q, [%2] + add tmp0q, [%3] + mov [%2], tmp0q + %elif %1 == 2 + mova xm0, [%2] + paddq xm0, [%3] + mova [%2], xm0 + %else + mova xm0, [%2] + mova xm1, [%2 + 16] + paddq xm0, [%3] + paddq xm1, [%3] + mova [%2], xm0 + mova [%2 + 16], xm1 + %endif +%endmacro + +%macro process_fn 1 ; num_planes +cglobal sws_process%1, 6, 7 + 2 * %1, 16 + ; Args: + ; execq, implq, bxd, yd, bxendd as defined in ops_common.int + ; tmp0d initially holds y_end, will be pushed to stack + ; Stack layout: + ; [rsp + 0] = [qword] in0 + ; [rsp + 8] = [qword] in1 + ; [rsp + 16] = [qword] in2 + ; [rsp + 24] = [qword] in3 + ; [rsp + 32] = [qword] out0 + ; [rsp + 40] = [qword] out1 + ; [rsp + 48] = [qword] out2 + ; [rsp + 56] = [qword] out3 + ; [rsp + 64] = [qword] saved impl + ; [rsp + 72] = [dword] saved bx start + ; [rsp + 76] = [dword] saved y end + ; [rsp + 80] = [qword] saved rsp + mov tmp1q, rsp + sub rsp, 88 + and rsp, -32 + mov [rsp + 64], implq + mov [rsp + 72], bxd + mov [rsp + 76], tmp0d + mov [rsp + 80], tmp1q + prep_addr %1, rsp, execq + SwsOpExec.in0 + prep_addr %1, rsp + 32, execq + SwsOpExec.out0 +.outer: + ; set up static registers + mov in0q, [rsp + 0] +IF %1 > 1, mov in1q, [rsp + 8] +IF %1 > 2, mov in2q, [rsp + 16] +IF %1 > 3, mov in3q, [rsp + 24] + mov out0q, [rsp + 32] +IF %1 > 1, mov out1q, [rsp + 40] +IF %1 > 2, mov out2q, [rsp + 48] +IF %1 > 3, mov out3q, [rsp + 56] +.inner: + mov tmp0q, [implq + SwsOpImpl.cont] + add implq, SwsOpImpl.next + call tmp0q + mov implq, [rsp + 64] + inc bxd + cmp bxd, bxendd + jne .inner + inc yd + cmp yd, [rsp + 76] + je .end + incr_addr %1, rsp, execq + SwsOpExec.in_stride0 + incr_addr %1, rsp + 32, execq + SwsOpExec.out_stride0 + mov bxd, [rsp + 72] + jmp .outer + +.end: + ; clean up + mov rsp, [rsp + 80] + RET +%endmacro + +;--------------------------------------------------------- +; Planar reads / writes + +%macro read_planar 1 ; elems +op read_planar%1 + movu mx, [in0q] +IF %1 > 1, movu my, [in1q] +IF %1 > 2, movu mz, [in2q] +IF %1 > 3, movu mw, [in3q] +%if V2 + movu mx2, [in0q + mmsize] +IF %1 > 1, movu my2, [in1q + mmsize] +IF %1 > 2, movu mz2, [in2q + mmsize] +IF %1 > 3, movu mw2, [in3q + mmsize] +%endif + LOAD_CONT tmp0q + add in0q, mmsize * (1 + V2) +IF %1 > 1, add in1q, mmsize * (1 + V2) +IF %1 > 2, add in2q, mmsize * (1 + V2) +IF %1 > 3, add in3q, mmsize * (1 + V2) + CONTINUE tmp0q +%endmacro + +%macro write_planar 1 ; elems +op write_planar%1 + movu [out0q], mx +IF %1 > 1, movu [out1q], my +IF %1 > 2, movu [out2q], mz +IF %1 > 3, movu [out3q], mw +%if V2 + movu [out0q + mmsize], mx2 +IF %1 > 1, movu [out1q + mmsize], my2 +IF %1 > 2, movu [out2q + mmsize], mz2 +IF %1 > 3, movu [out3q + mmsize], mw2 +%endif + add out0q, mmsize * (1 + V2) +IF %1 > 1, add out1q, mmsize * (1 + V2) +IF %1 > 2, add out2q, mmsize * (1 + V2) +IF %1 > 3, add out3q, mmsize * (1 + V2) + RET +%endmacro + +%macro read_packed2 1 ; depth +op read%1_packed2 + movu m8, [in0q + 0*mmsize] + movu m9, [in0q + 1*mmsize] + IF V2, movu m10, [in0q + 2*mmsize] + IF V2, movu m11, [in0q + 3*mmsize] +IF %1 < 32, VBROADCASTI128 m12, [read%1_unpack2] + LOAD_CONT tmp0q + add in0q, mmsize * (2 + V2 * 2) +%if %1 == 32 + shufps m8, m8, q3120 + shufps m9, m9, q3120 + IF V2, shufps m10, m10, q3120 + IF V2, shufps m11, m11, q3120 +%else + pshufb m8, m12 ; { X0 Y0 | X1 Y1 } + pshufb m9, m12 ; { X2 Y2 | X3 Y3 } + IF V2, pshufb m10, m12 + IF V2, pshufb m11, m12 +%endif + unpcklpd mx, m8, m9 ; { X0 X2 | X1 X3 } + unpckhpd my, m8, m9 ; { Y0 Y2 | Y1 Y3 } + IF V2, unpcklpd mx2, m10, m11 + IF V2, unpckhpd my2, m10, m11 +%if avx_enabled + vpermq mx, mx, q3120 ; { X0 X1 | X2 X3 } + vpermq my, my, q3120 ; { Y0 Y1 | Y2 Y3 } + IF V2, vpermq mx2, mx2, q3120 + IF V2, vpermq my2, my2, q3120 +%endif + CONTINUE tmp0q +%endmacro + +%macro write_packed2 1 ; depth +op write%1_packed2 +IF %1 < 32, VBROADCASTI128 m12, [write%1_pack2] + LOAD_CONT tmp0q +%if avx_enabled + vpermq mx, mx, q3120 ; { X0 X2 | X1 X3 } + vpermq my, my, q3120 ; { Y0 Y2 | Y1 Y3 } + IF V2, vpermq mx2, mx2, q3120 + IF V2, vpermq my2, my2, q3120 +%endif + unpcklpd m8, mx, my ; { X0 Y0 | X1 Y1 } + unpckhpd m9, mx, my ; { X2 Y2 | X3 Y3 } + IF V2, unpcklpd m10, mx2, my2 + IF V2, unpckhpd m11, mx2, my2 +%if %1 == 32 + shufps m8, m8, q3120 + shufps m9, m9, q3120 + IF V2, shufps m10, m10, q3120 + IF V2, shufps m11, m11, q3120 +%else + pshufb m8, m12 + pshufb m9, m12 + IF V2, pshufb m10, m12 + IF V2, pshufb m11, m12 +%endif + movu [out0q + 0*mmsize], m8 + movu [out0q + 1*mmsize], m9 +IF V2, movu [out0q + 2*mmsize], m10 +IF V2, movu [out0q + 3*mmsize], m11 + add out0q, mmsize * (2 + V2 * 2) + RET +%endmacro + +%macro read_packed_inner 7 ; x, y, z, w, addr, num, depth + movu xm8, [%5 + 0 * %6] + movu xm9, [%5 + 4 * %6] + movu xm10, [%5 + 8 * %6] + movu xm11, [%5 + 12 * %6] + %if avx_enabled + vinserti128 m8, m8, [%5 + 16 * %6], 1 + vinserti128 m9, m9, [%5 + 20 * %6], 1 + vinserti128 m10, m10, [%5 + 24 * %6], 1 + vinserti128 m11, m11, [%5 + 28 * %6], 1 + %endif + %if %7 == 32 + mova %1, m8 + mova %2, m9 + mova %3, m10 + mova %4, m11 + %else + pshufb %1, m8, m12 ; { X0 Y0 Z0 W0 | X4 Y4 Z4 W4 } + pshufb %2, m9, m12 ; { X1 Y1 Z1 W1 | X5 Y5 Z5 W5 } + pshufb %3, m10, m12 ; { X2 Y2 Z2 W2 | X6 Y6 Z6 W6 } + pshufb %4, m11, m12 ; { X3 Y3 Z3 W3 | X7 Y7 Z7 W7 } + %endif + punpckldq m8, %1, %2 ; { X0 X1 Y0 Y1 | X4 X5 Y4 Y5 } + punpckldq m9, %3, %4 ; { X2 X3 Y2 Y3 | X6 X7 Y6 Y7 } + punpckhdq m10, %1, %2 ; { Z0 Z1 W0 W1 | Z4 Z5 W4 W5 } + punpckhdq m11, %3, %4 ; { Z2 Z3 W2 W3 | Z6 Z7 W6 W7 } + punpcklqdq %1, m8, m9 ; { X0 X1 X2 X3 | X4 X5 X6 X7 } + punpckhqdq %2, m8, m9 ; { Y0 Y1 Y2 Y3 | Y4 Y5 Y6 Y7 } + punpcklqdq %3, m10, m11 ; { Z0 Z1 Z2 Z3 | Z4 Z5 Z6 Z7 } +IF %6 > 3, punpckhqdq %4, m10, m11 ; { W0 W1 W2 W3 | W4 W5 W6 W7 } +%endmacro + +%macro read_packed 2 ; num, depth +op read%2_packed%1 +IF %2 < 32, VBROADCASTI128 m12, [read%2_unpack%1] + LOAD_CONT tmp0q + read_packed_inner mx, my, mz, mw, in0q, %1, %2 +IF1 V2, read_packed_inner mx2, my2, mz2, mw2, in0q + %1 * mmsize, %1, %2 + add in0q, %1 * mmsize * (1 + V2) + CONTINUE tmp0q +%endmacro + +%macro write_packed_inner 7 ; x, y, z, w, addr, num, depth + punpckldq m8, %1, %2 ; { X0 Y0 X1 Y1 | X4 Y4 X5 Y5 } + punpckldq m9, %3, %4 ; { Z0 W0 Z1 W1 | Z4 W4 Z5 W5 } + punpckhdq m10, %1, %2 ; { X2 Y2 X3 Y3 | X6 Y6 X7 Y7 } + punpckhdq m11, %3, %4 ; { Z2 W2 Z3 W3 | Z6 W6 Z7 W7 } + punpcklqdq %1, m8, m9 ; { X0 Y0 Z0 W0 | X4 Y4 Z4 W4 } + punpckhqdq %2, m8, m9 ; { X1 Y1 Z1 W1 | X5 Y5 Z5 W5 } + punpcklqdq %3, m10, m11 ; { X2 Y2 Z2 W2 | X6 Y6 Z6 W6 } + punpckhqdq %4, m10, m11 ; { X3 Y3 Z3 W3 | X7 Y7 Z7 W7 } + %if %7 == 32 + mova m8, %1 + mova m9, %2 + mova m10, %3 + mova m11, %4 + %else + pshufb m8, %1, m12 + pshufb m9, %2, m12 + pshufb m10, %3, m12 + pshufb m11, %4, m12 + %endif + movu [%5 + 0*%6], xm8 + movu [%5 + 4*%6], xm9 + movu [%5 + 8*%6], xm10 + movu [%5 + 12*%6], xm11 + %if avx_enabled + vextracti128 [%5 + 16*%6], m8, 1 + vextracti128 [%5 + 20*%6], m9, 1 + vextracti128 [%5 + 24*%6], m10, 1 + vextracti128 [%5 + 28*%6], m11, 1 + %endif +%endmacro + +%macro write_packed 2 ; num, depth +op write%2_packed%1 +IF %2 < 32, VBROADCASTI128 m12, [write%2_pack%1] + write_packed_inner mx, my, mz, mw, out0q, %1, %2 +IF1 V2, write_packed_inner mx2, my2, mz2, mw2, out0q + %1 * mmsize, %1, %2 + add out0q, %1 * mmsize * (1 + V2) + RET +%endmacro + +%macro rw_packed 1 ; depth + read_packed2 %1 + read_packed 3, %1 + read_packed 4, %1 + write_packed2 %1 + write_packed 3, %1 + write_packed 4, %1 +%endmacro + +%macro read_nibbles 0 +op read_nibbles1 +%if avx_enabled + movu xmx, [in0q] +IF V2, movu xmx2, [in0q + 16] +%else + movq xmx, [in0q] +IF V2, movq xmx2, [in0q + 8] +%endif + VBROADCASTI128 m8, [mask4] + LOAD_CONT tmp0q + add in0q, (mmsize >> 1) * (1 + V2) + pmovzxbw mx, xmx +IF V2, pmovzxbw mx2, xmx2 + psllw my, mx, 8 +IF V2, psllw my2, mx2, 8 + psrlw mx, 4 +IF V2, psrlw mx2, 4 + pand my, m8 +IF V2, pand my2, m8 + por mx, my +IF V2, por mx2, my2 + CONTINUE tmp0q +%endmacro + +%macro read_bits 0 +op read_bits1 +%if avx_enabled + vpbroadcastd mx, [in0q] +IF V2, vpbroadcastd mx2, [in0q + 4] +%else + movd mx, [in0q] +IF V2, movd mx2, [in0q + 2] +%endif + mova m8, [bits_shuf] + VBROADCASTI128 m9, [bits_mask] + VBROADCASTI128 m10, [mask1] + LOAD_CONT tmp0q + add in0q, (mmsize >> 3) * (1 + V2) + pshufb mx, m8 +IF V2, pshufb mx2, m8 + pand mx, m9 +IF V2, pand mx2, m9 + pcmpeqb mx, m9 +IF V2, pcmpeqb mx2, m9 + pand mx, m10 +IF V2, pand mx2, m10 + CONTINUE tmp0q +%endmacro + +%macro write_bits 0 +op write_bits1 + VBROADCASTI128 m8, [bits_reverse] + psllw mx, 7 +IF V2, psllw mx2, 7 + pshufb mx, m8 +IF V2, pshufb mx2, m8 + pmovmskb tmp0d, mx +IF V2, pmovmskb tmp1d, mx2 +%if avx_enabled + mov [out0q], tmp0d +IF V2, mov [out0q + 4], tmp1d +%else + mov [out0q], tmp0d +IF V2, mov [out0q + 2], tmp1d +%endif + add out0q, (mmsize >> 3) * (1 + V2) + RET +%endmacro + +;-------------------------- +; Pixel packing / unpacking + +%macro pack_generic 3-4 0 ; x, y, z, w +op pack_%1%2%3%4 + ; pslld works for all sizes because the input should not overflow +IF %2, pslld mx, %4+%3+%2 +IF %3, pslld my, %4+%3 +IF %4, pslld mz, %4 +IF %2, por mx, my +IF %3, por mx, mz +IF %4, por mx, mw + %if V2 +IF %2, pslld mx2, %4+%3+%2 +IF %3, pslld my2, %4+%3 +IF %4, pslld mz2, %4 +IF %2, por mx2, my2 +IF %3, por mx2, mz2 +IF %4, por mx2, mw2 + %endif + CONTINUE +%endmacro + +%macro unpack 5-6 0 ; type, bits, x, y, z, w +op unpack_%3%4%5%6 + ; clear high bits by shifting left +IF %6, vpsll%1 mw, mx, %2 - (%6) +IF %5, vpsll%1 mz, mx, %2 - (%6+%5) +IF %4, vpsll%1 my, mx, %2 - (%6+%5+%4) + psrl%1 mx, %4+%5+%6 +IF %4, psrl%1 my, %2 - %4 +IF %5, psrl%1 mz, %2 - %5 +IF %6, psrl%1 mw, %2 - %6 + %if V2 +IF %6, vpsll%1 mw2, mx2, %2 - (%6) +IF %5, vpsll%1 mz2, mx2, %2 - (%6+%5) +IF %4, vpsll%1 my2, mx2, %2 - (%6+%5+%4) + psrl%1 mx2, %4+%5+%6 +IF %4, psrl%1 my2, %2 - %4 +IF %5, psrl%1 mz2, %2 - %5 +IF %6, psrl%1 mw2, %2 - %6 + %endif + CONTINUE +%endmacro + +%macro unpack8 3 ; x, y, z +op unpack_%1%2%3 %+ 0 + pand mz, mx, [mask%3] + psrld my, mx, %3 + psrld mx, %3+%2 + pand my, [mask%2] + pand mx, [mask%1] + %if V2 + pand mz2, mx2, [mask%3] + psrld my2, mx2, %3 + psrld mx2, %3+%2 + pand my2, [mask%2] + pand mx2, [mask%1] + %endif + CONTINUE +%endmacro + +;--------------------------------------------------------- +; Generic byte order shuffle (packed swizzle, endian, etc) + +%macro shuffle 0 +op shuffle + VBROADCASTI128 m8, [implq + SwsOpImpl.priv] + LOAD_CONT tmp0q +IF X, pshufb mx, m8 +IF Y, pshufb my, m8 +IF Z, pshufb mz, m8 +IF W, pshufb mw, m8 +%if V2 +IF X, pshufb mx2, m8 +IF Y, pshufb my2, m8 +IF Z, pshufb mz2, m8 +IF W, pshufb mw2, m8 +%endif + CONTINUE tmp0q +%endmacro + +;--------------------------------------------------------- +; Clearing + +%macro clear_alpha 3 ; idx, vreg, vreg2 +op clear_alpha%1 + LOAD_CONT tmp0q + pcmpeqb %2, %2 +IF V2, mova %3, %2 + CONTINUE tmp0q +%endmacro + +%macro clear_zero 3 ; idx, vreg, vreg2 +op clear_zero%1 + LOAD_CONT tmp0q + pxor %2, %2 +IF V2, mova %3, %2 + CONTINUE tmp0q +%endmacro + +%macro clear_generic 0 +op clear + LOAD_CONT tmp0q +%if avx_enabled + IF !X, vpbroadcastd mx, [implq + SwsOpImpl.priv + 0] + IF !Y, vpbroadcastd my, [implq + SwsOpImpl.priv + 4] + IF !Z, vpbroadcastd mz, [implq + SwsOpImpl.priv + 8] + IF !W, vpbroadcastd mw, [implq + SwsOpImpl.priv + 12] +%else ; !avx_enabled + IF !X, movd mx, [implq + SwsOpImpl.priv + 0] + IF !Y, movd my, [implq + SwsOpImpl.priv + 4] + IF !Z, movd mz, [implq + SwsOpImpl.priv + 8] + IF !W, movd mw, [implq + SwsOpImpl.priv + 12] + IF !X, pshufd mx, mx, 0 + IF !Y, pshufd my, my, 0 + IF !Z, pshufd mz, mz, 0 + IF !W, pshufd mw, mw, 0 +%endif +%if V2 + IF !X, mova mx2, mx + IF !Y, mova my2, my + IF !Z, mova mz2, mz + IF !W, mova mw2, mw +%endif + CONTINUE tmp0q +%endmacro + +%macro clear_funcs 0 + decl_pattern 1, 1, 1, 0, clear_generic + decl_pattern 0, 1, 1, 1, clear_generic + decl_pattern 0, 0, 1, 1, clear_generic + decl_pattern 1, 0, 0, 1, clear_generic + decl_pattern 1, 1, 0, 0, clear_generic + decl_pattern 0, 1, 0, 1, clear_generic + decl_pattern 1, 0, 1, 0, clear_generic + decl_pattern 1, 0, 0, 0, clear_generic + decl_pattern 0, 1, 0, 0, clear_generic + decl_pattern 0, 0, 1, 0, clear_generic +%endmacro + +;--------------------------------------------------------- +; Swizzling and duplicating + +; mA := mB, mB := mC, ... mX := mA +%macro vrotate 2-* ; A, B, C, ... + %rep %0 + %assign rot_a %1 + 4 + %assign rot_b %2 + 4 + mova m%1, m%2 + IF V2, mova m%[rot_a], m%[rot_b] + %rotate 1 + %endrep + %undef rot_a + %undef rot_b +%endmacro + +%macro swizzle_funcs 0 +op swizzle_3012 + LOAD_CONT tmp0q + vrotate 8, 0, 3, 2, 1 + CONTINUE tmp0q + +op swizzle_3021 + LOAD_CONT tmp0q + vrotate 8, 0, 3, 1 + CONTINUE tmp0q + +op swizzle_2103 + LOAD_CONT tmp0q + vrotate 8, 0, 2 + CONTINUE tmp0q + +op swizzle_3210 + LOAD_CONT tmp0q + vrotate 8, 0, 3 + vrotate 8, 1, 2 + CONTINUE tmp0q + +op swizzle_3102 + LOAD_CONT tmp0q + vrotate 8, 0, 3, 2 + CONTINUE tmp0q + +op swizzle_3201 + LOAD_CONT tmp0q + vrotate 8, 0, 3, 1, 2 + CONTINUE tmp0q + +op swizzle_1203 + LOAD_CONT tmp0q + vrotate 8, 0, 1, 2 + CONTINUE tmp0q + +op swizzle_1023 + LOAD_CONT tmp0q + vrotate 8, 0, 1 + CONTINUE tmp0q + +op swizzle_2013 + LOAD_CONT tmp0q + vrotate 8, 0, 2, 1 + CONTINUE tmp0q + +op swizzle_2310 + LOAD_CONT tmp0q + vrotate 8, 0, 2, 1, 3 + CONTINUE tmp0q + +op swizzle_2130 + LOAD_CONT tmp0q + vrotate 8, 0, 2, 3 + CONTINUE tmp0q + +op swizzle_1230 + LOAD_CONT tmp0q + vrotate 8, 0, 1, 2, 3 + CONTINUE tmp0q + +op swizzle_1320 + LOAD_CONT tmp0q + vrotate 8, 0, 1, 3 + CONTINUE tmp0q + +op swizzle_0213 + LOAD_CONT tmp0q + vrotate 8, 1, 2 + CONTINUE tmp0q + +op swizzle_0231 + LOAD_CONT tmp0q + vrotate 8, 1, 2, 3 + CONTINUE tmp0q + +op swizzle_0312 + LOAD_CONT tmp0q + vrotate 8, 1, 3, 2 + CONTINUE tmp0q + +op swizzle_3120 + LOAD_CONT tmp0q + vrotate 8, 0, 3 + CONTINUE tmp0q + +op swizzle_0321 + LOAD_CONT tmp0q + vrotate 8, 1, 3 + CONTINUE tmp0q + +op swizzle_0003 + LOAD_CONT tmp0q + mova my, mx + mova mz, mx +%if V2 + mova my2, mx2 + mova mz2, mx2 +%endif + CONTINUE tmp0q + +op swizzle_0001 + LOAD_CONT tmp0q + mova mw, my + mova mz, mx + mova my, mx +%if V2 + mova mw2, my2 + mova mz2, mx2 + mova my2, mx2 +%endif + CONTINUE tmp0q + +op swizzle_3000 + LOAD_CONT tmp0q + mova my, mx + mova mz, mx + mova mx, mw + mova mw, my +%if V2 + mova my2, mx2 + mova mz2, mx2 + mova mx2, mw2 + mova mw2, my2 +%endif + CONTINUE tmp0q + +op swizzle_1000 + LOAD_CONT tmp0q + mova mz, mx + mova mw, mx + mova mx, my + mova my, mz +%if V2 + mova mz2, mx2 + mova mw2, mx2 + mova mx2, my2 + mova my2, mz2 +%endif + CONTINUE tmp0q +%endmacro + +%macro packed_shuffle 2 ; size_in, size_out +cglobal packed_shuffle%1_%2, 6, 10, 2, \ + exec, shuffle, bx, y, bxend, yend, src, dst, src_stride, dst_stride + mov srcq, [execq + SwsOpExec.in0] + mov dstq, [execq + SwsOpExec.out0] + mov src_strideq, [execq + SwsOpExec.in_stride0] + mov dst_strideq, [execq + SwsOpExec.out_stride0] + VBROADCASTI128 m1, [shuffleq] + sub bxendd, bxd + sub yendd, yd + ; reuse regs + %define srcidxq execq + imul srcidxq, bxendq, -%1 +%if %1 = %2 + %define dstidxq srcidxq +%else + %define dstidxq shuffleq ; no longer needed reg + imul dstidxq, bxendq, -%2 +%endif + sub srcq, srcidxq + sub dstq, dstidxq +.loop: + %if %1 <= 4 + movd m0, [srcq + srcidxq] + %elif %1 <= 8 + movq m0, [srcq + srcidxq] + %else + movu m0, [srcq + srcidxq] + %endif + pshufb m0, m1 + movu [dstq + dstidxq], m0 + add srcidxq, %1 +IF %1 != %2,add dstidxq, %2 + jnz .loop + add srcq, src_strideq + add dstq, dst_strideq + imul srcidxq, bxendq, -%1 +IF %1 != %2,imul dstidxq, bxendq, -%2 + dec yendd + jnz .loop + RET +%endmacro + +;--------------------------------------------------------- +; Pixel type conversions + +%macro conv8to16 1 ; type +op %1_U8_U16 + LOAD_CONT tmp0q +%if V2 + %if avx_enabled + IF X, vextracti128 xmx2, mx, 1 + IF Y, vextracti128 xmy2, my, 1 + IF Z, vextracti128 xmz2, mz, 1 + IF W, vextracti128 xmw2, mw, 1 + %else + IF X, psrldq xmx2, mx, 8 + IF Y, psrldq xmy2, my, 8 + IF Z, psrldq xmz2, mz, 8 + IF W, psrldq xmw2, mw, 8 + %endif + IF X, pmovzxbw mx2, xmx2 + IF Y, pmovzxbw my2, xmy2 + IF Z, pmovzxbw mz2, xmz2 + IF W, pmovzxbw mw2, xmw2 +%endif ; V2 + IF X, pmovzxbw mx, xmx + IF Y, pmovzxbw my, xmy + IF Z, pmovzxbw mz, xmz + IF W, pmovzxbw mw, xmw + +%ifidn %1, expand + VBROADCASTI128 m8, [expand16_shuf] + %if V2 + IF X, pshufb mx2, m8 + IF Y, pshufb my2, m8 + IF Z, pshufb mz2, m8 + IF W, pshufb mw2, m8 + %endif + IF X, pshufb mx, m8 + IF Y, pshufb my, m8 + IF Z, pshufb mz, m8 + IF W, pshufb mw, m8 +%endif ; expand + CONTINUE tmp0q +%endmacro + +%macro conv16to8 0 +op convert_U16_U8 + LOAD_CONT tmp0q +%if V2 + ; this code technically works for the !V2 case as well, but slower +IF X, packuswb mx, mx2 +IF Y, packuswb my, my2 +IF Z, packuswb mz, mz2 +IF W, packuswb mw, mw2 +IF X, vpermq mx, mx, q3120 +IF Y, vpermq my, my, q3120 +IF Z, vpermq mz, mz, q3120 +IF W, vpermq mw, mw, q3120 +%else +IF X, vextracti128 xm8, mx, 1 +IF Y, vextracti128 xm9, my, 1 +IF Z, vextracti128 xm10, mz, 1 +IF W, vextracti128 xm11, mw, 1 +IF X, packuswb xmx, xm8 +IF Y, packuswb xmy, xm9 +IF Z, packuswb xmz, xm10 +IF W, packuswb xmw, xm11 +%endif + CONTINUE tmp0q +%endmacro + +%macro conv8to32 1 ; type +op %1_U8_U32 + LOAD_CONT tmp0q +IF X, psrldq xmx2, xmx, 8 +IF Y, psrldq xmy2, xmy, 8 +IF Z, psrldq xmz2, xmz, 8 +IF W, psrldq xmw2, xmw, 8 +IF X, pmovzxbd mx, xmx +IF Y, pmovzxbd my, xmy +IF Z, pmovzxbd mz, xmz +IF W, pmovzxbd mw, xmw +IF X, pmovzxbd mx2, xmx2 +IF Y, pmovzxbd my2, xmy2 +IF Z, pmovzxbd mz2, xmz2 +IF W, pmovzxbd mw2, xmw2 +%ifidn %1, expand + VBROADCASTI128 m8, [expand32_shuf] +IF X, pshufb mx, m8 +IF Y, pshufb my, m8 +IF Z, pshufb mz, m8 +IF W, pshufb mw, m8 +IF X, pshufb mx2, m8 +IF Y, pshufb my2, m8 +IF Z, pshufb mz2, m8 +IF W, pshufb mw2, m8 +%endif ; expand + CONTINUE tmp0q +%endmacro + +%macro conv32to8 0 +op convert_U32_U8 + LOAD_CONT tmp0q +IF X, packusdw mx, mx2 +IF Y, packusdw my, my2 +IF Z, packusdw mz, mz2 +IF W, packusdw mw, mw2 +IF X, vextracti128 xmx2, mx, 1 +IF Y, vextracti128 xmy2, my, 1 +IF Z, vextracti128 xmz2, mz, 1 +IF W, vextracti128 xmw2, mw, 1 +IF X, packuswb xmx, xmx2 +IF Y, packuswb xmy, xmy2 +IF Z, packuswb xmz, xmz2 +IF W, packuswb xmw, xmw2 +IF X, vpshufd xmx, xmx, q3120 +IF Y, vpshufd xmy, xmy, q3120 +IF Z, vpshufd xmz, xmz, q3120 +IF W, vpshufd xmw, xmw, q3120 + CONTINUE tmp0q +%endmacro + +%macro conv16to32 0 +op convert_U16_U32 + LOAD_CONT tmp0q +IF X, vextracti128 xmx2, mx, 1 +IF Y, vextracti128 xmy2, my, 1 +IF Z, vextracti128 xmz2, mz, 1 +IF W, vextracti128 xmw2, mw, 1 +IF X, pmovzxwd mx, xmx +IF Y, pmovzxwd my, xmy +IF Z, pmovzxwd mz, xmz +IF W, pmovzxwd mw, xmw +IF X, pmovzxwd mx2, xmx2 +IF Y, pmovzxwd my2, xmy2 +IF Z, pmovzxwd mz2, xmz2 +IF W, pmovzxwd mw2, xmw2 + CONTINUE tmp0q +%endmacro + +%macro conv32to16 0 +op convert_U32_U16 + LOAD_CONT tmp0q +IF X, packusdw mx, mx2 +IF Y, packusdw my, my2 +IF Z, packusdw mz, mz2 +IF W, packusdw mw, mw2 +IF X, vpermq mx, mx, q3120 +IF Y, vpermq my, my, q3120 +IF Z, vpermq mz, mz, q3120 +IF W, vpermq mw, mw, q3120 + CONTINUE tmp0q +%endmacro + +;--------------------------------------------------------- +; Shifting + +%macro lshift16 0 +op lshift16 + vmovq xm8, [implq + SwsOpImpl.priv] + LOAD_CONT tmp0q +IF X, psllw mx, xm8 +IF Y, psllw my, xm8 +IF Z, psllw mz, xm8 +IF W, psllw mw, xm8 +%if V2 +IF X, psllw mx2, xm8 +IF Y, psllw my2, xm8 +IF Z, psllw mz2, xm8 +IF W, psllw mw2, xm8 +%endif + CONTINUE tmp0q +%endmacro + +%macro rshift16 0 +op rshift16 + vmovq xm8, [implq + SwsOpImpl.priv] + LOAD_CONT tmp0q +IF X, psrlw mx, xm8 +IF Y, psrlw my, xm8 +IF Z, psrlw mz, xm8 +IF W, psrlw mw, xm8 +%if V2 +IF X, psrlw mx2, xm8 +IF Y, psrlw my2, xm8 +IF Z, psrlw mz2, xm8 +IF W, psrlw mw2, xm8 +%endif + CONTINUE tmp0q +%endmacro + +;--------------------------------------------------------- +; Function instantiations + +%macro funcs_u8 0 + read_planar 1 + read_planar 2 + read_planar 3 + read_planar 4 + write_planar 1 + write_planar 2 + write_planar 3 + write_planar 4 + + rw_packed 8 + read_nibbles + read_bits + write_bits + + pack_generic 1, 2, 1 + pack_generic 3, 3, 2 + pack_generic 2, 3, 3 + unpack8 1, 2, 1 + unpack8 3, 3, 2 + unpack8 2, 3, 3 + + clear_alpha 0, mx, mx2 + clear_alpha 1, my, my2 + clear_alpha 3, mw, mw2 + clear_zero 0, mx, mx2 + clear_zero 1, my, my2 + clear_zero 3, mw, mw2 + clear_funcs + swizzle_funcs + + decl_common_patterns shuffle +%endmacro + +%macro funcs_u16 0 + rw_packed 16 + pack_generic 4, 4, 4 + pack_generic 5, 5, 5 + pack_generic 5, 6, 5 + unpack w, 16, 4, 4, 4 + unpack w, 16, 5, 5, 5 + unpack w, 16, 5, 6, 5 + decl_common_patterns conv8to16 convert + decl_common_patterns conv8to16 expand + decl_common_patterns conv16to8 + decl_common_patterns lshift16 + decl_common_patterns rshift16 +%endmacro + +INIT_XMM sse4 +decl_v2 0, funcs_u8 +decl_v2 1, funcs_u8 + +process_fn 1 +process_fn 2 +process_fn 3 +process_fn 4 + +packed_shuffle 5, 15 ; 8 -> 24 +packed_shuffle 4, 16 ; 8 -> 32, 16 -> 64 +packed_shuffle 2, 12 ; 8 -> 48 +packed_shuffle 10, 15 ; 16 -> 24 +packed_shuffle 8, 16 ; 16 -> 32, 32 -> 64 +packed_shuffle 4, 12 ; 16 -> 48 +packed_shuffle 15, 15 ; 24 -> 24 +packed_shuffle 12, 16 ; 24 -> 32 +packed_shuffle 6, 12 ; 24 -> 48 +packed_shuffle 16, 12 ; 32 -> 24, 64 -> 48 +packed_shuffle 16, 16 ; 32 -> 32, 64 -> 64 +packed_shuffle 8, 12 ; 32 -> 48 +packed_shuffle 12, 12 ; 48 -> 48 + +INIT_YMM avx2 +decl_v2 0, funcs_u8 +decl_v2 1, funcs_u8 +decl_v2 0, funcs_u16 +decl_v2 1, funcs_u16 + +packed_shuffle 32, 32 + +INIT_YMM avx2 +decl_v2 1, rw_packed 32 +decl_v2 1, pack_generic 10, 10, 10, 2 +decl_v2 1, pack_generic 2, 10, 10, 10 +decl_v2 1, unpack d, 32, 10, 10, 10, 2 +decl_v2 1, unpack d, 32, 2, 10, 10, 10 +decl_common_patterns conv8to32 convert +decl_common_patterns conv8to32 expand +decl_common_patterns conv32to8 +decl_common_patterns conv16to32 +decl_common_patterns conv32to16 + +INIT_ZMM avx512 +packed_shuffle 64, 64 -- 2.49.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".