From: Niklas Haas <g...@haasn.dev> See doc/swscale-v2.txt for design details. --- libswscale/Makefile | 1 + libswscale/ops_chain.c | 283 +++++++++++++++++++++++++++++++++++++++++ libswscale/ops_chain.h | 134 +++++++++++++++++++ 3 files changed, 418 insertions(+) create mode 100644 libswscale/ops_chain.c create mode 100644 libswscale/ops_chain.h
diff --git a/libswscale/Makefile b/libswscale/Makefile index 810c9dee78..c9dfa78c89 100644 --- a/libswscale/Makefile +++ b/libswscale/Makefile @@ -16,6 +16,7 @@ OBJS = alphablend.o \ input.o \ lut3d.o \ ops.o \ + ops_chain.o \ ops_optimizer.o \ options.o \ output.o \ diff --git a/libswscale/ops_chain.c b/libswscale/ops_chain.c new file mode 100644 index 0000000000..129408df27 --- /dev/null +++ b/libswscale/ops_chain.c @@ -0,0 +1,283 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "libavutil/mem.h" +#include "libavutil/rational.h" + +#include "ops_chain.h" + +#define Q(N) ((AVRational) { N, 1 }) + +SwsOpChain *ff_sws_op_chain_alloc(void) +{ + return av_mallocz(sizeof(SwsOpChain)); +} + +void ff_sws_op_chain_free(SwsOpChain *chain) +{ + if (!chain) + return; + + for (int i = 0; i < chain->num_impl + 1; i++) { + if (chain->free[i]) + chain->free[i](chain->impl[i].priv.ptr); + } + + av_free(chain); +} + +int ff_sws_op_chain_append(SwsOpChain *chain, SwsFuncPtr func, + void (*free)(void *), SwsOpPriv priv) +{ + const int idx = chain->num_impl; + if (idx == SWS_MAX_OPS) + return AVERROR(EINVAL); + + av_assert1(func); + chain->impl[idx].cont = func; + chain->impl[idx + 1].priv = priv; + chain->free[idx + 1] = free; + chain->num_impl++; + return 0; +} + +/** + * Match an operation against a reference operation. Returns a score for how + * well the reference matches the operation, or 0 if there is no match. + * + * If `ref->comps` has any flags set, they must be set in `op` as well. + * Likewise, if `ref->comps` has any components marked as unused, they must be + * marked as as unused in `ops` as well. + * + * For SWS_OP_LINEAR, `ref->linear.mask` must be a strict superset of + * `op->linear.mask`, but may not contain any columns explicitly ignored by + * `op->comps.unused`. + * + * For SWS_OP_READ, SWS_OP_WRITE, SWS_OP_SWAP_BYTES and SWS_OP_SWIZZLE, the + * exact type is not checked, just the size. + * + * Components set in `next.unused` are ignored when matching. If `flexible` + * is true, the op body is ignored - only the operation, pixel type, and + * component masks are checked. + */ +static int op_match(const SwsOp *op, const SwsOpEntry *entry, const SwsComps next) +{ + int score = 10; + if (op->op != entry->op) + return 0; + + switch (op->op) { + case SWS_OP_READ: + case SWS_OP_WRITE: + case SWS_OP_SWAP_BYTES: + case SWS_OP_SWIZZLE: + /* Only the size matters for these operations */ + if (ff_sws_pixel_type_size(op->type) != ff_sws_pixel_type_size(entry->type)) + return 0; + break; + default: + if (op->type != entry->type) + return 0; + break; + } + + for (int i = 0; i < 4; i++) { + if (entry->unused[i]) { + if (op->comps.unused[i]) + score += 1; /* Operating on fewer components is better .. */ + else + return false; /* .. but not too few! */ + } + } + + /* Flexible variants always match, but lower the score to prioritize more + * specific implementations if they exist */ + if (entry->flexible) + return score - 5; + + switch (op->op) { + case SWS_OP_INVALID: + return 0; + case SWS_OP_READ: + case SWS_OP_WRITE: + if (op->rw.elems != entry->rw.elems || + op->rw.frac != entry->rw.frac || + (op->rw.elems > 1 && op->rw.packed != entry->rw.packed)) + return 0; + return score; + case SWS_OP_SWAP_BYTES: + return score; + case SWS_OP_PACK: + case SWS_OP_UNPACK: + for (int i = 0; i < 4 && op->pack.pattern[i]; i++) { + if (op->pack.pattern[i] != entry->pack.pattern[i]) + return 0; + } + return score; + case SWS_OP_CLEAR: + for (int i = 0; i < 4; i++) { + if (!op->c.q4[i].den) + continue; + if (av_cmp_q(op->c.q4[i], Q(entry->clear_value)) && !next.unused[i]) + return 0; + } + return score; + case SWS_OP_LSHIFT: + case SWS_OP_RSHIFT: + av_assert1(entry->flexible); + return score; + case SWS_OP_SWIZZLE: + for (int i = 0; i < 4; i++) { + if (op->swizzle.in[i] != entry->swizzle.in[i] && !next.unused[i]) + return 0; + } + return score; + case SWS_OP_CONVERT: + if (op->convert.to != entry->convert.to || + op->convert.expand != entry->convert.expand) + return 0; + return score; + case SWS_OP_DITHER: + return op->dither.size_log2 == entry->dither_size ? score : 0; + case SWS_OP_MIN: + case SWS_OP_MAX: + av_assert1(entry->flexible); + return score; + case SWS_OP_LINEAR: + /* All required elements must be present */ + if (op->lin.mask & ~entry->linear_mask) + return 0; + /* To avoid operating on possibly undefined memory, filter out + * implementations that operate on more input components */ + for (int i = 0; i < 4; i++) { + if ((entry->linear_mask & SWS_MASK_COL(i)) && op->comps.unused[i]) + return 0; + } + /* Prioritize smaller implementations */ + score += av_popcount(SWS_MASK_ALL ^ entry->linear_mask); + return score; + case SWS_OP_SCALE: + return score; + case SWS_OP_TYPE_NB: + break; + } + + av_assert0(!"Invalid operation type!"); + return 0; +} + +int ff_sws_op_compile_tables(const SwsOpTable *const tables[], int num_tables, + SwsOpList *ops, const int block_size, + SwsOpChain *chain) +{ + static const SwsOp dummy = { .comps.unused = { true, true, true, true }}; + const SwsOp *next = ops->num_ops > 1 ? &ops->ops[1] : &dummy; + const unsigned cpu_flags = av_get_cpu_flags(); + const SwsOpEntry *best = NULL; + const SwsOp *op = &ops->ops[0]; + int ret, best_score = 0, best_cpu_flags; + SwsOpPriv priv = {0}; + + for (int n = 0; n < num_tables; n++) { + const SwsOpTable *table = tables[n]; + if (table->block_size && table->block_size != block_size || + table->cpu_flags & ~cpu_flags) + continue; + + for (int i = 0; table->entries[i]; i++) { + const SwsOpEntry *entry = table->entries[i]; + int score = op_match(op, entry, next->comps); + if (score > best_score) { + best_score = score; + best_cpu_flags = table->cpu_flags; + best = entry; + } + } + } + + if (!best) + return AVERROR(ENOTSUP); + + if (best->setup) { + ret = best->setup(op, &priv); + if (ret < 0) + return ret; + } + + chain->cpu_flags |= best_cpu_flags; + ret = ff_sws_op_chain_append(chain, best->func, best->free, priv); + if (ret < 0) { + if (best->free) + best->free(&priv); + return ret; + } + + ops->ops++; + ops->num_ops--; + return ops->num_ops ? AVERROR(EAGAIN) : 0; +} + +#define q2pixel(type, q) ((q).den ? (type) (q).num / (q).den : 0) + +int ff_sws_setup_u8(const SwsOp *op, SwsOpPriv *out) +{ + out->u8[0] = op->c.u; + return 0; +} + +int ff_sws_setup_u(const SwsOp *op, SwsOpPriv *out) +{ + switch (op->type) { + case SWS_PIXEL_U8: out->u8[0] = op->c.u; return 0; + case SWS_PIXEL_U16: out->u16[0] = op->c.u; return 0; + case SWS_PIXEL_U32: out->u32[0] = op->c.u; return 0; + case SWS_PIXEL_F32: out->f32[0] = op->c.u; return 0; + default: return AVERROR(EINVAL); + } +} + +int ff_sws_setup_q(const SwsOp *op, SwsOpPriv *out) +{ + switch (op->type) { + case SWS_PIXEL_U8: out->u8[0] = q2pixel(uint8_t, op->c.q); return 0; + case SWS_PIXEL_U16: out->u16[0] = q2pixel(uint16_t, op->c.q); return 0; + case SWS_PIXEL_U32: out->u32[0] = q2pixel(uint32_t, op->c.q); return 0; + case SWS_PIXEL_F32: out->f32[0] = q2pixel(float, op->c.q); return 0; + default: return AVERROR(EINVAL); + } + + return 0; +} + +int ff_sws_setup_q4(const SwsOp *op, SwsOpPriv *out) +{ + for (int i = 0; i < 4; i++) { + switch (op->type) { + case SWS_PIXEL_U8: out->u8[i] = q2pixel(uint8_t, op->c.q4[i]); break; + case SWS_PIXEL_U16: out->u16[i] = q2pixel(uint16_t, op->c.q4[i]); break; + case SWS_PIXEL_U32: out->u32[i] = q2pixel(uint32_t, op->c.q4[i]); break; + case SWS_PIXEL_F32: out->f32[i] = q2pixel(float, op->c.q4[i]); break; + default: return AVERROR(EINVAL); + } + } + + return 0; +} diff --git a/libswscale/ops_chain.h b/libswscale/ops_chain.h new file mode 100644 index 0000000000..777d8e13c1 --- /dev/null +++ b/libswscale/ops_chain.h @@ -0,0 +1,134 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef SWSCALE_OPS_CHAIN_H +#define SWSCALE_OPS_CHAIN_H + +#include "libavutil/cpu.h" + +#include "ops_internal.h" + +/** + * Helpers for SIMD implementations based on chained kernels, using a + * continuation passing style to link them together. + * + * The basic idea here is to "link" together a series of different operation + * kernels by constructing a list of kernel addresses into an SwsOpChain. Each + * kernel will load the address of the next kernel (the "continuation") from + * this struct, and jump directly into it; using an internal function signature + * that is an implementation detail of the specific backend. + */ + +/** + * Private data for each kernel. + */ +typedef union SwsOpPriv { + DECLARE_ALIGNED_16(char, data)[16]; + + /* Common types */ + void *ptr; + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + float f32[4]; +} SwsOpPriv; + +static_assert(sizeof(SwsOpPriv) == 16, "SwsOpPriv size mismatch"); + +/* Setup helpers */ +int ff_sws_setup_u(const SwsOp *op, SwsOpPriv *out); +int ff_sws_setup_u8(const SwsOp *op, SwsOpPriv *out); +int ff_sws_setup_q(const SwsOp *op, SwsOpPriv *out); +int ff_sws_setup_q4(const SwsOp *op, SwsOpPriv *out); + +/** + * Per-kernel execution context. + * + * Note: This struct is hard-coded in assembly, so do not change the layout. + */ +typedef void (*SwsFuncPtr)(void); +typedef struct SwsOpImpl { + SwsFuncPtr cont; /* [offset = 0] Continuation for this operation. */ + SwsOpPriv priv; /* [offset = 16] Private data for this operation. */ +} SwsOpImpl; + +static_assert(sizeof(SwsOpImpl) == 32, "SwsOpImpl layout mismatch"); +static_assert(offsetof(SwsOpImpl, priv) == 16, "SwsOpImpl layout mismatch"); + +/** + * Compiled "chain" of operations, which can be dispatched efficiently. + * Effectively just a list of function pointers, alongside a small amount of + * private data for each operation. + */ +typedef struct SwsOpChain { +#define SWS_MAX_OPS 16 + SwsOpImpl impl[SWS_MAX_OPS + 1]; /* reserve extra space for the entrypoint */ + void (*free[SWS_MAX_OPS + 1])(void *); + int num_impl; + int cpu_flags; /* set of all used CPU flags */ +} SwsOpChain; + +SwsOpChain *ff_sws_op_chain_alloc(void); +void ff_sws_op_chain_free(SwsOpChain *chain); + +/* Returns 0 on success, or a negative error code. */ +int ff_sws_op_chain_append(SwsOpChain *chain, SwsFuncPtr func, + void (*free)(void *), SwsOpPriv priv); + +typedef struct SwsOpEntry { + /* Kernel metadata; reduced size subset of SwsOp */ + SwsOpType op; + SwsPixelType type; + bool flexible; /* if true, only the type and op are matched */ + bool unused[4]; /* for kernels which operate on a subset of components */ + + union { /* extra data defining the operation, unless `flexible` is true */ + SwsReadWriteOp rw; + SwsPackOp pack; + SwsSwizzleOp swizzle; + SwsConvertOp convert; + uint32_t linear_mask; /* subset of SwsLinearOp */ + int dither_size; /* subset of SwsDitherOp */ + int clear_value; /* clear value for integer clears */ + }; + + /* Kernel implementation */ + SwsFuncPtr func; + int (*setup)(const SwsOp *op, SwsOpPriv *out); /* optional */ + void (*free)(void *priv); +} SwsOpEntry; + +typedef struct SwsOpTable { + unsigned cpu_flags; /* required CPU flags for this table */ + int block_size; /* fixed block size of this table */ + const SwsOpEntry *entries[]; /* terminated by NULL */ +} SwsOpTable; + +/** + * "Compile" a single op by looking it up in a list of fixed size op tables. + * See `op_match` in `ops.c` for details on how the matching works. + * + * Returns 0, AVERROR(EAGAIN), or a negative error code. + */ +int ff_sws_op_compile_tables(const SwsOpTable *const tables[], int num_tables, + SwsOpList *ops, const int block_size, + SwsOpChain *chain); + +#endif -- 2.49.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".