From: Niklas Haas <g...@haasn.dev> See doc/swscale-v2.txt for design details. --- libswscale/Makefile | 1 + libswscale/ops_chain.c | 291 +++++++++++++++++++++++++++++++++++++++++ libswscale/ops_chain.h | 108 +++++++++++++++ 3 files changed, 400 insertions(+) create mode 100644 libswscale/ops_chain.c create mode 100644 libswscale/ops_chain.h
diff --git a/libswscale/Makefile b/libswscale/Makefile index 810c9dee78..c9dfa78c89 100644 --- a/libswscale/Makefile +++ b/libswscale/Makefile @@ -16,6 +16,7 @@ OBJS = alphablend.o \ input.o \ lut3d.o \ ops.o \ + ops_chain.o \ ops_optimizer.o \ options.o \ output.o \ diff --git a/libswscale/ops_chain.c b/libswscale/ops_chain.c new file mode 100644 index 0000000000..92e22cb384 --- /dev/null +++ b/libswscale/ops_chain.c @@ -0,0 +1,291 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "libavutil/mem.h" +#include "libavutil/rational.h" + +#include "ops_chain.h" + +SwsOpChain *ff_sws_op_chain_alloc(void) +{ + return av_mallocz(sizeof(SwsOpChain)); +} + +void ff_sws_op_chain_free(SwsOpChain *chain) +{ + if (!chain) + return; + + for (int i = 0; i < chain->num_impl + 1; i++) { + if (chain->free[i]) + chain->free[i](chain->impl[i].priv.ptr); + } + + av_free(chain); +} + +int ff_sws_op_chain_append(SwsOpChain *chain, SwsFuncPtr func, + void (*free)(void *), SwsOpPriv priv) +{ + const int idx = chain->num_impl; + if (idx == SWS_MAX_OPS) + return AVERROR(EINVAL); + + av_assert1(func); + chain->impl[idx].cont = func; + chain->impl[idx + 1].priv = priv; + chain->free[idx + 1] = free; + chain->num_impl++; + return 0; +} + +/** + * Match an operation against a reference operation. Returns a score for how + * well the reference matches the operation, or 0 if there is no match. + * + * If `ref->comps` has any flags set, they must be set in `op` as well. + * Likewise, if `ref->comps` has any components marked as unused, they must be + * marked as as unused in `ops` as well. + * + * For SWS_OP_LINEAR, `ref->linear.mask` must be a strict superset of + * `op->linear.mask`, but may not contain any columns explicitly ignored by + * `op->comps.unused`. + * + * For SWS_OP_READ, SWS_OP_WRITE, SWS_OP_SWAP_BYTES and SWS_OP_SWIZZLE, the + * exact type is not checked, just the size. + * + * Components set in `next.unused` are ignored when matching. If `flexible` + * is true, the op body is ignored - only the operation, pixel type, and + * component masks are checked. + */ +static int op_match(const SwsOp *op, const SwsOpEntry *entry, const SwsComps next) +{ + const SwsOp *ref = &entry->op; + int score = 10; + if (op->op != ref->op) + return 0; + + switch (op->op) { + case SWS_OP_READ: + case SWS_OP_WRITE: + case SWS_OP_SWAP_BYTES: + case SWS_OP_SWIZZLE: + /* Only the size matters for these operations */ + if (ff_sws_pixel_type_size(op->type) != ff_sws_pixel_type_size(ref->type)) + return 0; + break; + default: + if (op->type != ref->type) + return 0; + break; + } + + for (int i = 0; i < 4; i++) { + if (ref->comps.unused[i]) { + if (op->comps.unused[i]) + score += 1; /* Operating on fewer components is better .. */ + else + return false; /* .. but not too few! */ + } + + if (ref->comps.flags[i]) { + if (ref->comps.flags[i] & ~op->comps.flags[i]) { + return false; /* Missing required output assumptions */ + } else { + /* Implementation is more specialized */ + score += av_popcount(ref->comps.flags[i]); + } + } + } + + /* Flexible variants always match, but lower the score to prioritize more + * specific implementations if they exist */ + if (entry->flexible) + return score - 5; + + switch (op->op) { + case SWS_OP_INVALID: + return 0; + case SWS_OP_READ: + case SWS_OP_WRITE: + if (op->rw.elems != ref->rw.elems || + op->rw.packed != ref->rw.packed || + op->rw.frac != ref->rw.frac) + return 0; + return score; + case SWS_OP_SWAP_BYTES: + return score; + case SWS_OP_PACK: + case SWS_OP_UNPACK: + for (int i = 0; i < 4 && op->pack.pattern[i]; i++) { + if (op->pack.pattern[i] != ref->pack.pattern[i]) + return 0; + } + return score; + case SWS_OP_CLEAR: + for (int i = 0; i < 4; i++) { + if (!op->c.q4[i].den) + continue; + if (av_cmp_q(op->c.q4[i], ref->c.q4[i]) && !next.unused[i]) + return 0; + } + return score; + case SWS_OP_LSHIFT: + case SWS_OP_RSHIFT: + return op->c.u == ref->c.u ? score : 0; + case SWS_OP_SWIZZLE: + for (int i = 0; i < 4; i++) { + if (op->swizzle.in[i] != ref->swizzle.in[i] && !next.unused[i]) + return 0; + } + return score; + case SWS_OP_CONVERT: + if (op->convert.to != ref->convert.to || + op->convert.expand != ref->convert.expand) + return 0; + return score; + case SWS_OP_DITHER: + return op->dither.size_log2 == ref->dither.size_log2 ? score : 0; + case SWS_OP_MIN: + case SWS_OP_MAX: + for (int i = 0; i < 4; i++) { + if (av_cmp_q(op->c.q4[i], ref->c.q4[i]) && !next.unused[i]) + return 0; + } + return score; + case SWS_OP_LINEAR: + /* All required elements must be present */ + if (op->lin.mask & ~ref->lin.mask) + return 0; + /* To avoid operating on possibly undefined memory, filter out + * implementations that operate on more input components */ + for (int i = 0; i < 4; i++) { + if ((ref->lin.mask & SWS_MASK_COL(i)) && op->comps.unused[i]) + return 0; + } + /* Prioritize smaller implementations */ + score += av_popcount(SWS_MASK_ALL ^ ref->lin.mask); + return score; + case SWS_OP_SCALE: + return score; + case SWS_OP_TYPE_NB: + break; + } + + av_assert0(!"Invalid operation type!"); + return 0; +} + +int ff_sws_op_compile_tables(const SwsOpTable *const tables[], int num_tables, + SwsOpList *ops, const int block_size, + SwsOpChain *chain) +{ + static const SwsOp dummy = { .comps.unused = { true, true, true, true }}; + const SwsOp *next = ops->num_ops > 1 ? &ops->ops[1] : &dummy; + const unsigned cpu_flags = av_get_cpu_flags(); + const SwsOpEntry *best = NULL; + const SwsOp *op = &ops->ops[0]; + int ret, best_score = 0; + SwsOpPriv priv = {0}; + + for (int n = 0; n < num_tables; n++) { + const SwsOpTable *table = tables[n]; + if (table->block_size && table->block_size != block_size || + table->cpu_flags & ~cpu_flags) + continue; + + for (int i = 0; table->entries[i].op.op; i++) { + const SwsOpEntry *entry = &table->entries[i]; + int score = op_match(op, entry, next->comps); + if (score > best_score) { + best_score = score; + best = entry; + } + } + } + + if (!best) + return AVERROR(ENOTSUP); + + if (best->setup) { + ret = best->setup(op, &priv); + if (ret < 0) + return ret; + } + + ret = ff_sws_op_chain_append(chain, best->func, best->free, priv); + if (ret < 0) { + if (best->free) + best->free(&priv); + return ret; + } + + ops->ops++; + ops->num_ops--; + return ops->num_ops ? AVERROR(EAGAIN) : 0; +} + +#define q2pixel(type, q) ((q).den ? (type) (q).num / (q).den : 0) + +int ff_sws_setup_u8(const SwsOp *op, SwsOpPriv *out) +{ + out->u8[0] = op->c.u; + return 0; +} + +int ff_sws_setup_u(const SwsOp *op, SwsOpPriv *out) +{ + switch (op->type) { + case SWS_PIXEL_U8: out->u8[0] = op->c.u; return 0; + case SWS_PIXEL_U16: out->u16[0] = op->c.u; return 0; + case SWS_PIXEL_U32: out->u32[0] = op->c.u; return 0; + case SWS_PIXEL_F32: out->f32[0] = op->c.u; return 0; + default: return AVERROR(EINVAL); + } +} + +int ff_sws_setup_q(const SwsOp *op, SwsOpPriv *out) +{ + switch (op->type) { + case SWS_PIXEL_U8: out->u8[0] = q2pixel(uint8_t, op->c.q); return 0; + case SWS_PIXEL_U16: out->u16[0] = q2pixel(uint16_t, op->c.q); return 0; + case SWS_PIXEL_U32: out->u32[0] = q2pixel(uint32_t, op->c.q); return 0; + case SWS_PIXEL_F32: out->f32[0] = q2pixel(float, op->c.q); return 0; + default: return AVERROR(EINVAL); + } + + return 0; +} + +int ff_sws_setup_q4(const SwsOp *op, SwsOpPriv *out) +{ + for (int i = 0; i < 4; i++) { + switch (op->type) { + case SWS_PIXEL_U8: out->u8[i] = q2pixel(uint8_t, op->c.q4[i]); break; + case SWS_PIXEL_U16: out->u16[i] = q2pixel(uint16_t, op->c.q4[i]); break; + case SWS_PIXEL_U32: out->u32[i] = q2pixel(uint32_t, op->c.q4[i]); break; + case SWS_PIXEL_F32: out->f32[i] = q2pixel(float, op->c.q4[i]); break; + default: return AVERROR(EINVAL); + } + } + + return 0; +} diff --git a/libswscale/ops_chain.h b/libswscale/ops_chain.h new file mode 100644 index 0000000000..cf13f1a8af --- /dev/null +++ b/libswscale/ops_chain.h @@ -0,0 +1,108 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef SWSCALE_OPS_CHAIN_H +#define SWSCALE_OPS_CHAIN_H + +#include "libavutil/cpu.h" + +#include "ops_internal.h" + +/** + * Helpers for SIMD implementations based on chained kernels, using a + * continuation passing style to link them together. + */ + +/** + * Private data for each kernel. + */ +typedef union SwsOpPriv { + DECLARE_ALIGNED_16(char, data)[16]; + + /* Common types */ + void *ptr; + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + float f32[4]; +} SwsOpPriv; + +static_assert(sizeof(SwsOpPriv) == 16, "SwsOpPriv size mismatch"); + +/* Setup helpers */ +int ff_sws_setup_u(const SwsOp *op, SwsOpPriv *out); +int ff_sws_setup_u8(const SwsOp *op, SwsOpPriv *out); +int ff_sws_setup_q(const SwsOp *op, SwsOpPriv *out); +int ff_sws_setup_q4(const SwsOp *op, SwsOpPriv *out); + +/** + * Per-kernel execution context. + * + * Note: This struct is hard-coded in assembly, so do not change the layout. + */ +typedef void (*SwsFuncPtr)(void); +typedef struct SwsOpImpl { + SwsFuncPtr cont; /* [offset = 0] Continuation for this operation. */ + SwsOpPriv priv; /* [offset = 16] Private data for this operation. */ +} SwsOpImpl; + +static_assert(sizeof(SwsOpImpl) == 32, "SwsOpImpl layout mismatch"); +static_assert(offsetof(SwsOpImpl, priv) == 16, "SwsOpImpl layout mismatch"); + +/* Compiled chain of operations, which can be dispatched efficiently */ +typedef struct SwsOpChain { +#define SWS_MAX_OPS 16 + SwsOpImpl impl[SWS_MAX_OPS + 1]; /* reserve extra space for the entrypoint */ + void (*free[SWS_MAX_OPS + 1])(void *); + int num_impl; +} SwsOpChain; + +SwsOpChain *ff_sws_op_chain_alloc(void); +void ff_sws_op_chain_free(SwsOpChain *chain); + +/* Returns 0 on success, or a negative error code. */ +int ff_sws_op_chain_append(SwsOpChain *chain, SwsFuncPtr func, + void (*free)(void *), SwsOpPriv priv); + +typedef struct SwsOpEntry { + SwsOp op; + SwsFuncPtr func; + bool flexible; /* if true, only the type and op are matched */ + int (*setup)(const SwsOp *op, SwsOpPriv *out); /* optional */ + void (*free)(void *priv); +} SwsOpEntry; + +typedef struct SwsOpTable { + unsigned cpu_flags; /* required CPU flags for this table */ + int block_size; /* fixed block size of this table */ + SwsOpEntry entries[]; /* terminated by {0} */ +} SwsOpTable; + +/** + * "Compile" a single op by looking it up in a list of fixed size op tables. + * See `op_match` in `ops.c` for details on how the matching works. + * + * Returns 0, AVERROR(EAGAIN), or a negative error code. + */ +int ff_sws_op_compile_tables(const SwsOpTable *const tables[], int num_tables, + SwsOpList *ops, const int block_size, + SwsOpChain *chain); + +#endif -- 2.49.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".