ops_chain: add internal abstraction for kernel linking

Niklas Haas Tue, 27 May 2025 01:15:41 -0700

From: Niklas Haas <[email protected]>

See doc/swscale-v2.txt for design details.
---
 libswscale/Makefile    |   1 +
 libswscale/ops_chain.c | 283 +++++++++++++++++++++++++++++++++++++++++
 libswscale/ops_chain.h | 134 +++++++++++++++++++
 3 files changed, 418 insertions(+)
 create mode 100644 libswscale/ops_chain.c
 create mode 100644 libswscale/ops_chain.h


diff --git a/libswscale/Makefile b/libswscale/Makefile
index 810c9dee78..c9dfa78c89 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -16,6 +16,7 @@ OBJS = alphablend.o                                     \
        input.o                                          \
        lut3d.o                                          \
        ops.o                                            \
+       ops_chain.o                                      \
        ops_optimizer.o                                  \
        options.o                                        \
        output.o                                         \
diff --git a/libswscale/ops_chain.c b/libswscale/ops_chain.c
new file mode 100644
index 0000000000..129408df27
--- /dev/null
+++ b/libswscale/ops_chain.c
@@ -0,0 +1,283 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/mem.h"
+#include "libavutil/rational.h"
+
+#include "ops_chain.h"
+
+#define Q(N) ((AVRational) { N, 1 })
+
+SwsOpChain *ff_sws_op_chain_alloc(void)
+{
+    return av_mallocz(sizeof(SwsOpChain));
+}
+
+void ff_sws_op_chain_free(SwsOpChain *chain)
+{
+    if (!chain)
+        return;
+
+    for (int i = 0; i < chain->num_impl + 1; i++) {
+        if (chain->free[i])
+            chain->free[i](chain->impl[i].priv.ptr);
+    }
+
+    av_free(chain);
+}
+
+int ff_sws_op_chain_append(SwsOpChain *chain, SwsFuncPtr func,
+                           void (*free)(void *), SwsOpPriv priv)
+{
+    const int idx = chain->num_impl;
+    if (idx == SWS_MAX_OPS)
+        return AVERROR(EINVAL);
+
+    av_assert1(func);
+    chain->impl[idx].cont = func;
+    chain->impl[idx + 1].priv = priv;
+    chain->free[idx + 1] = free;
+    chain->num_impl++;
+    return 0;
+}
+
+/**
+ * Match an operation against a reference operation. Returns a score for how
+ * well the reference matches the operation, or 0 if there is no match.
+ *
+ * If `ref->comps` has any flags set, they must be set in `op` as well.
+ * Likewise, if `ref->comps` has any components marked as unused, they must be
+ * marked as as unused in `ops` as well.
+ *
+ * For SWS_OP_LINEAR, `ref->linear.mask` must be a strict superset of
+ * `op->linear.mask`, but may not contain any columns explicitly ignored by
+ * `op->comps.unused`.
+ *
+ * For SWS_OP_READ, SWS_OP_WRITE, SWS_OP_SWAP_BYTES and SWS_OP_SWIZZLE, the
+ * exact type is not checked, just the size.
+ *
+ * Components set in `next.unused` are ignored when matching. If `flexible`
+ * is true, the op body is ignored - only the operation, pixel type, and
+ * component masks are checked.
+ */
+static int op_match(const SwsOp *op, const SwsOpEntry *entry, const SwsComps 
next)
+{
+    int score = 10;
+    if (op->op != entry->op)
+        return 0;
+
+    switch (op->op) {
+    case SWS_OP_READ:
+    case SWS_OP_WRITE:
+    case SWS_OP_SWAP_BYTES:
+    case SWS_OP_SWIZZLE:
+        /* Only the size matters for these operations */
+        if (ff_sws_pixel_type_size(op->type) != 
ff_sws_pixel_type_size(entry->type))
+            return 0;
+        break;
+    default:
+        if (op->type != entry->type)
+            return 0;
+        break;
+    }
+
+    for (int i = 0; i < 4; i++) {
+        if (entry->unused[i]) {
+            if (op->comps.unused[i])
+                score += 1; /* Operating on fewer components is better .. */
+            else
+                return false; /* .. but not too few! */
+        }
+    }
+
+    /* Flexible variants always match, but lower the score to prioritize more
+     * specific implementations if they exist */
+    if (entry->flexible)
+        return score - 5;
+
+    switch (op->op) {
+    case SWS_OP_INVALID:
+        return 0;
+    case SWS_OP_READ:
+    case SWS_OP_WRITE:
+        if (op->rw.elems   != entry->rw.elems ||
+            op->rw.frac    != entry->rw.frac  ||
+            (op->rw.elems > 1 && op->rw.packed != entry->rw.packed))
+            return 0;
+        return score;
+    case SWS_OP_SWAP_BYTES:
+        return score;
+    case SWS_OP_PACK:
+    case SWS_OP_UNPACK:
+        for (int i = 0; i < 4 && op->pack.pattern[i]; i++) {
+            if (op->pack.pattern[i] != entry->pack.pattern[i])
+                return 0;
+        }
+        return score;
+    case SWS_OP_CLEAR:
+        for (int i = 0; i < 4; i++) {
+            if (!op->c.q4[i].den)
+                continue;
+            if (av_cmp_q(op->c.q4[i], Q(entry->clear_value)) && 
!next.unused[i])
+                return 0;
+        }
+        return score;
+    case SWS_OP_LSHIFT:
+    case SWS_OP_RSHIFT:
+        av_assert1(entry->flexible);
+        return score;
+    case SWS_OP_SWIZZLE:
+        for (int i = 0; i < 4; i++) {
+            if (op->swizzle.in[i] != entry->swizzle.in[i] && !next.unused[i])
+                return 0;
+        }
+        return score;
+    case SWS_OP_CONVERT:
+        if (op->convert.to     != entry->convert.to ||
+            op->convert.expand != entry->convert.expand)
+            return 0;
+        return score;
+    case SWS_OP_DITHER:
+        return op->dither.size_log2 == entry->dither_size ? score : 0;
+    case SWS_OP_MIN:
+    case SWS_OP_MAX:
+        av_assert1(entry->flexible);
+        return score;
+    case SWS_OP_LINEAR:
+        /* All required elements must be present */
+        if (op->lin.mask & ~entry->linear_mask)
+            return 0;
+        /* To avoid operating on possibly undefined memory, filter out
+         * implementations that operate on more input components */
+        for (int i = 0; i < 4; i++) {
+            if ((entry->linear_mask & SWS_MASK_COL(i)) && op->comps.unused[i])
+                return 0;
+        }
+        /* Prioritize smaller implementations */
+        score += av_popcount(SWS_MASK_ALL ^ entry->linear_mask);
+        return score;
+    case SWS_OP_SCALE:
+        return score;
+    case SWS_OP_TYPE_NB:
+        break;
+    }
+
+    av_assert0(!"Invalid operation type!");
+    return 0;
+}
+
+int ff_sws_op_compile_tables(const SwsOpTable *const tables[], int num_tables,
+                             SwsOpList *ops, const int block_size,
+                             SwsOpChain *chain)
+{
+    static const SwsOp dummy = { .comps.unused = { true, true, true, true }};
+    const SwsOp *next = ops->num_ops > 1 ? &ops->ops[1] : &dummy;
+    const unsigned cpu_flags = av_get_cpu_flags();
+    const SwsOpEntry *best = NULL;
+    const SwsOp *op = &ops->ops[0];
+    int ret, best_score = 0, best_cpu_flags;
+    SwsOpPriv priv = {0};
+
+    for (int n = 0; n < num_tables; n++) {
+        const SwsOpTable *table = tables[n];
+        if (table->block_size && table->block_size != block_size ||
+            table->cpu_flags & ~cpu_flags)
+            continue;
+
+        for (int i = 0; table->entries[i]; i++) {
+            const SwsOpEntry *entry = table->entries[i];
+            int score = op_match(op, entry, next->comps);
+            if (score > best_score) {
+                best_score = score;
+                best_cpu_flags = table->cpu_flags;
+                best = entry;
+            }
+        }
+    }
+
+    if (!best)
+        return AVERROR(ENOTSUP);
+
+    if (best->setup) {
+        ret = best->setup(op, &priv);
+        if (ret < 0)
+            return ret;
+    }
+
+    chain->cpu_flags |= best_cpu_flags;
+    ret = ff_sws_op_chain_append(chain, best->func, best->free, priv);
+    if (ret < 0) {
+        if (best->free)
+            best->free(&priv);
+        return ret;
+    }
+
+    ops->ops++;
+    ops->num_ops--;
+    return ops->num_ops ? AVERROR(EAGAIN) : 0;
+}
+
+#define q2pixel(type, q) ((q).den ? (type) (q).num / (q).den : 0)
+
+int ff_sws_setup_u8(const SwsOp *op, SwsOpPriv *out)
+{
+    out->u8[0] = op->c.u;
+    return 0;
+}
+
+int ff_sws_setup_u(const SwsOp *op, SwsOpPriv *out)
+{
+    switch (op->type) {
+    case SWS_PIXEL_U8:  out->u8[0]  = op->c.u; return 0;
+    case SWS_PIXEL_U16: out->u16[0] = op->c.u; return 0;
+    case SWS_PIXEL_U32: out->u32[0] = op->c.u; return 0;
+    case SWS_PIXEL_F32: out->f32[0] = op->c.u; return 0;
+    default: return AVERROR(EINVAL);
+    }
+}
+
+int ff_sws_setup_q(const SwsOp *op, SwsOpPriv *out)
+{
+    switch (op->type) {
+    case SWS_PIXEL_U8:  out->u8[0]  = q2pixel(uint8_t,  op->c.q); return 0;
+    case SWS_PIXEL_U16: out->u16[0] = q2pixel(uint16_t, op->c.q); return 0;
+    case SWS_PIXEL_U32: out->u32[0] = q2pixel(uint32_t, op->c.q); return 0;
+    case SWS_PIXEL_F32: out->f32[0] = q2pixel(float,    op->c.q); return 0;
+    default: return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+int ff_sws_setup_q4(const SwsOp *op, SwsOpPriv *out)
+{
+    for (int i = 0; i < 4; i++) {
+        switch (op->type) {
+        case SWS_PIXEL_U8:  out->u8[i]  = q2pixel(uint8_t,  op->c.q4[i]); 
break;
+        case SWS_PIXEL_U16: out->u16[i] = q2pixel(uint16_t, op->c.q4[i]); 
break;
+        case SWS_PIXEL_U32: out->u32[i] = q2pixel(uint32_t, op->c.q4[i]); 
break;
+        case SWS_PIXEL_F32: out->f32[i] = q2pixel(float,    op->c.q4[i]); 
break;
+        default: return AVERROR(EINVAL);
+        }
+    }
+
+    return 0;
+}
diff --git a/libswscale/ops_chain.h b/libswscale/ops_chain.h
new file mode 100644
index 0000000000..777d8e13c1
--- /dev/null
+++ b/libswscale/ops_chain.h
@@ -0,0 +1,134 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_OPS_CHAIN_H
+#define SWSCALE_OPS_CHAIN_H
+
+#include "libavutil/cpu.h"
+
+#include "ops_internal.h"
+
+/**
+ * Helpers for SIMD implementations based on chained kernels, using a
+ * continuation passing style to link them together.
+ *
+ * The basic idea here is to "link" together a series of different operation
+ * kernels by constructing a list of kernel addresses into an SwsOpChain. Each
+ * kernel will load the address of the next kernel (the "continuation") from
+ * this struct, and jump directly into it; using an internal function signature
+ * that is an implementation detail of the specific backend.
+ */
+
+/**
+ * Private data for each kernel.
+ */
+typedef union SwsOpPriv {
+    DECLARE_ALIGNED_16(char, data)[16];
+
+    /* Common types */
+    void *ptr;
+    uint8_t   u8[16];
+    uint16_t u16[8];
+    uint32_t u32[4];
+    float    f32[4];
+} SwsOpPriv;
+
+static_assert(sizeof(SwsOpPriv) == 16, "SwsOpPriv size mismatch");
+
+/* Setup helpers */
+int ff_sws_setup_u(const SwsOp *op, SwsOpPriv *out);
+int ff_sws_setup_u8(const SwsOp *op, SwsOpPriv *out);
+int ff_sws_setup_q(const SwsOp *op, SwsOpPriv *out);
+int ff_sws_setup_q4(const SwsOp *op, SwsOpPriv *out);
+
+/**
+ * Per-kernel execution context.
+ *
+ * Note: This struct is hard-coded in assembly, so do not change the layout.
+ */
+typedef void (*SwsFuncPtr)(void);
+typedef struct SwsOpImpl {
+    SwsFuncPtr cont; /* [offset =  0] Continuation for this operation. */
+    SwsOpPriv  priv; /* [offset = 16] Private data for this operation. */
+} SwsOpImpl;
+
+static_assert(sizeof(SwsOpImpl) == 32,         "SwsOpImpl layout mismatch");
+static_assert(offsetof(SwsOpImpl, priv) == 16, "SwsOpImpl layout mismatch");
+
+/**
+ * Compiled "chain" of operations, which can be dispatched efficiently.
+ * Effectively just a list of function pointers, alongside a small amount of
+ * private data for each operation.
+ */
+typedef struct SwsOpChain {
+#define SWS_MAX_OPS 16
+    SwsOpImpl impl[SWS_MAX_OPS + 1]; /* reserve extra space for the entrypoint 
*/
+    void (*free[SWS_MAX_OPS + 1])(void *);
+    int num_impl;
+    int cpu_flags; /* set of all used CPU flags */
+} SwsOpChain;
+
+SwsOpChain *ff_sws_op_chain_alloc(void);
+void ff_sws_op_chain_free(SwsOpChain *chain);
+
+/* Returns 0 on success, or a negative error code. */
+int ff_sws_op_chain_append(SwsOpChain *chain, SwsFuncPtr func,
+                           void (*free)(void *), SwsOpPriv priv);
+
+typedef struct SwsOpEntry {
+    /* Kernel metadata; reduced size subset of SwsOp */
+    SwsOpType op;
+    SwsPixelType type;
+    bool flexible; /* if true, only the type and op are matched */
+    bool unused[4]; /* for kernels which operate on a subset of components */
+
+    union { /* extra data defining the operation, unless `flexible` is true */
+        SwsReadWriteOp rw;
+        SwsPackOp      pack;
+        SwsSwizzleOp   swizzle;
+        SwsConvertOp   convert;
+        uint32_t       linear_mask; /* subset of SwsLinearOp */
+        int            dither_size; /* subset of SwsDitherOp */
+        int            clear_value; /* clear value for integer clears */
+    };
+
+    /* Kernel implementation */
+    SwsFuncPtr func;
+    int (*setup)(const SwsOp *op, SwsOpPriv *out); /* optional */
+    void (*free)(void *priv);
+} SwsOpEntry;
+
+typedef struct SwsOpTable {
+    unsigned cpu_flags;   /* required CPU flags for this table */
+    int block_size;       /* fixed block size of this table */
+    const SwsOpEntry *entries[]; /* terminated by NULL */
+} SwsOpTable;
+
+/**
+ * "Compile" a single op by looking it up in a list of fixed size op tables.
+ * See `op_match` in `ops.c` for details on how the matching works.
+ *
+ * Returns 0, AVERROR(EAGAIN), or a negative error code.
+ */
+int ff_sws_op_compile_tables(const SwsOpTable *const tables[], int num_tables,
+                             SwsOpList *ops, const int block_size,
+                             SwsOpChain *chain);
+
+#endif
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

[FFmpeg-devel] [PATCH v3 11/17] swscale/ops_chain: add internal abstraction for kernel linking

Reply via email to