uops: auto-generate reference C backend from uops_macros.h

Niklas Haas via ffmpeg-cvslog Tue, 09 Jun 2026 10:01:58 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 8118e964bbe5acd0ec6a7665208ec9d3038c692f
Author:     Niklas Haas <[email protected]>
AuthorDate: Wed Apr 8 14:43:06 2026 +0200
Commit:     Niklas Haas <[email protected]>
CommitDate: Tue Jun 9 18:27:20 2026 +0200

    swscale/uops: auto-generate reference C backend from uops_macros.h
    
    Instead of choosing by hand which kernels to implement, this rewrite focuses
    on leveraging the power of uops_macros.h to auto-generate all needed 
kernels.
    This not only simplifies maintenance, but also improves performance.
    
    I have decided to develop the replacement backend as a separate file, under
    a separate prefix, for the explicit purpose of being able to verify the
    correctness of the rewrite using the current backend as a checkasm 
reference.
    
    The code for the kernels themselves has been largely copied from the old
    C backend, modified slightly to conform to the uop template style. This does
    result in some code duplication, but a following commit will clean it up.
    I nonetheless want to preserve this commit for bisection purposes, to ensure
    we have one commit that contains both backends side-by-side.
    
    Overall speedup=1.182x faster, min=0.197x max=3.450x
    
    The big slowdowns are flukes caused by tiny deviations in the runtime of
    a noop memcpy conversion.
    
    As a nice side benefit, the compiled binary is now also ~10% smaller, and
    the code ~50% smaller.
    
    Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/Makefile       |   1 +
 libswscale/ops.c          |   2 +
 libswscale/uops_backend.c | 197 ++++++++++++
 libswscale/uops_tmpl.c    | 802 ++++++++++++++++++++++++++++++++++++++++++++++
 libswscale/uops_tmpl.h    | 146 +++++++++
 5 files changed, 1148 insertions(+)

diff --git a/libswscale/Makefile b/libswscale/Makefile
index 2738d66f35..f69b39972a 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -36,6 +36,7 @@ OBJS-$(CONFIG_UNSTABLE) +=                              \
        ops_memcpy.o                                     \
        ops_optimizer.o                                  \
        uops.o                                           \
+       uops_backend.o                                   \
 
 ifeq ($(CONFIG_UNSTABLE),yes)
 include $(SRC_PATH)/libswscale/vulkan/Makefile
diff --git a/libswscale/ops.c b/libswscale/ops.c
index b28dbec75f..719198e116 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -32,6 +32,7 @@
 #include "ops_internal.h"
 
 extern const SwsOpBackend backend_c;
+extern const SwsOpBackend backend_uops;
 extern const SwsOpBackend backend_murder;
 extern const SwsOpBackend backend_aarch64;
 extern const SwsOpBackend backend_x86;
@@ -49,6 +50,7 @@ const SwsOpBackend * const ff_sws_op_backends[] = {
 #elif ARCH_X86_64 && HAVE_X86ASM
     &backend_x86,
 #endif
+    &backend_uops,
     &backend_c,
 #if HAVE_SPIRV_HEADERS_SPIRV_H || HAVE_SPIRV_UNIFIED1_SPIRV_H
     &backend_spirv,
diff --git a/libswscale/uops_backend.c b/libswscale/uops_backend.c
new file mode 100644
index 0000000000..591fc154db
--- /dev/null
+++ b/libswscale/uops_backend.c
@@ -0,0 +1,197 @@
+/**
+ * Copyright (C) 2026 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+
+#include "uops_tmpl.h"
+
+/**
+ * We want to disable FP contraction because this is a reference backend that
+ * establishes a bit-exact reference result.
+ */
+#ifdef __clang__
+#pragma STDC FP_CONTRACT OFF
+#elif AV_GCC_VERSION_AT_LEAST(4, 8)
+#pragma GCC optimize ("fp-contract=off")
+#elif defined(_MSC_VER)
+#pragma fp_contract (off)
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(4, 4)
+#pragma GCC optimize ("finite-math-only")
+#endif
+
+/* Integer types */
+#define IS_FLOAT 0
+#  define BIT_DEPTH 8
+#    include "uops_tmpl.c"
+#  undef BIT_DEPTH
+#  define BIT_DEPTH 16
+#    include "uops_tmpl.c"
+#  undef BIT_DEPTH
+#  define BIT_DEPTH 32
+#    include "uops_tmpl.c"
+#  undef BIT_DEPTH
+#undef IS_FLOAT
+
+/* Floating point types */
+#define IS_FLOAT 1
+#  define BIT_DEPTH 32
+#    include "uops_tmpl.c"
+#  undef BIT_DEPTH
+#undef IS_FLOAT
+
+/* Expanded as new uop types are implemented in the C/template backend */
+#define REF_ALL_UOPS(TYPE)                                  \
+    SWS_FOR(TYPE, READ_PLANAR,    REF_ENTRY)                \
+    SWS_FOR(TYPE, READ_PLANAR_FV, REF_ENTRY)                \
+    SWS_FOR(TYPE, READ_PLANAR_FH, REF_ENTRY)                \
+    SWS_FOR(TYPE, READ_PACKED,    REF_ENTRY)                \
+    SWS_FOR(TYPE, READ_NIBBLE,    REF_ENTRY)                \
+    SWS_FOR(TYPE, READ_BIT,       REF_ENTRY)                \
+    SWS_FOR(TYPE, PERMUTE,        REF_ENTRY)                \
+    SWS_FOR(TYPE, COPY,           REF_ENTRY)                \
+    SWS_FOR(TYPE, WRITE_PLANAR,   REF_ENTRY)                \
+    SWS_FOR(TYPE, WRITE_PACKED,   REF_ENTRY)                \
+    SWS_FOR(TYPE, WRITE_NIBBLE,   REF_ENTRY)                \
+    SWS_FOR(TYPE, WRITE_BIT,      REF_ENTRY)                \
+    SWS_FOR(TYPE, SWAP_BYTES,     REF_ENTRY)                \
+    SWS_FOR(TYPE, EXPAND_BIT,     REF_ENTRY)                \
+    SWS_FOR(TYPE, EXPAND_PAIR,    REF_ENTRY)                \
+    SWS_FOR(TYPE, EXPAND_QUAD,    REF_ENTRY)                \
+    SWS_FOR(TYPE, TO_U8,          REF_ENTRY)                \
+    SWS_FOR(TYPE, TO_U16,         REF_ENTRY)                \
+    SWS_FOR(TYPE, TO_U32,         REF_ENTRY)                \
+    SWS_FOR(TYPE, TO_F32,         REF_ENTRY)                \
+    SWS_FOR(TYPE, SCALE,          REF_ENTRY)                \
+    SWS_FOR(TYPE, ADD,            REF_ENTRY)                \
+    SWS_FOR(TYPE, MIN,            REF_ENTRY)                \
+    SWS_FOR(TYPE, MAX,            REF_ENTRY)                \
+    SWS_FOR(TYPE, UNPACK,         REF_ENTRY)                \
+    SWS_FOR(TYPE, PACK,           REF_ENTRY)                \
+    SWS_FOR(TYPE, LSHIFT,         REF_ENTRY)                \
+    SWS_FOR(TYPE, RSHIFT,         REF_ENTRY)                \
+    SWS_FOR(TYPE, CLEAR,          REF_ENTRY)                \
+    SWS_FOR(TYPE, LINEAR,         REF_ENTRY)                \
+    SWS_FOR(TYPE, DITHER,         REF_ENTRY)                \
+    /* end of macro */
+
+static const SwsOpTable op_table = {
+    .block_size = SWS_BLOCK_SIZE,
+    .uops = true,
+    .entries = {
+        REF_ALL_UOPS(U8)
+        REF_ALL_UOPS(U16)
+        REF_ALL_UOPS(U32)
+        REF_ALL_UOPS(F32)
+        NULL
+    },
+};
+
+static void process(const SwsOpExec *exec, const void *priv,
+                    const int bx_start, const int y_start,
+                    int bx_end, int y_end)
+{
+    const SwsOpChain *chain = priv;
+    const SwsOpImpl *impl = chain->impl;
+    block_t x, y, z, w; /* allocate enough space for any intermediate */
+
+    SwsOpIter iterdata;
+    SwsOpIter *iter = &iterdata; /* for CONTINUE() macro to work */
+    iter->exec = exec;
+    for (int i = 0; i < 4; i++) {
+        iter->in[i]  = (uintptr_t) exec->in[i];
+        iter->out[i] = (uintptr_t) exec->out[i];
+    }
+
+    for (iter->y = y_start; iter->y < y_end; iter->y++) {
+        for (int block = bx_start; block < bx_end; block++) {
+            iter->x = block * SWS_BLOCK_SIZE;
+            CONTINUE(&x, &y, &z, &w);
+        }
+
+        const int y_bump = exec->in_bump_y ? exec->in_bump_y[iter->y] : 0;
+        for (int i = 0; i < 4; i++) {
+            iter->in[i]  += exec->in_bump[i] + y_bump * exec->in_stride[i];
+            iter->out[i] += exec->out_bump[i];
+        }
+    }
+}
+
+static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
+{
+    int ret;
+
+    SwsOpChain *chain = ff_sws_op_chain_alloc();
+    if (!chain)
+        return AVERROR(ENOMEM);
+
+    SwsUOpList *uops = ff_sws_uop_list_alloc();
+    if (!uops) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ret = ff_sws_ops_translate(ops, uops);
+    if (ret < 0)
+        goto fail;
+
+    av_assert0(uops->num_ops > 0);
+    for (int i = 0; i < uops->num_ops; i++) {
+        const SwsOpTable *table = &op_table;
+        ret = ff_sws_uop_lookup(ctx, &table, 1, &uops->ops[i],
+                                SWS_BLOCK_SIZE, chain);
+        if (ret < 0)
+            goto fail;
+    }
+
+    *out = (SwsCompiledOp) {
+        .slice_align = 1,
+        .block_size  = SWS_BLOCK_SIZE,
+        .cpu_flags   = chain->cpu_flags,
+        .over_read   = chain->over_read,
+        .over_write  = chain->over_write,
+        .priv        = chain,
+        .free        = ff_sws_op_chain_free_cb,
+        .func        = process,
+    };
+
+    av_log(ctx, AV_LOG_DEBUG, "Compiled micro-ops:\n");
+    for (int i = 0; i < uops->num_ops; i++) {
+        char name[SWS_UOP_NAME_MAX];
+        ff_sws_uop_name(&uops->ops[i], name);
+        av_log(ctx, AV_LOG_DEBUG, "    %s\n", name);
+    }
+
+    ff_sws_uop_list_free(&uops);
+    return 0;
+
+fail:
+    ff_sws_uop_list_free(&uops);
+    ff_sws_op_chain_free(chain);
+    return ret;
+}
+
+const SwsOpBackend backend_uops = {
+    .name       = "uops",
+    .flags      = SWS_BACKEND_C,
+    .compile    = compile,
+    .hw_format  = AV_PIX_FMT_NONE,
+};
diff --git a/libswscale/uops_tmpl.c b/libswscale/uops_tmpl.c
new file mode 100644
index 0000000000..9e0d35ea1f
--- /dev/null
+++ b/libswscale/uops_tmpl.c
@@ -0,0 +1,802 @@
+/**
+ * Copyright (C) 2026 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <libavutil/bswap.h>
+
+#include "uops_tmpl.h"
+
+#ifndef BIT_DEPTH
+#  define BIT_DEPTH 8
+#endif
+
+#if IS_FLOAT && BIT_DEPTH == 32
+#  define PIXEL_TYPE SWS_PIXEL_F32
+#  define pixel_t    float
+#  define inter_t    float
+#  define PX         F32
+#  define px         f32
+#elif BIT_DEPTH == 32
+#  define PIXEL_MAX  0xFFFFFFFFu
+#  define PIXEL_SWAP av_bswap32
+#  define pixel_t    uint32_t
+#  define inter_t    int64_t
+#  define PX         U32
+#  define px         u32
+#elif BIT_DEPTH == 16
+#  define PIXEL_MAX  0xFFFFu
+#  define PIXEL_SWAP av_bswap16
+#  define pixel_t    uint16_t
+#  define inter_t    int64_t
+#  define PX         U16
+#  define px         u16
+#elif BIT_DEPTH == 8
+#  define PIXEL_MAX  0xFFu
+#  define pixel_t    uint8_t
+#  define inter_t    int32_t
+#  define PX         U8
+#  define px         u8
+#else
+#  error Invalid BIT_DEPTH
+#endif
+
+/*********************************
+ * Generic read/write operations *
+ *********************************/
+
+DECL_READ(read_planar, const SwsCompMask mask)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x[i] = in0[i];
+        if (Y) y[i] = in1[i];
+        if (Z) z[i] = in2[i];
+        if (W) w[i] = in3[i];
+    }
+
+    if (X) iter->in[0] += SIZEOF_BLOCK;
+    if (Y) iter->in[1] += SIZEOF_BLOCK;
+    if (Z) iter->in[2] += SIZEOF_BLOCK;
+    if (W) iter->in[3] += SIZEOF_BLOCK;
+
+    CONTINUE(x, y, z, w);
+}
+
+DECL_READ(read_packed, const SwsCompMask mask)
+{
+    const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x[i] = in0[elems * i + 0];
+        if (Y) y[i] = in0[elems * i + 1];
+        if (Z) z[i] = in0[elems * i + 2];
+        if (W) w[i] = in0[elems * i + 3];
+    }
+
+    iter->in[0] += SIZEOF_BLOCK * elems;
+    CONTINUE(x, y, z, w);
+}
+
+DECL_WRITE(write_planar, const SwsCompMask mask)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) out0[i] = x[i];
+        if (Y) out1[i] = y[i];
+        if (Z) out2[i] = z[i];
+        if (W) out3[i] = w[i];
+    }
+
+    if (X) iter->out[0] += SIZEOF_BLOCK;
+    if (Y) iter->out[1] += SIZEOF_BLOCK;
+    if (Z) iter->out[2] += SIZEOF_BLOCK;
+    if (W) iter->out[3] += SIZEOF_BLOCK;
+}
+
+DECL_WRITE(write_packed, const SwsCompMask mask)
+{
+    const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) out0[elems * i + 0] = x[i];
+        if (Y) out0[elems * i + 1] = y[i];
+        if (Z) out0[elems * i + 2] = z[i];
+        if (W) out0[elems * i + 3] = w[i];
+    }
+
+    iter->out[0] += SIZEOF_BLOCK * elems;
+}
+
+#if BIT_DEPTH == 8
+
+DECL_READ(read_bit, const SwsCompMask mask)
+{
+    av_assert2(mask == SWS_COMP_ELEMS(1));
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
+        const pixel_t val = ((const pixel_t *) in0)[i >> 3];
+        x[i + 0] = (val >> 7) & 1;
+        x[i + 1] = (val >> 6) & 1;
+        x[i + 2] = (val >> 5) & 1;
+        x[i + 3] = (val >> 4) & 1;
+        x[i + 4] = (val >> 3) & 1;
+        x[i + 5] = (val >> 2) & 1;
+        x[i + 6] = (val >> 1) & 1;
+        x[i + 7] = (val >> 0) & 1;
+    }
+
+    iter->in[0] += SIZEOF_BLOCK >> 3;
+    CONTINUE(x, y, z, w);
+}
+
+DECL_READ(read_nibble, const SwsCompMask mask)
+{
+    av_assert2(mask == SWS_COMP_ELEMS(1));
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) {
+        const pixel_t val = in0[i >> 1];
+        x[i + 0] = val >> 4;  /* high nibble */
+        x[i + 1] = val & 0xF; /* low nibble */
+    }
+
+    iter->in[0] += SIZEOF_BLOCK >> 1;
+    CONTINUE(x, y, z, w);
+}
+
+DECL_WRITE(write_bit, const SwsCompMask mask)
+{
+    av_assert2(mask == SWS_COMP_ELEMS(1));
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
+        out0[i >> 3] = x[i + 0] << 7 |
+                       x[i + 1] << 6 |
+                       x[i + 2] << 5 |
+                       x[i + 3] << 4 |
+                       x[i + 4] << 3 |
+                       x[i + 5] << 2 |
+                       x[i + 6] << 1 |
+                       x[i + 7];
+    }
+
+    iter->out[0] += SIZEOF_BLOCK >> 3;
+}
+
+DECL_WRITE(write_nibble, const SwsCompMask mask)
+{
+    av_assert2(mask == SWS_COMP_ELEMS(1));
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i += 2)
+        out0[i >> 1] = x[i] << 4 | x[i + 1];
+
+    iter->out[0] += SIZEOF_BLOCK >> 1;
+}
+
+#endif /* BIT_DEPTH == 8 */
+
+SWS_FOR(PX, READ_PLANAR,    DECL_IMPL_READ,     read_planar)
+SWS_FOR(PX, READ_PACKED,    DECL_IMPL_READ,     read_packed)
+SWS_FOR(PX, READ_NIBBLE,    DECL_IMPL_READ,     read_nibble)
+SWS_FOR(PX, READ_BIT,       DECL_IMPL_READ,     read_bit)
+SWS_FOR(PX, WRITE_PLANAR,   DECL_IMPL_WRITE,    write_planar)
+SWS_FOR(PX, WRITE_PACKED,   DECL_IMPL_WRITE,    write_packed)
+SWS_FOR(PX, WRITE_NIBBLE,   DECL_IMPL_WRITE,    write_nibble)
+SWS_FOR(PX, WRITE_BIT,      DECL_IMPL_WRITE,    write_bit)
+
+SWS_FOR_STRUCT(PX, READ_PLANAR,     DECL_ENTRY)
+SWS_FOR_STRUCT(PX, READ_PACKED,     DECL_ENTRY)
+SWS_FOR_STRUCT(PX, READ_NIBBLE,     DECL_ENTRY)
+SWS_FOR_STRUCT(PX, READ_BIT,        DECL_ENTRY)
+SWS_FOR_STRUCT(PX, WRITE_PLANAR,    DECL_ENTRY)
+SWS_FOR_STRUCT(PX, WRITE_PACKED,    DECL_ENTRY)
+SWS_FOR_STRUCT(PX, WRITE_NIBBLE,    DECL_ENTRY)
+SWS_FOR_STRUCT(PX, WRITE_BIT,       DECL_ENTRY)
+
+/*****************************
+ * Scaling / filtering reads *
+ *****************************/
+
+DECL_SETUP(setup_filter_v, params, out)
+{
+    const SwsFilterWeights *filter = params->uop->data.kernel;
+    static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),
+                  ">8 byte pointers not supported");
+
+    /* Pre-convert weights to float */
+    float *weights = av_calloc(filter->num_weights, sizeof(float));
+    if (!weights)
+        return AVERROR(ENOMEM);
+
+    for (int i = 0; i < filter->num_weights; i++)
+        weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;
+
+    out->priv.ptr = weights;
+    out->priv.i32[2] = filter->filter_size;
+    out->free = ff_op_priv_free;
+    return 0;
+}
+
+/* Fully general vertical planar filter case */
+DECL_READ(read_planar_fv, const SwsCompMask mask)
+{
+    const SwsOpExec *exec = iter->exec;
+    const float *restrict weights = impl->priv.ptr;
+    const int filter_size = impl->priv.i32[2];
+    weights += filter_size * iter->y;
+
+    block_t xs, ys, zs, ws;
+    if (X) memset(&xs.f32, 0, sizeof(xs.f32));
+    if (Y) memset(&ys.f32, 0, sizeof(ys.f32));
+    if (Z) memset(&zs.f32, 0, sizeof(zs.f32));
+    if (W) memset(&ws.f32, 0, sizeof(ws.f32));
+
+    for (int j = 0; j < filter_size; j++) {
+        const float weight = weights[j];
+
+        SWS_LOOP
+        for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+            if (X) xs.f32[i] += weight * in0[i];
+            if (Y) ys.f32[i] += weight * in1[i];
+            if (Z) zs.f32[i] += weight * in2[i];
+            if (W) ws.f32[i] += weight * in3[i];
+        }
+
+        if (X) in0 = bump_ptr(in0, exec->in_stride[0]);
+        if (Y) in1 = bump_ptr(in1, exec->in_stride[1]);
+        if (Z) in2 = bump_ptr(in2, exec->in_stride[2]);
+        if (W) in3 = bump_ptr(in3, exec->in_stride[3]);
+    }
+
+    if (X) iter->in[0] += SIZEOF_BLOCK;
+    if (Y) iter->in[1] += SIZEOF_BLOCK;
+    if (Z) iter->in[2] += SIZEOF_BLOCK;
+    if (W) iter->in[3] += SIZEOF_BLOCK;
+
+    CONTINUE(&xs, &ys, &zs, &ws);
+}
+
+DECL_SETUP(setup_filter_h, params, out)
+{
+    SwsFilterWeights *filter = params->uop->data.kernel;
+    out->priv.ptr = av_refstruct_ref(filter->weights);
+    out->priv.i32[2] = filter->filter_size;
+    out->free = ff_op_priv_unref;
+    return 0;
+}
+
+/* Fully general horizontal planar filter case */
+DECL_READ(read_planar_fh, const SwsCompMask mask)
+{
+    const SwsOpExec *exec = iter->exec;
+    const int *restrict weights = impl->priv.ptr;
+    const int filter_size = impl->priv.i32[2];
+    const float scale = 1.0f / SWS_FILTER_SCALE;
+    const int xpos = iter->x;
+    weights += filter_size * iter->x;
+
+    block_t xs, ys, zs, ws;
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        const int offset = exec->in_offset_x[xpos + i];
+        pixel_t *start0 = bump_ptr(in0, offset);
+        pixel_t *start1 = bump_ptr(in1, offset);
+        pixel_t *start2 = bump_ptr(in2, offset);
+        pixel_t *start3 = bump_ptr(in3, offset);
+
+        inter_t sx = 0, sy = 0, sz = 0, sw = 0;
+        for (int j = 0; j < filter_size; j++) {
+            const int weight = weights[j];
+            if (X) sx += weight * start0[j];
+            if (Y) sy += weight * start1[j];
+            if (Z) sz += weight * start2[j];
+            if (W) sw += weight * start3[j];
+        }
+
+        if (X) xs.f32[i] = (float) sx * scale;
+        if (Y) ys.f32[i] = (float) sy * scale;
+        if (Z) zs.f32[i] = (float) sz * scale;
+        if (W) ws.f32[i] = (float) sw * scale;
+
+        weights += filter_size;
+    }
+
+    CONTINUE(&xs, &ys, &zs, &ws);
+}
+
+SWS_FOR(PX, READ_PLANAR_FV, DECL_IMPL_READ, read_planar_fv)
+SWS_FOR(PX, READ_PLANAR_FH, DECL_IMPL_READ, read_planar_fh)
+SWS_FOR_STRUCT(PX, READ_PLANAR_FV, DECL_ENTRY, .setup = fn(setup_filter_v) )
+SWS_FOR_STRUCT(PX, READ_PLANAR_FH, DECL_ENTRY, .setup = fn(setup_filter_h) )
+
+/***************************
+ * Permutation and copying *
+ ***************************/
+
+/* Permute by directly swapping the order of arguments to the continuation. */
+#define DECL_PERMUTE(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3)     
 \
+    static void NAME##_c(SwsOpIter *restrict iter,                             
 \
+                         const SwsOpImpl *restrict impl,                       
 \
+                         void *restrict in0, void *restrict in1,               
 \
+                         void *restrict in2, void *restrict in3)               
 \
+    {                                                                          
 \
+        CONTINUE(in##IDX0, in##IDX1, in##IDX2, in##IDX3);                      
 \
+    }
+
+#define DECL_COPY(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3)        
 \
+    static void NAME##_c(SwsOpIter *restrict iter,                             
 \
+                         const SwsOpImpl *restrict impl,                       
 \
+                         void *restrict in0, void *restrict in1,               
 \
+                         void *restrict in2, void *restrict in3)               
 \
+    {                                                                          
 \
+        const SwsCompMask mask = (MASK);                                       
 \
+        block_t x, y, z, w;                                                    
 \
+                                                                               
 \
+        if (X) memcpy(&x.px, in##IDX0, SIZEOF_BLOCK);                          
 \
+        if (Y) memcpy(&y.px, in##IDX1, SIZEOF_BLOCK);                          
 \
+        if (Z) memcpy(&z.px, in##IDX2, SIZEOF_BLOCK);                          
 \
+        if (W) memcpy(&w.px, in##IDX3, SIZEOF_BLOCK);                          
 \
+                                                                               
 \
+        CONTINUE(X ? &x : in0, Y ? &y : in1, Z ? &z : in2, W ? &w : in3);      
 \
+    }
+
+SWS_FOR(PX, PERMUTE, DECL_PERMUTE)
+SWS_FOR(PX, COPY,    DECL_COPY)
+SWS_FOR_STRUCT(PX, PERMUTE, DECL_ENTRY)
+SWS_FOR_STRUCT(PX, COPY,    DECL_ENTRY)
+
+/*********************
+ * Format conversion *
+ *********************/
+
+#define DECL_CAST(DST, dst)                                                    
 \
+    DECL_FUNC(to_##dst, const SwsCompMask mask)                                
 \
+    {                                                                          
 \
+        block_t xx, yy, zz, ww;                                                
 \
+                                                                               
 \
+        SWS_LOOP                                                               
 \
+        for (int i = 0; i < SWS_BLOCK_SIZE; i++) {                             
 \
+            if (X) xx.dst[i] = x[i];                                           
 \
+            if (Y) yy.dst[i] = y[i];                                           
 \
+            if (Z) zz.dst[i] = z[i];                                           
 \
+            if (W) ww.dst[i] = w[i];                                           
 \
+        }                                                                      
 \
+                                                                               
 \
+        CONTINUE(&xx, &yy, &zz, &ww);                                          
 \
+    }                                                                          
 \
+                                                                               
 \
+    SWS_FOR(PX, TO_##DST, DECL_IMPL, to_##dst)                                 
 \
+    SWS_FOR_STRUCT(PX, TO_##DST, DECL_ENTRY)
+
+DECL_CAST(U8,  u8)
+DECL_CAST(U16, u16)
+DECL_CAST(U32, u32)
+DECL_CAST(F32, f32)
+
+/********************
+ * Bit manipulation *
+ ********************/
+
+#if !IS_FLOAT
+DECL_FUNC(lshift, const SwsCompMask mask, const uint8_t amount)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x[i] <<= amount;
+        if (Y) y[i] <<= amount;
+        if (Z) z[i] <<= amount;
+        if (W) w[i] <<= amount;
+    }
+
+    CONTINUE(x, y, z, w);
+}
+
+DECL_FUNC(rshift, const SwsCompMask mask, const uint8_t amount)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x[i] >>= amount;
+        if (Y) y[i] >>= amount;
+        if (Z) z[i] >>= amount;
+        if (W) w[i] >>= amount;
+    }
+
+    CONTINUE(x, y, z, w);
+}
+#endif
+
+SWS_FOR(PX, LSHIFT, DECL_IMPL, lshift)
+SWS_FOR(PX, RSHIFT, DECL_IMPL, rshift)
+
+SWS_FOR_STRUCT(PX, LSHIFT, DECL_ENTRY)
+SWS_FOR_STRUCT(PX, RSHIFT, DECL_ENTRY)
+
+#ifdef PIXEL_SWAP
+DECL_FUNC(swap_bytes, const SwsCompMask mask)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x[i] = PIXEL_SWAP(x[i]);
+        if (Y) y[i] = PIXEL_SWAP(y[i]);
+        if (Z) z[i] = PIXEL_SWAP(z[i]);
+        if (W) w[i] = PIXEL_SWAP(w[i]);
+    }
+
+    CONTINUE(x, y, z, w);
+}
+#endif /* PIXEL_SWAP */
+
+SWS_FOR(PX, SWAP_BYTES, DECL_IMPL, swap_bytes)
+SWS_FOR_STRUCT(PX, SWAP_BYTES, DECL_ENTRY)
+
+#ifdef PIXEL_MAX
+DECL_FUNC(expand_bit, const SwsCompMask mask)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x[i] = x[i] ? PIXEL_MAX : 0;
+        if (Y) y[i] = y[i] ? PIXEL_MAX : 0;
+        if (Z) z[i] = z[i] ? PIXEL_MAX : 0;
+        if (W) w[i] = w[i] ? PIXEL_MAX : 0;
+    }
+
+    CONTINUE(x, y, z, w);
+}
+#endif
+
+#if BIT_DEPTH == 8
+DECL_FUNC(expand_pair, const SwsCompMask mask)
+{
+    block_t x16, y16, z16, w16;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x16.u16[i] = x[i] << 8 | x[i];
+        if (Y) y16.u16[i] = y[i] << 8 | y[i];
+        if (Z) z16.u16[i] = z[i] << 8 | z[i];
+        if (W) w16.u16[i] = w[i] << 8 | w[i];
+    }
+
+    CONTINUE(&x16, &y16, &z16, &w16);
+}
+
+DECL_FUNC(expand_quad, const SwsCompMask mask)
+{
+    block_t x32, y32, z32, w32;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x32.u32[i] = (uint32_t) x[i] << 24 | x[i] << 16 | x[i] << 8 | 
x[i];
+        if (Y) y32.u32[i] = (uint32_t) y[i] << 24 | y[i] << 16 | y[i] << 8 | 
y[i];
+        if (Z) z32.u32[i] = (uint32_t) z[i] << 24 | z[i] << 16 | z[i] << 8 | 
z[i];
+        if (W) w32.u32[i] = (uint32_t) w[i] << 24 | w[i] << 16 | w[i] << 8 | 
w[i];
+    }
+
+    CONTINUE(&x32, &y32, &z32, &w32);
+}
+#endif /* BIT_DEPTH == 8 */
+
+SWS_FOR(PX, EXPAND_BIT,  DECL_IMPL, expand_bit)
+SWS_FOR(PX, EXPAND_PAIR, DECL_IMPL, expand_pair)
+SWS_FOR(PX, EXPAND_QUAD, DECL_IMPL, expand_quad)
+SWS_FOR_STRUCT(PX, EXPAND_BIT,  DECL_ENTRY)
+SWS_FOR_STRUCT(PX, EXPAND_PAIR, DECL_ENTRY)
+SWS_FOR_STRUCT(PX, EXPAND_QUAD, DECL_ENTRY)
+
+/*************************
+ * Packing and unpacking *
+ ************************/
+
+#if !IS_FLOAT
+DECL_FUNC(unpack, const SwsCompMask mask,
+                  const uint8_t bx, const uint8_t by,
+                  const uint8_t bz, const uint8_t bw)
+{
+    const uint8_t sx = bw + bz + by;
+    const uint8_t sy = bw + bz;
+    const uint8_t sz = bw;
+    const uint8_t sw = 0;
+
+    const pixel_t mx = (1 << bx) - 1;
+    const pixel_t my = (1 << by) - 1;
+    const pixel_t mz = (1 << bz) - 1;
+    const pixel_t mw = (1 << bw) - 1;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        const pixel_t val = x[i];
+        if (X) x[i] = (val >> sx) & mx;
+        if (Y) y[i] = (val >> sy) & my;
+        if (Z) z[i] = (val >> sz) & mz;
+        if (W) w[i] = (val >> sw) & mw;
+    }
+
+    CONTINUE(x, y, z, w);
+}
+
+DECL_FUNC(pack, const SwsCompMask mask,
+                const uint8_t bx, const uint8_t by,
+                const uint8_t bz, const uint8_t bw)
+{
+    const uint8_t sx = bw + bz + by;
+    const uint8_t sy = bw + bz;
+    const uint8_t sz = bw;
+    const uint8_t sw = 0;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        pixel_t val = 0;
+        if (X) val |= x[i] << sx;
+        if (Y) val |= y[i] << sy;
+        if (Z) val |= z[i] << sz;
+        if (W) val |= w[i] << sw;
+        x[i] = val;
+    }
+
+    CONTINUE(x, y, z, w);
+}
+#endif /* !IS_FLOAT */
+
+SWS_FOR(PX, UNPACK, DECL_IMPL, unpack)
+SWS_FOR(PX, PACK,   DECL_IMPL, pack)
+SWS_FOR_STRUCT(PX, UNPACK,  DECL_ENTRY)
+SWS_FOR_STRUCT(PX, PACK,    DECL_ENTRY)
+
+/***********************
+ * Pixel data clearing *
+ ***********************/
+
+#ifdef PIXEL_MAX
+DECL_FUNC(clear, const SwsCompMask mask, const SwsCompMask one,
+                 const SwsCompMask zero)
+{
+    #define ONE(N)  SWS_COMP_TEST(one, N)
+    #define ZERO(N) SWS_COMP_TEST(zero, N)
+    const pixel_t cx = ONE(0) ? PIXEL_MAX : ZERO(0) ? 0 : impl->priv.px[0];
+    const pixel_t cy = ONE(1) ? PIXEL_MAX : ZERO(1) ? 0 : impl->priv.px[1];
+    const pixel_t cz = ONE(2) ? PIXEL_MAX : ZERO(2) ? 0 : impl->priv.px[2];
+    const pixel_t cw = ONE(3) ? PIXEL_MAX : ZERO(3) ? 0 : impl->priv.px[3];
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x[i] = cx;
+        if (Y) y[i] = cy;
+        if (Z) z[i] = cz;
+        if (W) w[i] = cw;
+    }
+
+    CONTINUE(x, y, z, w);
+}
+#endif
+
+SWS_FOR(PX, CLEAR, DECL_IMPL, clear)
+SWS_FOR_STRUCT(PX, CLEAR, DECL_ENTRY, .setup = ff_sws_setup_vec4)
+
+/*************************
+ * Arithmetic operations *
+ *************************/
+
+DECL_FUNC(scale, const SwsCompMask mask)
+{
+    const pixel_t scale = impl->priv.px[0];
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x[i] *= scale;
+        if (Y) y[i] *= scale;
+        if (Z) z[i] *= scale;
+        if (W) w[i] *= scale;
+    }
+
+    CONTINUE(x, y, z, w);
+}
+
+DECL_FUNC(add, const SwsCompMask mask)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x[i] += impl->priv.px[0];
+        if (Y) y[i] += impl->priv.px[1];
+        if (Z) z[i] += impl->priv.px[2];
+        if (W) w[i] += impl->priv.px[3];
+    }
+
+    CONTINUE(x, y, z, w);
+}
+
+DECL_FUNC(min, const SwsCompMask mask)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x[i] = FFMIN(x[i], impl->priv.px[0]);
+        if (Y) y[i] = FFMIN(y[i], impl->priv.px[1]);
+        if (Z) z[i] = FFMIN(z[i], impl->priv.px[2]);
+        if (W) w[i] = FFMIN(w[i], impl->priv.px[3]);
+    }
+
+    CONTINUE(x, y, z, w);
+}
+
+DECL_FUNC(max, const SwsCompMask mask)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x[i] = FFMAX(x[i], impl->priv.px[0]);
+        if (Y) y[i] = FFMAX(y[i], impl->priv.px[1]);
+        if (Z) z[i] = FFMAX(z[i], impl->priv.px[2]);
+        if (W) w[i] = FFMAX(w[i], impl->priv.px[3]);
+    }
+
+    CONTINUE(x, y, z, w);
+}
+
+SWS_FOR(PX, SCALE, DECL_IMPL, scale)
+SWS_FOR(PX, ADD,   DECL_IMPL, add)
+SWS_FOR(PX, MIN,   DECL_IMPL, min)
+SWS_FOR(PX, MAX,   DECL_IMPL, max)
+SWS_FOR_STRUCT(PX, SCALE, DECL_ENTRY, .setup = ff_sws_setup_scalar )
+SWS_FOR_STRUCT(PX, ADD,   DECL_ENTRY, .setup = ff_sws_setup_vec4 )
+SWS_FOR_STRUCT(PX, MIN,   DECL_ENTRY, .setup = ff_sws_setup_vec4 )
+SWS_FOR_STRUCT(PX, MAX,   DECL_ENTRY, .setup = ff_sws_setup_vec4 )
+
+/*************
+ * Dithering *
+ *************/
+
+DECL_SETUP(setup_dither, params, out)
+{
+    const SwsUOp *uop = params->uop;
+    const SwsDitherUOp *dither = &uop->par.dither;
+    const int size = 1 << dither->size_log2;
+    if (size >= SWS_BLOCK_SIZE) {
+        /* No extra padding needed */
+        out->priv.ptr = av_refstruct_ref(uop->data.ptr);
+        out->free = ff_op_priv_unref;
+        return 0;
+    }
+
+    const int stride = FFMAX(size, SWS_BLOCK_SIZE);
+    const int height = ff_sws_dither_height(dither);
+    pixel_t *matrix = av_malloc(sizeof(pixel_t) * height * stride);
+    if (!matrix)
+        return AVERROR(ENOMEM);
+    out->priv.ptr = matrix;
+    out->free = ff_op_priv_free;
+
+    /* Pad to multiple of block size. We don't need extra padding for the
+     * height because ff_sws_dither_height() already includes any padding
+     * necessary for the y_offset */
+    for (int y = 0; y < height; y++) {
+        pixel_t *row = &matrix[y * stride];
+        for (int x = 0; x < size; x++)
+            row[x] = uop->data.ptr[y * size + x].px;
+        for (int x = size; x < stride; x++)
+            row[x] = row[x % size];
+    }
+
+    return 0;
+}
+
+DECL_FUNC(dither, const SwsCompMask mask,
+                  const uint8_t off0, const uint8_t off1,
+                  const uint8_t off2, const uint8_t off3,
+                  const uint8_t size_log2)
+{
+    const int size   = 1 << size_log2;
+    const int stride = FFMAX(size, SWS_BLOCK_SIZE);
+
+    const pixel_t *matrix = impl->priv.ptr;
+    matrix += (iter->y & (size - 1)) * stride;
+    matrix += (iter->x & (size - 1)) & ~(SWS_BLOCK_SIZE - 1);
+
+    const pixel_t *const row0 = &matrix[off0 * stride];
+    const pixel_t *const row1 = &matrix[off1 * stride];
+    const pixel_t *const row2 = &matrix[off2 * stride];
+    const pixel_t *const row3 = &matrix[off3 * stride];
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X) x[i] += row0[i];
+        if (Y) y[i] += row1[i];
+        if (Z) z[i] += row2[i];
+        if (W) w[i] += row3[i];
+    }
+
+    CONTINUE(x, y, z, w);
+}
+
+SWS_FOR(PX, DITHER, DECL_IMPL, dither)
+SWS_FOR_STRUCT(PX, DITHER, DECL_ENTRY, .setup = fn(setup_dither) )
+
+/*********************
+ * Linear operations *
+ *********************/
+
+typedef struct {
+    /* Stored in split form for convenience */
+    pixel_t m[4][4];
+    pixel_t k[4];
+} fn(LinCoeffs);
+
+DECL_SETUP(setup_linear, params, out)
+{
+    const SwsUOp *uop = params->uop;
+    fn(LinCoeffs) c;
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++)
+            c.m[i][j] = uop->data.mat4[i][j].px;
+        c.k[i] = uop->data.mat4[i][4].px;
+    }
+
+    out->priv.ptr = av_memdup(&c, sizeof(c));
+    out->free = ff_op_priv_free;
+    return out->priv.ptr ? 0 : AVERROR(ENOMEM);
+}
+
+/**
+ * Fully general case for a 5x5 linear affine transformation. Should never be
+ * called without constant `mask`. This function will compile down to the
+ * appropriately optimized version for the required subset of operations when
+ * called with a constant mask.
+ */
+DECL_FUNC(linear, const SwsCompMask mask, const uint32_t one, const uint32_t 
zero)
+{
+    const fn(LinCoeffs) c = *(const fn(LinCoeffs) *) impl->priv.ptr;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        const pixel_t xx = x[i];
+        const pixel_t yy = y[i];
+        const pixel_t zz = z[i];
+        const pixel_t ww = w[i];
+
+#define LIN_VAL(I, J, val) \
+    ((one & SWS_MASK(I, J)) ? (val) : c.m[I][J] * (val))
+
+#define LIN_ROW(I, var) do {                                    \
+    var[i] = (zero & SWS_MASK(I, 4)) ? 0 : c.k[I];              \
+    if (!(zero & SWS_MASK(I, 0))) var[i] += LIN_VAL(I, 0, xx);  \
+    if (!(zero & SWS_MASK(I, 1))) var[i] += LIN_VAL(I, 1, yy);  \
+    if (!(zero & SWS_MASK(I, 2))) var[i] += LIN_VAL(I, 2, zz);  \
+    if (!(zero & SWS_MASK(I, 3))) var[i] += LIN_VAL(I, 3, ww);  \
+} while (0)
+
+        if (X) LIN_ROW(0, x);
+        if (Y) LIN_ROW(1, y);
+        if (Z) LIN_ROW(2, z);
+        if (W) LIN_ROW(3, w);
+    }
+
+    CONTINUE(x, y, z, w);
+}
+
+SWS_FOR(PX, LINEAR, DECL_IMPL, linear)
+SWS_FOR_STRUCT(PX, LINEAR, DECL_ENTRY, .setup = fn(setup_linear) )
+
+#undef PIXEL_MAX
+#undef PIXEL_SWAP
+#undef pixel_t
+#undef inter_t
+#undef block_t
+#undef PX
+#undef px
diff --git a/libswscale/uops_tmpl.h b/libswscale/uops_tmpl.h
new file mode 100644
index 0000000000..80c6e5221d
--- /dev/null
+++ b/libswscale/uops_tmpl.h
@@ -0,0 +1,146 @@
+/**
+ * Copyright (C) 2026 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_UOPS_TMPL_H
+#define SWSCALE_UOPS_TMPL_H
+
+/**
+ * Helper macros for the C-based backend.
+ *
+ * To use these macros, `pixel_t` should be defined as the type of pixels.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+
+#include "ops_chain.h"
+#include "uops_macros.h"
+
+#ifndef SWS_BLOCK_SIZE
+#  define SWS_BLOCK_SIZE 32
+#endif
+
+typedef union block_t {
+    uint8_t   u8[SWS_BLOCK_SIZE];
+    uint16_t u16[SWS_BLOCK_SIZE];
+    uint32_t u32[SWS_BLOCK_SIZE];
+    float    f32[SWS_BLOCK_SIZE];
+} block_t;
+
+#define SIZEOF_BLOCK (sizeof(pixel_t) * SWS_BLOCK_SIZE)
+
+/**
+ * Internal context holding per-iter execution data. The data pointers will be
+ * directly incremented by the corresponding read/write functions.
+ */
+typedef struct SwsOpIter {
+    uintptr_t in[4];
+    uintptr_t out[4];
+    int x, y;
+
+    /* Link back to per-slice execution context */
+    const SwsOpExec *exec;
+} SwsOpIter;
+
+#ifdef __clang__
+#  define SWS_LOOP AV_PRAGMA(clang loop vectorize(assume_safety))
+#elif defined(__GNUC__)
+#  define SWS_LOOP AV_PRAGMA(GCC ivdep)
+#else
+#  define SWS_LOOP
+#endif
+
+/* Miscellaneous helpers */
+#define bitfn2(name, ext) name ## _ ## ext
+#define bitfn(name, ext)  bitfn2(name, ext)
+#define fn(name)          bitfn(name, PX)
+
+#define bump_ptr(ptr, bump) ((pixel_t *) ((uintptr_t) (ptr) + (bump)))
+
+/* Helpers for dealing with component masks */
+#define X SWS_COMP_TEST(mask, 0)
+#define Y SWS_COMP_TEST(mask, 1)
+#define Z SWS_COMP_TEST(mask, 2)
+#define W SWS_COMP_TEST(mask, 3)
+
+/* Helper macros to make writing common function signatures less painful */
+#define DECL_FUNC(NAME, ...)                                                   
 \
+    static av_always_inline void                                               
 \
+        fn(NAME)(SwsOpIter *restrict iter, const SwsOpImpl *restrict impl,     
 \
+                 pixel_t *restrict x, pixel_t *restrict y,                     
 \
+                 pixel_t *restrict z, pixel_t *restrict w,                     
 \
+                 __VA_ARGS__)
+
+#define DECL_READ(NAME, ...)                                                   
 \
+    DECL_FUNC(NAME, __VA_ARGS__,                                               
 \
+              const pixel_t *restrict in0, const pixel_t *restrict in1,        
 \
+              const pixel_t *restrict in2, const pixel_t *restrict in3)        
 \
+
+#define DECL_WRITE(NAME, ...)                                                  
 \
+    DECL_FUNC(NAME, __VA_ARGS__,                                               
 \
+              pixel_t *restrict out0, pixel_t *restrict out1,                  
 \
+              pixel_t *restrict out2, pixel_t *restrict out3)                  
 \
+
+#define CALL(NAME, ...) fn(NAME)(iter, impl, x, y, z, w, __VA_ARGS__)
+
+/* Helper macro to call into the next continuation with a given type */
+#define CONTINUE(...)                                                          
 \
+    ((void (*)(SwsOpIter *, const SwsOpImpl *,                                 
 \
+               void *restrict x, void *restrict y,                             
 \
+               void *restrict z, void *restrict w)) impl->cont)                
 \
+        (iter, &impl[1], __VA_ARGS__)
+
+/* Helper macros for common op setup code */
+#define DECL_SETUP(NAME, PARAMS, OUT)                                          
 \
+    static av_unused int fn(NAME)(const SwsImplParams *PARAMS,                 
 \
+                                  SwsImplResult *OUT)
+
+/* Helper macro for declaring kernel entry points */
+#define DECL_IMPL(FUNC, NAME, TYPE, UOP, ...)                                  
 \
+    static av_flatten void NAME##_c(SwsOpIter *restrict iter,                  
 \
+                                    const SwsOpImpl *restrict impl,            
 \
+                                    void *restrict x, void *restrict y,        
 \
+                                    void *restrict z, void *restrict w)        
 \
+    {                                                                          
 \
+        CALL(FUNC, __VA_ARGS__);                                               
 \
+    }
+
+#define DECL_IMPL_READ(...)                                                    
 \
+    DECL_IMPL(__VA_ARGS__,                                                     
 \
+              (const pixel_t *) iter->in[0], (const pixel_t *) iter->in[1],    
 \
+              (const pixel_t *) iter->in[2], (const pixel_t *) iter->in[3])
+
+#define DECL_IMPL_WRITE(...)                                                   
 \
+    DECL_IMPL(__VA_ARGS__,                                                     
 \
+              (pixel_t *) iter->out[0], (pixel_t *) iter->out[1],              
 \
+              (pixel_t *) iter->out[2], (pixel_t *) iter->out[3])
+
+#define REF_ENTRY(DUMMY, NAME, ...) &op_##NAME,
+#define DECL_ENTRY(SETUP, NAME, ...)                                           
 \
+    static const SwsOpEntry op_##NAME = {                                      
 \
+        .func = (SwsFuncPtr) NAME##_c,                                         
 \
+        __VA_ARGS__,                                                           
 \
+        SETUP                                                                  
 \
+    };
+
+#endif

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 11/34: swscale/uops: auto-generate reference C backend from uops_macros.h

Reply via email to