ops_backend: add reference backend basend on C templates

Niklas Haas Sat, 26 Apr 2025 10:58:12 -0700

From: Niklas Haas <g...@haasn.dev>

This will serve as a reference for the SIMD backends to come. That said,
with auto-vectorization enabled, the performance of this is not atrocious, and
can often beat even the old SIMD.


In theory, we can dramatically speed it up by using GCC vectors instead of
arrays, but the performance gains from this are too dependent on exact GCC
versions and flags, so it practice it's not a substitute for a SIMD
implementation.
---
 libswscale/Makefile          |   6 +
 libswscale/ops.c             |   3 +
 libswscale/ops.h             |   2 -
 libswscale/ops_backend.c     | 101 ++++++
 libswscale/ops_backend.h     | 181 +++++++++++
 libswscale/ops_tmpl_common.c | 176 ++++++++++
 libswscale/ops_tmpl_float.c  | 255 +++++++++++++++
 libswscale/ops_tmpl_int.c    | 609 +++++++++++++++++++++++++++++++++++
 8 files changed, 1331 insertions(+), 2 deletions(-)
 create mode 100644 libswscale/ops_backend.c
 create mode 100644 libswscale/ops_backend.h
 create mode 100644 libswscale/ops_tmpl_common.c
 create mode 100644 libswscale/ops_tmpl_float.c
 create mode 100644 libswscale/ops_tmpl_int.c

diff --git a/libswscale/Makefile b/libswscale/Makefile
index c9dfa78c89..6e5696c5a6 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -16,6 +16,7 @@ OBJS = alphablend.o                                     \
        input.o                                          \
        lut3d.o                                          \
        ops.o                                            \
+       ops_backend.o                                    \
        ops_chain.o                                      \
        ops_optimizer.o                                  \
        options.o                                        \
@@ -29,6 +30,11 @@ OBJS = alphablend.o                                     \
        yuv2rgb.o                                        \
        vscale.o                                         \
 
+OPS-CFLAGS = -Wno-uninitialized \
+             -ffinite-math-only
+
+$(SUBDIR)ops_backend.o: CFLAGS += $(OPS-CFLAGS)
+
 # Objects duplicated from other libraries for shared builds
 SHLIBOBJS                    += log2_tab.o half2float.o
 
diff --git a/libswscale/ops.c b/libswscale/ops.c
index 6d9a844e06..9600e3c9df 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -27,7 +27,10 @@
 #include "ops.h"
 #include "ops_internal.h"
 
+extern SwsOpBackend backend_c;
+
 const SwsOpBackend * const ff_sws_op_backends[] = {
+    &backend_c,
     NULL
 };
 
diff --git a/libswscale/ops.h b/libswscale/ops.h
index c9c5706cbf..b8ab6d8522 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -91,8 +91,6 @@ typedef struct SwsComps {
 } SwsComps;
 
 typedef struct SwsReadWriteOp {
-    /* Note: Unread pixel data is explicitly cleared to {0} for sanity */
-
     int elems;   /* number of elements (of type `op.type`) to read/write */
     bool packed; /* read multiple elements from a single plane */
     int frac;    /* fractional pixel step factor (log2) */
diff --git a/libswscale/ops_backend.c b/libswscale/ops_backend.c
new file mode 100644
index 0000000000..6cd2b2d9b9
--- /dev/null
+++ b/libswscale/ops_backend.c
@@ -0,0 +1,101 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "ops_backend.h"
+
+/* Array-based reference implementation */
+
+#ifndef SWS_BLOCK_SIZE
+#  define SWS_BLOCK_SIZE 32
+#endif
+
+typedef  uint8_t  u8block_t[SWS_BLOCK_SIZE];
+typedef uint16_t u16block_t[SWS_BLOCK_SIZE];
+typedef uint32_t u32block_t[SWS_BLOCK_SIZE];
+typedef    float f32block_t[SWS_BLOCK_SIZE];
+
+#define BIT_DEPTH 8
+# include "ops_tmpl_int.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 16
+# include "ops_tmpl_int.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 32
+# include "ops_tmpl_int.c"
+# include "ops_tmpl_float.c"
+#undef BIT_DEPTH
+
+static void process(const SwsOpExec *exec, const void *priv, int num_blocks)
+{
+    const SwsOpChain *chain = priv;
+    const SwsOpImpl *impl = chain->impl;
+    SwsOpIter iter;
+
+    iter.y = exec->y;
+    for (int i = 0; i < 4; i++) {
+        iter.in[i]  = exec->in[i];
+        iter.out[i] = exec->out[i];
+    }
+
+    for (iter.x = exec->x; num_blocks-- > 0; iter.x += SWS_BLOCK_SIZE) {
+        ((void (*)(SwsOpIter *, const SwsOpImpl *)) impl->cont)
+            (&iter, &impl[1]);
+    }
+}
+
+static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
+{
+    int ret;
+
+    SwsOpChain *chain = ff_sws_op_chain_alloc();
+    if (!chain)
+        return AVERROR(ENOMEM);
+
+    static const SwsOpTable *const tables[] = {
+        &bitfn(op_table_int,    u8),
+        &bitfn(op_table_int,   u16),
+        &bitfn(op_table_int,   u32),
+        &bitfn(op_table_float, f32),
+    };
+
+    do {
+        ret = ff_sws_op_compile_tables(tables, FF_ARRAY_ELEMS(tables), ops,
+                                       SWS_BLOCK_SIZE, chain);
+    } while (ret == AVERROR(EAGAIN));
+    if (ret < 0) {
+        ff_sws_op_chain_free(chain);
+        return ret;
+    }
+
+    *out = (SwsCompiledOp) {
+        .func = process,
+        .block_size = SWS_BLOCK_SIZE,
+        .priv = chain,
+        .free = (void (*)(void *)) ff_sws_op_chain_free,
+    };
+    return 0;
+}
+
+SwsOpBackend backend_c = {
+    .name       = "c",
+    .compile    = compile,
+};
diff --git a/libswscale/ops_backend.h b/libswscale/ops_backend.h
new file mode 100644
index 0000000000..3d09ba791a
--- /dev/null
+++ b/libswscale/ops_backend.h
@@ -0,0 +1,181 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_OPS_BACKEND_H
+#define SWSCALE_OPS_BACKEND_H
+
+/**
+ * Helper macros for the C-based backend.
+ *
+ * To use these macros, the following types must be defined:
+ *  - PIXEL_TYPE should be one of SWS_PIXEL_*
+ *  - pixel_t should be the type of pixels
+ *  - block_t should be the type of blocks (groups of pixels)
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/mem.h"
+
+#include "ops_chain.h"
+
+/**
+ * Internal context holding per-iter execution data. The data pointers will be
+ * directly incremented by the corresponding read/write functions.
+ */
+typedef struct SwsOpIter {
+    const uint8_t *in[4];
+    uint8_t *out[4];
+    int x, y;
+} SwsOpIter;
+
+#ifdef __clang__
+#  define SWS_FUNC
+#  define SWS_LOOP AV_PRAGMA(clang loop vectorize(assume_safety))
+#elif defined(__GNUC__)
+#  define SWS_FUNC __attribute__((optimize("tree-vectorize")))
+#  define SWS_LOOP AV_PRAGMA(GCC ivdep)
+#else
+#  define SWS_FUNC
+#  define SWS_LOOP
+#endif
+
+#if defined(__clang__)
+#  define SWS_ASSUME(cond) __builtin_assume(cond)
+#elif defined(__GNUC__)
+#  define SWS_ASSUME(cond) { if (!(cond)) __builtin_unreachable(); }
+#else
+#  define SWS_ASSUME(cond) ((void) (cond))
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+#  define SWS_ASSUME_ALIGNED(ptr, align)  __builtin_assume_aligned(ptr, align)
+#else
+#  define SWS_ASSUME_ALIGNED(ptr, align) ((void *) (ptr))
+#endif
+
+/* Miscellaneous helpers */
+#define bitfn2(name, ext) name ## _ ## ext
+#define bitfn(name, ext)  bitfn2(name, ext)
+
+#define FN_SUFFIX AV_JOIN(FMT_CHAR, BIT_DEPTH)
+#define fn(name)  bitfn(name, FN_SUFFIX)
+
+#define av_q2pixel(q) ((q).den ? (pixel_t) (q).num / (q).den : 0)
+
+/* Helper macros to make writing common function signatures less painful */
+#define DECL_FUNC(NAME, ...)                                                   
 \
+    static av_always_inline void fn(NAME)(SwsOpIter *restrict iter,            
 \
+                                          const SwsOpImpl *restrict impl,      
 \
+                                          block_t x, block_t y,                
 \
+                                          block_t z, block_t w,                
 \
+                                          __VA_ARGS__)
+
+#define DECL_READ(NAME, ...)                                                   
 \
+    static av_always_inline void fn(NAME)(SwsOpIter *restrict iter,            
 \
+                                          const SwsOpImpl *restrict impl,      
 \
+                                          const pixel_t *restrict in0,         
 \
+                                          const pixel_t *restrict in1,         
 \
+                                          const pixel_t *restrict in2,         
 \
+                                          const pixel_t *restrict in3,         
 \
+                                          __VA_ARGS__)
+
+#define DECL_WRITE(NAME, ...)                                                  
 \
+    DECL_FUNC(NAME, pixel_t *restrict out0, pixel_t *restrict out1,            
 \
+                    pixel_t *restrict out2, pixel_t *restrict out3,            
 \
+                    __VA_ARGS__)
+
+/* Helper macros to call into functions declared with DECL_FUNC_* */
+#define CALL(FUNC, ...) \
+    fn(FUNC)(iter, impl, x, y, z, w, __VA_ARGS__)
+
+#define CALL_READ(FUNC, ...)                                                   
 \
+    fn(FUNC)(iter, impl, (const pixel_t *) iter->in[0],                        
 \
+                         (const pixel_t *) iter->in[1],                        
 \
+                         (const pixel_t *) iter->in[2],                        
 \
+                         (const pixel_t *) iter->in[3], __VA_ARGS__)
+
+#define CALL_WRITE(FUNC, ...)                                                  
 \
+    CALL(FUNC, (pixel_t *) iter->out[0], (pixel_t *) iter->out[1],             
 \
+               (pixel_t *) iter->out[2], (pixel_t *) iter->out[3], __VA_ARGS__)
+
+/* Helper macros to declare continuation functions */
+#define DECL_IMPL(NAME)                                                        
 \
+    static SWS_FUNC void fn(NAME)(SwsOpIter *restrict iter,                    
 \
+                                  const SwsOpImpl *restrict impl,              
 \
+                                  block_t x, block_t y,                        
 \
+                                  block_t z, block_t w)                        
 \
+
+/* Helper macro to call into the next continuation with a given type */
+#define CONTINUE(TYPE, ...)                                                    
 \
+    ((void (*)(SwsOpIter *, const SwsOpImpl *,                                 
 \
+               TYPE x, TYPE y, TYPE z, TYPE w)) impl->cont)                    
 \
+        (iter, &impl[1], __VA_ARGS__)
+
+/* Helper macros for common op setup code */
+#define DECL_SETUP(NAME)                                                       
 \
+    static int fn(NAME)(const SwsOp *op, SwsOpPriv *out)
+
+#define SETUP_MEMDUP(c) ff_setup_memdup(&(c), sizeof(c), out)
+static inline int ff_setup_memdup(const void *c, size_t size, SwsOpPriv *out)
+{
+    out->ptr = av_memdup(c, size);
+    return out->ptr ? 0 : AVERROR(ENOMEM);
+}
+
+/* Helper macro for declaring op table entries */
+#define DECL_ENTRY(NAME, ...)                                                  
 \
+    static const SwsOpEntry fn(op_##NAME) = {                                  
 \
+        .func = (SwsFuncPtr) fn(NAME),                                         
 \
+        .op.type = PIXEL_TYPE,                                                 
 \
+        __VA_ARGS__                                                            
 \
+    }
+
+/* Helpers to define functions for common subsets of components */
+#define DECL_PATTERN(NAME) \
+    DECL_FUNC(NAME, const bool X, const bool Y, const bool Z, const bool W)
+
+#define WRAP_PATTERN(FUNC, X, Y, Z, W, ...)                                    
 \
+    DECL_IMPL(FUNC##_##X##Y##Z##W)                                             
 \
+    {                                                                          
 \
+        CALL(FUNC, X, Y, Z, W);                                                
 \
+    }                                                                          
 \
+                                                                               
 \
+    DECL_ENTRY(FUNC##_##X##Y##Z##W,                                            
 \
+        .op.comps.unused = { !X, !Y, !Z, !W },                                 
 \
+        __VA_ARGS__                                                            
 \
+    )
+
+#define WRAP_COMMON_PATTERNS(FUNC, ...)                                        
 \
+    WRAP_PATTERN(FUNC, 1, 0, 0, 0, __VA_ARGS__);                               
 \
+    WRAP_PATTERN(FUNC, 1, 0, 0, 1, __VA_ARGS__);                               
 \
+    WRAP_PATTERN(FUNC, 1, 1, 1, 0, __VA_ARGS__);                               
 \
+    WRAP_PATTERN(FUNC, 1, 1, 1, 1, __VA_ARGS__)
+
+#define REF_COMMON_PATTERNS(NAME)                                              
 \
+    fn(op_##NAME##_1000),                                                      
 \
+    fn(op_##NAME##_1001),                                                      
 \
+    fn(op_##NAME##_1110),                                                      
 \
+    fn(op_##NAME##_1111)
+
+#endif
diff --git a/libswscale/ops_tmpl_common.c b/libswscale/ops_tmpl_common.c
new file mode 100644
index 0000000000..a9410a8a61
--- /dev/null
+++ b/libswscale/ops_tmpl_common.c
@@ -0,0 +1,176 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "ops_backend.h"
+
+#ifndef BIT_DEPTH
+#  error Should only be included from ops_tmpl_*.c!
+#endif
+
+#define WRAP_CONVERT_UINT(N)                                                   
 \
+DECL_PATTERN(convert_uint##N)                                                  
 \
+{                                                                              
 \
+    u##N##block_t xu, yu, zu, wu;                                              
 \
+                                                                               
 \
+    SWS_LOOP                                                                   
 \
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {                                 
 \
+        if (X)                                                                 
 \
+            xu[i] = x[i];                                                      
 \
+        if (Y)                                                                 
 \
+            yu[i] = y[i];                                                      
 \
+        if (Z)                                                                 
 \
+            zu[i] = z[i];                                                      
 \
+        if (W)                                                                 
 \
+            wu[i] = w[i];                                                      
 \
+    }                                                                          
 \
+                                                                               
 \
+    CONTINUE(u##N##block_t, xu, yu, zu, wu);                                   
 \
+}                                                                              
 \
+                                                                               
 \
+WRAP_COMMON_PATTERNS(convert_uint##N,                                          
 \
+    .op.op = SWS_OP_CONVERT,                                                   
 \
+    .op.convert.to = SWS_PIXEL_U##N,                                           
 \
+);
+
+#if BIT_DEPTH != 8
+WRAP_CONVERT_UINT(8)
+#endif
+
+#if BIT_DEPTH != 16
+WRAP_CONVERT_UINT(16)
+#endif
+
+#if BIT_DEPTH != 32 || defined(IS_FLOAT)
+WRAP_CONVERT_UINT(32)
+#endif
+
+DECL_FUNC(clear, const bool X, const bool Y, const bool Z, const bool W)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (!X)
+            x[i] = impl->priv.px[0];
+        if (!Y)
+            y[i] = impl->priv.px[1];
+        if (!Z)
+            z[i] = impl->priv.px[2];
+        if (!W)
+            w[i] = impl->priv.px[3];
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+#define WRAP_CLEAR(X, Y, Z, W)                                                 
 \
+DECL_IMPL(clear##_##X##Y##Z##W)                                                
 \
+{                                                                              
 \
+    CALL(clear, X, Y, Z, W);                                                   
 \
+}                                                                              
 \
+                                                                               
 \
+DECL_ENTRY(clear##_##X##Y##Z##W,                                               
 \
+    .setup = ff_sws_setup_q4,                                                  
 \
+    .flexible = true,                                                          
 \
+    .op.op = SWS_OP_CLEAR,                                                     
 \
+    .op.comps.unused = { !X, !Y, !Z, !W },                                     
 \
+);
+
+WRAP_CLEAR(1, 1, 1, 0) /* rgba alpha */
+WRAP_CLEAR(0, 1, 1, 1) /* argb alpha */
+
+WRAP_CLEAR(0, 0, 1, 1) /* vuya chroma */
+WRAP_CLEAR(1, 0, 0, 1) /* yuva chroma */
+WRAP_CLEAR(1, 1, 0, 0) /* ayuv chroma */
+WRAP_CLEAR(0, 1, 0, 1) /* uyva chroma */
+WRAP_CLEAR(1, 0, 1, 0) /* xvyu chroma */
+
+WRAP_CLEAR(1, 0, 0, 0) /* gray -> yuva */
+WRAP_CLEAR(0, 1, 0, 0) /* gray -> ayuv */
+WRAP_CLEAR(0, 0, 1, 0) /* gray -> vuya */
+
+DECL_PATTERN(min)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X)
+            x[i] = FFMIN(x[i], impl->priv.px[0]);
+        if (Y)
+            y[i] = FFMIN(y[i], impl->priv.px[1]);
+        if (Z)
+            z[i] = FFMIN(z[i], impl->priv.px[2]);
+        if (W)
+            w[i] = FFMIN(w[i], impl->priv.px[3]);
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+DECL_PATTERN(max)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X)
+            x[i] = FFMAX(x[i], impl->priv.px[0]);
+        if (Y)
+            y[i] = FFMAX(y[i], impl->priv.px[1]);
+        if (Z)
+            z[i] = FFMAX(z[i], impl->priv.px[2]);
+        if (W)
+            w[i] = FFMAX(w[i], impl->priv.px[3]);
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+WRAP_COMMON_PATTERNS(min,
+    .op.op = SWS_OP_MIN,
+    .setup = ff_sws_setup_q4,
+    .flexible = true,
+);
+
+WRAP_COMMON_PATTERNS(max,
+    .op.op = SWS_OP_MAX,
+    .setup = ff_sws_setup_q4,
+    .flexible = true,
+);
+
+DECL_PATTERN(scale)
+{
+    const pixel_t scale = impl->priv.px[0];
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X)
+            x[i] *= scale;
+        if (Y)
+            y[i] *= scale;
+        if (Z)
+            z[i] *= scale;
+        if (W)
+            w[i] *= scale;
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+WRAP_COMMON_PATTERNS(scale,
+    .op.op = SWS_OP_SCALE,
+    .setup = ff_sws_setup_q,
+    .flexible = true,
+);
diff --git a/libswscale/ops_tmpl_float.c b/libswscale/ops_tmpl_float.c
new file mode 100644
index 0000000000..9acdbd01bf
--- /dev/null
+++ b/libswscale/ops_tmpl_float.c
@@ -0,0 +1,255 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+
+#include "ops_backend.h"
+
+#ifndef BIT_DEPTH
+#  define BIT_DEPTH 32
+#endif
+
+#if BIT_DEPTH == 32
+#  define PIXEL_TYPE SWS_PIXEL_F32
+#  define PIXEL_MAX  FLT_MAX
+#  define PIXEL_MIN  FLT_MIN
+#  define pixel_t    float
+#  define block_t    f32block_t
+#  define px         f32
+#else
+#  error Invalid BIT_DEPTH
+#endif
+
+#define IS_FLOAT 1
+#define FMT_CHAR f
+#include "ops_tmpl_common.c"
+
+#define MAX_DITHER_SIZE 16
+#if MAX_DITHER_SIZE > SWS_BLOCK_SIZE
+#  define DITHER_ROW_SIZE MAX_DITHER_SIZE
+#else
+#  define DITHER_ROW_SIZE SWS_BLOCK_SIZE
+#endif
+
+typedef struct {
+    pixel_t matrix[MAX_DITHER_SIZE][DITHER_ROW_SIZE];
+} fn(DitherCoeffs);
+
+DECL_SETUP(setup_dither)
+{
+    fn(DitherCoeffs) c = {0};
+    const int size = 1 << op->dither.size_log2;
+
+    if (!size) {
+        /* We special case this value */
+        av_assert1(!av_cmp_q(op->dither.matrix[0], av_make_q(1, 2)));
+        out->ptr = NULL;
+        return 0;
+    }
+
+    for (int y = 0; y < size; y++) {
+        for (int x = 0; x < size; x++)
+            c.matrix[y][x] = av_q2pixel(op->dither.matrix[y * size + x]);
+        for (int x = size; x < SWS_BLOCK_SIZE; x++)
+            c.matrix[y][x] = c.matrix[y][x % size]; /* pad to chunk size */
+    }
+
+    return SETUP_MEMDUP(c);
+}
+
+DECL_FUNC(dither, const int size_log2)
+{
+    const fn(DitherCoeffs) *restrict c = impl->priv.ptr;
+    const int mask = (1 << size_log2) - 1;
+    const int y_line = iter->y;
+    const int row0 = (y_line +  0) & mask;
+    const int row1 = (y_line +  3) & mask;
+    const int row2 = (y_line +  2) & mask;
+    const int row3 = (y_line +  5) & mask;
+    const int base = iter->x & (SWS_BLOCK_SIZE & (MAX_DITHER_SIZE - 1));
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        x[i] += size_log2 ? c->matrix[row0][base + i] : (pixel_t) 0.5;
+        y[i] += size_log2 ? c->matrix[row1][base + i] : (pixel_t) 0.5;
+        z[i] += size_log2 ? c->matrix[row2][base + i] : (pixel_t) 0.5;
+        w[i] += size_log2 ? c->matrix[row3][base + i] : (pixel_t) 0.5;
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+#define WRAP_DITHER(N)                                                         
 \
+DECL_IMPL(dither##N)                                                           
 \
+{                                                                              
 \
+    CALL(dither, N);                                                           
 \
+}                                                                              
 \
+                                                                               
 \
+DECL_ENTRY(dither##N,                                                          
 \
+    .op.op = SWS_OP_DITHER,                                                    
 \
+    .op.dither.size_log2 = N,                                                  
 \
+    .setup = fn(setup_dither),                                                 
 \
+    .free = av_free,                                                           
 \
+);
+
+WRAP_DITHER(0)
+WRAP_DITHER(1)
+WRAP_DITHER(2)
+WRAP_DITHER(3)
+WRAP_DITHER(4)
+
+typedef struct {
+    /* Stored in split form for convenience */
+    pixel_t m[4][4];
+    pixel_t k[4];
+} fn(LinCoeffs);
+
+DECL_SETUP(setup_linear)
+{
+    fn(LinCoeffs) c;
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++)
+            c.m[i][j] = av_q2pixel(op->lin.m[i][j]);
+        c.k[i] = av_q2pixel(op->lin.m[i][4]);
+    }
+
+    return SETUP_MEMDUP(c);
+}
+
+/**
+ * Fully general case for a 5x5 linear affine transformation. Should never be
+ * called without constant `mask`. This function will compile down to the
+ * appropriately optimized version for the required subset of operations when
+ * called with a constant mask.
+ */
+DECL_FUNC(linear_mask, const uint32_t mask)
+{
+    const fn(LinCoeffs) c = *(const fn(LinCoeffs) *) impl->priv.ptr;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        const pixel_t xx = x[i];
+        const pixel_t yy = y[i];
+        const pixel_t zz = z[i];
+        const pixel_t ww = w[i];
+
+        x[i]  = (mask & SWS_MASK_OFF(0)) ? c.k[0] : 0;
+        x[i] += (mask & SWS_MASK(0, 0))  ? c.m[0][0] * xx : xx;
+        x[i] += (mask & SWS_MASK(0, 1))  ? c.m[0][1] * yy : 0;
+        x[i] += (mask & SWS_MASK(0, 2))  ? c.m[0][2] * zz : 0;
+        x[i] += (mask & SWS_MASK(0, 3))  ? c.m[0][3] * ww : 0;
+
+        y[i]  = (mask & SWS_MASK_OFF(1)) ? c.k[1] : 0;
+        y[i] += (mask & SWS_MASK(1, 0))  ? c.m[1][0] * xx : 0;
+        y[i] += (mask & SWS_MASK(1, 1))  ? c.m[1][1] * yy : yy;
+        y[i] += (mask & SWS_MASK(1, 2))  ? c.m[1][2] * zz : 0;
+        y[i] += (mask & SWS_MASK(1, 3))  ? c.m[1][3] * ww : 0;
+
+        z[i]  = (mask & SWS_MASK_OFF(2)) ? c.k[2] : 0;
+        z[i] += (mask & SWS_MASK(2, 0))  ? c.m[2][0] * xx : 0;
+        z[i] += (mask & SWS_MASK(2, 1))  ? c.m[2][1] * yy : 0;
+        z[i] += (mask & SWS_MASK(2, 2))  ? c.m[2][2] * zz : zz;
+        z[i] += (mask & SWS_MASK(2, 3))  ? c.m[2][3] * ww : 0;
+
+        w[i]  = (mask & SWS_MASK_OFF(3)) ? c.k[3] : 0;
+        w[i] += (mask & SWS_MASK(3, 0))  ? c.m[3][0] * xx : 0;
+        w[i] += (mask & SWS_MASK(3, 1))  ? c.m[3][1] * yy : 0;
+        w[i] += (mask & SWS_MASK(3, 2))  ? c.m[3][2] * zz : 0;
+        w[i] += (mask & SWS_MASK(3, 3))  ? c.m[3][3] * ww : ww;
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+#define WRAP_LINEAR(NAME, MASK)                                                
 \
+DECL_IMPL(linear_##NAME)                                                       
 \
+{                                                                              
 \
+    CALL(linear_mask, MASK);                                                   
 \
+}                                                                              
 \
+                                                                               
 \
+DECL_ENTRY(linear_##NAME,                                                      
 \
+    .setup = fn(setup_linear),                                                 
 \
+    .free = av_free,                                                           
 \
+    .op.op = SWS_OP_LINEAR,                                                    
 \
+    .op.lin.mask = (MASK),                                                     
 \
+);
+
+WRAP_LINEAR(luma,      SWS_MASK_LUMA)
+WRAP_LINEAR(alpha,     SWS_MASK_ALPHA)
+WRAP_LINEAR(lumalpha,  SWS_MASK_LUMA | SWS_MASK_ALPHA)
+WRAP_LINEAR(dot3,      0b111)
+WRAP_LINEAR(row0,      SWS_MASK_ROW(0))
+WRAP_LINEAR(row0a,     SWS_MASK_ROW(0) | SWS_MASK_ALPHA)
+WRAP_LINEAR(diag3,     SWS_MASK_DIAG3)
+WRAP_LINEAR(diag4,     SWS_MASK_DIAG4)
+WRAP_LINEAR(diagoff3,  SWS_MASK_DIAG3 | SWS_MASK_OFF3)
+WRAP_LINEAR(matrix3,   SWS_MASK_MAT3)
+WRAP_LINEAR(affine3,   SWS_MASK_MAT3 | SWS_MASK_OFF3)
+WRAP_LINEAR(affine3a,  SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA)
+WRAP_LINEAR(matrix4,   SWS_MASK_MAT4)
+WRAP_LINEAR(affine4,   SWS_MASK_MAT4 | SWS_MASK_OFF4)
+
+static const SwsOpTable fn(op_table_float) = {
+    .block_size = SWS_BLOCK_SIZE,
+    .entries = {
+        REF_COMMON_PATTERNS(convert_uint8),
+        REF_COMMON_PATTERNS(convert_uint16),
+        REF_COMMON_PATTERNS(convert_uint32),
+
+        fn(op_clear_1110),
+        REF_COMMON_PATTERNS(min),
+        REF_COMMON_PATTERNS(max),
+        REF_COMMON_PATTERNS(scale),
+
+        fn(op_dither0),
+        fn(op_dither1),
+        fn(op_dither2),
+        fn(op_dither3),
+        fn(op_dither4),
+
+        fn(op_linear_luma),
+        fn(op_linear_alpha),
+        fn(op_linear_lumalpha),
+        fn(op_linear_dot3),
+        fn(op_linear_row0),
+        fn(op_linear_row0a),
+        fn(op_linear_diag3),
+        fn(op_linear_diag4),
+        fn(op_linear_diagoff3),
+        fn(op_linear_matrix3),
+        fn(op_linear_affine3),
+        fn(op_linear_affine3a),
+        fn(op_linear_matrix4),
+        fn(op_linear_affine4),
+
+        {{0}}
+    },
+};
+
+#undef PIXEL_TYPE
+#undef PIXEL_MAX
+#undef PIXEL_MIN
+#undef pixel_t
+#undef block_t
+#undef px
+
+#undef FMT_CHAR
+#undef IS_FLOAT
diff --git a/libswscale/ops_tmpl_int.c b/libswscale/ops_tmpl_int.c
new file mode 100644
index 0000000000..e91ff4fe2c
--- /dev/null
+++ b/libswscale/ops_tmpl_int.c
@@ -0,0 +1,609 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/bswap.h"
+
+#include "ops_backend.h"
+
+#ifndef BIT_DEPTH
+#  define BIT_DEPTH 8
+#endif
+
+#if BIT_DEPTH == 32
+#  define PIXEL_TYPE SWS_PIXEL_U32
+#  define PIXEL_MAX  0xFFFFFFFFu
+#  define SWAP_BYTES av_bswap32
+#  define pixel_t    uint32_t
+#  define block_t    u32block_t
+#  define px         u32
+#elif BIT_DEPTH == 16
+#  define PIXEL_TYPE SWS_PIXEL_U16
+#  define PIXEL_MAX  0xFFFFu
+#  define SWAP_BYTES av_bswap16
+#  define pixel_t    uint16_t
+#  define block_t    u16block_t
+#  define px         u16
+#elif BIT_DEPTH == 8
+#  define PIXEL_TYPE SWS_PIXEL_U8
+#  define PIXEL_MAX  0xFFu
+#  define pixel_t    uint8_t
+#  define block_t    u8block_t
+#  define px         u8
+#else
+#  error Invalid BIT_DEPTH
+#endif
+
+#define IS_FLOAT  0
+#define FMT_CHAR  u
+#define PIXEL_MIN 0
+#include "ops_tmpl_common.c"
+
+DECL_READ(read_planar, const int elems)
+{
+    block_t x, y, z, w;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        x[i] = in0[i];
+        if (elems > 1)
+            y[i] = in1[i];
+        if (elems > 2)
+            z[i] = in2[i];
+        if (elems > 3)
+            w[i] = in3[i];
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+DECL_READ(read_packed, const int elems)
+{
+    block_t x, y, z, w;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        x[i] = in0[elems * i + 0];
+        if (elems > 1)
+            y[i] = in0[elems * i + 1];
+        if (elems > 2)
+            z[i] = in0[elems * i + 2];
+        if (elems > 3)
+            w[i] = in0[elems * i + 3];
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+DECL_WRITE(write_planar, const int elems)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        out0[i] = x[i];
+        if (elems > 1)
+            out1[i] = y[i];
+        if (elems > 2)
+            out2[i] = z[i];
+        if (elems > 3)
+            out3[i] = w[i];
+    }
+}
+
+DECL_WRITE(write_packed, const int elems)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        out0[elems * i + 0] = x[i];
+        if (elems > 1)
+            out0[elems * i + 1] = y[i];
+        if (elems > 2)
+            out0[elems * i + 2] = z[i];
+        if (elems > 3)
+            out0[elems * i + 3] = w[i];
+    }
+}
+
+#define WRAP_READ(FUNC, ELEMS, FRAC, PACKED)                                   
 \
+DECL_IMPL(FUNC##ELEMS)                                                         
 \
+{                                                                              
 \
+    CALL_READ(FUNC, ELEMS);                                                    
 \
+    for (int i = 0; i < (PACKED ? 1 : ELEMS); i++)                             
 \
+        iter->in[i] += sizeof(block_t) * (PACKED ? ELEMS : 1) >> FRAC;         
 \
+}                                                                              
 \
+                                                                               
 \
+DECL_ENTRY(FUNC##ELEMS,                                                        
 \
+    .op.op = SWS_OP_READ,                                                      
 \
+    .op.rw = {                                                                 
 \
+        .elems  = ELEMS,                                                       
 \
+        .packed = PACKED,                                                      
 \
+        .frac   = FRAC,                                                        
 \
+    },                                                                         
 \
+);
+
+WRAP_READ(read_planar, 1, 0, false)
+WRAP_READ(read_planar, 2, 0, false)
+WRAP_READ(read_planar, 3, 0, false)
+WRAP_READ(read_planar, 4, 0, false)
+WRAP_READ(read_packed, 2, 0, true)
+WRAP_READ(read_packed, 3, 0, true)
+WRAP_READ(read_packed, 4, 0, true)
+
+#define WRAP_WRITE(FUNC, ELEMS, FRAC, PACKED)                                  
 \
+DECL_IMPL(FUNC##ELEMS)                                                         
 \
+{                                                                              
 \
+    CALL_WRITE(FUNC, ELEMS);                                                   
 \
+    for (int i = 0; i < (PACKED ? 1 : ELEMS); i++)                             
 \
+        iter->out[i] += sizeof(block_t) * (PACKED ? ELEMS : 1) >> FRAC;        
 \
+}                                                                              
 \
+                                                                               
 \
+DECL_ENTRY(FUNC##ELEMS,                                                        
 \
+    .op.op = SWS_OP_WRITE,                                                     
 \
+    .op.rw = {                                                                 
 \
+        .elems  = ELEMS,                                                       
 \
+        .packed = PACKED,                                                      
 \
+        .frac   = FRAC,                                                        
 \
+    },                                                                         
 \
+);
+
+WRAP_WRITE(write_planar, 1, 0, false)
+WRAP_WRITE(write_planar, 2, 0, false)
+WRAP_WRITE(write_planar, 3, 0, false)
+WRAP_WRITE(write_planar, 4, 0, false)
+WRAP_WRITE(write_packed, 2, 0, true)
+WRAP_WRITE(write_packed, 3, 0, true)
+WRAP_WRITE(write_packed, 4, 0, true)
+
+#if BIT_DEPTH == 8
+DECL_READ(read_nibbles, const int elems)
+{
+    block_t x, y, z, w;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) {
+        const pixel_t val = ((const pixel_t *) in0)[i >> 1];
+        x[i + 0] = val >> 4;  /* high nibble */
+        x[i + 1] = val & 0xF; /* low nibble */
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+DECL_READ(read_bits, const int elems)
+{
+    block_t x, y, z, w;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
+        const pixel_t val = ((const pixel_t *) in0)[i >> 3];
+        x[i + 0] = (val >> 7) & 1;
+        x[i + 1] = (val >> 6) & 1;
+        x[i + 2] = (val >> 5) & 1;
+        x[i + 3] = (val >> 4) & 1;
+        x[i + 4] = (val >> 3) & 1;
+        x[i + 5] = (val >> 2) & 1;
+        x[i + 6] = (val >> 1) & 1;
+        x[i + 7] = (val >> 0) & 1;
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+WRAP_READ(read_nibbles, 1, 1, false)
+WRAP_READ(read_bits,    1, 3, false)
+
+DECL_WRITE(write_nibbles, const int elems)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i += 2)
+        out0[i >> 1] = x[i] << 4 | x[i + 1];
+}
+
+DECL_WRITE(write_bits, const int elems)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
+        out0[i >> 3] = x[i + 0] << 7 |
+                       x[i + 1] << 6 |
+                       x[i + 2] << 5 |
+                       x[i + 3] << 4 |
+                       x[i + 4] << 3 |
+                       x[i + 5] << 2 |
+                       x[i + 6] << 1 |
+                       x[i + 7];
+    }
+}
+
+WRAP_WRITE(write_nibbles, 1, 1, false)
+WRAP_WRITE(write_bits,    1, 3, false)
+#endif /* BIT_DEPTH == 8 */
+
+#ifdef SWAP_BYTES
+DECL_PATTERN(swap_bytes)
+{
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X)
+            x[i] = SWAP_BYTES(x[i]);
+        if (Y)
+            y[i] = SWAP_BYTES(y[i]);
+        if (Z)
+            z[i] = SWAP_BYTES(z[i]);
+        if (W)
+            w[i] = SWAP_BYTES(w[i]);
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+WRAP_COMMON_PATTERNS(swap_bytes, .op.op = SWS_OP_SWAP_BYTES);
+#endif /* SWAP_BYTES */
+
+#if BIT_DEPTH == 8
+DECL_PATTERN(expand16)
+{
+    u16block_t x16, y16, z16, w16;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        if (X)
+            x16[i] = x[i] << 8 | x[i];
+        if (Y)
+            y16[i] = y[i] << 8 | y[i];
+        if (Z)
+            z16[i] = z[i] << 8 | z[i];
+        if (W)
+            w16[i] = w[i] << 8 | w[i];
+    }
+
+    CONTINUE(u16block_t, x16, y16, z16, w16);
+}
+
+WRAP_COMMON_PATTERNS(expand16,
+    .op.op = SWS_OP_CONVERT,
+    .op.convert.to = SWS_PIXEL_U16,
+    .op.convert.expand = true,
+);
+
+DECL_PATTERN(expand32)
+{
+    u32block_t x32, y32, z32, w32;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        x32[i] = x[i] << 24 | x[i] << 16 | x[i] << 8 | x[i];
+        y32[i] = y[i] << 24 | y[i] << 16 | y[i] << 8 | y[i];
+        z32[i] = z[i] << 24 | z[i] << 16 | z[i] << 8 | z[i];
+        w32[i] = w[i] << 24 | w[i] << 16 | w[i] << 8 | w[i];
+    }
+
+    CONTINUE(u32block_t, x32, y32, z32, w32);
+}
+
+WRAP_COMMON_PATTERNS(expand32,
+    .op.op = SWS_OP_CONVERT,
+    .op.convert.to = SWS_PIXEL_U32,
+    .op.convert.expand = true,
+);
+#endif
+
+#define WRAP_PACK_UNPACK(X, Y, Z, W)                                           
 \
+inline DECL_IMPL(pack_##X##Y##Z##W)                                            
 \
+{                                                                              
 \
+    SWS_LOOP                                                                   
 \
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {                                 
 \
+        x[i] = x[i] << (Y+Z+W);                                                
 \
+        if (Y)                                                                 
 \
+            x[i] |= y[i] << (Z+W);                                             
 \
+        if (Z)                                                                 
 \
+            x[i] |= z[i] << W;                                                 
 \
+        if (W)                                                                 
 \
+            x[i] |= w[i];                                                      
 \
+    }                                                                          
 \
+                                                                               
 \
+    CONTINUE(block_t, x, y, z, w);                                             
 \
+}                                                                              
 \
+                                                                               
 \
+DECL_ENTRY(pack_##X##Y##Z##W,                                                  
 \
+    .op.op = SWS_OP_PACK,                                                      
 \
+    .op.pack.pattern = { X, Y, Z, W },                                         
 \
+    .op.comps.unused = { !X, !Y, !Z, !W },                                     
 \
+);                                                                             
 \
+                                                                               
 \
+inline DECL_IMPL(unpack_##X##Y##Z##W)                                          
 \
+{                                                                              
 \
+    SWS_LOOP                                                                   
 \
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {                                 
 \
+        const pixel_t val = x[i];                                              
 \
+        x[i] = val >> (Y+Z+W);                                                 
 \
+        if (Y)                                                                 
 \
+            y[i] = (val >> (Z+W)) & ((1 << Y) - 1);                            
 \
+        if (Z)                                                                 
 \
+            z[i] = (val >> W) & ((1 << Z) - 1);                                
 \
+        if (W)                                                                 
 \
+            w[i] = val & ((1 << W) - 1);                                       
 \
+    }                                                                          
 \
+                                                                               
 \
+    CONTINUE(block_t, x, y, z, w);                                             
 \
+}                                                                              
 \
+                                                                               
 \
+DECL_ENTRY(unpack_##X##Y##Z##W,                                                
 \
+    .op.op = SWS_OP_UNPACK,                                                    
 \
+    .op.pack.pattern = { X, Y, Z, W },                                         
 \
+    .op.comps.flags = {                                                        
 \
+        X ? 0 : SWS_COMP_GARBAGE, Y ? 0 : SWS_COMP_GARBAGE,                    
 \
+        Z ? 0 : SWS_COMP_GARBAGE, W ? 0 : SWS_COMP_GARBAGE,                    
 \
+    },                                                                         
 \
+);
+
+WRAP_PACK_UNPACK( 3,  3,  2,  0)
+WRAP_PACK_UNPACK( 2,  3,  3,  0)
+WRAP_PACK_UNPACK( 1,  2,  1,  0)
+WRAP_PACK_UNPACK( 5,  6,  5,  0)
+WRAP_PACK_UNPACK( 5,  5,  5,  0)
+WRAP_PACK_UNPACK( 4,  4,  4,  0)
+WRAP_PACK_UNPACK( 2, 10, 10, 10)
+WRAP_PACK_UNPACK(10, 10, 10,  2)
+
+#if BIT_DEPTH != 8
+DECL_PATTERN(lshift)
+{
+    const uint8_t amount = impl->priv.u8[0];
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        x[i] <<= amount;
+        y[i] <<= amount;
+        z[i] <<= amount;
+        w[i] <<= amount;
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+DECL_PATTERN(rshift)
+{
+    const uint8_t amount = impl->priv.u8[0];
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        x[i] >>= amount;
+        y[i] >>= amount;
+        z[i] >>= amount;
+        w[i] >>= amount;
+    }
+
+    CONTINUE(block_t, x, y, z, w);
+}
+
+WRAP_COMMON_PATTERNS(lshift,
+    .op.op    = SWS_OP_LSHIFT,
+    .setup    = ff_sws_setup_u8,
+    .flexible = true,
+);
+
+WRAP_COMMON_PATTERNS(rshift,
+    .op.op    = SWS_OP_RSHIFT,
+    .setup    = ff_sws_setup_u8,
+    .flexible = true,
+);
+#endif /* BIT_DEPTH != 8 */
+
+DECL_PATTERN(convert_float)
+{
+    f32block_t xf, yf, zf, wf;
+
+    SWS_LOOP
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+        xf[i] = x[i];
+        yf[i] = y[i];
+        zf[i] = z[i];
+        wf[i] = w[i];
+    }
+
+    CONTINUE(f32block_t, xf, yf, zf, wf);
+}
+
+WRAP_COMMON_PATTERNS(convert_float,
+    .op.op = SWS_OP_CONVERT,
+    .op.convert.to = SWS_PIXEL_F32,
+);
+
+/**
+ * Swizzle by directly swapping the order of arguments to the continuation.
+ * Note that this is only safe to do if no arguments are duplicated.
+ */
+#define DECL_SWIZZLE(X, Y, Z, W)                                               
 \
+static SWS_FUNC void                                                           
 \
+fn(swizzle_##X##Y##Z##W)(SwsOpIter *restrict iter,                             
 \
+                         const SwsOpImpl *restrict impl,                       
 \
+                         block_t c0, block_t c1, block_t c2, block_t c3)       
 \
+{                                                                              
 \
+    CONTINUE(block_t, c##X, c##Y, c##Z, c##W);                                 
 \
+}                                                                              
 \
+                                                                               
 \
+DECL_ENTRY(swizzle_##X##Y##Z##W,                                               
 \
+    .op.op = SWS_OP_SWIZZLE,                                                   
 \
+    .op.swizzle = SWS_SWIZZLE(X, Y, Z, W),                                     
 \
+);
+
+DECL_SWIZZLE(3, 0, 1, 2)
+DECL_SWIZZLE(3, 0, 2, 1)
+DECL_SWIZZLE(2, 1, 0, 3)
+DECL_SWIZZLE(3, 2, 1, 0)
+DECL_SWIZZLE(3, 1, 0, 2)
+DECL_SWIZZLE(3, 2, 0, 1)
+DECL_SWIZZLE(1, 2, 0, 3)
+DECL_SWIZZLE(1, 0, 2, 3)
+DECL_SWIZZLE(2, 0, 1, 3)
+DECL_SWIZZLE(2, 3, 1, 0)
+DECL_SWIZZLE(2, 1, 3, 0)
+DECL_SWIZZLE(1, 2, 3, 0)
+DECL_SWIZZLE(1, 3, 2, 0)
+DECL_SWIZZLE(0, 2, 1, 3)
+DECL_SWIZZLE(0, 2, 3, 1)
+DECL_SWIZZLE(0, 3, 1, 2)
+DECL_SWIZZLE(3, 1, 2, 0)
+DECL_SWIZZLE(0, 3, 2, 1)
+
+/* Broadcast luma -> rgb (only used for y(a) -> rgb(a)) */
+#define DECL_EXPAND_LUMA(X, W, T0, T1)                                         
 \
+static SWS_FUNC void                                                           
 \
+fn(expand_luma_##X##W)(SwsOpIter *restrict iter,                               
 \
+                       const SwsOpImpl *restrict impl,                         
 \
+                       block_t c0, block_t c1,  block_t c2, block_t c3)        
 \
+{                                                                              
 \
+    SWS_LOOP                                                                   
 \
+    for (int i = 0; i < SWS_BLOCK_SIZE; i++)                                   
 \
+        T0[i] = T1[i] = c0[i];                                                 
 \
+                                                                               
 \
+    CONTINUE(block_t, c##X, T0, T1, c##W);                                     
 \
+}                                                                              
 \
+                                                                               
 \
+DECL_ENTRY(expand_luma_##X##W,                                                 
 \
+    .op.op = SWS_OP_SWIZZLE,                                                   
 \
+    .op.swizzle = SWS_SWIZZLE(X, 0, 0, W),                                     
 \
+);
+
+DECL_EXPAND_LUMA(0, 3, c1, c2)
+DECL_EXPAND_LUMA(3, 0, c1, c2)
+DECL_EXPAND_LUMA(1, 0, c2, c3)
+DECL_EXPAND_LUMA(0, 1, c2, c3)
+
+static const SwsOpTable fn(op_table_int) = {
+    .block_size = SWS_BLOCK_SIZE,
+    .entries = {
+        fn(op_read_planar1),
+        fn(op_read_planar2),
+        fn(op_read_planar3),
+        fn(op_read_planar4),
+        fn(op_read_packed2),
+        fn(op_read_packed3),
+        fn(op_read_packed4),
+
+        fn(op_write_planar1),
+        fn(op_write_planar2),
+        fn(op_write_planar3),
+        fn(op_write_planar4),
+        fn(op_write_packed2),
+        fn(op_write_packed3),
+        fn(op_write_packed4),
+
+#if BIT_DEPTH == 8
+        fn(op_read_bits1),
+        fn(op_read_nibbles1),
+        fn(op_write_bits1),
+        fn(op_write_nibbles1),
+
+        fn(op_pack_1210),
+        fn(op_pack_2330),
+        fn(op_pack_3320),
+
+        fn(op_unpack_1210),
+        fn(op_unpack_2330),
+        fn(op_unpack_3320),
+
+
+        REF_COMMON_PATTERNS(expand16),
+        REF_COMMON_PATTERNS(expand32),
+#elif BIT_DEPTH == 16
+        fn(op_pack_4440),
+        fn(op_pack_5550),
+        fn(op_pack_5650),
+        fn(op_unpack_4440),
+        fn(op_unpack_5550),
+        fn(op_unpack_5650),
+#elif BIT_DEPTH == 32
+        fn(op_pack_2101010),
+        fn(op_pack_1010102),
+        fn(op_unpack_2101010),
+        fn(op_unpack_1010102),
+#endif
+
+#ifdef SWAP_BYTES
+        REF_COMMON_PATTERNS(swap_bytes),
+#endif
+
+        REF_COMMON_PATTERNS(min),
+        REF_COMMON_PATTERNS(max),
+        REF_COMMON_PATTERNS(scale),
+        REF_COMMON_PATTERNS(convert_float),
+
+        fn(op_clear_1110),
+        fn(op_clear_0111),
+        fn(op_clear_0011),
+        fn(op_clear_1001),
+        fn(op_clear_1100),
+        fn(op_clear_0101),
+        fn(op_clear_1010),
+        fn(op_clear_1000),
+        fn(op_clear_0100),
+        fn(op_clear_0010),
+
+        fn(op_swizzle_3012),
+        fn(op_swizzle_3021),
+        fn(op_swizzle_2103),
+        fn(op_swizzle_3210),
+        fn(op_swizzle_3102),
+        fn(op_swizzle_3201),
+        fn(op_swizzle_1203),
+        fn(op_swizzle_1023),
+        fn(op_swizzle_2013),
+        fn(op_swizzle_2310),
+        fn(op_swizzle_2130),
+        fn(op_swizzle_1230),
+        fn(op_swizzle_1320),
+        fn(op_swizzle_0213),
+        fn(op_swizzle_0231),
+        fn(op_swizzle_0312),
+        fn(op_swizzle_3120),
+        fn(op_swizzle_0321),
+
+        fn(op_expand_luma_03),
+        fn(op_expand_luma_30),
+        fn(op_expand_luma_10),
+        fn(op_expand_luma_01),
+
+#if BIT_DEPTH != 8
+        REF_COMMON_PATTERNS(lshift),
+        REF_COMMON_PATTERNS(rshift),
+        REF_COMMON_PATTERNS(convert_uint8),
+#endif /* BIT_DEPTH != 8 */
+
+#if BIT_DEPTH != 16
+        REF_COMMON_PATTERNS(convert_uint16),
+#endif
+#if BIT_DEPTH != 32
+        REF_COMMON_PATTERNS(convert_uint32),
+#endif
+
+        {{0}}
+    },
+};
+
+#undef PIXEL_TYPE
+#undef PIXEL_MAX
+#undef PIXEL_MIN
+#undef SWAP_BYTES
+#undef pixel_t
+#undef block_t
+#undef px
+
+#undef FMT_CHAR
+#undef IS_FLOAT
-- 
2.49.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 10/17] swscale/ops_backend: add reference backend basend on C templates

Reply via email to