This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 8118e964bbe5acd0ec6a7665208ec9d3038c692f Author: Niklas Haas <[email protected]> AuthorDate: Wed Apr 8 14:43:06 2026 +0200 Commit: Niklas Haas <[email protected]> CommitDate: Tue Jun 9 18:27:20 2026 +0200 swscale/uops: auto-generate reference C backend from uops_macros.h Instead of choosing by hand which kernels to implement, this rewrite focuses on leveraging the power of uops_macros.h to auto-generate all needed kernels. This not only simplifies maintenance, but also improves performance. I have decided to develop the replacement backend as a separate file, under a separate prefix, for the explicit purpose of being able to verify the correctness of the rewrite using the current backend as a checkasm reference. The code for the kernels themselves has been largely copied from the old C backend, modified slightly to conform to the uop template style. This does result in some code duplication, but a following commit will clean it up. I nonetheless want to preserve this commit for bisection purposes, to ensure we have one commit that contains both backends side-by-side. Overall speedup=1.182x faster, min=0.197x max=3.450x The big slowdowns are flukes caused by tiny deviations in the runtime of a noop memcpy conversion. As a nice side benefit, the compiled binary is now also ~10% smaller, and the code ~50% smaller. Signed-off-by: Niklas Haas <[email protected]> --- libswscale/Makefile | 1 + libswscale/ops.c | 2 + libswscale/uops_backend.c | 197 ++++++++++++ libswscale/uops_tmpl.c | 802 ++++++++++++++++++++++++++++++++++++++++++++++ libswscale/uops_tmpl.h | 146 +++++++++ 5 files changed, 1148 insertions(+) diff --git a/libswscale/Makefile b/libswscale/Makefile index 2738d66f35..f69b39972a 100644 --- a/libswscale/Makefile +++ b/libswscale/Makefile @@ -36,6 +36,7 @@ OBJS-$(CONFIG_UNSTABLE) += \ ops_memcpy.o \ ops_optimizer.o \ uops.o \ + uops_backend.o \ ifeq ($(CONFIG_UNSTABLE),yes) include $(SRC_PATH)/libswscale/vulkan/Makefile diff --git a/libswscale/ops.c b/libswscale/ops.c index b28dbec75f..719198e116 100644 --- a/libswscale/ops.c +++ b/libswscale/ops.c @@ -32,6 +32,7 @@ #include "ops_internal.h" extern const SwsOpBackend backend_c; +extern const SwsOpBackend backend_uops; extern const SwsOpBackend backend_murder; extern const SwsOpBackend backend_aarch64; extern const SwsOpBackend backend_x86; @@ -49,6 +50,7 @@ const SwsOpBackend * const ff_sws_op_backends[] = { #elif ARCH_X86_64 && HAVE_X86ASM &backend_x86, #endif + &backend_uops, &backend_c, #if HAVE_SPIRV_HEADERS_SPIRV_H || HAVE_SPIRV_UNIFIED1_SPIRV_H &backend_spirv, diff --git a/libswscale/uops_backend.c b/libswscale/uops_backend.c new file mode 100644 index 0000000000..591fc154db --- /dev/null +++ b/libswscale/uops_backend.c @@ -0,0 +1,197 @@ +/** + * Copyright (C) 2026 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" + +#include "uops_tmpl.h" + +/** + * We want to disable FP contraction because this is a reference backend that + * establishes a bit-exact reference result. + */ +#ifdef __clang__ +#pragma STDC FP_CONTRACT OFF +#elif AV_GCC_VERSION_AT_LEAST(4, 8) +#pragma GCC optimize ("fp-contract=off") +#elif defined(_MSC_VER) +#pragma fp_contract (off) +#endif + +#if AV_GCC_VERSION_AT_LEAST(4, 4) +#pragma GCC optimize ("finite-math-only") +#endif + +/* Integer types */ +#define IS_FLOAT 0 +# define BIT_DEPTH 8 +# include "uops_tmpl.c" +# undef BIT_DEPTH +# define BIT_DEPTH 16 +# include "uops_tmpl.c" +# undef BIT_DEPTH +# define BIT_DEPTH 32 +# include "uops_tmpl.c" +# undef BIT_DEPTH +#undef IS_FLOAT + +/* Floating point types */ +#define IS_FLOAT 1 +# define BIT_DEPTH 32 +# include "uops_tmpl.c" +# undef BIT_DEPTH +#undef IS_FLOAT + +/* Expanded as new uop types are implemented in the C/template backend */ +#define REF_ALL_UOPS(TYPE) \ + SWS_FOR(TYPE, READ_PLANAR, REF_ENTRY) \ + SWS_FOR(TYPE, READ_PLANAR_FV, REF_ENTRY) \ + SWS_FOR(TYPE, READ_PLANAR_FH, REF_ENTRY) \ + SWS_FOR(TYPE, READ_PACKED, REF_ENTRY) \ + SWS_FOR(TYPE, READ_NIBBLE, REF_ENTRY) \ + SWS_FOR(TYPE, READ_BIT, REF_ENTRY) \ + SWS_FOR(TYPE, PERMUTE, REF_ENTRY) \ + SWS_FOR(TYPE, COPY, REF_ENTRY) \ + SWS_FOR(TYPE, WRITE_PLANAR, REF_ENTRY) \ + SWS_FOR(TYPE, WRITE_PACKED, REF_ENTRY) \ + SWS_FOR(TYPE, WRITE_NIBBLE, REF_ENTRY) \ + SWS_FOR(TYPE, WRITE_BIT, REF_ENTRY) \ + SWS_FOR(TYPE, SWAP_BYTES, REF_ENTRY) \ + SWS_FOR(TYPE, EXPAND_BIT, REF_ENTRY) \ + SWS_FOR(TYPE, EXPAND_PAIR, REF_ENTRY) \ + SWS_FOR(TYPE, EXPAND_QUAD, REF_ENTRY) \ + SWS_FOR(TYPE, TO_U8, REF_ENTRY) \ + SWS_FOR(TYPE, TO_U16, REF_ENTRY) \ + SWS_FOR(TYPE, TO_U32, REF_ENTRY) \ + SWS_FOR(TYPE, TO_F32, REF_ENTRY) \ + SWS_FOR(TYPE, SCALE, REF_ENTRY) \ + SWS_FOR(TYPE, ADD, REF_ENTRY) \ + SWS_FOR(TYPE, MIN, REF_ENTRY) \ + SWS_FOR(TYPE, MAX, REF_ENTRY) \ + SWS_FOR(TYPE, UNPACK, REF_ENTRY) \ + SWS_FOR(TYPE, PACK, REF_ENTRY) \ + SWS_FOR(TYPE, LSHIFT, REF_ENTRY) \ + SWS_FOR(TYPE, RSHIFT, REF_ENTRY) \ + SWS_FOR(TYPE, CLEAR, REF_ENTRY) \ + SWS_FOR(TYPE, LINEAR, REF_ENTRY) \ + SWS_FOR(TYPE, DITHER, REF_ENTRY) \ + /* end of macro */ + +static const SwsOpTable op_table = { + .block_size = SWS_BLOCK_SIZE, + .uops = true, + .entries = { + REF_ALL_UOPS(U8) + REF_ALL_UOPS(U16) + REF_ALL_UOPS(U32) + REF_ALL_UOPS(F32) + NULL + }, +}; + +static void process(const SwsOpExec *exec, const void *priv, + const int bx_start, const int y_start, + int bx_end, int y_end) +{ + const SwsOpChain *chain = priv; + const SwsOpImpl *impl = chain->impl; + block_t x, y, z, w; /* allocate enough space for any intermediate */ + + SwsOpIter iterdata; + SwsOpIter *iter = &iterdata; /* for CONTINUE() macro to work */ + iter->exec = exec; + for (int i = 0; i < 4; i++) { + iter->in[i] = (uintptr_t) exec->in[i]; + iter->out[i] = (uintptr_t) exec->out[i]; + } + + for (iter->y = y_start; iter->y < y_end; iter->y++) { + for (int block = bx_start; block < bx_end; block++) { + iter->x = block * SWS_BLOCK_SIZE; + CONTINUE(&x, &y, &z, &w); + } + + const int y_bump = exec->in_bump_y ? exec->in_bump_y[iter->y] : 0; + for (int i = 0; i < 4; i++) { + iter->in[i] += exec->in_bump[i] + y_bump * exec->in_stride[i]; + iter->out[i] += exec->out_bump[i]; + } + } +} + +static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out) +{ + int ret; + + SwsOpChain *chain = ff_sws_op_chain_alloc(); + if (!chain) + return AVERROR(ENOMEM); + + SwsUOpList *uops = ff_sws_uop_list_alloc(); + if (!uops) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = ff_sws_ops_translate(ops, uops); + if (ret < 0) + goto fail; + + av_assert0(uops->num_ops > 0); + for (int i = 0; i < uops->num_ops; i++) { + const SwsOpTable *table = &op_table; + ret = ff_sws_uop_lookup(ctx, &table, 1, &uops->ops[i], + SWS_BLOCK_SIZE, chain); + if (ret < 0) + goto fail; + } + + *out = (SwsCompiledOp) { + .slice_align = 1, + .block_size = SWS_BLOCK_SIZE, + .cpu_flags = chain->cpu_flags, + .over_read = chain->over_read, + .over_write = chain->over_write, + .priv = chain, + .free = ff_sws_op_chain_free_cb, + .func = process, + }; + + av_log(ctx, AV_LOG_DEBUG, "Compiled micro-ops:\n"); + for (int i = 0; i < uops->num_ops; i++) { + char name[SWS_UOP_NAME_MAX]; + ff_sws_uop_name(&uops->ops[i], name); + av_log(ctx, AV_LOG_DEBUG, " %s\n", name); + } + + ff_sws_uop_list_free(&uops); + return 0; + +fail: + ff_sws_uop_list_free(&uops); + ff_sws_op_chain_free(chain); + return ret; +} + +const SwsOpBackend backend_uops = { + .name = "uops", + .flags = SWS_BACKEND_C, + .compile = compile, + .hw_format = AV_PIX_FMT_NONE, +}; diff --git a/libswscale/uops_tmpl.c b/libswscale/uops_tmpl.c new file mode 100644 index 0000000000..9e0d35ea1f --- /dev/null +++ b/libswscale/uops_tmpl.c @@ -0,0 +1,802 @@ +/** + * Copyright (C) 2026 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <libavutil/bswap.h> + +#include "uops_tmpl.h" + +#ifndef BIT_DEPTH +# define BIT_DEPTH 8 +#endif + +#if IS_FLOAT && BIT_DEPTH == 32 +# define PIXEL_TYPE SWS_PIXEL_F32 +# define pixel_t float +# define inter_t float +# define PX F32 +# define px f32 +#elif BIT_DEPTH == 32 +# define PIXEL_MAX 0xFFFFFFFFu +# define PIXEL_SWAP av_bswap32 +# define pixel_t uint32_t +# define inter_t int64_t +# define PX U32 +# define px u32 +#elif BIT_DEPTH == 16 +# define PIXEL_MAX 0xFFFFu +# define PIXEL_SWAP av_bswap16 +# define pixel_t uint16_t +# define inter_t int64_t +# define PX U16 +# define px u16 +#elif BIT_DEPTH == 8 +# define PIXEL_MAX 0xFFu +# define pixel_t uint8_t +# define inter_t int32_t +# define PX U8 +# define px u8 +#else +# error Invalid BIT_DEPTH +#endif + +/********************************* + * Generic read/write operations * + *********************************/ + +DECL_READ(read_planar, const SwsCompMask mask) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x[i] = in0[i]; + if (Y) y[i] = in1[i]; + if (Z) z[i] = in2[i]; + if (W) w[i] = in3[i]; + } + + if (X) iter->in[0] += SIZEOF_BLOCK; + if (Y) iter->in[1] += SIZEOF_BLOCK; + if (Z) iter->in[2] += SIZEOF_BLOCK; + if (W) iter->in[3] += SIZEOF_BLOCK; + + CONTINUE(x, y, z, w); +} + +DECL_READ(read_packed, const SwsCompMask mask) +{ + const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x[i] = in0[elems * i + 0]; + if (Y) y[i] = in0[elems * i + 1]; + if (Z) z[i] = in0[elems * i + 2]; + if (W) w[i] = in0[elems * i + 3]; + } + + iter->in[0] += SIZEOF_BLOCK * elems; + CONTINUE(x, y, z, w); +} + +DECL_WRITE(write_planar, const SwsCompMask mask) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) out0[i] = x[i]; + if (Y) out1[i] = y[i]; + if (Z) out2[i] = z[i]; + if (W) out3[i] = w[i]; + } + + if (X) iter->out[0] += SIZEOF_BLOCK; + if (Y) iter->out[1] += SIZEOF_BLOCK; + if (Z) iter->out[2] += SIZEOF_BLOCK; + if (W) iter->out[3] += SIZEOF_BLOCK; +} + +DECL_WRITE(write_packed, const SwsCompMask mask) +{ + const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) out0[elems * i + 0] = x[i]; + if (Y) out0[elems * i + 1] = y[i]; + if (Z) out0[elems * i + 2] = z[i]; + if (W) out0[elems * i + 3] = w[i]; + } + + iter->out[0] += SIZEOF_BLOCK * elems; +} + +#if BIT_DEPTH == 8 + +DECL_READ(read_bit, const SwsCompMask mask) +{ + av_assert2(mask == SWS_COMP_ELEMS(1)); + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) { + const pixel_t val = ((const pixel_t *) in0)[i >> 3]; + x[i + 0] = (val >> 7) & 1; + x[i + 1] = (val >> 6) & 1; + x[i + 2] = (val >> 5) & 1; + x[i + 3] = (val >> 4) & 1; + x[i + 4] = (val >> 3) & 1; + x[i + 5] = (val >> 2) & 1; + x[i + 6] = (val >> 1) & 1; + x[i + 7] = (val >> 0) & 1; + } + + iter->in[0] += SIZEOF_BLOCK >> 3; + CONTINUE(x, y, z, w); +} + +DECL_READ(read_nibble, const SwsCompMask mask) +{ + av_assert2(mask == SWS_COMP_ELEMS(1)); + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) { + const pixel_t val = in0[i >> 1]; + x[i + 0] = val >> 4; /* high nibble */ + x[i + 1] = val & 0xF; /* low nibble */ + } + + iter->in[0] += SIZEOF_BLOCK >> 1; + CONTINUE(x, y, z, w); +} + +DECL_WRITE(write_bit, const SwsCompMask mask) +{ + av_assert2(mask == SWS_COMP_ELEMS(1)); + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) { + out0[i >> 3] = x[i + 0] << 7 | + x[i + 1] << 6 | + x[i + 2] << 5 | + x[i + 3] << 4 | + x[i + 4] << 3 | + x[i + 5] << 2 | + x[i + 6] << 1 | + x[i + 7]; + } + + iter->out[0] += SIZEOF_BLOCK >> 3; +} + +DECL_WRITE(write_nibble, const SwsCompMask mask) +{ + av_assert2(mask == SWS_COMP_ELEMS(1)); + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) + out0[i >> 1] = x[i] << 4 | x[i + 1]; + + iter->out[0] += SIZEOF_BLOCK >> 1; +} + +#endif /* BIT_DEPTH == 8 */ + +SWS_FOR(PX, READ_PLANAR, DECL_IMPL_READ, read_planar) +SWS_FOR(PX, READ_PACKED, DECL_IMPL_READ, read_packed) +SWS_FOR(PX, READ_NIBBLE, DECL_IMPL_READ, read_nibble) +SWS_FOR(PX, READ_BIT, DECL_IMPL_READ, read_bit) +SWS_FOR(PX, WRITE_PLANAR, DECL_IMPL_WRITE, write_planar) +SWS_FOR(PX, WRITE_PACKED, DECL_IMPL_WRITE, write_packed) +SWS_FOR(PX, WRITE_NIBBLE, DECL_IMPL_WRITE, write_nibble) +SWS_FOR(PX, WRITE_BIT, DECL_IMPL_WRITE, write_bit) + +SWS_FOR_STRUCT(PX, READ_PLANAR, DECL_ENTRY) +SWS_FOR_STRUCT(PX, READ_PACKED, DECL_ENTRY) +SWS_FOR_STRUCT(PX, READ_NIBBLE, DECL_ENTRY) +SWS_FOR_STRUCT(PX, READ_BIT, DECL_ENTRY) +SWS_FOR_STRUCT(PX, WRITE_PLANAR, DECL_ENTRY) +SWS_FOR_STRUCT(PX, WRITE_PACKED, DECL_ENTRY) +SWS_FOR_STRUCT(PX, WRITE_NIBBLE, DECL_ENTRY) +SWS_FOR_STRUCT(PX, WRITE_BIT, DECL_ENTRY) + +/***************************** + * Scaling / filtering reads * + *****************************/ + +DECL_SETUP(setup_filter_v, params, out) +{ + const SwsFilterWeights *filter = params->uop->data.kernel; + static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]), + ">8 byte pointers not supported"); + + /* Pre-convert weights to float */ + float *weights = av_calloc(filter->num_weights, sizeof(float)); + if (!weights) + return AVERROR(ENOMEM); + + for (int i = 0; i < filter->num_weights; i++) + weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE; + + out->priv.ptr = weights; + out->priv.i32[2] = filter->filter_size; + out->free = ff_op_priv_free; + return 0; +} + +/* Fully general vertical planar filter case */ +DECL_READ(read_planar_fv, const SwsCompMask mask) +{ + const SwsOpExec *exec = iter->exec; + const float *restrict weights = impl->priv.ptr; + const int filter_size = impl->priv.i32[2]; + weights += filter_size * iter->y; + + block_t xs, ys, zs, ws; + if (X) memset(&xs.f32, 0, sizeof(xs.f32)); + if (Y) memset(&ys.f32, 0, sizeof(ys.f32)); + if (Z) memset(&zs.f32, 0, sizeof(zs.f32)); + if (W) memset(&ws.f32, 0, sizeof(ws.f32)); + + for (int j = 0; j < filter_size; j++) { + const float weight = weights[j]; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) xs.f32[i] += weight * in0[i]; + if (Y) ys.f32[i] += weight * in1[i]; + if (Z) zs.f32[i] += weight * in2[i]; + if (W) ws.f32[i] += weight * in3[i]; + } + + if (X) in0 = bump_ptr(in0, exec->in_stride[0]); + if (Y) in1 = bump_ptr(in1, exec->in_stride[1]); + if (Z) in2 = bump_ptr(in2, exec->in_stride[2]); + if (W) in3 = bump_ptr(in3, exec->in_stride[3]); + } + + if (X) iter->in[0] += SIZEOF_BLOCK; + if (Y) iter->in[1] += SIZEOF_BLOCK; + if (Z) iter->in[2] += SIZEOF_BLOCK; + if (W) iter->in[3] += SIZEOF_BLOCK; + + CONTINUE(&xs, &ys, &zs, &ws); +} + +DECL_SETUP(setup_filter_h, params, out) +{ + SwsFilterWeights *filter = params->uop->data.kernel; + out->priv.ptr = av_refstruct_ref(filter->weights); + out->priv.i32[2] = filter->filter_size; + out->free = ff_op_priv_unref; + return 0; +} + +/* Fully general horizontal planar filter case */ +DECL_READ(read_planar_fh, const SwsCompMask mask) +{ + const SwsOpExec *exec = iter->exec; + const int *restrict weights = impl->priv.ptr; + const int filter_size = impl->priv.i32[2]; + const float scale = 1.0f / SWS_FILTER_SCALE; + const int xpos = iter->x; + weights += filter_size * iter->x; + + block_t xs, ys, zs, ws; + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + const int offset = exec->in_offset_x[xpos + i]; + pixel_t *start0 = bump_ptr(in0, offset); + pixel_t *start1 = bump_ptr(in1, offset); + pixel_t *start2 = bump_ptr(in2, offset); + pixel_t *start3 = bump_ptr(in3, offset); + + inter_t sx = 0, sy = 0, sz = 0, sw = 0; + for (int j = 0; j < filter_size; j++) { + const int weight = weights[j]; + if (X) sx += weight * start0[j]; + if (Y) sy += weight * start1[j]; + if (Z) sz += weight * start2[j]; + if (W) sw += weight * start3[j]; + } + + if (X) xs.f32[i] = (float) sx * scale; + if (Y) ys.f32[i] = (float) sy * scale; + if (Z) zs.f32[i] = (float) sz * scale; + if (W) ws.f32[i] = (float) sw * scale; + + weights += filter_size; + } + + CONTINUE(&xs, &ys, &zs, &ws); +} + +SWS_FOR(PX, READ_PLANAR_FV, DECL_IMPL_READ, read_planar_fv) +SWS_FOR(PX, READ_PLANAR_FH, DECL_IMPL_READ, read_planar_fh) +SWS_FOR_STRUCT(PX, READ_PLANAR_FV, DECL_ENTRY, .setup = fn(setup_filter_v) ) +SWS_FOR_STRUCT(PX, READ_PLANAR_FH, DECL_ENTRY, .setup = fn(setup_filter_h) ) + +/*************************** + * Permutation and copying * + ***************************/ + +/* Permute by directly swapping the order of arguments to the continuation. */ +#define DECL_PERMUTE(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3) \ + static void NAME##_c(SwsOpIter *restrict iter, \ + const SwsOpImpl *restrict impl, \ + void *restrict in0, void *restrict in1, \ + void *restrict in2, void *restrict in3) \ + { \ + CONTINUE(in##IDX0, in##IDX1, in##IDX2, in##IDX3); \ + } + +#define DECL_COPY(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3) \ + static void NAME##_c(SwsOpIter *restrict iter, \ + const SwsOpImpl *restrict impl, \ + void *restrict in0, void *restrict in1, \ + void *restrict in2, void *restrict in3) \ + { \ + const SwsCompMask mask = (MASK); \ + block_t x, y, z, w; \ + \ + if (X) memcpy(&x.px, in##IDX0, SIZEOF_BLOCK); \ + if (Y) memcpy(&y.px, in##IDX1, SIZEOF_BLOCK); \ + if (Z) memcpy(&z.px, in##IDX2, SIZEOF_BLOCK); \ + if (W) memcpy(&w.px, in##IDX3, SIZEOF_BLOCK); \ + \ + CONTINUE(X ? &x : in0, Y ? &y : in1, Z ? &z : in2, W ? &w : in3); \ + } + +SWS_FOR(PX, PERMUTE, DECL_PERMUTE) +SWS_FOR(PX, COPY, DECL_COPY) +SWS_FOR_STRUCT(PX, PERMUTE, DECL_ENTRY) +SWS_FOR_STRUCT(PX, COPY, DECL_ENTRY) + +/********************* + * Format conversion * + *********************/ + +#define DECL_CAST(DST, dst) \ + DECL_FUNC(to_##dst, const SwsCompMask mask) \ + { \ + block_t xx, yy, zz, ww; \ + \ + SWS_LOOP \ + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { \ + if (X) xx.dst[i] = x[i]; \ + if (Y) yy.dst[i] = y[i]; \ + if (Z) zz.dst[i] = z[i]; \ + if (W) ww.dst[i] = w[i]; \ + } \ + \ + CONTINUE(&xx, &yy, &zz, &ww); \ + } \ + \ + SWS_FOR(PX, TO_##DST, DECL_IMPL, to_##dst) \ + SWS_FOR_STRUCT(PX, TO_##DST, DECL_ENTRY) + +DECL_CAST(U8, u8) +DECL_CAST(U16, u16) +DECL_CAST(U32, u32) +DECL_CAST(F32, f32) + +/******************** + * Bit manipulation * + ********************/ + +#if !IS_FLOAT +DECL_FUNC(lshift, const SwsCompMask mask, const uint8_t amount) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x[i] <<= amount; + if (Y) y[i] <<= amount; + if (Z) z[i] <<= amount; + if (W) w[i] <<= amount; + } + + CONTINUE(x, y, z, w); +} + +DECL_FUNC(rshift, const SwsCompMask mask, const uint8_t amount) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x[i] >>= amount; + if (Y) y[i] >>= amount; + if (Z) z[i] >>= amount; + if (W) w[i] >>= amount; + } + + CONTINUE(x, y, z, w); +} +#endif + +SWS_FOR(PX, LSHIFT, DECL_IMPL, lshift) +SWS_FOR(PX, RSHIFT, DECL_IMPL, rshift) + +SWS_FOR_STRUCT(PX, LSHIFT, DECL_ENTRY) +SWS_FOR_STRUCT(PX, RSHIFT, DECL_ENTRY) + +#ifdef PIXEL_SWAP +DECL_FUNC(swap_bytes, const SwsCompMask mask) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x[i] = PIXEL_SWAP(x[i]); + if (Y) y[i] = PIXEL_SWAP(y[i]); + if (Z) z[i] = PIXEL_SWAP(z[i]); + if (W) w[i] = PIXEL_SWAP(w[i]); + } + + CONTINUE(x, y, z, w); +} +#endif /* PIXEL_SWAP */ + +SWS_FOR(PX, SWAP_BYTES, DECL_IMPL, swap_bytes) +SWS_FOR_STRUCT(PX, SWAP_BYTES, DECL_ENTRY) + +#ifdef PIXEL_MAX +DECL_FUNC(expand_bit, const SwsCompMask mask) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x[i] = x[i] ? PIXEL_MAX : 0; + if (Y) y[i] = y[i] ? PIXEL_MAX : 0; + if (Z) z[i] = z[i] ? PIXEL_MAX : 0; + if (W) w[i] = w[i] ? PIXEL_MAX : 0; + } + + CONTINUE(x, y, z, w); +} +#endif + +#if BIT_DEPTH == 8 +DECL_FUNC(expand_pair, const SwsCompMask mask) +{ + block_t x16, y16, z16, w16; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x16.u16[i] = x[i] << 8 | x[i]; + if (Y) y16.u16[i] = y[i] << 8 | y[i]; + if (Z) z16.u16[i] = z[i] << 8 | z[i]; + if (W) w16.u16[i] = w[i] << 8 | w[i]; + } + + CONTINUE(&x16, &y16, &z16, &w16); +} + +DECL_FUNC(expand_quad, const SwsCompMask mask) +{ + block_t x32, y32, z32, w32; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x32.u32[i] = (uint32_t) x[i] << 24 | x[i] << 16 | x[i] << 8 | x[i]; + if (Y) y32.u32[i] = (uint32_t) y[i] << 24 | y[i] << 16 | y[i] << 8 | y[i]; + if (Z) z32.u32[i] = (uint32_t) z[i] << 24 | z[i] << 16 | z[i] << 8 | z[i]; + if (W) w32.u32[i] = (uint32_t) w[i] << 24 | w[i] << 16 | w[i] << 8 | w[i]; + } + + CONTINUE(&x32, &y32, &z32, &w32); +} +#endif /* BIT_DEPTH == 8 */ + +SWS_FOR(PX, EXPAND_BIT, DECL_IMPL, expand_bit) +SWS_FOR(PX, EXPAND_PAIR, DECL_IMPL, expand_pair) +SWS_FOR(PX, EXPAND_QUAD, DECL_IMPL, expand_quad) +SWS_FOR_STRUCT(PX, EXPAND_BIT, DECL_ENTRY) +SWS_FOR_STRUCT(PX, EXPAND_PAIR, DECL_ENTRY) +SWS_FOR_STRUCT(PX, EXPAND_QUAD, DECL_ENTRY) + +/************************* + * Packing and unpacking * + ************************/ + +#if !IS_FLOAT +DECL_FUNC(unpack, const SwsCompMask mask, + const uint8_t bx, const uint8_t by, + const uint8_t bz, const uint8_t bw) +{ + const uint8_t sx = bw + bz + by; + const uint8_t sy = bw + bz; + const uint8_t sz = bw; + const uint8_t sw = 0; + + const pixel_t mx = (1 << bx) - 1; + const pixel_t my = (1 << by) - 1; + const pixel_t mz = (1 << bz) - 1; + const pixel_t mw = (1 << bw) - 1; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + const pixel_t val = x[i]; + if (X) x[i] = (val >> sx) & mx; + if (Y) y[i] = (val >> sy) & my; + if (Z) z[i] = (val >> sz) & mz; + if (W) w[i] = (val >> sw) & mw; + } + + CONTINUE(x, y, z, w); +} + +DECL_FUNC(pack, const SwsCompMask mask, + const uint8_t bx, const uint8_t by, + const uint8_t bz, const uint8_t bw) +{ + const uint8_t sx = bw + bz + by; + const uint8_t sy = bw + bz; + const uint8_t sz = bw; + const uint8_t sw = 0; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + pixel_t val = 0; + if (X) val |= x[i] << sx; + if (Y) val |= y[i] << sy; + if (Z) val |= z[i] << sz; + if (W) val |= w[i] << sw; + x[i] = val; + } + + CONTINUE(x, y, z, w); +} +#endif /* !IS_FLOAT */ + +SWS_FOR(PX, UNPACK, DECL_IMPL, unpack) +SWS_FOR(PX, PACK, DECL_IMPL, pack) +SWS_FOR_STRUCT(PX, UNPACK, DECL_ENTRY) +SWS_FOR_STRUCT(PX, PACK, DECL_ENTRY) + +/*********************** + * Pixel data clearing * + ***********************/ + +#ifdef PIXEL_MAX +DECL_FUNC(clear, const SwsCompMask mask, const SwsCompMask one, + const SwsCompMask zero) +{ + #define ONE(N) SWS_COMP_TEST(one, N) + #define ZERO(N) SWS_COMP_TEST(zero, N) + const pixel_t cx = ONE(0) ? PIXEL_MAX : ZERO(0) ? 0 : impl->priv.px[0]; + const pixel_t cy = ONE(1) ? PIXEL_MAX : ZERO(1) ? 0 : impl->priv.px[1]; + const pixel_t cz = ONE(2) ? PIXEL_MAX : ZERO(2) ? 0 : impl->priv.px[2]; + const pixel_t cw = ONE(3) ? PIXEL_MAX : ZERO(3) ? 0 : impl->priv.px[3]; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x[i] = cx; + if (Y) y[i] = cy; + if (Z) z[i] = cz; + if (W) w[i] = cw; + } + + CONTINUE(x, y, z, w); +} +#endif + +SWS_FOR(PX, CLEAR, DECL_IMPL, clear) +SWS_FOR_STRUCT(PX, CLEAR, DECL_ENTRY, .setup = ff_sws_setup_vec4) + +/************************* + * Arithmetic operations * + *************************/ + +DECL_FUNC(scale, const SwsCompMask mask) +{ + const pixel_t scale = impl->priv.px[0]; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x[i] *= scale; + if (Y) y[i] *= scale; + if (Z) z[i] *= scale; + if (W) w[i] *= scale; + } + + CONTINUE(x, y, z, w); +} + +DECL_FUNC(add, const SwsCompMask mask) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x[i] += impl->priv.px[0]; + if (Y) y[i] += impl->priv.px[1]; + if (Z) z[i] += impl->priv.px[2]; + if (W) w[i] += impl->priv.px[3]; + } + + CONTINUE(x, y, z, w); +} + +DECL_FUNC(min, const SwsCompMask mask) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x[i] = FFMIN(x[i], impl->priv.px[0]); + if (Y) y[i] = FFMIN(y[i], impl->priv.px[1]); + if (Z) z[i] = FFMIN(z[i], impl->priv.px[2]); + if (W) w[i] = FFMIN(w[i], impl->priv.px[3]); + } + + CONTINUE(x, y, z, w); +} + +DECL_FUNC(max, const SwsCompMask mask) +{ + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x[i] = FFMAX(x[i], impl->priv.px[0]); + if (Y) y[i] = FFMAX(y[i], impl->priv.px[1]); + if (Z) z[i] = FFMAX(z[i], impl->priv.px[2]); + if (W) w[i] = FFMAX(w[i], impl->priv.px[3]); + } + + CONTINUE(x, y, z, w); +} + +SWS_FOR(PX, SCALE, DECL_IMPL, scale) +SWS_FOR(PX, ADD, DECL_IMPL, add) +SWS_FOR(PX, MIN, DECL_IMPL, min) +SWS_FOR(PX, MAX, DECL_IMPL, max) +SWS_FOR_STRUCT(PX, SCALE, DECL_ENTRY, .setup = ff_sws_setup_scalar ) +SWS_FOR_STRUCT(PX, ADD, DECL_ENTRY, .setup = ff_sws_setup_vec4 ) +SWS_FOR_STRUCT(PX, MIN, DECL_ENTRY, .setup = ff_sws_setup_vec4 ) +SWS_FOR_STRUCT(PX, MAX, DECL_ENTRY, .setup = ff_sws_setup_vec4 ) + +/************* + * Dithering * + *************/ + +DECL_SETUP(setup_dither, params, out) +{ + const SwsUOp *uop = params->uop; + const SwsDitherUOp *dither = &uop->par.dither; + const int size = 1 << dither->size_log2; + if (size >= SWS_BLOCK_SIZE) { + /* No extra padding needed */ + out->priv.ptr = av_refstruct_ref(uop->data.ptr); + out->free = ff_op_priv_unref; + return 0; + } + + const int stride = FFMAX(size, SWS_BLOCK_SIZE); + const int height = ff_sws_dither_height(dither); + pixel_t *matrix = av_malloc(sizeof(pixel_t) * height * stride); + if (!matrix) + return AVERROR(ENOMEM); + out->priv.ptr = matrix; + out->free = ff_op_priv_free; + + /* Pad to multiple of block size. We don't need extra padding for the + * height because ff_sws_dither_height() already includes any padding + * necessary for the y_offset */ + for (int y = 0; y < height; y++) { + pixel_t *row = &matrix[y * stride]; + for (int x = 0; x < size; x++) + row[x] = uop->data.ptr[y * size + x].px; + for (int x = size; x < stride; x++) + row[x] = row[x % size]; + } + + return 0; +} + +DECL_FUNC(dither, const SwsCompMask mask, + const uint8_t off0, const uint8_t off1, + const uint8_t off2, const uint8_t off3, + const uint8_t size_log2) +{ + const int size = 1 << size_log2; + const int stride = FFMAX(size, SWS_BLOCK_SIZE); + + const pixel_t *matrix = impl->priv.ptr; + matrix += (iter->y & (size - 1)) * stride; + matrix += (iter->x & (size - 1)) & ~(SWS_BLOCK_SIZE - 1); + + const pixel_t *const row0 = &matrix[off0 * stride]; + const pixel_t *const row1 = &matrix[off1 * stride]; + const pixel_t *const row2 = &matrix[off2 * stride]; + const pixel_t *const row3 = &matrix[off3 * stride]; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + if (X) x[i] += row0[i]; + if (Y) y[i] += row1[i]; + if (Z) z[i] += row2[i]; + if (W) w[i] += row3[i]; + } + + CONTINUE(x, y, z, w); +} + +SWS_FOR(PX, DITHER, DECL_IMPL, dither) +SWS_FOR_STRUCT(PX, DITHER, DECL_ENTRY, .setup = fn(setup_dither) ) + +/********************* + * Linear operations * + *********************/ + +typedef struct { + /* Stored in split form for convenience */ + pixel_t m[4][4]; + pixel_t k[4]; +} fn(LinCoeffs); + +DECL_SETUP(setup_linear, params, out) +{ + const SwsUOp *uop = params->uop; + fn(LinCoeffs) c; + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) + c.m[i][j] = uop->data.mat4[i][j].px; + c.k[i] = uop->data.mat4[i][4].px; + } + + out->priv.ptr = av_memdup(&c, sizeof(c)); + out->free = ff_op_priv_free; + return out->priv.ptr ? 0 : AVERROR(ENOMEM); +} + +/** + * Fully general case for a 5x5 linear affine transformation. Should never be + * called without constant `mask`. This function will compile down to the + * appropriately optimized version for the required subset of operations when + * called with a constant mask. + */ +DECL_FUNC(linear, const SwsCompMask mask, const uint32_t one, const uint32_t zero) +{ + const fn(LinCoeffs) c = *(const fn(LinCoeffs) *) impl->priv.ptr; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + const pixel_t xx = x[i]; + const pixel_t yy = y[i]; + const pixel_t zz = z[i]; + const pixel_t ww = w[i]; + +#define LIN_VAL(I, J, val) \ + ((one & SWS_MASK(I, J)) ? (val) : c.m[I][J] * (val)) + +#define LIN_ROW(I, var) do { \ + var[i] = (zero & SWS_MASK(I, 4)) ? 0 : c.k[I]; \ + if (!(zero & SWS_MASK(I, 0))) var[i] += LIN_VAL(I, 0, xx); \ + if (!(zero & SWS_MASK(I, 1))) var[i] += LIN_VAL(I, 1, yy); \ + if (!(zero & SWS_MASK(I, 2))) var[i] += LIN_VAL(I, 2, zz); \ + if (!(zero & SWS_MASK(I, 3))) var[i] += LIN_VAL(I, 3, ww); \ +} while (0) + + if (X) LIN_ROW(0, x); + if (Y) LIN_ROW(1, y); + if (Z) LIN_ROW(2, z); + if (W) LIN_ROW(3, w); + } + + CONTINUE(x, y, z, w); +} + +SWS_FOR(PX, LINEAR, DECL_IMPL, linear) +SWS_FOR_STRUCT(PX, LINEAR, DECL_ENTRY, .setup = fn(setup_linear) ) + +#undef PIXEL_MAX +#undef PIXEL_SWAP +#undef pixel_t +#undef inter_t +#undef block_t +#undef PX +#undef px diff --git a/libswscale/uops_tmpl.h b/libswscale/uops_tmpl.h new file mode 100644 index 0000000000..80c6e5221d --- /dev/null +++ b/libswscale/uops_tmpl.h @@ -0,0 +1,146 @@ +/** + * Copyright (C) 2026 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef SWSCALE_UOPS_TMPL_H +#define SWSCALE_UOPS_TMPL_H + +/** + * Helper macros for the C-based backend. + * + * To use these macros, `pixel_t` should be defined as the type of pixels. + */ + +#include <assert.h> +#include <float.h> +#include <stdint.h> + +#include "libavutil/attributes.h" + +#include "ops_chain.h" +#include "uops_macros.h" + +#ifndef SWS_BLOCK_SIZE +# define SWS_BLOCK_SIZE 32 +#endif + +typedef union block_t { + uint8_t u8[SWS_BLOCK_SIZE]; + uint16_t u16[SWS_BLOCK_SIZE]; + uint32_t u32[SWS_BLOCK_SIZE]; + float f32[SWS_BLOCK_SIZE]; +} block_t; + +#define SIZEOF_BLOCK (sizeof(pixel_t) * SWS_BLOCK_SIZE) + +/** + * Internal context holding per-iter execution data. The data pointers will be + * directly incremented by the corresponding read/write functions. + */ +typedef struct SwsOpIter { + uintptr_t in[4]; + uintptr_t out[4]; + int x, y; + + /* Link back to per-slice execution context */ + const SwsOpExec *exec; +} SwsOpIter; + +#ifdef __clang__ +# define SWS_LOOP AV_PRAGMA(clang loop vectorize(assume_safety)) +#elif defined(__GNUC__) +# define SWS_LOOP AV_PRAGMA(GCC ivdep) +#else +# define SWS_LOOP +#endif + +/* Miscellaneous helpers */ +#define bitfn2(name, ext) name ## _ ## ext +#define bitfn(name, ext) bitfn2(name, ext) +#define fn(name) bitfn(name, PX) + +#define bump_ptr(ptr, bump) ((pixel_t *) ((uintptr_t) (ptr) + (bump))) + +/* Helpers for dealing with component masks */ +#define X SWS_COMP_TEST(mask, 0) +#define Y SWS_COMP_TEST(mask, 1) +#define Z SWS_COMP_TEST(mask, 2) +#define W SWS_COMP_TEST(mask, 3) + +/* Helper macros to make writing common function signatures less painful */ +#define DECL_FUNC(NAME, ...) \ + static av_always_inline void \ + fn(NAME)(SwsOpIter *restrict iter, const SwsOpImpl *restrict impl, \ + pixel_t *restrict x, pixel_t *restrict y, \ + pixel_t *restrict z, pixel_t *restrict w, \ + __VA_ARGS__) + +#define DECL_READ(NAME, ...) \ + DECL_FUNC(NAME, __VA_ARGS__, \ + const pixel_t *restrict in0, const pixel_t *restrict in1, \ + const pixel_t *restrict in2, const pixel_t *restrict in3) \ + +#define DECL_WRITE(NAME, ...) \ + DECL_FUNC(NAME, __VA_ARGS__, \ + pixel_t *restrict out0, pixel_t *restrict out1, \ + pixel_t *restrict out2, pixel_t *restrict out3) \ + +#define CALL(NAME, ...) fn(NAME)(iter, impl, x, y, z, w, __VA_ARGS__) + +/* Helper macro to call into the next continuation with a given type */ +#define CONTINUE(...) \ + ((void (*)(SwsOpIter *, const SwsOpImpl *, \ + void *restrict x, void *restrict y, \ + void *restrict z, void *restrict w)) impl->cont) \ + (iter, &impl[1], __VA_ARGS__) + +/* Helper macros for common op setup code */ +#define DECL_SETUP(NAME, PARAMS, OUT) \ + static av_unused int fn(NAME)(const SwsImplParams *PARAMS, \ + SwsImplResult *OUT) + +/* Helper macro for declaring kernel entry points */ +#define DECL_IMPL(FUNC, NAME, TYPE, UOP, ...) \ + static av_flatten void NAME##_c(SwsOpIter *restrict iter, \ + const SwsOpImpl *restrict impl, \ + void *restrict x, void *restrict y, \ + void *restrict z, void *restrict w) \ + { \ + CALL(FUNC, __VA_ARGS__); \ + } + +#define DECL_IMPL_READ(...) \ + DECL_IMPL(__VA_ARGS__, \ + (const pixel_t *) iter->in[0], (const pixel_t *) iter->in[1], \ + (const pixel_t *) iter->in[2], (const pixel_t *) iter->in[3]) + +#define DECL_IMPL_WRITE(...) \ + DECL_IMPL(__VA_ARGS__, \ + (pixel_t *) iter->out[0], (pixel_t *) iter->out[1], \ + (pixel_t *) iter->out[2], (pixel_t *) iter->out[3]) + +#define REF_ENTRY(DUMMY, NAME, ...) &op_##NAME, +#define DECL_ENTRY(SETUP, NAME, ...) \ + static const SwsOpEntry op_##NAME = { \ + .func = (SwsFuncPtr) NAME##_c, \ + __VA_ARGS__, \ + SETUP \ + }; + +#endif _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
