From: Niklas Haas <g...@haasn.dev> See the design document introduced in the previous commit for an in-depth introduction to the new approach.
This commit merely introduces the common dispatch code, the SwsOpList boilerplate, and the high-level optimizer. The subsequent commits will add the underlying implementations. --- libswscale/Makefile | 2 + libswscale/ops.c | 843 +++++++++++++++++++++++++++++++++++++ libswscale/ops.h | 265 ++++++++++++ libswscale/ops_internal.h | 103 +++++ libswscale/ops_optimizer.c | 810 +++++++++++++++++++++++++++++++++++ 5 files changed, 2023 insertions(+) create mode 100644 libswscale/ops.c create mode 100644 libswscale/ops.h create mode 100644 libswscale/ops_internal.h create mode 100644 libswscale/ops_optimizer.c diff --git a/libswscale/Makefile b/libswscale/Makefile index d5e10d17dc..810c9dee78 100644 --- a/libswscale/Makefile +++ b/libswscale/Makefile @@ -15,6 +15,8 @@ OBJS = alphablend.o \ graph.o \ input.o \ lut3d.o \ + ops.o \ + ops_optimizer.o \ options.o \ output.o \ rgb2rgb.o \ diff --git a/libswscale/ops.c b/libswscale/ops.c new file mode 100644 index 0000000000..6d9a844e06 --- /dev/null +++ b/libswscale/ops.c @@ -0,0 +1,843 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "libavutil/bswap.h" +#include "libavutil/mem.h" +#include "libavutil/rational.h" +#include "libavutil/refstruct.h" + +#include "ops.h" +#include "ops_internal.h" + +const SwsOpBackend * const ff_sws_op_backends[] = { + NULL +}; + +const int ff_sws_num_op_backends = FF_ARRAY_ELEMS(ff_sws_op_backends) - 1; + +#define Q(N) ((AVRational) { N, 1 }) + +#define RET(x) \ + do { \ + if ((ret = (x)) < 0) \ + return ret; \ + } while (0) + +const char *ff_sws_pixel_type_name(SwsPixelType type) +{ + switch (type) { + case SWS_PIXEL_U8: return "u8"; + case SWS_PIXEL_U16: return "u16"; + case SWS_PIXEL_U32: return "u32"; + case SWS_PIXEL_F32: return "f32"; + case SWS_PIXEL_NONE: return "none"; + case SWS_PIXEL_TYPE_NB: break; + } + + av_assert0(!"Invalid pixel type!"); + return "ERR"; +} + +int ff_sws_pixel_type_size(SwsPixelType type) +{ + switch (type) { + case SWS_PIXEL_U8: return sizeof(uint8_t); + case SWS_PIXEL_U16: return sizeof(uint16_t); + case SWS_PIXEL_U32: return sizeof(uint32_t); + case SWS_PIXEL_F32: return sizeof(float); + case SWS_PIXEL_NONE: break; + case SWS_PIXEL_TYPE_NB: break; + } + + av_assert0(!"Invalid pixel type!"); + return 0; +} + +bool ff_sws_pixel_type_is_int(SwsPixelType type) +{ + switch (type) { + case SWS_PIXEL_U8: + case SWS_PIXEL_U16: + case SWS_PIXEL_U32: + return true; + case SWS_PIXEL_F32: + return false; + case SWS_PIXEL_NONE: + case SWS_PIXEL_TYPE_NB: break; + } + + av_assert0(!"Invalid pixel type!"); + return false; +} + +SwsPixelType ff_sws_pixel_type_to_uint(SwsPixelType type) +{ + if (!type) + return type; + + switch (ff_sws_pixel_type_size(type)) { + case 8: return SWS_PIXEL_U8; + case 16: return SWS_PIXEL_U16; + case 32: return SWS_PIXEL_U32; + } + + av_assert0(!"Invalid pixel type!"); + return SWS_PIXEL_NONE; +} + +/* biased towards `a` */ +static AVRational av_min_q(AVRational a, AVRational b) +{ + return av_cmp_q(a, b) == 1 ? b : a; +} + +static AVRational av_max_q(AVRational a, AVRational b) +{ + return av_cmp_q(a, b) == -1 ? b : a; +} + +static AVRational expand_factor(SwsPixelType from, SwsPixelType to) +{ + const int src = ff_sws_pixel_type_size(from); + const int dst = ff_sws_pixel_type_size(to); + int scale = 0; + for (int i = 0; i < dst / src; i++) + scale = scale << src * 8 | 1; + return Q(scale); +} + +void ff_sws_apply_op_q(const SwsOp *op, AVRational x[4]) +{ + switch (op->op) { + case SWS_OP_READ: + case SWS_OP_WRITE: + return; + case SWS_OP_UNPACK: { + unsigned val = x[0].num; + int shift = ff_sws_pixel_type_size(op->type) * 8; + for (int i = 0; i < 4; i++) { + const unsigned mask = (1 << op->pack.pattern[i]) - 1; + shift -= op->pack.pattern[i]; + x[i] = Q((val >> shift) & mask); + } + return; + } + case SWS_OP_PACK: { + unsigned val = 0; + int shift = ff_sws_pixel_type_size(op->type) * 8; + for (int i = 0; i < 4; i++) { + const unsigned mask = (1 << op->pack.pattern[i]) - 1; + shift -= op->pack.pattern[i]; + val |= (x[i].num & mask) << shift; + } + x[0] = Q(val); + return; + } + case SWS_OP_SWAP_BYTES: + switch (ff_sws_pixel_type_size(op->type)) { + case 2: + for (int i = 0; i < 4; i++) + x[i].num = av_bswap16(x[i].num); + break; + case 4: + for (int i = 0; i < 4; i++) + x[i].num = av_bswap32(x[i].num); + break; + } + return; + case SWS_OP_CLEAR: + for (int i = 0; i < 4; i++) { + if (op->c.q4[i].den) + x[i] = op->c.q4[i]; + } + return; + case SWS_OP_LSHIFT: { + AVRational mult = Q(1 << op->c.u); + for (int i = 0; i < 4; i++) + x[i] = x[i].den ? av_mul_q(x[i], mult) : x[i]; + return; + } + case SWS_OP_RSHIFT: { + AVRational mult = Q(1 << op->c.u); + for (int i = 0; i < 4; i++) + x[i] = x[i].den ? av_div_q(x[i], mult) : x[i]; + return; + } + case SWS_OP_SWIZZLE: { + const AVRational orig[4] = { x[0], x[1], x[2], x[3] }; + for (int i = 0; i < 4; i++) + x[i] = orig[op->swizzle.in[i]]; + return; + } + case SWS_OP_CONVERT: + if (ff_sws_pixel_type_is_int(op->convert.to)) { + const AVRational scale = expand_factor(op->type, op->convert.to); + for (int i = 0; i < 4; i++) { + x[i] = x[i].den ? Q(x[i].num / x[i].den) : x[i]; + if (op->convert.expand) + x[i] = av_mul_q(x[i], scale); + } + } + return; + case SWS_OP_DITHER: + for (int i = 0; i < 4; i++) + x[i] = x[i].den ? av_add_q(x[i], av_make_q(1, 2)) : x[i]; + return; + case SWS_OP_MIN: + for (int i = 0; i < 4; i++) + x[i] = av_min_q(x[i], op->c.q4[i]); + return; + case SWS_OP_MAX: + for (int i = 0; i < 4; i++) + x[i] = av_max_q(x[i], op->c.q4[i]); + return; + case SWS_OP_LINEAR: { + const AVRational orig[4] = { x[0], x[1], x[2], x[3] }; + for (int i = 0; i < 4; i++) { + AVRational sum = op->lin.m[i][4]; + for (int j = 0; j < 4; j++) + sum = av_add_q(sum, av_mul_q(orig[j], op->lin.m[i][j])); + x[i] = sum; + } + return; + } + case SWS_OP_SCALE: + for (int i = 0; i < 4; i++) + x[i] = x[i].den ? av_mul_q(x[i], op->c.q) : x[i]; + return; + } + + av_assert0(!"Invalid operation type!"); +} + +static void op_uninit(SwsOp *op) +{ + switch (op->op) { + case SWS_OP_DITHER: + av_refstruct_unref(&op->dither.matrix); + break; + } + + *op = (SwsOp) {0}; +} + +SwsOpList *ff_sws_op_list_alloc(void) +{ + return av_mallocz(sizeof(SwsOpList)); +} + +void ff_sws_op_list_free(SwsOpList **p_ops) +{ + SwsOpList *ops = *p_ops; + if (!ops) + return; + + for (int i = 0; i < ops->num_ops; i++) + op_uninit(&ops->ops[i]); + + av_freep(&ops->ops); + av_free(ops); + *p_ops = NULL; +} + +SwsOpList *ff_sws_op_list_duplicate(const SwsOpList *ops) +{ + SwsOpList *copy = av_malloc(sizeof(*copy)); + if (!copy) + return NULL; + + *copy = *ops; + copy->ops = av_memdup(ops->ops, ops->num_ops * sizeof(ops->ops[0])); + if (!copy->ops) { + av_free(copy); + return NULL; + } + + for (int i = 0; i < ops->num_ops; i++) { + const SwsOp *op = &ops->ops[i]; + switch (op->op) { + case SWS_OP_DITHER: + av_refstruct_ref(copy->ops[i].dither.matrix); + break; + } + } + + return copy; +} + +void ff_sws_op_list_remove_at(SwsOpList *ops, int index, int count) +{ + const int end = ops->num_ops - count; + av_assert2(index >= 0 && count >= 0 && index + count <= ops->num_ops); + for (int i = index; i < end; i++) + ops->ops[i] = ops->ops[i + count]; + ops->num_ops = end; +} + +int ff_sws_op_list_insert_at(SwsOpList *ops, int index, SwsOp *op) +{ + void *ret; + ret = av_dynarray2_add((void **) &ops->ops, &ops->num_ops, sizeof(*op), + (const void *) op); + if (!ret) { + op_uninit(op); + return AVERROR(ENOMEM); + } + + for (int i = ops->num_ops - 1; i > index; i--) + ops->ops[i] = ops->ops[i - 1]; + ops->ops[index] = *op; + *op = (SwsOp) {0}; + return 0; +} + +int ff_sws_op_list_append(SwsOpList *ops, SwsOp *op) +{ + return ff_sws_op_list_insert_at(ops, ops->num_ops, op); +} + +int ff_sws_op_list_max_size(const SwsOpList *ops) +{ + int max_size = 0; + for (int i = 0; i < ops->num_ops; i++) { + const int size = ff_sws_pixel_type_size(ops->ops[i].type); + max_size = FFMAX(max_size, size); + } + + return max_size; +} + +uint32_t ff_sws_linear_mask(const SwsLinearOp c) +{ + uint32_t mask = 0; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 5; j++) { + if (av_cmp_q(c.m[i][j], Q(i == j))) + mask |= SWS_MASK(i, j); + } + } + return mask; +} + +static const char *describe_lin_mask(uint32_t mask) +{ + /* Try to be fairly descriptive without assuming too much */ + static const struct { + const char *name; + uint32_t mask; + } patterns[] = { + { "noop", 0 }, + { "luma", SWS_MASK_LUMA }, + { "alpha", SWS_MASK_ALPHA }, + { "luma+alpha", SWS_MASK_LUMA | SWS_MASK_ALPHA }, + { "dot3", 0b111 }, + { "dot4", 0b1111 }, + { "row0", SWS_MASK_ROW(0) }, + { "row0+alpha", SWS_MASK_ROW(0) | SWS_MASK_ALPHA }, + { "col0", SWS_MASK_COL(0) }, + { "col0+off3", SWS_MASK_COL(0) | SWS_MASK_OFF3 }, + { "off3", SWS_MASK_OFF3 }, + { "off3+alpha", SWS_MASK_OFF3 | SWS_MASK_ALPHA }, + { "diag3", SWS_MASK_DIAG3 }, + { "diag4", SWS_MASK_DIAG4 }, + { "diag3+alpha", SWS_MASK_DIAG3 | SWS_MASK_ALPHA }, + { "diag3+off3", SWS_MASK_DIAG3 | SWS_MASK_OFF3 }, + { "diag3+off3+alpha", SWS_MASK_DIAG3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA }, + { "diag4+off4", SWS_MASK_DIAG4 | SWS_MASK_OFF4 }, + { "matrix3", SWS_MASK_MAT3 }, + { "matrix3+off3", SWS_MASK_MAT3 | SWS_MASK_OFF3 }, + { "matrix3+off3+alpha", SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA }, + { "matrix4", SWS_MASK_MAT4 }, + { "matrix4+off4", SWS_MASK_MAT4 | SWS_MASK_OFF4 }, + }; + + for (int i = 0; i < FF_ARRAY_ELEMS(patterns); i++) { + if (!(mask & ~patterns[i].mask)) + return patterns[i].name; + } + + return "full"; +} + +static char describe_comp_flags(unsigned flags) +{ + if (flags & SWS_COMP_GARBAGE) + return 'X'; + else if (flags & SWS_COMP_ZERO) + return '0'; + else if (flags & SWS_COMP_EXACT) + return '+'; + else + return '.'; +} + +static const char *print_q(const AVRational q, char buf[], int buf_len) +{ + if (!q.den) { + switch (q.num) { + case 1: return "inf"; + case -1: return "-inf"; + default: return "nan"; + } + } + + if (q.den == 1) { + snprintf(buf, buf_len, "%d", q.num); + return buf; + } + + if (abs(q.num) > 1000 || abs(q.den) > 1000) { + snprintf(buf, buf_len, "%f", av_q2d(q)); + return buf; + } + + snprintf(buf, buf_len, "%d/%d", q.num, q.den); + return buf; +} + +#define PRINTQ(q) print_q(q, (char[32]){0}, sizeof(char[32]) - 1) + + +void ff_sws_op_list_print(void *log, int lev, const SwsOpList *ops) +{ + if (!ops->num_ops) { + av_log(log, lev, " (empty)\n"); + return; + } + + for (int i = 0; i < ops->num_ops; i++) { + const SwsOp *op = &ops->ops[i]; + av_log(log, lev, " [%3s %c%c%c%c -> %c%c%c%c] ", + ff_sws_pixel_type_name(op->type), + op->comps.unused[0] ? 'X' : '.', + op->comps.unused[1] ? 'X' : '.', + op->comps.unused[2] ? 'X' : '.', + op->comps.unused[3] ? 'X' : '.', + describe_comp_flags(op->comps.flags[0]), + describe_comp_flags(op->comps.flags[1]), + describe_comp_flags(op->comps.flags[2]), + describe_comp_flags(op->comps.flags[3])); + + switch (op->op) { + case SWS_OP_INVALID: + av_log(log, lev, "SWS_OP_INVALID\n"); + break; + case SWS_OP_READ: + case SWS_OP_WRITE: + av_log(log, lev, "%-20s: %d elem(s) %s >> %d\n", + op->op == SWS_OP_READ ? "SWS_OP_READ" + : "SWS_OP_WRITE", + op->rw.elems, op->rw.packed ? "packed" : "planar", + op->rw.frac); + break; + case SWS_OP_SWAP_BYTES: + av_log(log, lev, "SWS_OP_SWAP_BYTES\n"); + break; + case SWS_OP_LSHIFT: + av_log(log, lev, "%-20s: << %u\n", "SWS_OP_LSHIFT", op->c.u); + break; + case SWS_OP_RSHIFT: + av_log(log, lev, "%-20s: >> %u\n", "SWS_OP_RSHIFT", op->c.u); + break; + case SWS_OP_PACK: + case SWS_OP_UNPACK: + av_log(log, lev, "%-20s: {%d %d %d %d}\n", + op->op == SWS_OP_PACK ? "SWS_OP_PACK" + : "SWS_OP_UNPACK", + op->pack.pattern[0], op->pack.pattern[1], + op->pack.pattern[2], op->pack.pattern[3]); + break; + case SWS_OP_CLEAR: + av_log(log, lev, "%-20s: {%s %s %s %s}\n", "SWS_OP_CLEAR", + op->c.q4[0].den ? PRINTQ(op->c.q4[0]) : "_", + op->c.q4[1].den ? PRINTQ(op->c.q4[1]) : "_", + op->c.q4[2].den ? PRINTQ(op->c.q4[2]) : "_", + op->c.q4[3].den ? PRINTQ(op->c.q4[3]) : "_"); + break; + case SWS_OP_SWIZZLE: + av_log(log, lev, "%-20s: %d%d%d%d\n", "SWS_OP_SWIZZLE", + op->swizzle.x, op->swizzle.y, op->swizzle.z, op->swizzle.w); + break; + case SWS_OP_CONVERT: + av_log(log, lev, "%-20s: %s -> %s%s\n", "SWS_OP_CONVERT", + ff_sws_pixel_type_name(op->type), + ff_sws_pixel_type_name(op->convert.to), + op->convert.expand ? " (expand)" : ""); + break; + case SWS_OP_DITHER: + av_log(log, lev, "%-20s: %dx%d matrix\n", "SWS_OP_DITHER", + 1 << op->dither.size_log2, 1 << op->dither.size_log2); + break; + case SWS_OP_MIN: + av_log(log, lev, "%-20s: x <= {%s %s %s %s}\n", "SWS_OP_MIN", + op->c.q4[0].den ? PRINTQ(op->c.q4[0]) : "_", + op->c.q4[1].den ? PRINTQ(op->c.q4[1]) : "_", + op->c.q4[2].den ? PRINTQ(op->c.q4[2]) : "_", + op->c.q4[3].den ? PRINTQ(op->c.q4[3]) : "_"); + break; + case SWS_OP_MAX: + av_log(log, lev, "%-20s: {%s %s %s %s} <= x\n", "SWS_OP_MAX", + op->c.q4[0].den ? PRINTQ(op->c.q4[0]) : "_", + op->c.q4[1].den ? PRINTQ(op->c.q4[1]) : "_", + op->c.q4[2].den ? PRINTQ(op->c.q4[2]) : "_", + op->c.q4[3].den ? PRINTQ(op->c.q4[3]) : "_"); + break; + case SWS_OP_LINEAR: + av_log(log, lev, "%-20s: %s [[%s %s %s %s %s] " + "[%s %s %s %s %s] " + "[%s %s %s %s %s] " + "[%s %s %s %s %s]]\n", + "SWS_OP_LINEAR", describe_lin_mask(op->lin.mask), + PRINTQ(op->lin.m[0][0]), PRINTQ(op->lin.m[0][1]), PRINTQ(op->lin.m[0][2]), PRINTQ(op->lin.m[0][3]), PRINTQ(op->lin.m[0][4]), + PRINTQ(op->lin.m[1][0]), PRINTQ(op->lin.m[1][1]), PRINTQ(op->lin.m[1][2]), PRINTQ(op->lin.m[1][3]), PRINTQ(op->lin.m[1][4]), + PRINTQ(op->lin.m[2][0]), PRINTQ(op->lin.m[2][1]), PRINTQ(op->lin.m[2][2]), PRINTQ(op->lin.m[2][3]), PRINTQ(op->lin.m[2][4]), + PRINTQ(op->lin.m[3][0]), PRINTQ(op->lin.m[3][1]), PRINTQ(op->lin.m[3][2]), PRINTQ(op->lin.m[3][3]), PRINTQ(op->lin.m[3][4])); + break; + case SWS_OP_SCALE: + av_log(log, lev, "%-20s: * %s\n", "SWS_OP_SCALE", + PRINTQ(op->c.q)); + break; + case SWS_OP_TYPE_NB: + break; + } + + if (op->comps.min[0].den || op->comps.min[1].den || + op->comps.min[2].den || op->comps.min[3].den || + op->comps.max[0].den || op->comps.max[1].den || + op->comps.max[2].den || op->comps.max[3].den) + { + av_log(log, AV_LOG_TRACE, " min: {%s, %s, %s, %s}, max: {%s, %s, %s, %s}\n", + PRINTQ(op->comps.min[0]), PRINTQ(op->comps.min[1]), + PRINTQ(op->comps.min[2]), PRINTQ(op->comps.min[3]), + PRINTQ(op->comps.max[0]), PRINTQ(op->comps.max[1]), + PRINTQ(op->comps.max[2]), PRINTQ(op->comps.max[3])); + } + + } + + av_log(log, lev, " (X = unused, + = exact, 0 = zero)\n"); +} + +typedef struct SwsOpPass { + SwsCompiledOp comp; + SwsOpExec exec_base; + int num_blocks; + int tail_off_in; + int tail_off_out; + int tail_size_in; + int tail_size_out; + bool memcpy_in; + bool memcpy_out; +} SwsOpPass; + +static void op_pass_free(void *ptr) +{ + SwsOpPass *p = ptr; + if (!p) + return; + + if (p->comp.free) + p->comp.free(p->comp.priv); + + av_free(p); +} + +static void op_pass_setup(const SwsImg *out, const SwsImg *in, const SwsPass *pass) +{ + const AVPixFmtDescriptor *indesc = av_pix_fmt_desc_get(in->fmt); + const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->fmt); + + SwsOpPass *p = pass->priv; + SwsOpExec *exec = &p->exec_base; + const SwsCompiledOp *comp = &p->comp; + const int block_size = comp->block_size; + p->num_blocks = (pass->width + block_size - 1) / block_size; + + /* Set up main loop parameters */ + const int aligned_w = p->num_blocks * block_size; + const int safe_width = (p->num_blocks - 1) * block_size; + const int tail_size = pass->width - safe_width; + p->tail_off_in = safe_width * exec->pixel_bits_in >> 3; + p->tail_off_out = safe_width * exec->pixel_bits_out >> 3; + p->tail_size_in = tail_size * exec->pixel_bits_in >> 3; + p->tail_size_out = tail_size * exec->pixel_bits_out >> 3; + p->memcpy_in = false; + p->memcpy_out = false; + + for (int i = 0; i < 4 && in->data[i]; i++) { + const int sub_x = (i == 1 || i == 2) ? indesc->log2_chroma_w : 0; + const int plane_w = (aligned_w + sub_x) >> sub_x; + const int plane_pad = (comp->over_read + sub_x) >> sub_x; + const int plane_size = plane_w * exec->pixel_bits_in >> 3; + p->memcpy_in |= plane_size + plane_pad > in->linesize[i]; + exec->in_stride[i] = in->linesize[i]; + } + + for (int i = 0; i < 4 && out->data[i]; i++) { + const int sub_x = (i == 1 || i == 2) ? outdesc->log2_chroma_w : 0; + const int plane_w = (aligned_w + sub_x) >> sub_x; + const int plane_pad = (comp->over_write + sub_x) >> sub_x; + const int plane_size = plane_w * exec->pixel_bits_out >> 3; + p->memcpy_out |= plane_size + plane_pad > out->linesize[i]; + exec->out_stride[i] = out->linesize[i]; + } +} + +/* Dispatch kernel over the last column of the image using memcpy */ +static av_always_inline void +handle_tail(const SwsOpPass *p, SwsOpExec *exec, + const SwsImg *out_base, const bool copy_out, + const SwsImg *in_base, const bool copy_in, + const int y, const int y_end) +{ + DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])]; + + const SwsCompiledOp *comp = &p->comp; + const int block_size = comp->block_size; + const int tail_size_in = p->tail_size_in; + const int tail_size_out = p->tail_size_out; + + SwsImg in = ff_sws_img_shift(*in_base, y); + SwsImg out = ff_sws_img_shift(*out_base, y); + for (int i = 0; i < 4 && in.data[i]; i++) { + in.data[i] += p->tail_off_in; + if (copy_in) { + exec->in[i] = (void *) tmp[0][i]; + exec->in_stride[i] = sizeof(tmp[0][i]); + } else { + exec->in[i] = in.data[i]; + } + } + + for (int i = 0; i < 4 && out.data[i]; i++) { + out.data[i] += p->tail_off_out; + if (copy_out) { + exec->out[i] = (void *) tmp[1][i]; + exec->out_stride[i] = sizeof(tmp[1][i]); + } else { + exec->out[i] = out.data[i]; + } + } + + exec->x = (p->num_blocks - 1) * block_size; + for (exec->y = y; exec->y < y_end; exec->y++) { + if (copy_in) { + for (int i = 0; i < 4 && in.data[i]; i++) { + av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]); + memcpy(tmp[0][i], in.data[i], tail_size_in); + in.data[i] += in.linesize[i]; + } + } + + comp->func(exec, comp->priv, 1); + + if (copy_out) { + for (int i = 0; i < 4 && out.data[i]; i++) { + av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]); + memcpy(out.data[i], tmp[1][i], tail_size_out); + out.data[i] += out.linesize[i]; + } + } + + for (int i = 0; i < 4; i++) { + if (!copy_in) + exec->in[i] += in.linesize[i]; + if (!copy_out) + exec->out[i] += out.linesize[i]; + } + } +} + +static av_always_inline void +op_pass_run(const SwsImg *out_base, const SwsImg *in_base, + const int y, const int h, const SwsPass *pass) +{ + const SwsOpPass *p = pass->priv; + const SwsCompiledOp *comp = &p->comp; + + /* Fill exec metadata for this slice */ + const SwsImg in = ff_sws_img_shift(*in_base, y); + const SwsImg out = ff_sws_img_shift(*out_base, y); + SwsOpExec exec = p->exec_base; + exec.slice_y = y; + exec.slice_h = h; + for (int i = 0; i < 4; i++) { + exec.in[i] = in.data[i]; + exec.out[i] = out.data[i]; + } + + /** + * To ensure safety, we need to consider the following: + * + * 1. We can overread the input, unless this is the last line of an + * unpadded buffer. All operation chains must be able to handle + * arbitrary pixel input, so arbitrary overread is fine. + * + * 2. We can overwrite the output, as long as we don't write more than the + * amount of pixels that fit into one linesize. So we always need to + * memcpy the last column on the output side if unpadded. + * + * 3. For the last row, we also need to memcpy the remainder of the input, + * to avoid reading past the end of the buffer. Note that since we know + * the run() function is called on stripes of the same buffer, we don't + * need to worry about this for the end of a slice. + */ + + const int last_slice = y + h == pass->height; + const bool memcpy_in = last_slice && p->memcpy_in; + const bool memcpy_out = p->memcpy_out; + const int num_blocks = p->num_blocks; + const int blocks_main = num_blocks - memcpy_out; + const int y_end = y + h - memcpy_in; + + /* Handle main section */ + for (exec.y = y; exec.y < y_end; exec.y++) { + comp->func(&exec, comp->priv, blocks_main); + for (int i = 0; i < 4; i++) { + exec.in[i] += in.linesize[i]; + exec.out[i] += out.linesize[i]; + } + } + + if (memcpy_in) + comp->func(&exec, comp->priv, num_blocks - 1); /* safe part of last row */ + + /* Handle last column via memcpy, takes over `exec` so call these last */ + if (memcpy_out) + handle_tail(p, &exec, out_base, true, in_base, false, y, y_end); + if (memcpy_in) + handle_tail(p, &exec, out_base, memcpy_out, in_base, true, y_end, y + h); +} + +static int rw_pixel_bits(const SwsOp *op) +{ + const int elems = op->rw.packed ? op->rw.elems : 1; + const int size = ff_sws_pixel_type_size(op->type); + const int bits = 8 >> op->rw.frac; + av_assert1(bits >= 1); + return elems * size * bits; +} + +int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend, + const SwsOpList *ops, SwsCompiledOp *out) +{ + SwsOpList *copy, rest; + int ret = 0; + + copy = ff_sws_op_list_duplicate(ops); + if (!copy) + return AVERROR(ENOMEM); + + /* Ensure these are always set during compilation */ + ff_sws_op_list_update_comps(copy); + + /* Make an on-stack copy of `ops` to ensure we can still properly clean up + * the copy afterwards */ + rest = *copy; + + ret = backend->compile(ctx, &rest, out); + if (ret == AVERROR(ENOTSUP)) { + av_log(ctx, AV_LOG_DEBUG, "Backend '%s' does not support operations:\n", backend->name); + ff_sws_op_list_print(ctx, AV_LOG_DEBUG, &rest); + } else if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, "Failed to compile operations: %s\n", av_err2str(ret)); + ff_sws_op_list_print(ctx, AV_LOG_ERROR, &rest); + } + + ff_sws_op_list_free(©); + return ret; +} + +int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out) +{ + for (int n = 0; ff_sws_op_backends[n]; n++) { + const SwsOpBackend *backend = ff_sws_op_backends[n]; + if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0) + continue; + + av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': " + "block size = %d, over-read = %d, over-write = %d\n", + backend->name, out->block_size, out->over_read, out->over_write); + return 0; + } + + av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n"); + ff_sws_op_list_print(ctx, AV_LOG_WARNING, ops); + return AVERROR(ENOTSUP); +} + +int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat dst, + SwsPass *input, SwsPass **output) +{ + SwsContext *ctx = graph->ctx; + SwsOpPass *p = NULL; + const SwsOp *read = &ops->ops[0]; + const SwsOp *write = &ops->ops[ops->num_ops - 1]; + SwsPass *pass; + int ret; + + if (ops->num_ops < 2) { + av_log(ctx, AV_LOG_ERROR, "Need at least two operations.\n"); + return AVERROR(EINVAL); + } + + if (read->op != SWS_OP_READ || write->op != SWS_OP_WRITE) { + av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read " + "and write, respectively.\n"); + return AVERROR(EINVAL); + } + + if (flags & SWS_OP_FLAG_OPTIMIZE) + RET(ff_sws_op_list_optimize(ops)); + else + ff_sws_op_list_update_comps(ops); + + p = av_mallocz(sizeof(*p)); + if (!p) + return AVERROR(ENOMEM); + + p->exec_base = (SwsOpExec) { + .width = dst.width, + .height = dst.height, + .pixel_bits_in = rw_pixel_bits(read), + .pixel_bits_out = rw_pixel_bits(write), + }; + + ret = ff_sws_ops_compile(ctx, ops, &p->comp); + if (ret < 0) + goto fail; + + pass = ff_sws_graph_add_pass(graph, dst.format, dst.width, dst.height, input, + 1, p, op_pass_run); + if (!pass) { + ret = AVERROR(ENOMEM); + goto fail; + } + pass->setup = op_pass_setup; + pass->free = op_pass_free; + + *output = pass; + return 0; + +fail: + op_pass_free(p); + return ret; +} diff --git a/libswscale/ops.h b/libswscale/ops.h new file mode 100644 index 0000000000..c9c5706cbf --- /dev/null +++ b/libswscale/ops.h @@ -0,0 +1,265 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef SWSCALE_OPS_H +#define SWSCALE_OPS_H + +#include <assert.h> +#include <stdbool.h> +#include <stdalign.h> + +#include "graph.h" + +typedef enum SwsPixelType { + SWS_PIXEL_NONE = 0, + SWS_PIXEL_U8, + SWS_PIXEL_U16, + SWS_PIXEL_U32, + SWS_PIXEL_F32, + SWS_PIXEL_TYPE_NB +} SwsPixelType; + +const char *ff_sws_pixel_type_name(SwsPixelType type); +int ff_sws_pixel_type_size(SwsPixelType type) av_const; +bool ff_sws_pixel_type_is_int(SwsPixelType type) av_const; +SwsPixelType ff_sws_pixel_type_to_uint(SwsPixelType type) av_const; + +typedef enum SwsOpType { + SWS_OP_INVALID = 0, + + /* Input/output handling */ + SWS_OP_READ, /* gather raw pixels from planes */ + SWS_OP_WRITE, /* write raw pixels to planes */ + SWS_OP_SWAP_BYTES, /* swap byte order (for differing endianness) */ + SWS_OP_UNPACK, /* split tightly packed data into components */ + SWS_OP_PACK, /* compress components into tightly packed data */ + + /* Pixel manipulation */ + SWS_OP_CLEAR, /* clear pixel values */ + SWS_OP_LSHIFT, /* logical left shift of raw pixel values by (u8) */ + SWS_OP_RSHIFT, /* right shift of raw pixel values by (u8) */ + SWS_OP_SWIZZLE, /* rearrange channel order, or duplicate channels */ + SWS_OP_CONVERT, /* convert (cast) between formats */ + SWS_OP_DITHER, /* add dithering noise */ + + /* Arithmetic operations */ + SWS_OP_LINEAR, /* generalized linear affine transform */ + SWS_OP_SCALE, /* multiplication by scalar (q) */ + SWS_OP_MIN, /* numeric minimum (q4) */ + SWS_OP_MAX, /* numeric maximum (q4) */ + + SWS_OP_TYPE_NB, +} SwsOpType; + +enum SwsCompFlags { + SWS_COMP_GARBAGE = 1 << 0, /* contents are undefined / garbage data */ + SWS_COMP_EXACT = 1 << 1, /* value is an in-range, exact, integer */ + SWS_COMP_ZERO = 1 << 2, /* known to be a constant zero */ +}; + +typedef union SwsConst { + /* Generic constant value */ + AVRational q; + AVRational q4[4]; + unsigned u; +} SwsConst; + +typedef struct SwsComps { + unsigned flags[4]; /* knowledge about (output) component contents */ + bool unused[4]; /* which input components are definitely unused */ + + /* Keeps track of the known possible value range, or {0, 0} for undefined + * or (unknown range) floating point inputs */ + AVRational min[4], max[4]; +} SwsComps; + +typedef struct SwsReadWriteOp { + /* Note: Unread pixel data is explicitly cleared to {0} for sanity */ + + int elems; /* number of elements (of type `op.type`) to read/write */ + bool packed; /* read multiple elements from a single plane */ + int frac; /* fractional pixel step factor (log2) */ + + /** Examples: + * rgba = 4x u8 packed + * yuv444p = 3x u8 + * rgb565 = 1x u16 <- use SWS_OP_UNPACK to unpack + * monow = 1x u8 (frac 3) + * rgb4 = 1x u8 (frac 1) + */ +} SwsReadWriteOp; + +typedef struct SwsPackOp { + int pattern[4]; /* bit depth pattern, from MSB to LSB */ +} SwsPackOp; + +typedef struct SwsSwizzleOp { + /** + * Input component for each output component: + * Out[x] := In[swizzle.in[x]] + */ + union { + uint32_t mask; + uint8_t in[4]; + struct { uint8_t x, y, z, w; }; + }; +} SwsSwizzleOp; + +#define SWS_SWIZZLE(X,Y,Z,W) ((SwsSwizzleOp) { .in = {X, Y, Z, W} }) + +typedef struct SwsConvertOp { + SwsPixelType to; /* type of pixel to convert to */ + bool expand; /* if true, integers are expanded to the full range */ +} SwsConvertOp; + +typedef struct SwsDitherOp { + AVRational *matrix; /* tightly packed dither matrix (refstruct) */ + int size_log2; /* size (in bits) of the dither matrix */ +} SwsDitherOp; + +typedef struct SwsLinearOp { + /** + * Generalized 5x5 affine transformation: + * [ Out.x ] = [ A B C D E ] + * [ Out.y ] = [ F G H I J ] * [ x y z w 1 ] + * [ Out.z ] = [ K L M N O ] + * [ Out.w ] = [ P Q R S T ] + * + * The mask keeps track of which components differ from an identity matrix. + * There may be more efficient implementations of particular subsets, for + * example the common subset of {A, E, G, J, M, O} can be implemented with + * just three fused multiply-add operations. + */ + AVRational m[4][5]; + uint32_t mask; /* m[i][j] <-> 1 << (5 * i + j) */ +} SwsLinearOp; + +#define SWS_MASK(I, J) (1 << (5 * (I) + (J))) +#define SWS_MASK_OFF(I) SWS_MASK(I, 4) +#define SWS_MASK_ROW(I) (0b11111 << (5 * (I))) +#define SWS_MASK_COL(J) (0b1000010000100001 << J) + +enum { + SWS_MASK_ALL = (1 << 20) - 1, + SWS_MASK_LUMA = SWS_MASK(0, 0) | SWS_MASK_OFF(0), + SWS_MASK_ALPHA = SWS_MASK(3, 3) | SWS_MASK_OFF(3), + + SWS_MASK_DIAG3 = SWS_MASK(0, 0) | SWS_MASK(1, 1) | SWS_MASK(2, 2), + SWS_MASK_OFF3 = SWS_MASK_OFF(0) | SWS_MASK_OFF(1) | SWS_MASK_OFF(2), + SWS_MASK_MAT3 = SWS_MASK(0, 0) | SWS_MASK(0, 1) | SWS_MASK(0, 2) | + SWS_MASK(1, 0) | SWS_MASK(1, 1) | SWS_MASK(1, 2) | + SWS_MASK(2, 0) | SWS_MASK(2, 1) | SWS_MASK(2, 2), + + SWS_MASK_DIAG4 = SWS_MASK_DIAG3 | SWS_MASK(3, 3), + SWS_MASK_OFF4 = SWS_MASK_OFF3 | SWS_MASK_OFF(3), + SWS_MASK_MAT4 = SWS_MASK_ALL & ~SWS_MASK_OFF4, +}; + +/* Helper function to compute the correct mask */ +uint32_t ff_sws_linear_mask(SwsLinearOp); + +typedef struct SwsOp { + SwsOpType op; /* operation to perform */ + SwsPixelType type; /* pixel type to operate on */ + union { + SwsReadWriteOp rw; + SwsPackOp pack; + SwsSwizzleOp swizzle; + SwsConvertOp convert; + SwsDitherOp dither; + SwsLinearOp lin; + SwsConst c; + }; + + /* For use internal use inside ff_sws_*() functions */ + SwsComps comps; +} SwsOp; + +/** + * Frees any allocations associated with an SwsOp and sets it to {0}. + */ +void ff_sws_op_uninit(SwsOp *op); + +/** + * Apply an operation to an AVRational. No-op for read/write operations. + */ +void ff_sws_apply_op_q(const SwsOp *op, AVRational x[4]); + +/** + * Helper struct for representing a list of operations. + */ +typedef struct SwsOpList { + SwsOp *ops; + int num_ops; +} SwsOpList; + +SwsOpList *ff_sws_op_list_alloc(void); +void ff_sws_op_list_free(SwsOpList **ops); + +/** + * Returns a duplicate of `ops`, or NULL on OOM. + */ +SwsOpList *ff_sws_op_list_duplicate(const SwsOpList *ops); + +/** + * Returns the size of the largest pixel type used in `ops`. + */ +int ff_sws_op_list_max_size(const SwsOpList *ops); + +/** + * These will take over ownership of `op` and set it to {0}, even on failure. + */ +int ff_sws_op_list_append(SwsOpList *ops, SwsOp *op); +int ff_sws_op_list_insert_at(SwsOpList *ops, int index, SwsOp *op); + +void ff_sws_op_list_remove_at(SwsOpList *ops, int index, int count); + +/** + * Print out the contents of an operation list. + */ +void ff_sws_op_list_print(void *log_ctx, int log_level, const SwsOpList *ops); + +/** + * Infer + propagate known information about components. Called automatically + * when needed by the optimizer and compiler. + */ +void ff_sws_op_list_update_comps(SwsOpList *ops); + +/** + * Fuse compatible and eliminate redundant operations, as well as replacing + * some operations with more efficient alternatives. + */ +int ff_sws_op_list_optimize(SwsOpList *ops); + +enum SwsOpCompileFlags { + /* Automatically optimize the operations when compiling */ + SWS_OP_FLAG_OPTIMIZE = 1 << 0, +}; + +/** + * Resolves an operation list to a graph pass. The first and last operations + * must be a read/write respectively. `flags` is a list of SwsOpCompileFlags. + * + * Note: `ops` may be modified by this function. + */ +int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat dst, + SwsPass *input, SwsPass **output); + +#endif diff --git a/libswscale/ops_internal.h b/libswscale/ops_internal.h new file mode 100644 index 0000000000..ac0319321e --- /dev/null +++ b/libswscale/ops_internal.h @@ -0,0 +1,103 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef SWSCALE_OPS_INTERNAL_H +#define SWSCALE_OPS_INTERNAL_H + +#include "libavutil/mem_internal.h" + +#include "ops.h" + +/** + * Global execution context for all compiled functions. + * + * Note: This struct is hard-coded in assembly, so do not change the layout + * without updating the corresponding assembly definitions. + */ +typedef struct SwsOpExec { + /* The data pointers point to the first pixel to process */ + const uint8_t *in[4]; + uint8_t *out[4]; + + /* Separation between lines in bytes */ + ptrdiff_t in_stride[4]; + ptrdiff_t out_stride[4]; + + /* Extra metadata, may or may not be useful */ + int32_t x, y; /* Starting pixel coordinates */ + int32_t width, height; /* Overall image dimensions */ + int32_t slice_y, slice_h; /* Start and height of current slice */ + int32_t pixel_bits_in; /* Bits per input pixel */ + int32_t pixel_bits_out; /* Bits per output pixel */ +} SwsOpExec; + +static_assert(sizeof(SwsOpExec) == 16 * sizeof(void *) + 8 * sizeof(int32_t), + "SwsOpExec layout mismatch"); + +/* Process a given number of pixel blocks */ +typedef void (*SwsOpFunc)(const SwsOpExec *exec, const void *priv, int blocks); + +#define SWS_DECL_FUNC(NAME) \ + void NAME(const SwsOpExec *, const void *, int) + +typedef struct SwsCompiledOp { + SwsOpFunc func; + + int block_size; /* number of pixels processed per iteration */ + int over_read; /* implementation over-reads input by this many bytes */ + int over_write; /* implementation over-writes output by this many bytes */ + + /* Arbitrary private data */ + void *priv; + void (*free)(void *priv); +} SwsCompiledOp; + +typedef struct SwsOpBackend { + const char *name; /* Descriptive name for this backend */ + + /** + * Compile an operation list to an implementation chain. May modify `ops` + * freely; the original list will be freed automatically by the caller. + * + * Returns 0 or a negative error code. + */ + int (*compile)(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out); +} SwsOpBackend; + +/* List of all backends, terminated by NULL */ +extern const SwsOpBackend *const ff_sws_op_backends[]; +extern const int ff_sws_num_op_backends; /* excludes terminating NULL */ + +/** + * Attempt to compile a list of operations using a specific backend. + * + * Returns 0 on success, or a negative error code on failure. + */ +int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend, + const SwsOpList *ops, SwsCompiledOp *out); + +/** + * Compile a list of operations using the best available backend. + * + * Returns 0 on success, or a negative error code on failure. + */ +int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out); + +#endif diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c new file mode 100644 index 0000000000..9f509085ba --- /dev/null +++ b/libswscale/ops_optimizer.c @@ -0,0 +1,810 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "libavutil/mem.h" +#include "libavutil/rational.h" + +#include "ops.h" +#include "ops_internal.h" + +#define Q(N) ((AVRational) { N, 1 }) + +#define RET(x) \ + do { \ + if ((ret = (x)) < 0) \ + return ret; \ + } while (0) + +/* Returns true for operations that are independent per channel. These can + * usually be commuted freely other such operations. */ +static bool op_type_is_independent(SwsOpType op) +{ + switch (op) { + case SWS_OP_SWAP_BYTES: + case SWS_OP_LSHIFT: + case SWS_OP_RSHIFT: + case SWS_OP_CONVERT: + case SWS_OP_DITHER: + case SWS_OP_MIN: + case SWS_OP_MAX: + case SWS_OP_SCALE: + return true; + case SWS_OP_INVALID: + case SWS_OP_READ: + case SWS_OP_WRITE: + case SWS_OP_SWIZZLE: + case SWS_OP_CLEAR: + case SWS_OP_LINEAR: + case SWS_OP_PACK: + case SWS_OP_UNPACK: + return false; + case SWS_OP_TYPE_NB: + break; + } + + av_assert0(!"Invalid operation type!"); + return false; +} + +static AVRational expand_factor(SwsPixelType from, SwsPixelType to) +{ + const int src = ff_sws_pixel_type_size(from); + const int dst = ff_sws_pixel_type_size(to); + int scale = 0; + for (int i = 0; i < dst / src; i++) + scale = scale << src * 8 | 1; + return Q(scale); +} + +/* merge_comp_flags() forms a monoid with flags_identity as the null element */ +static const unsigned flags_identity = SWS_COMP_ZERO | SWS_COMP_EXACT; +static unsigned merge_comp_flags(unsigned a, unsigned b) +{ + const unsigned flags_or = SWS_COMP_GARBAGE; + const unsigned flags_and = SWS_COMP_ZERO | SWS_COMP_EXACT; + return ((a & b) & flags_and) | ((a | b) & flags_or); +} + +/* Infer + propagate known information about components */ +void ff_sws_op_list_update_comps(SwsOpList *ops) +{ + SwsComps next = { .unused = {true, true, true, true} }; + SwsComps prev = { .flags = { + SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, + }}; + + /* Forwards pass, propagates knowledge about the incoming pixel values */ + for (int n = 0; n < ops->num_ops; n++) { + SwsOp *op = &ops->ops[n]; + + /* Prefill min/max values automatically; may have to be fixed in + * special cases */ + memcpy(op->comps.min, prev.min, sizeof(prev.min)); + memcpy(op->comps.max, prev.max, sizeof(prev.max)); + ff_sws_apply_op_q(op, op->comps.min); + ff_sws_apply_op_q(op, op->comps.max); + + switch (op->op) { + case SWS_OP_READ: + for (int i = 0; i < op->rw.elems; i++) { + if (ff_sws_pixel_type_is_int(op->type)) { + const int size = ff_sws_pixel_type_size(op->type); + const uint64_t max_val = (1 << 8 * size) - 1; + op->comps.flags[i] |= SWS_COMP_EXACT; + op->comps.min[i] = Q(0); + op->comps.max[i] = Q(max_val); + } + } + for (int i = op->rw.elems; i < 4; i++) + op->comps.flags[i] |= prev.flags[i]; + break; + case SWS_OP_WRITE: + for (int i = 0; i < op->rw.elems; i++) + av_assert1(!(prev.flags[i] & SWS_COMP_GARBAGE)); + /* fall through */ + case SWS_OP_SWAP_BYTES: + case SWS_OP_LSHIFT: + case SWS_OP_RSHIFT: + case SWS_OP_MIN: + case SWS_OP_MAX: + /* Linearly propagate flags per component */ + for (int i = 0; i < 4; i++) + op->comps.flags[i] |= prev.flags[i]; + break; + case SWS_OP_DITHER: + /* Strip zero flag because of the nonzero dithering offset */ + for (int i = 0; i < 4; i++) + op->comps.flags[i] |= prev.flags[i] & ~SWS_COMP_ZERO; + break; + case SWS_OP_UNPACK: + for (int i = 0; i < 4; i++) { + if (op->pack.pattern[i]) + op->comps.flags[i] |= prev.flags[0]; + else + op->comps.flags[i] = SWS_COMP_GARBAGE; + } + break; + case SWS_OP_PACK: { + unsigned flags = flags_identity; + for (int i = 0; i < 4; i++) { + if (op->pack.pattern[i]) + flags = merge_comp_flags(flags, prev.flags[i]); + if (i > 0) /* clear remaining comps for sanity */ + op->comps.flags[i] = SWS_COMP_GARBAGE; + } + op->comps.flags[0] |= flags; + break; + } + case SWS_OP_CLEAR: + for (int i = 0; i < 4; i++) { + if (op->c.q4[i].den) { + if (op->c.q4[i].num == 0) + op->comps.flags[i] |= SWS_COMP_ZERO | SWS_COMP_EXACT; + if (op->c.q4[i].den == 1) + op->comps.flags[i] |= SWS_COMP_EXACT; + } + else + op->comps.flags[i] |= prev.flags[i]; + } + break; + case SWS_OP_SWIZZLE: + for (int i = 0; i < 4; i++) + op->comps.flags[i] |= prev.flags[op->swizzle.in[i]]; + break; + case SWS_OP_CONVERT: + for (int i = 0; i < 4; i++) { + op->comps.flags[i] |= prev.flags[i]; + if (ff_sws_pixel_type_is_int(op->convert.to)) + op->comps.flags[i] |= SWS_COMP_EXACT; + } + break; + case SWS_OP_LINEAR: + for (int i = 0; i < 4; i++) { + unsigned flags = flags_identity; + AVRational min = Q(0), max = Q(0); + for (int j = 0; j < 4; j++) { + const AVRational k = op->lin.m[i][j]; + AVRational mink = av_mul_q(prev.min[j], k); + AVRational maxk = av_mul_q(prev.max[j], k); + if (k.num) { + flags = merge_comp_flags(flags, prev.flags[j]); + if (k.den != 1) /* fractional coefficient */ + flags &= ~SWS_COMP_EXACT; + if (k.num < 0) + FFSWAP(AVRational, mink, maxk); + min = av_add_q(min, mink); + max = av_add_q(max, maxk); + } + } + if (op->lin.m[i][4].num) { /* nonzero offset */ + flags &= ~SWS_COMP_ZERO; + if (op->lin.m[i][4].den != 1) /* fractional offset */ + flags &= ~SWS_COMP_EXACT; + min = av_add_q(min, op->lin.m[i][4]); + max = av_add_q(max, op->lin.m[i][4]); + } + op->comps.flags[i] |= flags; + op->comps.min[i] = min; + op->comps.max[i] = max; + } + break; + case SWS_OP_SCALE: + for (int i = 0; i < 4; i++) { + op->comps.flags[i] |= prev.flags[i]; + if (op->c.q.den != 1) /* fractional scale */ + op->comps.flags[i] &= ~SWS_COMP_EXACT; + if (op->c.q.num < 0) + FFSWAP(AVRational, op->comps.min[i], op->comps.max[i]); + } + break; + + case SWS_OP_INVALID: + case SWS_OP_TYPE_NB: + av_assert0(!"Invalid operation type!"); + } + + prev = op->comps; + } + + /* Backwards pass, solves for component dependencies */ + for (int n = ops->num_ops - 1; n >= 0; n--) { + SwsOp *op = &ops->ops[n]; + + switch (op->op) { + case SWS_OP_READ: + case SWS_OP_WRITE: + for (int i = 0; i < op->rw.elems; i++) + op->comps.unused[i] = op->op == SWS_OP_READ; + for (int i = op->rw.elems; i < 4; i++) + op->comps.unused[i] |= next.unused[i]; + break; + case SWS_OP_SWAP_BYTES: + case SWS_OP_LSHIFT: + case SWS_OP_RSHIFT: + case SWS_OP_CONVERT: + case SWS_OP_DITHER: + case SWS_OP_MIN: + case SWS_OP_MAX: + case SWS_OP_SCALE: + for (int i = 0; i < 4; i++) + op->comps.unused[i] |= next.unused[i]; + break; + case SWS_OP_UNPACK: { + bool unused = true; + for (int i = 0; i < 4; i++) { + if (op->pack.pattern[i]) + unused &= next.unused[i]; + op->comps.unused[i] |= i > 0; + } + op->comps.unused[0] = unused; + break; + } + case SWS_OP_PACK: + for (int i = 0; i < 4; i++) { + if (op->pack.pattern[i]) + op->comps.unused[i] |= next.unused[0]; + else + op->comps.unused[i] = true; + } + break; + case SWS_OP_CLEAR: + for (int i = 0; i < 4; i++) { + if (op->c.q4[i].den) + op->comps.unused[i] = true; + else + op->comps.unused[i] |= next.unused[i]; + } + break; + case SWS_OP_SWIZZLE: { + bool unused[4] = { true, true, true, true }; + for (int i = 0; i < 4; i++) + unused[op->swizzle.in[i]] &= next.unused[i]; + for (int i = 0; i < 4; i++) + op->comps.unused[i] = unused[i]; + break; + } + case SWS_OP_LINEAR: + for (int j = 0; j < 4; j++) { + bool unused = true; + for (int i = 0; i < 4; i++) { + if (op->lin.m[i][j].num) + unused &= next.unused[i]; + } + op->comps.unused[j] = unused; + } + break; + } + + next = op->comps; + } +} + +/* returns log2(x) only if x is a power of two, or 0 otherwise */ +static int exact_log2(const int x) +{ + int p; + if (x <= 0) + return 0; + p = av_log2(x); + return (1 << p) == x ? p : 0; +} + +static int exact_log2_q(const AVRational x) +{ + if (x.den == 1) + return exact_log2(x.num); + else if (x.num == 1) + return -exact_log2(x.den); + else + return 0; +} + +/** + * If a linear operation can be reduced to a scalar multiplication, returns + * the corresponding scaling factor, or 0 otherwise. + */ +static bool extract_scalar(const SwsLinearOp *c, SwsComps prev, SwsComps next, + SwsConst *out_scale) +{ + SwsConst scale = {0}; + + /* There are components not on the main diagonal */ + if (c->mask & ~SWS_MASK_DIAG4) + return false; + + for (int i = 0; i < 4; i++) { + const AVRational s = c->m[i][i]; + if ((prev.flags[i] & SWS_COMP_ZERO) || next.unused[i]) + continue; + if (scale.q.den && av_cmp_q(s, scale.q)) + return false; + scale.q = s; + } + + if (scale.q.den) + *out_scale = scale; + return scale.q.den; +} + +/* Extracts an integer clear operation (subset) from the given linear op. */ +static bool extract_constant_rows(SwsLinearOp *c, SwsComps prev, + SwsConst *out_clear) +{ + SwsConst clear = {0}; + bool ret = false; + + for (int i = 0; i < 4; i++) { + bool const_row = c->m[i][4].den == 1; /* offset is integer */ + for (int j = 0; j < 4; j++) { + const_row &= c->m[i][j].num == 0 || /* scalar is zero */ + (prev.flags[j] & SWS_COMP_ZERO); /* input is zero */ + } + if (const_row && (c->mask & SWS_MASK_ROW(i))) { + clear.q4[i] = c->m[i][4]; + for (int j = 0; j < 5; j++) + c->m[i][j] = Q(i == j); + c->mask &= ~SWS_MASK_ROW(i); + ret = true; + } + } + + if (ret) + *out_clear = clear; + return ret; +} + +/* Unswizzle a linear operation by aligning single-input rows with + * their corresponding diagonal */ +static bool extract_swizzle(SwsLinearOp *op, SwsComps prev, SwsSwizzleOp *out_swiz) +{ + SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3); + SwsLinearOp c = *op; + + for (int i = 0; i < 4; i++) { + int idx = -1; + for (int j = 0; j < 4; j++) { + if (!c.m[i][j].num || (prev.flags[j] & SWS_COMP_ZERO)) + continue; + if (idx >= 0) + return false; /* multiple inputs */ + idx = j; + } + + if (idx >= 0 && idx != i) { + /* Move coefficient to the diagonal */ + c.m[i][i] = c.m[i][idx]; + c.m[i][idx] = Q(0); + swiz.in[i] = idx; + } + } + + if (swiz.mask == SWS_SWIZZLE(0, 1, 2, 3).mask) + return false; /* no swizzle was identified */ + + c.mask = ff_sws_linear_mask(c); + *out_swiz = swiz; + *op = c; + return true; +} + +static void op_copy_flags(SwsOp *op, const SwsOp *op2) +{ + for (int i = 0; i < 4; i++) + op->comps.flags[i] = op2->comps.flags[i]; +} + +/* Should only be used on ops that commute with each other, and only after + * applying the necessary adjustments + */ +static void swap_ops(SwsOp *op, SwsOp *next) +{ + /* Clear all inferred flags */ + op->comps = next->comps = (SwsComps) {0}; + FFSWAP(SwsOp, *op, *next); +} + +int ff_sws_op_list_optimize(SwsOpList *ops) +{ + int prev_num_ops, ret; + bool progress; + + do { + prev_num_ops = ops->num_ops; + progress = false; + + ff_sws_op_list_update_comps(ops); + + for (int n = 0; n < ops->num_ops;) { + SwsOp dummy = {0}; + SwsOp *op = &ops->ops[n]; + SwsOp *prev = n ? &ops->ops[n - 1] : &dummy; + SwsOp *next = n + 1 < ops->num_ops ? &ops->ops[n + 1] : &dummy; + + /* common helper variables */ + bool changed = false; + bool noop = true; + + switch (op->op) { + case SWS_OP_READ: + /* Optimized further into refcopy / memcpy */ + if (next->op == SWS_OP_WRITE && + next->rw.elems == op->rw.elems && + next->rw.packed == op->rw.packed && + next->rw.frac == op->rw.frac) + { + ff_sws_op_list_remove_at(ops, n, 2); + av_assert1(ops->num_ops == 0); + return 0; + } + + /* Skip reading extra unneeded components */ + if (!op->rw.packed) { + int needed = op->rw.elems; + while (needed > 0 && next->comps.unused[needed - 1]) + needed--; + if (op->rw.elems != needed) { + op->rw.elems = needed; + op->rw.packed &= op->rw.elems > 1; + progress = true; + continue; + } + } + break; + + case SWS_OP_SWAP_BYTES: + /* Redundant (double) swap */ + if (next->op == SWS_OP_SWAP_BYTES) { + ff_sws_op_list_remove_at(ops, n, 2); + continue; + } + break; + + case SWS_OP_UNPACK: + /* Redundant unpack+pack */ + if (next->op == SWS_OP_PACK && next->type == op->type && + next->pack.pattern[0] == op->pack.pattern[0] && + next->pack.pattern[1] == op->pack.pattern[1] && + next->pack.pattern[2] == op->pack.pattern[2] && + next->pack.pattern[3] == op->pack.pattern[3]) + { + ff_sws_op_list_remove_at(ops, n, 2); + continue; + } + + /* Skip unpacking components that are not used */ + for (int i = 3; i > 0 && next->comps.unused[i]; i--) + op->pack.pattern[i] = 0; + break; + + case SWS_OP_PACK: + /* Skip packing known-to-be-zero components */ + for (int i = 3; i > 0; i--) { + if (!(prev->comps.flags[i] & SWS_COMP_ZERO)) + break; + op->pack.pattern[i] = 0; + } + break; + + case SWS_OP_LSHIFT: + case SWS_OP_RSHIFT: + /* Two shifts in the same direction */ + if (next->op == op->op) { + op->c.u += next->c.u; + ff_sws_op_list_remove_at(ops, n + 1, 1); + continue; + } + + /* No-op shift */ + if (!op->c.u) { + ff_sws_op_list_remove_at(ops, n, 1); + continue; + } + break; + + case SWS_OP_CLEAR: + for (int i = 0; i < 4; i++) { + if (!op->c.q4[i].den) + continue; + + if ((prev->comps.flags[i] & SWS_COMP_ZERO) && + !(prev->comps.flags[i] & SWS_COMP_GARBAGE) && + op->c.q4[i].num == 0) + { + /* Redundant clear-to-zero of zero component */ + op->c.q4[i].den = 0; + } else if (next->comps.unused[i]) { + /* Unnecessary clear of unused component */ + op->c.q4[i] = (AVRational) {0, 0}; + } else if (op->c.q4[i].den) { + noop = false; + } + } + + if (noop) { + ff_sws_op_list_remove_at(ops, n, 1); + continue; + } + + /* Transitive clear */ + if (next->op == SWS_OP_CLEAR) { + for (int i = 0; i < 4; i++) { + if (next->c.q4[i].den) + op->c.q4[i] = next->c.q4[i]; + } + ff_sws_op_list_remove_at(ops, n + 1, 1); + continue; + } + + /* Prefer to clear as late as possible, to avoid doing + * redundant work */ + if ((op_type_is_independent(next->op) && next->op != SWS_OP_SWAP_BYTES) || + next->op == SWS_OP_SWIZZLE) + { + if (next->op == SWS_OP_CONVERT) + op->type = next->convert.to; + ff_sws_apply_op_q(next, op->c.q4); + swap_ops(op, next); + progress = true; + continue; + } + break; + + case SWS_OP_SWIZZLE: { + bool seen[4] = {0}; + bool has_duplicates = false; + for (int i = 0; i < 4; i++) { + if (next->comps.unused[i]) + continue; + if (op->swizzle.in[i] != i) + noop = false; + has_duplicates |= seen[op->swizzle.in[i]]; + seen[op->swizzle.in[i]] = true; + } + + /* Identity swizzle */ + if (noop) { + ff_sws_op_list_remove_at(ops, n, 1); + continue; + } + + /* Transitive swizzle */ + if (next->op == SWS_OP_SWIZZLE) { + const SwsSwizzleOp orig = op->swizzle; + for (int i = 0; i < 4; i++) + op->swizzle.in[i] = orig.in[next->swizzle.in[i]]; + op_copy_flags(op, next); + ff_sws_op_list_remove_at(ops, n + 1, 1); + continue; + } + + /* Try to push swizzles with duplicates towards the output */ + if (has_duplicates && op_type_is_independent(next->op)) { + if (next->op == SWS_OP_CONVERT) + op->type = next->convert.to; + if (next->op == SWS_OP_MIN || next->op == SWS_OP_MAX) { + /* Un-swizzle the next operation */ + const SwsConst c = next->c; + for (int i = 0; i < 4; i++) { + if (!next->comps.unused[i]) + next->c.q4[op->swizzle.in[i]] = c.q4[i]; + } + } + swap_ops(op, next); + progress = true; + continue; + } + break; + } + + case SWS_OP_CONVERT: + /* No-op conversion */ + if (op->type == op->convert.to) { + ff_sws_op_list_remove_at(ops, n, 1); + continue; + } + + /* Transitive conversion */ + if (next->op == SWS_OP_CONVERT && + op->convert.expand == next->convert.expand) + { + av_assert1(op->convert.to == next->type); + op->convert.to = next->convert.to; + op_copy_flags(op, next); + ff_sws_op_list_remove_at(ops, n + 1, 1); + continue; + } + + /* Conversion followed by integer expansion */ + if (next->op == SWS_OP_SCALE && + !av_cmp_q(next->c.q, expand_factor(op->type, op->convert.to))) + { + op->convert.expand = true; + ff_sws_op_list_remove_at(ops, n + 1, 1); + continue; + } + break; + + case SWS_OP_MIN: + for (int i = 0; i < 4; i++) { + if (next->comps.unused[i] || !op->c.q4[i].den) + continue; + if (av_cmp_q(op->c.q4[i], prev->comps.max[i]) < 0) + noop = false; + } + + if (noop) { + ff_sws_op_list_remove_at(ops, n, 1); + continue; + } + break; + + case SWS_OP_MAX: + for (int i = 0; i < 4; i++) { + if (next->comps.unused[i] || !op->c.q4[i].den) + continue; + if (av_cmp_q(prev->comps.min[i], op->c.q4[i]) < 0) + noop = false; + } + + if (noop) { + ff_sws_op_list_remove_at(ops, n, 1); + continue; + } + break; + + case SWS_OP_DITHER: + for (int i = 0; i < 4; i++) { + noop &= (prev->comps.flags[i] & SWS_COMP_EXACT) || + next->comps.unused[i]; + } + + if (noop) { + ff_sws_op_list_remove_at(ops, n, 1); + continue; + } + break; + + case SWS_OP_LINEAR: { + SwsSwizzleOp swizzle; + SwsConst c; + + /* No-op (identity) linear operation */ + if (!op->lin.mask) { + ff_sws_op_list_remove_at(ops, n, 1); + continue; + } + + if (next->op == SWS_OP_LINEAR) { + /* 5x5 matrix multiplication after appending [ 0 0 0 0 1 ] */ + const SwsLinearOp m1 = op->lin; + const SwsLinearOp m2 = next->lin; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 5; j++) { + AVRational sum = Q(0); + for (int k = 0; k < 4; k++) + sum = av_add_q(sum, av_mul_q(m2.m[i][k], m1.m[k][j])); + if (j == 4) /* m1.m[4][j] == 1 */ + sum = av_add_q(sum, m2.m[i][4]); + op->lin.m[i][j] = sum; + } + } + op_copy_flags(op, next); + op->lin.mask = ff_sws_linear_mask(op->lin); + ff_sws_op_list_remove_at(ops, n + 1, 1); + continue; + } + + /* Optimize away zero columns */ + for (int j = 0; j < 4; j++) { + const uint32_t col = SWS_MASK_COL(j); + if (!(prev->comps.flags[j] & SWS_COMP_ZERO) || !(op->lin.mask & col)) + continue; + for (int i = 0; i < 4; i++) + op->lin.m[i][j] = Q(i == j); + op->lin.mask &= ~col; + changed = true; + } + + /* Optimize away unused rows */ + for (int i = 0; i < 4; i++) { + const uint32_t row = SWS_MASK_ROW(i); + if (!next->comps.unused[i] || !(op->lin.mask & row)) + continue; + for (int j = 0; j < 5; j++) + op->lin.m[i][j] = Q(i == j); + op->lin.mask &= ~row; + changed = true; + } + + if (changed) { + progress = true; + continue; + } + + /* Convert constant rows to explicit clear instruction */ + if (extract_constant_rows(&op->lin, prev->comps, &c)) { + RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) { + .op = SWS_OP_CLEAR, + .type = op->type, + .comps = op->comps, + .c = c, + })); + continue; + } + + /* Multiplication by scalar constant */ + if (extract_scalar(&op->lin, prev->comps, next->comps, &c)) { + op->op = SWS_OP_SCALE; + op->c = c; + progress = true; + continue; + } + + /* Swizzle by fixed pattern */ + if (extract_swizzle(&op->lin, prev->comps, &swizzle)) { + RET(ff_sws_op_list_insert_at(ops, n, &(SwsOp) { + .op = SWS_OP_SWIZZLE, + .type = op->type, + .swizzle = swizzle, + })); + continue; + } + break; + } + + case SWS_OP_SCALE: { + const int factor2 = exact_log2_q(op->c.q); + + /* No-op scaling */ + if (op->c.q.num == 1 && op->c.q.den == 1) { + ff_sws_op_list_remove_at(ops, n, 1); + continue; + } + + /* Scaling by integer before conversion to int */ + if (op->c.q.den == 1 && + next->op == SWS_OP_CONVERT && + ff_sws_pixel_type_is_int(next->convert.to)) + { + op->type = next->convert.to; + swap_ops(op, next); + progress = true; + continue; + } + + /* Scaling by exact power of two */ + if (factor2 && ff_sws_pixel_type_is_int(op->type)) { + op->op = factor2 > 0 ? SWS_OP_LSHIFT : SWS_OP_RSHIFT; + op->c.u = FFABS(factor2); + progress = true; + continue; + } + break; + } + } + + /* No optimization triggered, move on to next operation */ + n++; + } + } while (prev_num_ops != ops->num_ops || progress); + + return 0; +} -- 2.49.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".