From: Niklas Haas <g...@haasn.dev> This is responsible for taking a "naive" ops list and optimizing it as much as possible. Also includes a small analyzer that generates component metadata for use by the optimizer. --- libswscale/Makefile | 1 + libswscale/ops.h | 12 + libswscale/ops_optimizer.c | 783 +++++++++++++++++++++++++++++++++++++ 3 files changed, 796 insertions(+) create mode 100644 libswscale/ops_optimizer.c
diff --git a/libswscale/Makefile b/libswscale/Makefile index e0beef4e69..810c9dee78 100644 --- a/libswscale/Makefile +++ b/libswscale/Makefile @@ -16,6 +16,7 @@ OBJS = alphablend.o \ input.o \ lut3d.o \ ops.o \ + ops_optimizer.o \ options.o \ output.o \ rgb2rgb.o \ diff --git a/libswscale/ops.h b/libswscale/ops.h index 85462ae337..ae65d578b3 100644 --- a/libswscale/ops.h +++ b/libswscale/ops.h @@ -237,4 +237,16 @@ void ff_sws_op_list_remove_at(SwsOpList *ops, int index, int count); */ void ff_sws_op_list_print(void *log_ctx, int log_level, const SwsOpList *ops); +/** + * Infer + propagate known information about components. Called automatically + * when needed by the optimizer and compiler. + */ +void ff_sws_op_list_update_comps(SwsOpList *ops); + +/** + * Fuse compatible and eliminate redundant operations, as well as replacing + * some operations with more efficient alternatives. + */ +int ff_sws_op_list_optimize(SwsOpList *ops); + #endif diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c new file mode 100644 index 0000000000..d503bf7bf3 --- /dev/null +++ b/libswscale/ops_optimizer.c @@ -0,0 +1,783 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "libavutil/rational.h" + +#include "ops.h" + +#define Q(N) ((AVRational) { N, 1 }) + +#define RET(x) \ + do { \ + if ((ret = (x)) < 0) \ + return ret; \ + } while (0) + +/* Returns true for operations that are independent per channel. These can + * usually be commuted freely other such operations. */ +static bool op_type_is_independent(SwsOpType op) +{ + switch (op) { + case SWS_OP_SWAP_BYTES: + case SWS_OP_LSHIFT: + case SWS_OP_RSHIFT: + case SWS_OP_CONVERT: + case SWS_OP_DITHER: + case SWS_OP_MIN: + case SWS_OP_MAX: + case SWS_OP_SCALE: + return true; + case SWS_OP_INVALID: + case SWS_OP_READ: + case SWS_OP_WRITE: + case SWS_OP_SWIZZLE: + case SWS_OP_CLEAR: + case SWS_OP_LINEAR: + case SWS_OP_PACK: + case SWS_OP_UNPACK: + return false; + case SWS_OP_TYPE_NB: + break; + } + + av_assert0(!"Invalid operation type!"); + return false; +} + +static AVRational expand_factor(SwsPixelType from, SwsPixelType to) +{ + const int src = ff_sws_pixel_type_size(from); + const int dst = ff_sws_pixel_type_size(to); + int scale = 0; + for (int i = 0; i < dst / src; i++) + scale = scale << src * 8 | 1; + return Q(scale); +} + +/* merge_comp_flags() forms a monoid with flags_identity as the null element */ +static const unsigned flags_identity = SWS_COMP_ZERO | SWS_COMP_EXACT; +static unsigned merge_comp_flags(unsigned a, unsigned b) +{ + const unsigned flags_or = SWS_COMP_GARBAGE; + const unsigned flags_and = SWS_COMP_ZERO | SWS_COMP_EXACT; + return ((a & b) & flags_and) | ((a | b) & flags_or); +} + +/* Infer + propagate known information about components */ +void ff_sws_op_list_update_comps(SwsOpList *ops) +{ + SwsComps next = { .unused = {true, true, true, true} }; + SwsComps prev = { .flags = { + SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, + }}; + + /* Forwards pass, propagates knowledge about the incoming pixel values */ + for (int n = 0; n < ops->num_ops; n++) { + SwsOp *op = &ops->ops[n]; + + /* Prefill min/max values automatically; may have to be fixed in + * special cases */ + memcpy(op->comps.min, prev.min, sizeof(prev.min)); + memcpy(op->comps.max, prev.max, sizeof(prev.max)); + + if (op->op != SWS_OP_SWAP_BYTES) { + ff_sws_apply_op_q(op, op->comps.min); + ff_sws_apply_op_q(op, op->comps.max); + } + + switch (op->op) { + case SWS_OP_READ: + for (int i = 0; i < op->rw.elems; i++) { + if (ff_sws_pixel_type_is_int(op->type)) { + int bits = 8 * ff_sws_pixel_type_size(op->type); + if (!op->rw.packed && ops->src.desc) { + /* Use legal value range from pixdesc if available; + * we don't need to do this for packed formats because + * non-byte-aligned packed formats will necessarily go + * through SWS_OP_UNPACK anyway */ + for (int c = 0; c < 4; c++) { + if (ops->src.desc->comp[c].plane == i) { + bits = ops->src.desc->comp[c].depth; + break; + } + } + } + + op->comps.flags[i] = SWS_COMP_EXACT; + op->comps.min[i] = Q(0); + op->comps.max[i] = Q((1ULL << bits) - 1); + } + } + for (int i = op->rw.elems; i < 4; i++) + op->comps.flags[i] = prev.flags[i]; + break; + case SWS_OP_WRITE: + for (int i = 0; i < op->rw.elems; i++) + av_assert1(!(prev.flags[i] & SWS_COMP_GARBAGE)); + /* fall through */ + case SWS_OP_SWAP_BYTES: + case SWS_OP_LSHIFT: + case SWS_OP_RSHIFT: + case SWS_OP_MIN: + case SWS_OP_MAX: + /* Linearly propagate flags per component */ + for (int i = 0; i < 4; i++) + op->comps.flags[i] = prev.flags[i]; + break; + case SWS_OP_DITHER: + /* Strip zero flag because of the nonzero dithering offset */ + for (int i = 0; i < 4; i++) + op->comps.flags[i] = prev.flags[i] & ~SWS_COMP_ZERO; + break; + case SWS_OP_UNPACK: + for (int i = 0; i < 4; i++) { + if (op->pack.pattern[i]) + op->comps.flags[i] = prev.flags[0]; + else + op->comps.flags[i] = SWS_COMP_GARBAGE; + } + break; + case SWS_OP_PACK: { + unsigned flags = flags_identity; + for (int i = 0; i < 4; i++) { + if (op->pack.pattern[i]) + flags = merge_comp_flags(flags, prev.flags[i]); + if (i > 0) /* clear remaining comps for sanity */ + op->comps.flags[i] = SWS_COMP_GARBAGE; + } + op->comps.flags[0] = flags; + break; + } + case SWS_OP_CLEAR: + for (int i = 0; i < 4; i++) { + if (op->c.q4[i].den) { + if (op->c.q4[i].num == 0) { + op->comps.flags[i] = SWS_COMP_ZERO | SWS_COMP_EXACT; + } else if (op->c.q4[i].den == 1) { + op->comps.flags[i] = SWS_COMP_EXACT; + } + } else { + op->comps.flags[i] = prev.flags[i]; + } + } + break; + case SWS_OP_SWIZZLE: + for (int i = 0; i < 4; i++) + op->comps.flags[i] = prev.flags[op->swizzle.in[i]]; + break; + case SWS_OP_CONVERT: + for (int i = 0; i < 4; i++) { + op->comps.flags[i] = prev.flags[i]; + if (ff_sws_pixel_type_is_int(op->convert.to)) + op->comps.flags[i] |= SWS_COMP_EXACT; + } + break; + case SWS_OP_LINEAR: + for (int i = 0; i < 4; i++) { + unsigned flags = flags_identity; + AVRational min = Q(0), max = Q(0); + for (int j = 0; j < 4; j++) { + const AVRational k = op->lin.m[i][j]; + AVRational mink = av_mul_q(prev.min[j], k); + AVRational maxk = av_mul_q(prev.max[j], k); + if (k.num) { + flags = merge_comp_flags(flags, prev.flags[j]); + if (k.den != 1) /* fractional coefficient */ + flags &= ~SWS_COMP_EXACT; + if (k.num < 0) + FFSWAP(AVRational, mink, maxk); + min = av_add_q(min, mink); + max = av_add_q(max, maxk); + } + } + if (op->lin.m[i][4].num) { /* nonzero offset */ + flags &= ~SWS_COMP_ZERO; + if (op->lin.m[i][4].den != 1) /* fractional offset */ + flags &= ~SWS_COMP_EXACT; + min = av_add_q(min, op->lin.m[i][4]); + max = av_add_q(max, op->lin.m[i][4]); + } + op->comps.flags[i] = flags; + op->comps.min[i] = min; + op->comps.max[i] = max; + } + break; + case SWS_OP_SCALE: + for (int i = 0; i < 4; i++) { + op->comps.flags[i] = prev.flags[i]; + if (op->c.q.den != 1) /* fractional scale */ + op->comps.flags[i] &= ~SWS_COMP_EXACT; + if (op->c.q.num < 0) + FFSWAP(AVRational, op->comps.min[i], op->comps.max[i]); + } + break; + + case SWS_OP_INVALID: + case SWS_OP_TYPE_NB: + av_assert0(!"Invalid operation type!"); + } + + prev = op->comps; + } + + /* Backwards pass, solves for component dependencies */ + for (int n = ops->num_ops - 1; n >= 0; n--) { + SwsOp *op = &ops->ops[n]; + + switch (op->op) { + case SWS_OP_READ: + case SWS_OP_WRITE: + for (int i = 0; i < op->rw.elems; i++) + op->comps.unused[i] = op->op == SWS_OP_READ; + for (int i = op->rw.elems; i < 4; i++) + op->comps.unused[i] = next.unused[i]; + break; + case SWS_OP_SWAP_BYTES: + case SWS_OP_LSHIFT: + case SWS_OP_RSHIFT: + case SWS_OP_CONVERT: + case SWS_OP_DITHER: + case SWS_OP_MIN: + case SWS_OP_MAX: + case SWS_OP_SCALE: + for (int i = 0; i < 4; i++) + op->comps.unused[i] = next.unused[i]; + break; + case SWS_OP_UNPACK: { + bool unused = true; + for (int i = 0; i < 4; i++) { + if (op->pack.pattern[i]) + unused &= next.unused[i]; + op->comps.unused[i] = i > 0; + } + op->comps.unused[0] = unused; + break; + } + case SWS_OP_PACK: + for (int i = 0; i < 4; i++) { + if (op->pack.pattern[i]) + op->comps.unused[i] = next.unused[0]; + else + op->comps.unused[i] = true; + } + break; + case SWS_OP_CLEAR: + for (int i = 0; i < 4; i++) { + if (op->c.q4[i].den) + op->comps.unused[i] = true; + else + op->comps.unused[i] = next.unused[i]; + } + break; + case SWS_OP_SWIZZLE: { + bool unused[4] = { true, true, true, true }; + for (int i = 0; i < 4; i++) + unused[op->swizzle.in[i]] &= next.unused[i]; + for (int i = 0; i < 4; i++) + op->comps.unused[i] = unused[i]; + break; + } + case SWS_OP_LINEAR: + for (int j = 0; j < 4; j++) { + bool unused = true; + for (int i = 0; i < 4; i++) { + if (op->lin.m[i][j].num) + unused &= next.unused[i]; + } + op->comps.unused[j] = unused; + } + break; + } + + next = op->comps; + } +} + +/* returns log2(x) only if x is a power of two, or 0 otherwise */ +static int exact_log2(const int x) +{ + int p; + if (x <= 0) + return 0; + p = av_log2(x); + return (1 << p) == x ? p : 0; +} + +static int exact_log2_q(const AVRational x) +{ + if (x.den == 1) + return exact_log2(x.num); + else if (x.num == 1) + return -exact_log2(x.den); + else + return 0; +} + +/** + * If a linear operation can be reduced to a scalar multiplication, returns + * the corresponding scaling factor, or 0 otherwise. + */ +static bool extract_scalar(const SwsLinearOp *c, SwsComps prev, SwsComps next, + SwsConst *out_scale) +{ + SwsConst scale = {0}; + + /* There are components not on the main diagonal */ + if (c->mask & ~SWS_MASK_DIAG4) + return false; + + for (int i = 0; i < 4; i++) { + const AVRational s = c->m[i][i]; + if ((prev.flags[i] & SWS_COMP_ZERO) || next.unused[i]) + continue; + if (scale.q.den && av_cmp_q(s, scale.q)) + return false; + scale.q = s; + } + + if (scale.q.den) + *out_scale = scale; + return scale.q.den; +} + +/* Extracts an integer clear operation (subset) from the given linear op. */ +static bool extract_constant_rows(SwsLinearOp *c, SwsComps prev, + SwsConst *out_clear) +{ + SwsConst clear = {0}; + bool ret = false; + + for (int i = 0; i < 4; i++) { + bool const_row = c->m[i][4].den == 1; /* offset is integer */ + for (int j = 0; j < 4; j++) { + const_row &= c->m[i][j].num == 0 || /* scalar is zero */ + (prev.flags[j] & SWS_COMP_ZERO); /* input is zero */ + } + if (const_row && (c->mask & SWS_MASK_ROW(i))) { + clear.q4[i] = c->m[i][4]; + for (int j = 0; j < 5; j++) + c->m[i][j] = Q(i == j); + c->mask &= ~SWS_MASK_ROW(i); + ret = true; + } + } + + if (ret) + *out_clear = clear; + return ret; +} + +/* Unswizzle a linear operation by aligning single-input rows with + * their corresponding diagonal */ +static bool extract_swizzle(SwsLinearOp *op, SwsComps prev, SwsSwizzleOp *out_swiz) +{ + SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3); + SwsLinearOp c = *op; + + for (int i = 0; i < 4; i++) { + int idx = -1; + for (int j = 0; j < 4; j++) { + if (!c.m[i][j].num || (prev.flags[j] & SWS_COMP_ZERO)) + continue; + if (idx >= 0) + return false; /* multiple inputs */ + idx = j; + } + + if (idx >= 0 && idx != i) { + /* Move coefficient to the diagonal */ + c.m[i][i] = c.m[i][idx]; + c.m[i][idx] = Q(0); + swiz.in[i] = idx; + } + } + + if (swiz.mask == SWS_SWIZZLE(0, 1, 2, 3).mask) + return false; /* no swizzle was identified */ + + c.mask = ff_sws_linear_mask(c); + *out_swiz = swiz; + *op = c; + return true; +} + +int ff_sws_op_list_optimize(SwsOpList *ops) +{ + int ret; + +retry: + ff_sws_op_list_update_comps(ops); + + for (int n = 0; n < ops->num_ops;) { + SwsOp dummy = {0}; + SwsOp *op = &ops->ops[n]; + SwsOp *prev = n ? &ops->ops[n - 1] : &dummy; + SwsOp *next = n + 1 < ops->num_ops ? &ops->ops[n + 1] : &dummy; + + /* common helper variable */ + bool noop = true; + + switch (op->op) { + case SWS_OP_READ: + /* Optimized further into refcopy / memcpy */ + if (next->op == SWS_OP_WRITE && + next->rw.elems == op->rw.elems && + next->rw.packed == op->rw.packed && + next->rw.frac == op->rw.frac) + { + ff_sws_op_list_remove_at(ops, n, 2); + av_assert1(ops->num_ops == 0); + return 0; + } + + /* Skip reading extra unneeded components */ + if (!op->rw.packed) { + int needed = op->rw.elems; + while (needed > 0 && next->comps.unused[needed - 1]) + needed--; + if (op->rw.elems != needed) { + op->rw.elems = needed; + op->rw.packed &= op->rw.elems > 1; + goto retry; + } + } + break; + + case SWS_OP_SWAP_BYTES: + /* Redundant (double) swap */ + if (next->op == SWS_OP_SWAP_BYTES) { + ff_sws_op_list_remove_at(ops, n, 2); + goto retry; + } + break; + + case SWS_OP_UNPACK: + /* Redundant unpack+pack */ + if (next->op == SWS_OP_PACK && next->type == op->type && + next->pack.pattern[0] == op->pack.pattern[0] && + next->pack.pattern[1] == op->pack.pattern[1] && + next->pack.pattern[2] == op->pack.pattern[2] && + next->pack.pattern[3] == op->pack.pattern[3]) + { + ff_sws_op_list_remove_at(ops, n, 2); + goto retry; + } + break; + + case SWS_OP_LSHIFT: + case SWS_OP_RSHIFT: + /* Two shifts in the same direction */ + if (next->op == op->op) { + op->c.u += next->c.u; + ff_sws_op_list_remove_at(ops, n + 1, 1); + goto retry; + } + + /* No-op shift */ + if (!op->c.u) { + ff_sws_op_list_remove_at(ops, n, 1); + goto retry; + } + break; + + case SWS_OP_CLEAR: + for (int i = 0; i < 4; i++) { + if (!op->c.q4[i].den) + continue; + + if ((prev->comps.flags[i] & SWS_COMP_ZERO) && + !(prev->comps.flags[i] & SWS_COMP_GARBAGE) && + op->c.q4[i].num == 0) + { + /* Redundant clear-to-zero of zero component */ + op->c.q4[i].den = 0; + } else if (next->comps.unused[i]) { + /* Unnecessary clear of unused component */ + op->c.q4[i] = (AVRational) {0, 0}; + } else if (op->c.q4[i].den) { + noop = false; + } + } + + if (noop) { + ff_sws_op_list_remove_at(ops, n, 1); + goto retry; + } + + /* Transitive clear */ + if (next->op == SWS_OP_CLEAR) { + for (int i = 0; i < 4; i++) { + if (next->c.q4[i].den) + op->c.q4[i] = next->c.q4[i]; + } + ff_sws_op_list_remove_at(ops, n + 1, 1); + goto retry; + } + + /* Prefer to clear as late as possible, to avoid doing + * redundant work */ + if ((op_type_is_independent(next->op) && next->op != SWS_OP_SWAP_BYTES) || + next->op == SWS_OP_SWIZZLE) + { + if (next->op == SWS_OP_CONVERT) + op->type = next->convert.to; + ff_sws_apply_op_q(next, op->c.q4); + FFSWAP(SwsOp, *op, *next); + goto retry; + } + break; + + case SWS_OP_SWIZZLE: { + bool seen[4] = {0}; + bool has_duplicates = false; + for (int i = 0; i < 4; i++) { + if (next->comps.unused[i]) + continue; + if (op->swizzle.in[i] != i) + noop = false; + has_duplicates |= seen[op->swizzle.in[i]]; + seen[op->swizzle.in[i]] = true; + } + + /* Identity swizzle */ + if (noop) { + ff_sws_op_list_remove_at(ops, n, 1); + goto retry; + } + + /* Transitive swizzle */ + if (next->op == SWS_OP_SWIZZLE) { + const SwsSwizzleOp orig = op->swizzle; + for (int i = 0; i < 4; i++) + op->swizzle.in[i] = orig.in[next->swizzle.in[i]]; + ff_sws_op_list_remove_at(ops, n + 1, 1); + goto retry; + } + + /* Try to push swizzles with duplicates towards the output */ + if (has_duplicates && op_type_is_independent(next->op)) { + if (next->op == SWS_OP_CONVERT) + op->type = next->convert.to; + if (next->op == SWS_OP_MIN || next->op == SWS_OP_MAX) { + /* Un-swizzle the next operation */ + const SwsConst c = next->c; + for (int i = 0; i < 4; i++) { + if (!next->comps.unused[i]) + next->c.q4[op->swizzle.in[i]] = c.q4[i]; + } + } + FFSWAP(SwsOp, *op, *next); + goto retry; + } + + /* Move swizzle out of the way between two converts so that + * they may be merged */ + if (prev->op == SWS_OP_CONVERT && next->op == SWS_OP_CONVERT) { + op->type = next->convert.to; + FFSWAP(SwsOp, *op, *next); + goto retry; + } + break; + } + + case SWS_OP_CONVERT: + /* No-op conversion */ + if (op->type == op->convert.to) { + ff_sws_op_list_remove_at(ops, n, 1); + goto retry; + } + + /* Transitive conversion */ + if (next->op == SWS_OP_CONVERT && + op->convert.expand == next->convert.expand) + { + av_assert1(op->convert.to == next->type); + op->convert.to = next->convert.to; + ff_sws_op_list_remove_at(ops, n + 1, 1); + goto retry; + } + + /* Conversion followed by integer expansion */ + if (next->op == SWS_OP_SCALE && + !av_cmp_q(next->c.q, expand_factor(op->type, op->convert.to))) + { + op->convert.expand = true; + ff_sws_op_list_remove_at(ops, n + 1, 1); + goto retry; + } + break; + + case SWS_OP_MIN: + for (int i = 0; i < 4; i++) { + if (next->comps.unused[i] || !op->c.q4[i].den) + continue; + if (av_cmp_q(op->c.q4[i], prev->comps.max[i]) < 0) + noop = false; + } + + if (noop) { + ff_sws_op_list_remove_at(ops, n, 1); + goto retry; + } + break; + + case SWS_OP_MAX: + for (int i = 0; i < 4; i++) { + if (next->comps.unused[i] || !op->c.q4[i].den) + continue; + if (av_cmp_q(prev->comps.min[i], op->c.q4[i]) < 0) + noop = false; + } + + if (noop) { + ff_sws_op_list_remove_at(ops, n, 1); + goto retry; + } + break; + + case SWS_OP_DITHER: + for (int i = 0; i < 4; i++) { + noop &= (prev->comps.flags[i] & SWS_COMP_EXACT) || + next->comps.unused[i]; + } + + if (noop) { + ff_sws_op_list_remove_at(ops, n, 1); + goto retry; + } + break; + + case SWS_OP_LINEAR: { + SwsSwizzleOp swizzle; + SwsConst c; + + /* No-op (identity) linear operation */ + if (!op->lin.mask) { + ff_sws_op_list_remove_at(ops, n, 1); + goto retry; + } + + if (next->op == SWS_OP_LINEAR) { + /* 5x5 matrix multiplication after appending [ 0 0 0 0 1 ] */ + const SwsLinearOp m1 = op->lin; + const SwsLinearOp m2 = next->lin; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 5; j++) { + AVRational sum = Q(0); + for (int k = 0; k < 4; k++) + sum = av_add_q(sum, av_mul_q(m2.m[i][k], m1.m[k][j])); + if (j == 4) /* m1.m[4][j] == 1 */ + sum = av_add_q(sum, m2.m[i][4]); + op->lin.m[i][j] = sum; + } + } + op->lin.mask = ff_sws_linear_mask(op->lin); + ff_sws_op_list_remove_at(ops, n + 1, 1); + goto retry; + } + + /* Optimize away zero columns */ + for (int j = 0; j < 4; j++) { + const uint32_t col = SWS_MASK_COL(j); + if (!(prev->comps.flags[j] & SWS_COMP_ZERO) || !(op->lin.mask & col)) + continue; + for (int i = 0; i < 4; i++) + op->lin.m[i][j] = Q(i == j); + op->lin.mask &= ~col; + goto retry; + } + + /* Optimize away unused rows */ + for (int i = 0; i < 4; i++) { + const uint32_t row = SWS_MASK_ROW(i); + if (!next->comps.unused[i] || !(op->lin.mask & row)) + continue; + for (int j = 0; j < 5; j++) + op->lin.m[i][j] = Q(i == j); + op->lin.mask &= ~row; + goto retry; + } + + /* Convert constant rows to explicit clear instruction */ + if (extract_constant_rows(&op->lin, prev->comps, &c)) { + RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) { + .op = SWS_OP_CLEAR, + .type = op->type, + .comps = op->comps, + .c = c, + })); + goto retry; + } + + /* Multiplication by scalar constant */ + if (extract_scalar(&op->lin, prev->comps, next->comps, &c)) { + op->op = SWS_OP_SCALE; + op->c = c; + goto retry; + } + + /* Swizzle by fixed pattern */ + if (extract_swizzle(&op->lin, prev->comps, &swizzle)) { + RET(ff_sws_op_list_insert_at(ops, n, &(SwsOp) { + .op = SWS_OP_SWIZZLE, + .type = op->type, + .swizzle = swizzle, + })); + goto retry; + } + break; + } + + case SWS_OP_SCALE: { + const int factor2 = exact_log2_q(op->c.q); + + /* No-op scaling */ + if (op->c.q.num == 1 && op->c.q.den == 1) { + ff_sws_op_list_remove_at(ops, n, 1); + goto retry; + } + + /* Scaling by integer before conversion to int */ + if (op->c.q.den == 1 && + next->op == SWS_OP_CONVERT && + ff_sws_pixel_type_is_int(next->convert.to)) + { + op->type = next->convert.to; + FFSWAP(SwsOp, *op, *next); + goto retry; + } + + /* Scaling by exact power of two */ + if (factor2 && ff_sws_pixel_type_is_int(op->type)) { + op->op = factor2 > 0 ? SWS_OP_LSHIFT : SWS_OP_RSHIFT; + op->c.u = FFABS(factor2); + goto retry; + } + break; + } + } + + /* No optimization triggered, move on to next operation */ + n++; + } + + return 0; +} -- 2.49.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".