This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit dbe961b4cdb9e669890424f02082d90d4483faa3 Author: Niklas Haas <[email protected]> AuthorDate: Tue Jun 9 01:29:19 2026 +0200 Commit: Niklas Haas <[email protected]> CommitDate: Tue Jun 9 18:27:20 2026 +0200 swscale/uops: add SWS_UOP_LINEAR_FMA and SWS_UOP_FLAG_FMA This is like SWS_UOP_LINEAR but parametrized by which matrix entries can use FMA instead of bitexact IEEE mul/add instructions. I decided to make these a separate uop to avoid bogging down the reference backend with arch-specific details like FMA. However, I think FMA ops are quite common/universal so I pre-emptively split it into its own separate flag rather than defining something like SWS_UOP_FLAG_X86. Signed-off-by: Niklas Haas <[email protected]> --- libswscale/uops.c | 60 +++++++++++++++++++++++++++-- libswscale/uops.h | 4 ++ libswscale/uops_macros.h | 98 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 159 insertions(+), 3 deletions(-) diff --git a/libswscale/uops.c b/libswscale/uops.c index 369e429020..9d3c7a71f5 100644 --- a/libswscale/uops.c +++ b/libswscale/uops.c @@ -69,6 +69,7 @@ static const struct { UOP_NAME(TO_F32, "to_f32"), UOP_NAME(SCALE, "scale"), UOP_NAME(LINEAR, "linear"), + UOP_NAME(LINEAR_FMA, "linear_fma"), UOP_NAME(ADD, "add"), UOP_NAME(MIN, "min"), UOP_NAME(MAX, "max"), @@ -174,6 +175,7 @@ void ff_sws_uop_name(const SwsUOp *op, char buf[SWS_UOP_NAME_MAX]) } break; case SWS_UOP_LINEAR: + case SWS_UOP_LINEAR_FMA: for (int i = 0; i < 4; i++) { if (!SWS_COMP_TEST(op->mask, i)) continue; @@ -183,6 +185,8 @@ void ff_sws_uop_name(const SwsUOp *op, char buf[SWS_UOP_NAME_MAX]) av_bprint_chars(&bp, '1', 1); else if (par->lin.zero & SWS_MASK(i, j)) av_bprint_chars(&bp, '0', 1); + else if (par->lin.exact & SWS_MASK(i, j)) + av_bprint_chars(&bp, 'X', 1); else av_bprint_chars(&bp, 'x', 1); } @@ -235,8 +239,11 @@ static int generate_entry_struct(void *opaque, void *key) par->clear.one, par->clear.zero); break; case SWS_UOP_LINEAR: + case SWS_UOP_LINEAR_FMA: av_bprintf(bp, ", .par.lin.one = 0x%x, .par.lin.zero = 0x%x", par->lin.one, par->lin.zero); + if (uop->uop == SWS_UOP_LINEAR_FMA) + av_bprintf(bp, ", .par.lin.exact = 0x%x", par->lin.exact); break; case SWS_UOP_DITHER: av_bprintf(bp, ", .par.dither = { .y_offset = {%u, %u, %u, %u}, .size_log2 = %u }", @@ -282,7 +289,10 @@ static int generate_entry_args(void *opaque, void *key) av_bprintf(bp, ", 0x%05x, 0x%05x", par->clear.one, par->clear.zero); break; case SWS_UOP_LINEAR: + case SWS_UOP_LINEAR_FMA: av_bprintf(bp, ", 0x%05x, 0x%05x", par->lin.one, par->lin.zero); + if (uop->uop == SWS_UOP_LINEAR_FMA) + av_bprintf(bp, ", 0x%05x", par->lin.exact); break; case SWS_UOP_DITHER: av_bprintf(bp, ", %u, %u, %u, %u, %u", @@ -364,6 +374,35 @@ static SwsPixelType pixel_type_to_int(const SwsPixelType type) return SWS_PIXEL_NONE; } +static bool exact_product_f32(float a, float b) +{ + volatile float prod = a * b; + volatile float result = b ? prod / b : 0.0f; + return !b || result == a; +} + +static bool exact_prod(SwsPixelType type, SwsPixel coef, + const SwsComps *comps, int idx) +{ + const AVRational minq = comps->min[idx]; + const AVRational maxq = comps->max[idx]; + if (ff_sws_pixel_type_is_int(type)) + return true; + else if (!minq.den || !maxq.den) + return false; /* unknown bounds */ + + const SwsPixel min = pixel_from_q(type, minq); + const SwsPixel max = pixel_from_q(type, maxq); + switch (type) { + case SWS_PIXEL_F32: + return exact_product_f32(coef.f32, min.f32) && + exact_product_f32(coef.f32, max.f32); + } + + av_unreachable("Invalid pixel type!"); + return false; +} + static int translate_rw_op(SwsUOpList *ops, const SwsOp *op) { SwsUOp uop = { @@ -499,26 +538,40 @@ static int translate_dither_op(SwsUOpList *ops, const SwsOp *op) return ff_sws_uop_list_append(ops, &uop); } -static int translate_linear_op(SwsUOpList *ops, const SwsOp *op) +static int translate_linear_op(SwsContext *ctx, SwsUOpList *ops, + SwsUOpFlags flags, const SwsOp *op, + const SwsComps *input) { SwsUOp uop = { .type = op->type, .uop = SWS_UOP_LINEAR, }; + const bool bitexact = ctx->flags & SWS_BITEXACT; + uint32_t exact = 0; + for (int i = 0; i < 4; i++) { if (SWS_OP_NEEDED(op, i) && (op->lin.mask & SWS_MASK_ROW(i))) uop.mask |= SWS_COMP(i); for (int j = 0; j < 5; j++) { const AVRational k = op->lin.m[i][j]; - uop.data.mat4[i][j] = Q2PIXEL(k); + const SwsPixel px = Q2PIXEL(k); + uop.data.mat4[i][j] = px; if (k.num == 0) uop.par.lin.zero |= SWS_MASK(i, j); else if (k.num == k.den) uop.par.lin.one |= SWS_MASK(i, j); + else if (j < 4 && (!bitexact || exact_prod(uop.type, px, input, j))) + exact |= SWS_MASK(i, j); } } + if (flags & SWS_UOP_FLAG_FMA) { + /* multiplication by 1 and 0 are always exact by definition */ + uop.uop = SWS_UOP_LINEAR_FMA; + uop.par.lin.exact = exact | uop.par.lin.zero | uop.par.lin.one; + } + return ff_sws_uop_list_append(ops, &uop); } @@ -555,7 +608,7 @@ static int translate_op(SwsContext *ctx, SwsUOpList *uops, SwsUOpFlags flags, case SWS_OP_DITHER: return translate_dither_op(uops, op); case SWS_OP_LINEAR: - return translate_linear_op(uops, op); + return translate_linear_op(ctx, uops, flags, op, input); default: break; } @@ -699,6 +752,7 @@ fail: static const SwsUOpFlags uop_flags[] = { 0, + SWS_UOP_FLAG_FMA, /* x86 backend */ }; static int register_uops(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out) diff --git a/libswscale/uops.h b/libswscale/uops.h index 01197fd09f..4983552eb7 100644 --- a/libswscale/uops.h +++ b/libswscale/uops.h @@ -127,6 +127,7 @@ typedef enum SwsUOpType { SWS_UOP_RSHIFT, /* mask = components to shift */ SWS_UOP_CLEAR, /* mask = components to clear */ SWS_UOP_LINEAR, /* mask = non-trivial output rows */ + SWS_UOP_LINEAR_FMA, /* with SWS_UOP_FLAG_FMA */ SWS_UOP_DITHER, /* mask = components to dither */ /* Platform-specific uops would go here */ @@ -153,6 +154,9 @@ typedef struct SwsClearUOp { typedef struct SwsLinearUOp { uint32_t one; /* mask of coefficients equal to one */ uint32_t zero; /* mask of coefficients equal to zero */ + + /* for SWS_UOP_LINEAR_FMA only */ + uint32_t exact; /* mask of coefficients whose product is exact */ } SwsLinearUOp; typedef struct SwsDitherUOp { diff --git a/libswscale/uops_macros.h b/libswscale/uops_macros.h index a2fccd08fa..f3774243a4 100644 --- a/libswscale/uops_macros.h +++ b/libswscale/uops_macros.h @@ -281,6 +281,8 @@ MACRO(__VA_ARGS__, u8_clear_yzw_xx1 , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR , .mask = 0xe, .par.clear.one = 0x8, .par.clear.zero = 0x0) #define SWS_FOR_U8_LINEAR(MACRO, ...) #define SWS_FOR_STRUCT_U8_LINEAR(MACRO, ...) +#define SWS_FOR_U8_LINEAR_FMA(MACRO, ...) +#define SWS_FOR_STRUCT_U8_LINEAR_FMA(MACRO, ...) #define SWS_FOR_U8_DITHER(MACRO, ...) #define SWS_FOR_STRUCT_U8_DITHER(MACRO, ...) #define SWS_FOR_U16_READ_PLANAR(MACRO, ...) \ @@ -547,6 +549,8 @@ MACRO(__VA_ARGS__, u16_clear_yzw_xx1 , .type = SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR , .mask = 0xe, .par.clear.one = 0x8, .par.clear.zero = 0x0) #define SWS_FOR_U16_LINEAR(MACRO, ...) #define SWS_FOR_STRUCT_U16_LINEAR(MACRO, ...) +#define SWS_FOR_U16_LINEAR_FMA(MACRO, ...) +#define SWS_FOR_STRUCT_U16_LINEAR_FMA(MACRO, ...) #define SWS_FOR_U16_DITHER(MACRO, ...) #define SWS_FOR_STRUCT_U16_DITHER(MACRO, ...) #define SWS_FOR_U32_READ_PLANAR(MACRO, ...) \ @@ -757,6 +761,8 @@ MACRO(__VA_ARGS__, u32_clear_xzw_xxx , .type = SWS_PIXEL_U32, .uop = SWS_UOP_CLEAR , .mask = 0xd, .par.clear.one = 0x0, .par.clear.zero = 0x0) #define SWS_FOR_U32_LINEAR(MACRO, ...) #define SWS_FOR_STRUCT_U32_LINEAR(MACRO, ...) +#define SWS_FOR_U32_LINEAR_FMA(MACRO, ...) +#define SWS_FOR_STRUCT_U32_LINEAR_FMA(MACRO, ...) #define SWS_FOR_U32_DITHER(MACRO, ...) #define SWS_FOR_STRUCT_U32_DITHER(MACRO, ...) #define SWS_FOR_F32_READ_PLANAR(MACRO, ...) @@ -955,6 +961,98 @@ MACRO(__VA_ARGS__, f32_linear_xyzw_xxx0x_xxx0x_xxx0x_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xba108) \ MACRO(__VA_ARGS__, f32_linear_xyzw_x0x0x_xxx0x_xx00x_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xbb10a) \ MACRO(__VA_ARGS__, f32_linear_xyzw_x0000_0x000_00x00_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xbefbe) +#define SWS_FOR_F32_LINEAR_FMA(MACRO, ...) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_x000x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefae, 0xfffee) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_X000x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefae, 0xfffef) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xxx00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xffff8) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XXx00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xffffb) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XxX00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xffffd) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xXX00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xffffe) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XXX00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xxx01 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41050, 0xbefa8, 0xffff8) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XXx01 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41050, 0xbefa8, 0xffffb) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xXX01 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41050, 0xbefa8, 0xffffe) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XXX01 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41050, 0xbefa8, 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_x0001 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41050, 0xbefae, 0xffffe) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_X0001 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41050, 0xbefae, 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_y_0x000 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x2, 0x41001, 0xbefbe, 0xfffbf) \ + MACRO(__VA_ARGS__, f32_linear_fma_y_0X000 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x2, 0x41001, 0xbefbe, 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx0x_xxx0x_xxx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba108, 0xfa108) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXx0x_XXx0x_XXx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba108, 0xfad6b) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX0x_XxX0x_XXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba108, 0xfbdaf) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX0x_xXX0x_xXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba108, 0xfb9ce) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX0x_XXX0x_XXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba108, 0xfbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0x0x_xxx0x_xx00x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbb10a, 0xfb10a) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0X0x_XXX0x_XX00x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbb10a, 0xfbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx00_xxx0x_xxx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba118, 0xfa118) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXx00_XXx0x_XXx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba118, 0xfad7b) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX00_XxX0x_XXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba118, 0xfbdbf) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_xXX0x_xXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba118, 0xfb9de) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX00_XXX0x_XXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba118, 0xfbdff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_x000x_0x00x_00x0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbadae, 0xfadae) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_X000x_0X00x_00X0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbadae, 0xfbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0000_0x000_00x00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbefbe, 0xfefbe) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0000_0X000_00X00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbefbe, 0xffffe) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0000_0X000_00x00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbefbe, 0xfefff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0000_0X000_00X00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbefbe, 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0001_0X00x_00X01 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x44010, 0xbadae, 0xffdff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0001_0X001_00X01 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x44210, 0xbadae, 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_10X0x_1XX0x_1X00x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40421, 0xbb10a, 0xfbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_w_000X0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x8, 0x01041, 0xbefbe, 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xw_x000x_000x0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x9, 0x01040, 0xbefae, 0xbffee) \ + MACRO(__VA_ARGS__, f32_linear_fma_xw_X000x_000X0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x9, 0x01040, 0xbefae, 0xfffef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xw_xxx00_000x0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x9, 0x01040, 0xbefb8, 0xbfff8) \ + MACRO(__VA_ARGS__, f32_linear_fma_xw_XXX00_000X0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x9, 0x01040, 0xbefb8, 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_xxx0x_xxx0x_xxx0x_000x0, SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0xf, 0x00000, 0xba108, 0xba108) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_XXX0x_XXX0x_XXX0x_000X0, SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0xf, 0x00000, 0xba108, 0xfbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_X0X0x_XXX0x_XX00x_000X0, SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0xf, 0x00000, 0xbb10a, 0xfbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_X0000_0X000_00X00_000X0, SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0xf, 0x00000, 0xbefbe, 0xfffff) +#define SWS_FOR_STRUCT_F32_LINEAR_FMA(MACRO, ...) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_x000x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xfffee) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_X000x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xfffef) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xxx00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffff8) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XXx00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffffb) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XxX00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffffd) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xXX00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffffe) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XXX00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xxx01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xffff8) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XXx01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xffffb) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xXX01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xffffe) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XXX01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_x0001 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefae, .par.lin.exact = 0xffffe) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_X0001 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefae, .par.lin.exact = 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_y_0x000 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x2, .par.lin.one = 0x41001, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffbf) \ + MACRO(__VA_ARGS__, f32_linear_fma_y_0X000 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x2, .par.lin.one = 0x41001, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx0x_xxx0x_xxx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfa108) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXx0x_XXx0x_XXx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfad6b) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX0x_XxX0x_XXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfbdaf) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX0x_xXX0x_xXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfb9ce) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX0x_XXX0x_XXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0x0x_xxx0x_xx00x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbb10a, .par.lin.exact = 0xfb10a) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0X0x_XXX0x_XX00x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbb10a, .par.lin.exact = 0xfbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx00_xxx0x_xxx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfa118) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXx00_XXx0x_XXx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfad7b) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX00_XxX0x_XXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfbdbf) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_xXX0x_xXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfb9de) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX00_XXX0x_XXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfbdff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_x000x_0x00x_00x0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbadae, .par.lin.exact = 0xfadae) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_X000x_0X00x_00X0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbadae, .par.lin.exact = 0xfbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0000_0x000_00x00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfefbe) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0000_0X000_00X00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xffffe) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0000_0X000_00x00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfefff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0000_0X000_00X00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0001_0X00x_00X01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x44010, .par.lin.zero = 0xbadae, .par.lin.exact = 0xffdff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0001_0X001_00X01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x44210, .par.lin.zero = 0xbadae, .par.lin.exact = 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_10X0x_1XX0x_1X00x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40421, .par.lin.zero = 0xbb10a, .par.lin.exact = 0xfbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_w_000X0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x8, .par.lin.one = 0x1041, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xw_x000x_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x9, .par.lin.one = 0x1040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xbffee) \ + MACRO(__VA_ARGS__, f32_linear_fma_xw_X000x_000X0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x9, .par.lin.one = 0x1040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xfffef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xw_xxx00_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x9, .par.lin.one = 0x1040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xbfff8) \ + MACRO(__VA_ARGS__, f32_linear_fma_xw_XXX00_000X0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x9, .par.lin.one = 0x1040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_xxx0x_xxx0x_xxx0x_000x0, .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xba108, .par.lin.exact = 0xba108) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_XXX0x_XXX0x_XXX0x_000X0, .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xba108, .par.lin.exact = 0xfbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_X0X0x_XXX0x_XX00x_000X0, .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xbb10a, .par.lin.exact = 0xfbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_X0000_0X000_00X00_000X0, .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffff) #define SWS_FOR_F32_DITHER(MACRO, ...) \ MACRO(__VA_ARGS__, f32_dither_x_0_16x16 , SWS_PIXEL_F32, SWS_UOP_DITHER , 0x1, 0, 0, 0, 0, 4) \ MACRO(__VA_ARGS__, f32_dither_y_3_16x16 , SWS_PIXEL_F32, SWS_UOP_DITHER , 0x2, 0, 3, 0, 0, 4) \ _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
