This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 36004d681f9aa9dcd32af8abef0dec589e3717c9 Author: Niklas Haas <[email protected]> AuthorDate: Tue Jun 9 01:48:24 2026 +0200 Commit: Niklas Haas <[email protected]> CommitDate: Tue Jun 9 18:27:20 2026 +0200 swscale/uops: add SWS_UOP_MOVE for optimal register-register swizzles This decomposes a swizzle mask into a series of optimal register-register moves, using at most two temporary scratch registers. This is a better match for ASM-style backends than the existing PERMUTE/COPY uops that are designed for the needs of the C backend (or other backends which either apply the swizzle mask directly or permute pointers). I originally had logic equivalent to this written in NASM macros, but it was just such a complicated mess that I think it's better to rewrite it in C and have the resulting metadata be an explicit part of the uop definition. This commit only adds the uop, I'll update the x86 implementation in the next step. Co-authored-by: Ramiro Polla <[email protected]> Signed-off-by: Niklas Haas <[email protected]> --- libswscale/uops.c | 118 ++++++++++++++++++++++++++++- libswscale/uops.h | 13 ++++ libswscale/uops_macros.h | 188 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 316 insertions(+), 3 deletions(-) diff --git a/libswscale/uops.c b/libswscale/uops.c index 66cefcd55f..7af8a8af51 100644 --- a/libswscale/uops.c +++ b/libswscale/uops.c @@ -60,6 +60,7 @@ static const struct { UOP_NAME(WRITE_BIT, "write_bit"), UOP_NAME(PERMUTE, "permute"), UOP_NAME(COPY, "copy"), + UOP_NAME(MOVE, "move"), UOP_NAME(SWAP_BYTES, "swap_bytes"), UOP_NAME(EXPAND_BIT, "expand_bit"), UOP_NAME(EXPAND_PAIR, "expand_pair"), @@ -161,6 +162,14 @@ void ff_sws_uop_name(const SwsUOp *op, char buf[SWS_UOP_NAME_MAX]) av_bprint_chars(&bp, "xyzw"[par->swizzle.in[i]], 1); } break; + case SWS_UOP_MOVE: + av_bprint_chars(&bp, '_', 1); + for (int i = 0; i < par->move.num_moves; i++) + av_bprint_chars(&bp, "txyzw"[par->move.dst[i] + 1], 1); + av_bprint_chars(&bp, '_', 1); + for (int i = 0; i < par->move.num_moves; i++) + av_bprint_chars(&bp, "txyzw"[par->move.src[i] + 1], 1); + break; case SWS_UOP_PACK: case SWS_UOP_UNPACK: av_bprint_chars(&bp, '_', 1); @@ -239,6 +248,15 @@ static int generate_entry_struct(void *opaque, void *key) par->swizzle.in[0], par->swizzle.in[1], par->swizzle.in[2], par->swizzle.in[3]); break; + case SWS_UOP_MOVE: + av_bprintf(bp, ", .par.move.num_moves = %d", par->move.num_moves); + av_bprintf(bp, ", .par.move.dst = {%d, %d, %d, %d, %d, %d}", + par->move.dst[0], par->move.dst[1], par->move.dst[2], + par->move.dst[3], par->move.dst[4], par->move.dst[5]); + av_bprintf(bp, ", .par.move.src = {%d, %d, %d, %d, %d, %d}", + par->move.src[0], par->move.src[1], par->move.src[2], + par->move.src[3], par->move.src[4], par->move.src[5]); + break; case SWS_UOP_PACK: case SWS_UOP_UNPACK: av_bprintf(bp, ", .par.pack.pattern = {%d, %d, %d, %d}", @@ -295,6 +313,15 @@ static int generate_entry_args(void *opaque, void *key) par->swizzle.in[0], par->swizzle.in[1], par->swizzle.in[2], par->swizzle.in[3]); break; + case SWS_UOP_MOVE: + av_bprintf(bp, ", %d", par->move.num_moves); + av_bprintf(bp, ", %d, %d, %d, %d, %d, %d", + par->move.dst[0], par->move.dst[1], par->move.dst[2], + par->move.dst[3], par->move.dst[4], par->move.dst[5]); + av_bprintf(bp, ", %d, %d, %d, %d, %d, %d", + par->move.src[0], par->move.src[1], par->move.src[2], + par->move.src[3], par->move.src[4], par->move.src[5]); + break; case SWS_UOP_PACK: case SWS_UOP_UNPACK: av_bprintf(bp, ", %d, %d, %d, %d", @@ -480,8 +507,93 @@ static int translate_rw_op(SwsContext *ctx, SwsUOpList *ops, SwsUOpFlags flags, return ff_sws_uop_list_append(ops, &uop); } -static int translate_swizzle(SwsUOpList *ops, const SwsOp *op) +static int count_idx(const int *arr, size_t size, int val) +{ + int num = 0; + for (size_t i = 0; i < size; i++) { + if (arr[i] == val) + num++; + } + + return num; +} + +static int translate_move(SwsUOpList *ops, const SwsOp *op) { + SwsUOp uop = { + .uop = SWS_UOP_MOVE, + .type = pixel_type_to_int(op->type), + }; + SwsMoveUOp *par = &uop.par.move; + + /* Mask of components that are not yet satisfied */ + SwsCompMask todo = ff_sws_comp_mask_needed(op); + for (int i = 0; i < 4; i++) { + if (op->swizzle.in[i] == i) + todo &= ~SWS_COMP(i); + } + + /* Mask of components whose value is required for the final output */ + SwsCompMask needed = 0; + for (int i = 0; i < 4; i++) { + if (SWS_OP_NEEDED(op, i)) + needed |= SWS_COMP(op->swizzle.in[i]); + } + + /* Current mapping of registers to components */ + int idx[4 + 1] = { 0, 1, 2, 3, -1 }; /* +1 for tmp */ + + /* Decompose the swizzle mask into a series of register-register moves */ + while (todo) { + int dst = -1, src = -1; + + /* Find next unsatisfied dst <- src move that doesn't clobber a value */ + for (dst = 0; dst < 4; dst++) { + if (!SWS_COMP_TEST(todo, dst)) + continue; /* already satisfied */ + const int cur = idx[dst]; + if (count_idx(idx, FF_ARRAY_ELEMS(idx), cur) == 1 && SWS_COMP_TEST(needed, cur)) + continue; /* clobbers last remaining, still-needed value */ + for (src = 0; src < FF_ARRAY_ELEMS(idx); src++) { + if (idx[src] == op->swizzle.in[dst]) { + /* Prevent read-after-write dependency. */ + if (par->num_moves > 0 && src == par->dst[par->num_moves - 1]) + src = par->src[par->num_moves - 1]; + break; + } + } + av_assert1(src < FF_ARRAY_ELEMS(idx)); + todo &= ~SWS_COMP(dst); + break; + } + + if (dst == 4) { + /* Stuck in a cycle, break it by saving to the scratch register */ + dst = 4; + for (src = 0; src < 4; src++) { + if (SWS_COMP_TEST(todo, src)) { + needed &= ~SWS_COMP(idx[src]); + break; + } + } + av_assert1(src < 4); + } + + av_assert0(par->num_moves < SWS_UOP_MOVE_MAX); + par->dst[par->num_moves] = dst > 3 ? -1 : dst; + par->src[par->num_moves] = src > 3 ? -1 : src; + par->num_moves++; + idx[dst] = idx[src]; + } + + return ff_sws_uop_list_append(ops, &uop); +} + +static int translate_swizzle(SwsUOpList *ops, SwsUOpFlags flags, const SwsOp *op) +{ + if (flags & SWS_UOP_FLAG_MOVE) + return translate_move(ops, op); + SwsUOp uop = { .type = pixel_type_to_int(op->type), .uop = SWS_UOP_PERMUTE, @@ -645,7 +757,7 @@ static int translate_op(SwsContext *ctx, SwsUOpList *uops, SwsUOpFlags flags, case SWS_OP_WRITE: return translate_rw_op(ctx, uops, flags, op); case SWS_OP_SWIZZLE: - return translate_swizzle(uops, op); + return translate_swizzle(uops, flags, op); case SWS_OP_DITHER: return translate_dither_op(uops, op); case SWS_OP_LINEAR: @@ -793,7 +905,7 @@ fail: static const SwsUOpFlags uop_flags[] = { 0, - SWS_UOP_FLAG_FMA, /* x86 backend */ + SWS_UOP_FLAG_FMA | SWS_UOP_FLAG_MOVE, /* x86 backend */ }; static int register_uops(SwsContext *ctx, const SwsOpList *ops, diff --git a/libswscale/uops.h b/libswscale/uops.h index d69c35053d..b2e9af30a4 100644 --- a/libswscale/uops.h +++ b/libswscale/uops.h @@ -82,6 +82,7 @@ typedef uint32_t SwsUOpFlags; typedef enum SwsUOpFlagBits { SWS_UOP_FLAG_NONE = 0, SWS_UOP_FLAG_FMA = (1 << 0), /* platform supports FMA ops */ + SWS_UOP_FLAG_MOVE = (1 << 1), /* platform supports SWS_UOP_MOVE */ } SwsUOpFlagBits; typedef enum SwsUOpType { @@ -104,6 +105,7 @@ typedef enum SwsUOpType { /* Data rearrangement uops; mask = non-trivial and needed components */ SWS_UOP_PERMUTE, /* rearrange components (no duplicates) */ SWS_UOP_COPY, /* copy/duplicate components */ + SWS_UOP_MOVE, /* series of register-register assignments */ /* Data conversion / manipulation uops; mask = affected components */ SWS_UOP_SWAP_BYTES, /* swap byte order in components */ @@ -147,6 +149,16 @@ typedef struct SwsSwizzleUOp { uint8_t in[4]; /* input component for each output component */ } SwsSwizzleUOp; +typedef struct SwsMoveUOp { + /* The worst case number of moves (for two independent cycles) */ + #define SWS_UOP_MOVE_MAX 6 + int num_moves; + + /* This may involve a temporary register (index -1) */ + int8_t dst[SWS_UOP_MOVE_MAX]; /* destination register index */ + int8_t src[SWS_UOP_MOVE_MAX]; /* source register index */ +} SwsMoveUOp; + typedef struct SwsPackUOp { uint8_t pattern[4]; /* bit depth pattern, from MSB to LSB */ } SwsPackUOp; @@ -179,6 +191,7 @@ typedef union SwsUOpParams { SwsFilterUOp filter; /* for SWS_UOP_READ_*_FV/FH */ SwsShiftUOp shift; SwsSwizzleUOp swizzle; + SwsMoveUOp move; SwsPackUOp pack; SwsClearUOp clear; SwsLinearUOp lin; diff --git a/libswscale/uops_macros.h b/libswscale/uops_macros.h index 9ab1858577..f63d046aa3 100644 --- a/libswscale/uops_macros.h +++ b/libswscale/uops_macros.h @@ -173,6 +173,76 @@ MACRO(__VA_ARGS__, u8_copy_yzw_xxx , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_COPY , .mask = 0xe, .par.swizzle.in = {0, 0, 0, 0}) \ MACRO(__VA_ARGS__, u8_copy_yzw_xxy , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_COPY , .mask = 0xe, .par.swizzle.in = {0, 0, 0, 1}) \ MACRO(__VA_ARGS__, u8_copy_xyzw_yxxx , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_COPY , .mask = 0xf, .par.swizzle.in = {1, 0, 0, 0}) +#define SWS_FOR_U8_MOVE(MACRO, ...) \ + MACRO(__VA_ARGS__, u8_move_x_y , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_x_z , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_x_w , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_y_x , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_y_w , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 1, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_z_x , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_w_x , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_xy_yw , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 2, 0, 1, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_xy_zw , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 2, 0, 1, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_yx_xw , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 2, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_yz_xx , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_wz_zx , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 2, 3, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_xyz_yzw , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 3, 0, 1, 2, 0, 0, 0, 1, 2, 3, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_xzy_zyw , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 3, 0, 2, 1, 0, 0, 0, 2, 1, 3, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_yzw_xxx , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 3, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_zwy_xyx , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 3, 2, 3, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_wyz_yzx , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 3, 3, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_wzy_zyx , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 3, 3, 2, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_txy_xyt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 3, -1, 0, 1, 0, 0, 0, 0, 1, -1, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_txz_xzt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 3, -1, 0, 2, 0, 0, 0, 0, 2, -1, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_tyz_yzt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 3, -1, 1, 2, 0, 0, 0, 1, 2, -1, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_tyw_ywt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 3, -1, 1, 3, 0, 0, 0, 1, 3, -1, 0, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_xtyz_wyzt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 4, 0, -1, 1, 2, 0, 0, 3, 1, 2, -1, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_zxyw_xyzz , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 4, 2, 0, 1, 3, 0, 0, 0, 1, 2, 2, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_wtyz_xyzt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 4, 3, -1, 1, 2, 0, 0, 0, 1, 2, -1, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_txyz_xyzt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 4, -1, 0, 1, 2, 0, 0, 0, 1, 2, -1, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_txzy_xzyt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 4, -1, 0, 2, 1, 0, 0, 0, 2, 1, -1, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_txzw_xzwt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 4, -1, 0, 2, 3, 0, 0, 0, 2, 3, -1, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_txwz_xwzt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 4, -1, 0, 3, 2, 0, 0, 0, 3, 2, -1, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_tyzw_yzwt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 4, -1, 1, 2, 3, 0, 0, 1, 2, 3, -1, 0, 0) \ + MACRO(__VA_ARGS__, u8_move_txyzw_xyzwt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 5, -1, 0, 1, 2, 3, 0, 0, 1, 2, 3, -1, 0) \ + MACRO(__VA_ARGS__, u8_move_txwyz_xwyzt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 5, -1, 0, 3, 1, 2, 0, 0, 3, 1, 2, -1, 0) \ + MACRO(__VA_ARGS__, u8_move_txwzy_xwzyt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 5, -1, 0, 3, 2, 1, 0, 0, 3, 2, 1, -1, 0) \ + MACRO(__VA_ARGS__, u8_move_txwtyz_xwtyzt , SWS_PIXEL_U8 , SWS_UOP_MOVE , 0x0, 6, -1, 0, 3, -1, 1, 2, 0, 3, -1, 1, 2, -1) +#define SWS_FOR_STRUCT_U8_MOVE(MACRO, ...) \ + MACRO(__VA_ARGS__, u8_move_x_y , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {0, 0, 0, 0, 0, 0}, .par.move.src = {1, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_x_z , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {0, 0, 0, 0, 0, 0}, .par.move.src = {2, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_x_w , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {0, 0, 0, 0, 0, 0}, .par.move.src = {3, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_y_x , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {1, 0, 0, 0, 0, 0}, .par.move.src = {0, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_y_w , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {1, 0, 0, 0, 0, 0}, .par.move.src = {3, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_z_x , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {2, 0, 0, 0, 0, 0}, .par.move.src = {0, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_w_x , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {3, 0, 0, 0, 0, 0}, .par.move.src = {0, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_xy_yw , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {0, 1, 0, 0, 0, 0}, .par.move.src = {1, 3, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_xy_zw , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {0, 1, 0, 0, 0, 0}, .par.move.src = {2, 3, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_yx_xw , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {1, 0, 0, 0, 0, 0}, .par.move.src = {0, 3, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_yz_xx , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {1, 2, 0, 0, 0, 0}, .par.move.src = {0, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_wz_zx , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {3, 2, 0, 0, 0, 0}, .par.move.src = {2, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_xyz_yzw , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {0, 1, 2, 0, 0, 0}, .par.move.src = {1, 2, 3, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_xzy_zyw , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {0, 2, 1, 0, 0, 0}, .par.move.src = {2, 1, 3, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_yzw_xxx , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {1, 2, 3, 0, 0, 0}, .par.move.src = {0, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_zwy_xyx , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {2, 3, 1, 0, 0, 0}, .par.move.src = {0, 1, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_wyz_yzx , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {3, 1, 2, 0, 0, 0}, .par.move.src = {1, 2, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_wzy_zyx , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {3, 2, 1, 0, 0, 0}, .par.move.src = {2, 1, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_txy_xyt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {-1, 0, 1, 0, 0, 0}, .par.move.src = {0, 1, -1, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_txz_xzt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {-1, 0, 2, 0, 0, 0}, .par.move.src = {0, 2, -1, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_tyz_yzt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {-1, 1, 2, 0, 0, 0}, .par.move.src = {1, 2, -1, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_tyw_ywt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {-1, 1, 3, 0, 0, 0}, .par.move.src = {1, 3, -1, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_xtyz_wyzt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {0, -1, 1, 2, 0, 0}, .par.move.src = {3, 1, 2, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_zxyw_xyzz , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {2, 0, 1, 3, 0, 0}, .par.move.src = {0, 1, 2, 2, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_wtyz_xyzt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {3, -1, 1, 2, 0, 0}, .par.move.src = {0, 1, 2, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_txyz_xyzt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {-1, 0, 1, 2, 0, 0}, .par.move.src = {0, 1, 2, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_txzy_xzyt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {-1, 0, 2, 1, 0, 0}, .par.move.src = {0, 2, 1, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_txzw_xzwt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {-1, 0, 2, 3, 0, 0}, .par.move.src = {0, 2, 3, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_txwz_xwzt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {-1, 0, 3, 2, 0, 0}, .par.move.src = {0, 3, 2, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_tyzw_yzwt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {-1, 1, 2, 3, 0, 0}, .par.move.src = {1, 2, 3, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u8_move_txyzw_xyzwt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 5, .par.move.dst = {-1, 0, 1, 2, 3, 0}, .par.move.src = {0, 1, 2, 3, -1, 0}) \ + MACRO(__VA_ARGS__, u8_move_txwyz_xwyzt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 5, .par.move.dst = {-1, 0, 3, 1, 2, 0}, .par.move.src = {0, 3, 1, 2, -1, 0}) \ + MACRO(__VA_ARGS__, u8_move_txwzy_xwzyt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 5, .par.move.dst = {-1, 0, 3, 2, 1, 0}, .par.move.src = {0, 3, 2, 1, -1, 0}) \ + MACRO(__VA_ARGS__, u8_move_txwtyz_xwtyzt , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 6, .par.move.dst = {-1, 0, 3, -1, 1, 2}, .par.move.src = {0, 3, -1, 1, 2, -1}) #define SWS_FOR_U8_SWAP_BYTES(MACRO, ...) #define SWS_FOR_STRUCT_U8_SWAP_BYTES(MACRO, ...) #define SWS_FOR_U8_EXPAND_BIT(MACRO, ...) \ @@ -421,6 +491,56 @@ #define SWS_FOR_STRUCT_U16_COPY(MACRO, ...) \ MACRO(__VA_ARGS__, u16_copy_yz_xx , .type = SWS_PIXEL_U16, .uop = SWS_UOP_COPY , .mask = 0x6, .par.swizzle.in = {0, 0, 0, 3}) \ MACRO(__VA_ARGS__, u16_copy_yzw_xxy , .type = SWS_PIXEL_U16, .uop = SWS_UOP_COPY , .mask = 0xe, .par.swizzle.in = {0, 0, 0, 1}) +#define SWS_FOR_U16_MOVE(MACRO, ...) \ + MACRO(__VA_ARGS__, u16_move_x_y , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_x_z , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_x_w , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_y_x , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_y_w , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 1, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_w_x , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_xz_zw , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 2, 0, 2, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_yx_xw , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 2, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_yz_xx , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_wz_zx , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 2, 3, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_xyz_yzw , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 3, 0, 1, 2, 0, 0, 0, 1, 2, 3, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_xzy_zyw , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 3, 0, 2, 1, 0, 0, 0, 2, 1, 3, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_zwy_xyx , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 3, 2, 3, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_wzy_zyx , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 3, 3, 2, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_txy_xyt , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 3, -1, 0, 1, 0, 0, 0, 0, 1, -1, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_txz_xzt , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 3, -1, 0, 2, 0, 0, 0, 0, 2, -1, 0, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_xtyz_wyzt , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 4, 0, -1, 1, 2, 0, 0, 3, 1, 2, -1, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_wtyz_xyzt , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 4, 3, -1, 1, 2, 0, 0, 0, 1, 2, -1, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_txyz_xyzt , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 4, -1, 0, 1, 2, 0, 0, 0, 1, 2, -1, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_txzy_xzyt , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 4, -1, 0, 2, 1, 0, 0, 0, 2, 1, -1, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_tyzw_yzwt , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 4, -1, 1, 2, 3, 0, 0, 1, 2, 3, -1, 0, 0) \ + MACRO(__VA_ARGS__, u16_move_txyzw_xyzwt , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 5, -1, 0, 1, 2, 3, 0, 0, 1, 2, 3, -1, 0) \ + MACRO(__VA_ARGS__, u16_move_txwzy_xwzyt , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 5, -1, 0, 3, 2, 1, 0, 0, 3, 2, 1, -1, 0) \ + MACRO(__VA_ARGS__, u16_move_txwtyz_xwtyzt , SWS_PIXEL_U16, SWS_UOP_MOVE , 0x0, 6, -1, 0, 3, -1, 1, 2, 0, 3, -1, 1, 2, -1) +#define SWS_FOR_STRUCT_U16_MOVE(MACRO, ...) \ + MACRO(__VA_ARGS__, u16_move_x_y , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {0, 0, 0, 0, 0, 0}, .par.move.src = {1, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_x_z , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {0, 0, 0, 0, 0, 0}, .par.move.src = {2, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_x_w , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {0, 0, 0, 0, 0, 0}, .par.move.src = {3, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_y_x , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {1, 0, 0, 0, 0, 0}, .par.move.src = {0, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_y_w , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {1, 0, 0, 0, 0, 0}, .par.move.src = {3, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_w_x , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {3, 0, 0, 0, 0, 0}, .par.move.src = {0, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_xz_zw , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {0, 2, 0, 0, 0, 0}, .par.move.src = {2, 3, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_yx_xw , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {1, 0, 0, 0, 0, 0}, .par.move.src = {0, 3, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_yz_xx , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {1, 2, 0, 0, 0, 0}, .par.move.src = {0, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_wz_zx , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {3, 2, 0, 0, 0, 0}, .par.move.src = {2, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_xyz_yzw , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {0, 1, 2, 0, 0, 0}, .par.move.src = {1, 2, 3, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_xzy_zyw , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {0, 2, 1, 0, 0, 0}, .par.move.src = {2, 1, 3, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_zwy_xyx , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {2, 3, 1, 0, 0, 0}, .par.move.src = {0, 1, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_wzy_zyx , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {3, 2, 1, 0, 0, 0}, .par.move.src = {2, 1, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_txy_xyt , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {-1, 0, 1, 0, 0, 0}, .par.move.src = {0, 1, -1, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_txz_xzt , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {-1, 0, 2, 0, 0, 0}, .par.move.src = {0, 2, -1, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_xtyz_wyzt , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {0, -1, 1, 2, 0, 0}, .par.move.src = {3, 1, 2, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_wtyz_xyzt , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {3, -1, 1, 2, 0, 0}, .par.move.src = {0, 1, 2, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_txyz_xyzt , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {-1, 0, 1, 2, 0, 0}, .par.move.src = {0, 1, 2, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_txzy_xzyt , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {-1, 0, 2, 1, 0, 0}, .par.move.src = {0, 2, 1, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_tyzw_yzwt , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {-1, 1, 2, 3, 0, 0}, .par.move.src = {1, 2, 3, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u16_move_txyzw_xyzwt , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 5, .par.move.dst = {-1, 0, 1, 2, 3, 0}, .par.move.src = {0, 1, 2, 3, -1, 0}) \ + MACRO(__VA_ARGS__, u16_move_txwzy_xwzyt , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 5, .par.move.dst = {-1, 0, 3, 2, 1, 0}, .par.move.src = {0, 3, 2, 1, -1, 0}) \ + MACRO(__VA_ARGS__, u16_move_txwtyz_xwtyzt , .type = SWS_PIXEL_U16, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 6, .par.move.dst = {-1, 0, 3, -1, 1, 2}, .par.move.src = {0, 3, -1, 1, 2, -1}) #define SWS_FOR_U16_SWAP_BYTES(MACRO, ...) \ MACRO(__VA_ARGS__, u16_swap_bytes_x , SWS_PIXEL_U16, SWS_UOP_SWAP_BYTES , 0x1) \ MACRO(__VA_ARGS__, u16_swap_bytes_y , SWS_PIXEL_U16, SWS_UOP_SWAP_BYTES , 0x2) \ @@ -691,6 +811,72 @@ MACRO(__VA_ARGS__, u32_copy_yz_xx , .type = SWS_PIXEL_U32, .uop = SWS_UOP_COPY , .mask = 0x6, .par.swizzle.in = {0, 0, 0, 3}) \ MACRO(__VA_ARGS__, u32_copy_yzw_xxx , .type = SWS_PIXEL_U32, .uop = SWS_UOP_COPY , .mask = 0xe, .par.swizzle.in = {0, 0, 0, 0}) \ MACRO(__VA_ARGS__, u32_copy_yzw_xxy , .type = SWS_PIXEL_U32, .uop = SWS_UOP_COPY , .mask = 0xe, .par.swizzle.in = {0, 0, 0, 1}) +#define SWS_FOR_U32_MOVE(MACRO, ...) \ + MACRO(__VA_ARGS__, u32_move_x_y , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_x_z , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_x_w , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_y_x , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_y_w , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 1, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_z_x , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_w_x , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_w_y , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 1, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_xz_zw , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 2, 0, 2, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_yz_xx , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_zx_xw , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 2, 2, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_wx_xy , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 2, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_wy_yx , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 2, 3, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_wz_zx , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 2, 3, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_xyz_yzw , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 3, 0, 1, 2, 0, 0, 0, 1, 2, 3, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_xzy_zyw , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 3, 0, 2, 1, 0, 0, 0, 2, 1, 3, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_yzw_xxx , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 3, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_zwy_xyx , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 3, 2, 3, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_wyz_yzx , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 3, 3, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_wzy_zyx , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 3, 3, 2, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_txy_xyt , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 3, -1, 0, 1, 0, 0, 0, 0, 1, -1, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_txz_xzt , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 3, -1, 0, 2, 0, 0, 0, 0, 2, -1, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_tyz_yzt , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 3, -1, 1, 2, 0, 0, 0, 1, 2, -1, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_tyw_ywt , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 3, -1, 1, 3, 0, 0, 0, 1, 3, -1, 0, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_xtyz_wyzt , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 4, 0, -1, 1, 2, 0, 0, 3, 1, 2, -1, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_wtyz_xyzt , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 4, 3, -1, 1, 2, 0, 0, 0, 1, 2, -1, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_txyz_xyzt , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 4, -1, 0, 1, 2, 0, 0, 0, 1, 2, -1, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_txzy_xzyt , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 4, -1, 0, 2, 1, 0, 0, 0, 2, 1, -1, 0, 0) \ + MACRO(__VA_ARGS__, u32_move_txyzw_xyzwt , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 5, -1, 0, 1, 2, 3, 0, 0, 1, 2, 3, -1, 0) \ + MACRO(__VA_ARGS__, u32_move_txwyz_xwyzt , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 5, -1, 0, 3, 1, 2, 0, 0, 3, 1, 2, -1, 0) \ + MACRO(__VA_ARGS__, u32_move_txwzy_xwzyt , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 5, -1, 0, 3, 2, 1, 0, 0, 3, 2, 1, -1, 0) \ + MACRO(__VA_ARGS__, u32_move_txwtyz_xwtyzt , SWS_PIXEL_U32, SWS_UOP_MOVE , 0x0, 6, -1, 0, 3, -1, 1, 2, 0, 3, -1, 1, 2, -1) +#define SWS_FOR_STRUCT_U32_MOVE(MACRO, ...) \ + MACRO(__VA_ARGS__, u32_move_x_y , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {0, 0, 0, 0, 0, 0}, .par.move.src = {1, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_x_z , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {0, 0, 0, 0, 0, 0}, .par.move.src = {2, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_x_w , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {0, 0, 0, 0, 0, 0}, .par.move.src = {3, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_y_x , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {1, 0, 0, 0, 0, 0}, .par.move.src = {0, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_y_w , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {1, 0, 0, 0, 0, 0}, .par.move.src = {3, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_z_x , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {2, 0, 0, 0, 0, 0}, .par.move.src = {0, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_w_x , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {3, 0, 0, 0, 0, 0}, .par.move.src = {0, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_w_y , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 1, .par.move.dst = {3, 0, 0, 0, 0, 0}, .par.move.src = {1, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_xz_zw , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {0, 2, 0, 0, 0, 0}, .par.move.src = {2, 3, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_yz_xx , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {1, 2, 0, 0, 0, 0}, .par.move.src = {0, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_zx_xw , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {2, 0, 0, 0, 0, 0}, .par.move.src = {0, 3, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_wx_xy , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {3, 0, 0, 0, 0, 0}, .par.move.src = {0, 1, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_wy_yx , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {3, 1, 0, 0, 0, 0}, .par.move.src = {1, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_wz_zx , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 2, .par.move.dst = {3, 2, 0, 0, 0, 0}, .par.move.src = {2, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_xyz_yzw , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {0, 1, 2, 0, 0, 0}, .par.move.src = {1, 2, 3, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_xzy_zyw , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {0, 2, 1, 0, 0, 0}, .par.move.src = {2, 1, 3, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_yzw_xxx , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {1, 2, 3, 0, 0, 0}, .par.move.src = {0, 0, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_zwy_xyx , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {2, 3, 1, 0, 0, 0}, .par.move.src = {0, 1, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_wyz_yzx , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {3, 1, 2, 0, 0, 0}, .par.move.src = {1, 2, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_wzy_zyx , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {3, 2, 1, 0, 0, 0}, .par.move.src = {2, 1, 0, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_txy_xyt , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {-1, 0, 1, 0, 0, 0}, .par.move.src = {0, 1, -1, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_txz_xzt , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {-1, 0, 2, 0, 0, 0}, .par.move.src = {0, 2, -1, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_tyz_yzt , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {-1, 1, 2, 0, 0, 0}, .par.move.src = {1, 2, -1, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_tyw_ywt , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 3, .par.move.dst = {-1, 1, 3, 0, 0, 0}, .par.move.src = {1, 3, -1, 0, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_xtyz_wyzt , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {0, -1, 1, 2, 0, 0}, .par.move.src = {3, 1, 2, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_wtyz_xyzt , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {3, -1, 1, 2, 0, 0}, .par.move.src = {0, 1, 2, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_txyz_xyzt , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {-1, 0, 1, 2, 0, 0}, .par.move.src = {0, 1, 2, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_txzy_xzyt , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 4, .par.move.dst = {-1, 0, 2, 1, 0, 0}, .par.move.src = {0, 2, 1, -1, 0, 0}) \ + MACRO(__VA_ARGS__, u32_move_txyzw_xyzwt , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 5, .par.move.dst = {-1, 0, 1, 2, 3, 0}, .par.move.src = {0, 1, 2, 3, -1, 0}) \ + MACRO(__VA_ARGS__, u32_move_txwyz_xwyzt , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 5, .par.move.dst = {-1, 0, 3, 1, 2, 0}, .par.move.src = {0, 3, 1, 2, -1, 0}) \ + MACRO(__VA_ARGS__, u32_move_txwzy_xwzyt , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 5, .par.move.dst = {-1, 0, 3, 2, 1, 0}, .par.move.src = {0, 3, 2, 1, -1, 0}) \ + MACRO(__VA_ARGS__, u32_move_txwtyz_xwtyzt , .type = SWS_PIXEL_U32, .uop = SWS_UOP_MOVE , .mask = 0x0, .par.move.num_moves = 6, .par.move.dst = {-1, 0, 3, -1, 1, 2}, .par.move.src = {0, 3, -1, 1, 2, -1}) #define SWS_FOR_U32_SWAP_BYTES(MACRO, ...) \ MACRO(__VA_ARGS__, u32_swap_bytes_x , SWS_PIXEL_U32, SWS_UOP_SWAP_BYTES , 0x1) \ MACRO(__VA_ARGS__, u32_swap_bytes_xy , SWS_PIXEL_U32, SWS_UOP_SWAP_BYTES , 0x3) \ @@ -837,6 +1023,8 @@ #define SWS_FOR_STRUCT_F32_PERMUTE(MACRO, ...) #define SWS_FOR_F32_COPY(MACRO, ...) #define SWS_FOR_STRUCT_F32_COPY(MACRO, ...) +#define SWS_FOR_F32_MOVE(MACRO, ...) +#define SWS_FOR_STRUCT_F32_MOVE(MACRO, ...) #define SWS_FOR_F32_SWAP_BYTES(MACRO, ...) #define SWS_FOR_STRUCT_F32_SWAP_BYTES(MACRO, ...) #define SWS_FOR_F32_EXPAND_BIT(MACRO, ...) _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
