The avx512 vpblendm* instructions exactly implement cmpsel, using a predicate input. Of course this matches nicely with the avx512 predicate comparison instructions.
Signed-off-by: Richard Henderson <richard.hender...@linaro.org> --- tcg/i386/tcg-target.c.inc | 46 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc index 2a3ae28e85..8c363b7bfc 100644 --- a/tcg/i386/tcg-target.c.inc +++ b/tcg/i386/tcg-target.c.inc @@ -413,6 +413,10 @@ static bool tcg_target_const_match(int64_t val, int ct, #define OPC_UD2 (0x0b | P_EXT) #define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) #define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) +#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX) +#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) +#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX) +#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) #define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX) #define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX) #define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) @@ -738,6 +742,16 @@ static void tcg_out_vex_modrm_type(TCGContext *s, int opc, tcg_out_vex_modrm(s, opc, r, v, rm); } +static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v, + int rm, int aaa, bool z, TCGType type) +{ + if (type == TCG_TYPE_V256) { + opc |= P_VEXL; + } + tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z); + tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); +} + /* Output an opcode with a full "rm + (index<<shift) + offset" address mode. We handle either RM and INDEX missing with a negative value. In 64-bit mode for absolute addresses, ~RM is the size of the immediate operand @@ -3183,6 +3197,33 @@ static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece, } } +static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece, + TCGReg v0, TCGReg c1, TCGReg c2, + TCGReg v3, TCGReg v4, TCGCond cond) +{ + static const int vpblendm_insn[] = { + OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ + }; + bool z = false; + + /* + * We have already eliminated !V3 && !V4. + * Swap to place constant in V4 to take advantage of zero-masking. + */ + if (!v3) { + z = true; + v3 = v4; + cond = tcg_invert_cond(cond); + } else if (!v4) { + z = true; + v4 = v3; + } + + tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond); + tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3, + /* k1 */1, z, type); +} + static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg v0, TCGReg c1, TCGReg c2, TCGReg v3, TCGReg v4, TCGCond cond) @@ -3196,6 +3237,11 @@ static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, return; } + if (vece <= MO_16 ? have_avx512bw : have_avx512vl) { + tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond); + return; + } + if (tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond)) { TCGReg swap = v3; v3 = v4; -- 2.43.0