On Mon, May 13, 2024 at 5:57 AM Roger Sayle <ro...@nextmovesoftware.com> wrote: > > > This patch improves the way that the x86 backend recognizes and > expands AVX512's bitwise ternary logic (vpternlog) instructions. I like the patch.
1 file changed, 25 insertions(+), 1 deletion(-) gcc/config/i386/i386-expand.cc | 26 +++++++++++++++++++++++++- modified gcc/config/i386/i386-expand.cc @@ -25601,6 +25601,7 @@ ix86_gen_bcst_mem (machine_mode mode, rtx x) int ix86_ternlog_idx (rtx op, rtx *args) { + /* Nice dynamic programming:) */ int idx0, idx1; if (!op) @@ -25651,6 +25652,7 @@ ix86_ternlog_idx (rtx op, rtx *args) return 0xaa; } /* Maximum of one volatile memory reference per expression. */ + /* According to comments, it should be && ? */ if (side_effects_p (op) || side_effects_p (args[2])) return -1; if (rtx_equal_p (op, args[2])) @@ -25666,6 +25668,8 @@ ix86_ternlog_idx (rtx op, rtx *args) case SUBREG: if (!VECTOR_MODE_P (GET_MODE (SUBREG_REG (op))) + /* It could be TI/OI/XImode since it's just bit operations, + So no need for VECTOR_MODE_P? */ || GET_MODE_SIZE (GET_MODE (SUBREG_REG (op))) != GET_MODE_SIZE (GET_MODE (op))) return -1; @@ -25701,7 +25705,7 @@ ix86_ternlog_idx (rtx op, rtx *args) case UNSPEC: if (XINT (op, 1) != UNSPEC_VTERNLOG || XVECLEN (op, 0) != 4 - || CONST_INT_P (XVECEXP (op, 0, 3))) + || !CONST_INT_P (XVECEXP (op, 0, 3))) return -1; /* TODO: Handle permuted operands. */ @@ -25778,10 +25782,13 @@ ix86_ternlog_operand_p (rtx op) /* Prefer pxor. */ if (ix86_ternlog_leaf_p (XEXP (op, 0), mode) && (ix86_ternlog_leaf_p (op1, mode) + /* Add some comments, it's because we already have <mask_codefor>one_cmpl<mode>2<mask_name>. */ || vector_all_ones_operand (op1, mode))) return false; break; + /* Wouldn't pternlog match (SUBREG: (REG))???,and it should also be excluded. + Similar for SUBREG: (AND/IOR/XOR)? */ default: break; } @@ -25865,25 +25872,35 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, case 0x0a: /* ~a&c */ if ((!op1 || !side_effects_p (op1)) + /* shouldn't op1 always be register_operand with no side effects when it exists? + <avx512>_vternlog<mode>_mask only supports register_operand for op1. + ix86_ternlog_idx only assigns REG to args[1]. + Ditto for op0, also we should add op2 && register_operand (op2, mode) + to avoid segment fault? */ && register_operand (op0, mode) && register_operand (op2, mode)) return ix86_expand_ternlog_andnot (mode, op0, op1, target); + /* op2 instead of op1??? */ break; case 0x0c: /* ~a&b */ if ((!op2 || !side_effects_p (op2)) && register_operand (op0, mode) && register_operand (op1, mode)) + /* If op0 and op1 exist, they must be register_operand? So just op0 && op1? */ return ix86_expand_ternlog_andnot (mode, op0, op1, target); break; case 0x0f: /* ~a */ if ((!op1 || !side_effects_p (op1)) + /* No need for !side_effects for op1? */ + /* Ditto. */ && (!op2 || !side_effects_p (op2))) { if (GET_MODE (op0) != mode) op0 = gen_lowpart (mode, op0); if (!TARGET_64BIT && !register_operand (op0, mode)) + /* It must be register_operand for op0 when it exists, no? */ op0 = force_reg (mode, op0); emit_move_insn (target, gen_rtx_XOR (mode, op0, CONSTM1_RTX (mode))); return target; @@ -25894,6 +25911,7 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, if ((!op0 || !side_effects_p (op0)) && register_operand (op1, mode) && register_operand (op2, mode)) + /* op1 && op2 && register_operand (op2, mode)?? */ return ix86_expand_ternlog_andnot (mode, op1, op2, target); break; @@ -25901,12 +25919,14 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, if ((!op2 || !side_effects_p (op2)) && register_operand (op0, mode) && register_operand (op1, mode)) + /* op0 && op1? */ return ix86_expand_ternlog_andnot (mode, op1, op0, target); break; case 0x33: /* ~b */ if ((!op0 || !side_effects_p (op0)) && (!op2 || !side_effects_p (op2))) + /* op1 && (!op2 || !side_effects_p (op2)) ? */ { if (GET_MODE (op1) != mode) op1 = gen_lowpart (mode, op1); @@ -26051,6 +26071,10 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, tmp2 = ix86_gen_bcst_mem (mode, op2); if (!tmp2) tmp2 = validize_mem (force_const_mem (mode, op2)); + /* Can we use ix86_expand_vector_move here, it will try move integer to gpr, + and broadcast gpr to the vector register. + It should be faster than a constant pool, and PR115021 should be solved by + another way instead of this walkaround. */ } else tmp2 = op2; -- BR, Hongtao