On Mon, May 13, 2024 at 5:57 AM Roger Sayle <ro...@nextmovesoftware.com> wrote:
>
>
> This patch improves the way that the x86 backend recognizes and
> expands AVX512's bitwise ternary logic (vpternlog) instructions.
I like the patch.

1 file changed, 25 insertions(+), 1 deletion(-)
gcc/config/i386/i386-expand.cc | 26 +++++++++++++++++++++++++-

modified   gcc/config/i386/i386-expand.cc
@@ -25601,6 +25601,7 @@ ix86_gen_bcst_mem (machine_mode mode, rtx x)
 int
 ix86_ternlog_idx (rtx op, rtx *args)
 {
+  /* Nice dynamic programming:)  */
   int idx0, idx1;

   if (!op)
@@ -25651,6 +25652,7 @@ ix86_ternlog_idx (rtx op, rtx *args)
    return 0xaa;
  }
       /* Maximum of one volatile memory reference per expression.  */
+      /* According to comments, it should be && ?  */
       if (side_effects_p (op) || side_effects_p (args[2]))
  return -1;
       if (rtx_equal_p (op, args[2]))
@@ -25666,6 +25668,8 @@ ix86_ternlog_idx (rtx op, rtx *args)

     case SUBREG:
       if (!VECTOR_MODE_P (GET_MODE (SUBREG_REG (op)))
+   /* It could be TI/OI/XImode since it's just bit operations,
+      So no need for VECTOR_MODE_P?  */
    || GET_MODE_SIZE (GET_MODE (SUBREG_REG (op)))
       != GET_MODE_SIZE (GET_MODE (op)))
  return -1;
@@ -25701,7 +25705,7 @@ ix86_ternlog_idx (rtx op, rtx *args)
     case UNSPEC:
       if (XINT (op, 1) != UNSPEC_VTERNLOG
    || XVECLEN (op, 0) != 4
-   || CONST_INT_P (XVECEXP (op, 0, 3)))
+   || !CONST_INT_P (XVECEXP (op, 0, 3)))
  return -1;

       /* TODO: Handle permuted operands.  */
@@ -25778,10 +25782,13 @@ ix86_ternlog_operand_p (rtx op)
       /* Prefer pxor.  */
       if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
    && (ix86_ternlog_leaf_p (op1, mode)
+       /* Add some comments, it's because we already have
<mask_codefor>one_cmpl<mode>2<mask_name>.  */
        || vector_all_ones_operand (op1, mode)))
  return false;
       break;

+      /* Wouldn't pternlog match (SUBREG: (REG))???,and it should
also be excluded.
+        Similar for SUBREG: (AND/IOR/XOR)?   */
     default:
       break;
     }
@@ -25865,25 +25872,35 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,

     case 0x0a: /* ~a&c */
       if ((!op1 || !side_effects_p (op1))
+   /* shouldn't op1 always be register_operand with no side effects
when it exists?
+      <avx512>_vternlog<mode>_mask only supports register_operand for op1.
+      ix86_ternlog_idx only assigns REG to args[1].
+      Ditto for op0, also we should add op2 && register_operand (op2, mode)
+      to avoid segment fault?   */
    && register_operand (op0, mode)
    && register_operand (op2, mode))
  return ix86_expand_ternlog_andnot (mode, op0, op1, target);
+      /* op2 instead of op1??? */
       break;

     case 0x0c: /* ~a&b */
       if ((!op2 || !side_effects_p (op2))
    && register_operand (op0, mode)
    && register_operand (op1, mode))
+ /* If op0 and op1 exist, they must be register_operand? So just op0
&& op1?  */
  return ix86_expand_ternlog_andnot (mode, op0, op1, target);
       break;

     case 0x0f:  /* ~a */
       if ((!op1 || !side_effects_p (op1))
+   /* No need for !side_effects for op1?  */
+   /* Ditto.  */
    && (!op2 || !side_effects_p (op2)))
  {
    if (GET_MODE (op0) != mode)
      op0 = gen_lowpart (mode, op0);
    if (!TARGET_64BIT && !register_operand (op0, mode))
+     /* It must be register_operand for op0 when it exists, no? */
      op0 = force_reg (mode, op0);
    emit_move_insn (target, gen_rtx_XOR (mode, op0, CONSTM1_RTX (mode)));
    return target;
@@ -25894,6 +25911,7 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,
       if ((!op0 || !side_effects_p (op0))
    && register_operand (op1, mode)
    && register_operand (op2, mode))
+ /* op1 && op2 && register_operand (op2, mode)??  */
  return ix86_expand_ternlog_andnot (mode, op1, op2, target);
       break;

@@ -25901,12 +25919,14 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,
       if ((!op2 || !side_effects_p (op2))
    && register_operand (op0, mode)
    && register_operand (op1, mode))
+ /* op0 && op1? */
  return ix86_expand_ternlog_andnot (mode, op1, op0, target);
       break;

     case 0x33:  /* ~b */
       if ((!op0 || !side_effects_p (op0))
    && (!op2 || !side_effects_p (op2)))
+ /* op1 && (!op2 || !side_effects_p (op2)) ?  */
  {
    if (GET_MODE (op1) != mode)
      op1 = gen_lowpart (mode, op1);
@@ -26051,6 +26071,10 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,
       tmp2 = ix86_gen_bcst_mem (mode, op2);
       if (!tmp2)
  tmp2 = validize_mem (force_const_mem (mode, op2));
+      /* Can we use ix86_expand_vector_move here, it will try move
integer to gpr,
+ and broadcast gpr to the vector register.
+ It should be faster than a constant pool, and PR115021 should be solved by
+ another way instead of this walkaround.  */
     }
   else
     tmp2 = op2;




-- 
BR,
Hongtao

Reply via email to