On Wed, Oct 28, 2020 at 10:54 AM Hongyu Wang <wwwhhhyyy...@gmail.com> wrote:
>
> Hi Uros,
>
> Thanks for the example. We've update the patterns with new expanders
> and predicates like vzeroall.
> Now the generated insn for "encodekey128u32"  is like
>
> (insn 7 6 8 2 (parallel [
>             (set (reg:SI 84 [ <retval> ])
>                 (unspec_volatile:SI [
>                         (reg:SI 85)
>                         (reg:V2DI 20 xmm0)
>                     ] UNSPECV_ENCODEKEY128U32))
>             (set (reg:V2DI 20 xmm0)
>                 (unspec_volatile:V2DI [
>                         (const_int 0 [0])
>                     ] UNSPECV_ENCODEKEY128U32))
>             (set (reg:V2DI 21 xmm1)
>                 (unspec_volatile:V2DI [
>                         (const_int 0 [0])
>                     ] UNSPECV_ENCODEKEY128U32))
>             (set (reg:V2DI 22 xmm2)
>                 (unspec_volatile:V2DI [
>                         (const_int 0 [0])
>                     ] UNSPECV_ENCODEKEY128U32))
>             (set (reg:V2DI 24 xmm4)
>                 (const_vector:V2DI [
>                         (const_int 0 [0]) repeated x2
>                     ]))
>             (set (reg:V2DI 25 xmm5)
>                 (const_vector:V2DI [
>                         (const_int 0 [0]) repeated x2
>                     ]))
>             (set (reg:V2DI 26 xmm6)
>                 (const_vector:V2DI [
>                         (const_int 0 [0]) repeated x2
>                     ]))
>             (clobber (reg:CC 17 flags))
>
> Rebased on 2020-10-27 trunk and update patch.

Yes, this looks much better.

+#define OPTION_MASK_ISA2_AVX2_UNSET OPTION_MASK_ISA2_AVX512F_UNSET
+#define OPTION_MASK_ISA2_AVX_UNSET OPTION_MASK_ISA2_AVX2_UNSET
+#define OPTION_MASK_ISA2_SSE4_2_UNSET OPTION_MASK_ISA2_AVX_UNSET
+#define OPTION_MASK_ISA2_SSE4_1_UNSET OPTION_MASK_ISA2_SSE4_2_UNSET
+#define OPTION_MASK_ISA2_SSE4_UNSET OPTION_MASK_ISA2_SSE4_1_UNSET
+#define OPTION_MASK_ISA2_SSSE3_UNSET OPTION_MASK_ISA2_SSE4_1_UNSET
+#define OPTION_MASK_ISA2_SSE3_UNSET OPTION_MASK_ISA2_SSSE3_UNSET
+#define OPTION_MASK_ISA2_SSE2_UNSET \
+  (OPTION_MASK_ISA2_SSE3_UNSET | OPTION_MASK_ISA2_KL_UNSET)
+#define OPTION_MASK_ISA2_SSE_UNSET OPTION_MASK_ISA2_SSE2_UNSET

is there a reason to introduce all these (with corresponding changes)?
SSE options live in ISA bitmap, so it is kind of strange you need to
handle them in ISA2 bitmap. Option handling is not exactly my area,
please ask HJ to comment and review this part.

Eventually, some option could be moved from ISA to ISA2 to accommodate
KL options in ISA bitmap.

+      pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
+            const0_rtx);
+      emit_move_insn (target, pat);

emit_insn (gen_rtx_SET (target, pat));

+    case IX86_BUILTIN_ENCODEKEY128U32:
+      {
+    rtx op, xmm_regs[7];
+
+    arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
+    arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
+    arg2 = CALL_EXPR_ARG (exp, 2); // void *h
+
+    op0 = expand_normal (arg0);
+    op1 = expand_normal (arg1);
+    op2 = expand_normal (arg2);
+
+    if (!REG_P (op0))
+      op0 = copy_to_mode_reg (SImode, op0);
+
+    op1 = copy_to_suggested_reg (op1,
+                     gen_rtx_REG (V2DImode,
+                          GET_SSE_REGNO (0)),
+                     V2DImode);
+
+    xmm_regs[0] = op1;

this is no better than:

reg = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
emit_move_insn (reg, op1)

+    xmm_regs[0] = op1;
+    for (i = 1; i < 3; i++)
+      xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));

The first line is dead code, copy_to_suggested reg generated (reg
xmm0) RTX for op1. Just use:

    for (i = 0; i < 3; i++)
      xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));

Similar comments for:

+   case IX86_BUILTIN_ENCODEKEY256U32:

...

+(define_expand "encodekey128u32"
+  [(match_par_dup 3
+    [(set (match_operand:SI 0 "register_operand")
+      (unspec_volatile:SI
+        [(match_operand:SI   1 "register_operand")
+         (match_operand:V2DI 2 "register_operand")]

It is better to use hard register in this particular case, (reg:V2DI XMM0_REG).

+            UNSPECV_ENCODEKEY128U32))])]
+  "TARGET_KL"
+{
+  rtx xmm_regs[7];
+  rtx tmp_unspec;
+  unsigned i;
+
+  /* parallel rtx for encodekey128 predicate */
+  operands[3] = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (8));
+
+  xmm_regs[0] = operands[2];
+  for (i = 1; i < 7; i++)
+      xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));

for (i = 0; i < 7; i++)

+  tmp_unspec
+    = gen_rtx_UNSPEC_VOLATILE (SImode,
+                   gen_rtvec (2, operands[1], xmm_regs[0]),
+                   UNSPECV_ENCODEKEY128U32);
+
+  XVECEXP (operands[3], 0, 0)
+    = gen_rtx_SET (operands[0], tmp_unspec);
+
+  for (i = 0; i < 3; i++)
+    {
+      tmp_unspec
+    = gen_rtx_UNSPEC_VOLATILE (V2DImode,
+                   gen_rtvec (1, const0_rtx),
+                   UNSPECV_ENCODEKEY128U32);

Please move the above out of the loop.

+      XVECEXP (operands[3], 0, i + 1)
+    = gen_rtx_SET (xmm_regs[i], tmp_unspec);
+    }
+
+  for (i = 4; i < 7; i++)
+      XVECEXP (operands[3], 0, i)
+    = gen_rtx_SET (xmm_regs[i], CONST0_RTX (V2DImode));
+
+  XVECEXP (operands[3], 0, 7)
+    = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+})
+
+(define_insn "*encodekey128u32"
+  [(match_parallel 3 "encodekey128_operation"
+    [(set (match_operand:SI 0 "register_operand" "=r")
+      (unspec_volatile:SI
+        [(match_operand:SI   1 "register_operand" "r")
+         (match_operand:V2DI 2 "register_operand" "Yz")]

Again, please use hard register: (reg:V2DI XMM0_REG).

+            UNSPECV_ENCODEKEY128U32))])]
+  "TARGET_KL"
+  "encodekey128\t{%1, %0|%0, %1}"
+  [(set_attr "type" "other")])
+
+(define_expand "encodekey256u32"
+  [(match_par_dup 4
+    [(set (match_operand:SI 0 "register_operand")
+      (unspec_volatile:SI
+        [(match_operand:SI   1 "register_operand")
+         (match_operand:V2DI 2 "register_operand")
+         (match_operand:V2DI 3 "sse_reg_operand")]

Use hard registers here instead of operands 2 and 3: (reg:V2DI
XMM0_REG) and (reg:V2DI XMM1_REG).

+        UNSPECV_ENCODEKEY256U32))])]
+  "TARGET_KL"
+{
+  rtx xmm_regs[7];
+  rtx tmp_unspec;
+  unsigned i;
+
+  /* parallel rtx for encodekey256 predicate */
+  operands[4] = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (9));
+
+  xmm_regs[0] = operands[2];
+  xmm_regs[1] = operands[3];
+  for (i = 2; i < 7; i++)
+      xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));

for (i = 0; i < 7; i++)

+  tmp_unspec
+    = gen_rtx_UNSPEC_VOLATILE (SImode,
+                   gen_rtvec (3, operands[1],
+                      xmm_regs[0], xmm_regs[1]),
+                   UNSPECV_ENCODEKEY256U32);
+
+  XVECEXP (operands[4], 0, 0)
+    = gen_rtx_SET (operands[0], tmp_unspec);
+
+  for (i = 0; i < 4; i++)
+    {
+      tmp_unspec
+    = gen_rtx_UNSPEC_VOLATILE (V2DImode,
+                   gen_rtvec (1, const0_rtx),
+                   UNSPECV_ENCODEKEY256U32);

Please move the above out of the loop.

+      XVECEXP (operands[4], 0, i + 1)
+    = gen_rtx_SET (xmm_regs[i], tmp_unspec);
+    }
+
+  for (i = 4; i < 7; i++)
+      XVECEXP (operands[4], 0, i + 1)
+    = gen_rtx_SET (xmm_regs[i], CONST0_RTX (V2DImode));
+
+  XVECEXP (operands[4], 0, 8)
+    = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+})
+
+(define_insn "*encodekey256u32"
+  [(match_parallel 4 "encodekey256_operation"
+    [(set (match_operand:SI 0 "register_operand" "=r")
+      (unspec_volatile:SI
+        [(match_operand:SI   1 "register_operand" "r")
+         (match_operand:V2DI 2 "register_operand" "Yz")
+         (match_operand:V2DI 3 "sse_reg_operand")]

Here lies the reason to use hard registers. "sse_reg_operand" is not
enough for register allocator. We have no constraint for first SSE
reg, so again
use hard registers here instead of operands 2 and 3: (reg:V2DI
XMM0_REG) and (reg:V2DI XMM1_REG).

+        UNSPECV_ENCODEKEY256U32))])]
+  "TARGET_KL"
+  "encodekey256\t{%1, %0|%0, %1}"
+  [(set_attr "type" "other")])
+

Uros.

Reply via email to