On Mon, Jun 15, 2020 at 09:29:29PM +0800, Hongtao Liu via Gcc-patches wrote:
>   Basically i "copy" this optimization from clang i386 backend, Refer
> to pr95524 for details.
>   Bootstrap is ok, regression test on i386/x86-64 backend is ok.
> 
> gcc/ChangeLog:
>         PR target/95524
>         * gcc/config/i386/i386-expand.c
>         (ix86_expand_vec_shift_qihi_constant): New function.

No gcc/ prefix in gcc/ChangeLog (git push would be refused).
And with or without the prefix it fits, so:
        * config/i386/i386-expand.c (ix86_expand_vec_shift_qihi_constant): New
        function.

>         * gcc/config/i386/i386-protos.h: Declare.

This needs to mention the function name again.

>         * gcc/config/i386/sse.md: Optimize shift V*QImode by constant.

And thus needs to mention the define_expand, (<shift_insn><mode>3)
in this case.

+  machine_mode qimode, himode;
+  unsigned int shift_constant, and_constant, xor_constant;
+  rtx vec_const_and, vec_const_xor;
+  rtx tmp, op1_subreg;
+  rtx (*gen_shift) (rtx, rtx, rtx);
+  rtx (*gen_and) (rtx, rtx, rtx);
+  rtx (*gen_xor) (rtx, rtx, rtx);
+  rtx (*gen_sub) (rtx, rtx, rtx);
+
+  /* Only optimize shift by constant.  */
+  if (!CONST_INT_P (op2))
+    return false;
+
+  qimode = GET_MODE (dest);
+  shift_constant = INTVAL (op2);

I wonder if shift_constant shouldn't be unsigned HOST_WIDE_INT
instead, and which >= 8 values you should try to do something about rather
than punt on optimizing.  If this is just from normal C shifts, then
perhaps anything >= 32U would be invalid, if it is from intrinsics,
perhaps there is a different behavior (masked with something?).

+  /* Shift constant greater equal 8 result into 0.  */
+  if (shift_constant > 7)
+    {
+      if (code == ASHIFT || code == LSHIFTRT)
+       {
+         emit_move_insn (dest, CONST0_RTX (qimode));
+         return true;
+       }
+      /* Sign bit not known.  */
+      else if (code == ASHIFTRT)
+       return false;
+      else
+       gcc_unreachable ();
+    }
+
+  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+  /* Record sign bit.  */
+  xor_constant = 1 << (8 - shift_constant - 1);
+
+  /* Zero upper/lower bits shift from left/right element.  */
+  and_constant = code == ASHIFT ? 256 - (1 << shift_constant) :
+    (1 << (8 - shift_constant)) - 1;

Formatting.  Should be:
  and_constant
    = (code == ASHIFT ? 256 - (1 << shift_constant)
                      : (1 << (8 - shift_constant)) - 1);
or so.
+
+  switch (qimode)
+    {
+    case V16QImode:
+      himode = V8HImode;
+      gen_shift = (code == ASHIFT) ? gen_ashlv8hi3 :
+       (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3;

Similarly.  : or ? should just never appear at the end of line.
And probably no reason to wrap the comparisons in parens,
instead wrap the whole expression.  So:
      gen_shift
        = (code == ASHIFT
           ? gen_ashlv8hi3
           : code == ASHIFTRT ? gen_ashrv8hi3 : gen_lshrv8hi3);
or so?

+      gen_and = gen_andv16qi3;
+      gen_xor = gen_xorv16qi3;
+      gen_sub = gen_subv16qi3;
+      break;
+    case V32QImode:
+      himode = V16HImode;
+      gen_shift = (code == ASHIFT) ? gen_ashlv16hi3 :
+       (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3;

Ditto.

+      gen_and = gen_andv32qi3;
+      gen_xor = gen_xorv32qi3;
+      gen_sub = gen_subv32qi3;
+      break;
+    case V64QImode:
+      himode = V32HImode;
+      gen_shift = (code == ASHIFT) ? gen_ashlv32hi3 :
+       (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3;

Ditto.

+  tmp = gen_reg_rtx (himode);
+  vec_const_and = gen_reg_rtx (qimode);
+  op1_subreg = simplify_gen_subreg (himode, op1, qimode, 0);

Just use lowpart_subreg?
+
+  /* For ASHIFT and LSHIFTRT, perform operation like
+     vpsllw/vpsrlw $shift_constant, %op1, %dest.
+     vpand %vec_const_and, %dest.  */
+  emit_insn (gen_shift (tmp, op1_subreg, op2));
+  emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
+  emit_move_insn (vec_const_and,
+                 ix86_build_const_vector (qimode, true,
+                                          GEN_INT (and_constant)));
+  emit_insn (gen_and (dest, dest, vec_const_and));
+
+  /* For ASHIFTRT, perform extra operation like
+     vpxor %vec_const_xor, %dest, %dest
+     vpsubb %vec_const_xor, %dest, %dest  */
+  if (code == ASHIFTRT)
+    {
+      vec_const_xor = gen_reg_rtx (qimode);
+      emit_move_insn (vec_const_xor,
+                     ix86_build_const_vector (qimode, true,
+                                              GEN_INT (xor_constant)));
+      emit_insn (gen_xor (dest, dest, vec_const_xor));
+      emit_insn (gen_sub (dest, dest, vec_const_xor));
+    }
+  return true;
+}
+
 /* Expand a vector operation CODE for a V*QImode in terms of the
    same operation on V*HImode.  */
 
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index f5320494fa1..7c2ce618f3f 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -206,6 +206,7 @@ extern void ix86_expand_round_sse4 (rtx, rtx);
 
 extern bool ix86_expand_vecmul_qihi (rtx, rtx, rtx);
 extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
+extern bool ix86_expand_vec_shift_qihi_constant (enum rtx_code, rtx, rtx, rtx);
 
 extern rtx ix86_split_stack_guard (void);
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index aa9fdc87c68..b466950af40 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -19863,6 +19863,9 @@
       gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
       emit_insn (gen (operands[0], operands[1], tmp));
     }
+  else if (ix86_expand_vec_shift_qihi_constant (<CODE>, operands[0],
+                                               operands[1], operands[2]))
+    DONE;

Perhaps do instead:
  else if (!ix86_expand_vec_shift_qihi_constant (<CODE>, operands[0],
                                                 operands[1], operands[2]))
    ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
  DONE;
?

        Jakub

Reply via email to