Hi!

The code we emit on the following testcases is really terrible, with both
-mavx512f -mno-avx512bw as well as -mavx512bw, rather than doing e.g.
        vpinsrw $0, %edi, %xmm0, %xmm1
        vinserti32x4    $0, %xmm1, %zmm0, %zmm0
when trying to insert into low 128-bits or
        vextracti32x4   $1, %zmm0, %xmm1
        vpinsrw $3, %edi, %xmm1, %xmm1
        vinserti32x4    $1, %xmm1, %zmm0, %zmm0
when trying to insert into other 128-bits, we emit:
        pushq   %rbp
        vmovq   %xmm0, %rax
        movzwl  %di, %edi
        xorw    %ax, %ax
        movq    %rsp, %rbp
        orq     %rdi, %rax
        andq    $-64, %rsp
        vmovdqa64       %zmm0, -64(%rsp)
        movq    %rax, -64(%rsp)
        vmovdqa64       -64(%rsp), %zmm0
        leave
and furthermore there is some RA bug it triggers, so we miscompile it.
All this is because while at least for AVX512BW we have ix86_expand_vector_set
implemented for V64QImode and V32QImode, we actually don't have a pattern
for it.  Fixed by adding those modes to V, which is used only by this
vec_set pattern and some xop pattern, which is misusing it anyway (it really
can't handle 512-bit vectors, so could result in assembly failures etc.
with -mxop -mavx512f).  Furthermore, for AVX512F we can use the above
extraction/insertion/insertion sequence, similarly to what we do for 256-bit
insertions, just we have halves there rather than quarters.
The splitters are added to match what vec_set_lo_* define_insn_and_split do,
if we are trying to extract low 128-bit lane of a 512-bit vector, we really
don't need vextracti32x4 instruction, we can use simple move or even nothing
at all (if source and destination are the same register, or post RA register
renaming can arrange that).

The RA bug still should be fixed, but at least this patch makes it latent
and improves a lot the code.  Bootstrapped/regtested on x86_64-linux and
i686-linux, ok for trunk? 

2018-03-30  Jakub Jelinek  <ja...@redhat.com>

        PR middle-end/85090
        * config/i386/sse.md (V): Add V64QI and V32HI for TARGET_AVX512F.
        (V_128_256): New mode iterator.
        (*avx512dq_vextract<shuffletype>64x2_1 splitter): New define_split.
        (*avx512f_vextract<shuffletype>32x4_1 splitter): Likewise.
        (xop_pcmov_<mode><avxsizesuffix>): Use V_128_256 mode iterator instead
        of V.
        * config/i386/i386.c (ix86_expand_vector_set): Improve V32HImode and
        V64QImode expansion for !TARGET_AVX512BW && TARGET_AVX512F.

        * gcc.target/i386/avx512f-pr85090-1.c: New test.
        * gcc.target/i386/avx512f-pr85090-2.c: New test.
        * gcc.target/i386/avx512f-pr85090-3.c: New test.
        * gcc.target/i386/avx512bw-pr85090-2.c: New test.
        * gcc.target/i386/avx512bw-pr85090-3.c: New test.

--- gcc/config/i386/sse.md.jj   2018-03-29 12:38:32.088512220 +0200
+++ gcc/config/i386/sse.md      2018-03-30 14:56:19.949549653 +0200
@@ -229,8 +229,8 @@ (define_mode_iterator VI1_AVX512VL
 
 ;; All vector modes
 (define_mode_iterator V
-  [(V32QI "TARGET_AVX") V16QI
-   (V16HI "TARGET_AVX") V8HI
+  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
+   (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
    (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
    (V8DI "TARGET_AVX512F")  (V4DI "TARGET_AVX") V2DI
    (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
@@ -244,6 +244,10 @@ (define_mode_iterator V_128
 (define_mode_iterator V_256
   [V32QI V16HI V8SI V4DI V8SF V4DF])
 
+;; All 128bit and 256bit vector modes
+(define_mode_iterator V_128_256
+  [V32QI V16QI V16HI V8HI V8SI V4SI V4DI V2DI V8SF V4SF V4DF V2DF])
+
 ;; All 512bit vector modes
 (define_mode_iterator V_512 [V64QI V32HI V16SI V8DI V16SF V8DF])
 
@@ -7351,6 +7355,15 @@ (define_insn "<mask_codefor>avx512dq_vex
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_split
+  [(set (match_operand:<ssequartermode> 0 "nonimmediate_operand")
+       (vec_select:<ssequartermode>
+         (match_operand:V8FI 1 "register_operand")
+         (parallel [(const_int 0) (const_int 1)])))]
+  "TARGET_AVX512DQ && reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+  "operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);")
+
 (define_insn "<mask_codefor>avx512f_vextract<shuffletype>32x4_1<mask_name>"
   [(set (match_operand:<ssequartermode> 0 "<store_mask_predicate>" 
"=<store_mask_constraint>")
        (vec_select:<ssequartermode>
@@ -7374,6 +7387,16 @@ (define_insn "<mask_codefor>avx512f_vext
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_split
+  [(set (match_operand:<ssequartermode> 0 "nonimmediate_operand")
+       (vec_select:<ssequartermode>
+         (match_operand:V16FI 1 "register_operand")
+         (parallel [(const_int 0) (const_int 1)
+                    (const_int 2) (const_int 3)])))]
+  "TARGET_AVX512F && reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+  "operands[1] = gen_lowpart (<ssequartermode>mode, operands[1]);")
+
 (define_mode_attr extract_type_2
   [(V16SF "avx512dq") (V16SI "avx512dq") (V8DF "avx512f") (V8DI "avx512f")])
 
@@ -16478,11 +16501,11 @@ (define_insn "xop_p<madcs>wd"
 
 ;; XOP parallel XMM conditional moves
 (define_insn "xop_pcmov_<mode><avxsizesuffix>"
-  [(set (match_operand:V 0 "register_operand" "=x,x")
-       (if_then_else:V
-         (match_operand:V 3 "nonimmediate_operand" "x,m")
-         (match_operand:V 1 "register_operand" "x,x")
-         (match_operand:V 2 "nonimmediate_operand" "xm,x")))]
+  [(set (match_operand:V_128_256 0 "register_operand" "=x,x")
+       (if_then_else:V_128_256
+         (match_operand:V_128_256 3 "nonimmediate_operand" "x,m")
+         (match_operand:V_128_256 1 "register_operand" "x,x")
+         (match_operand:V_128_256 2 "nonimmediate_operand" "xm,x")))]
   "TARGET_XOP"
   "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "sse4arg")])
--- gcc/config/i386/i386.c.jj   2018-03-28 21:14:29.533744671 +0200
+++ gcc/config/i386/i386.c      2018-03-29 21:16:06.154318265 +0200
@@ -44085,21 +44085,69 @@ half:
       break;
 
     case E_V32HImode:
-      if (TARGET_AVX512F && TARGET_AVX512BW)
+      if (TARGET_AVX512BW)
        {
          mmode = SImode;
          gen_blendm = gen_avx512bw_blendmv32hi;
        }
+      else if (TARGET_AVX512F)
+       {
+         half_mode = E_V8HImode;
+         n = 8;
+         goto quarter;
+       }
       break;
 
     case E_V64QImode:
-      if (TARGET_AVX512F && TARGET_AVX512BW)
+      if (TARGET_AVX512BW)
        {
          mmode = DImode;
          gen_blendm = gen_avx512bw_blendmv64qi;
        }
+      else if (TARGET_AVX512F)
+       {
+         half_mode = E_V16QImode;
+         n = 16;
+         goto quarter;
+       }
       break;
 
+quarter:
+      /* Compute offset.  */
+      i = elt / n;
+      elt %= n;
+
+      gcc_assert (i <= 3);
+
+      {
+       /* Extract the quarter.  */
+       tmp = gen_reg_rtx (V4SImode);
+       rtx tmp2 = gen_lowpart (V16SImode, target);
+       rtx mask = gen_reg_rtx (QImode);
+
+       emit_move_insn (mask, constm1_rtx);
+       emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
+                                                  tmp, mask));
+
+       tmp2 = gen_reg_rtx (half_mode);
+       emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
+       tmp = tmp2;
+
+       /* Put val in tmp at elt.  */
+       ix86_expand_vector_set (false, tmp, val, elt);
+
+       /* Put it back.  */
+       tmp2 = gen_reg_rtx (V16SImode);
+       rtx tmp3 = gen_lowpart (V16SImode, target);
+       mask = gen_reg_rtx (HImode);
+       emit_move_insn (mask, constm1_rtx);
+       tmp = gen_lowpart (V4SImode, tmp);
+       emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
+                                                 tmp3, mask));
+       emit_move_insn (target, gen_lowpart (mode, tmp2));
+      }
+      return;
+
     default:
       break;
     }
--- gcc/testsuite/gcc.target/i386/avx512f-pr85090-1.c.jj        2018-03-30 
15:04:47.964730406 +0200
+++ gcc/testsuite/gcc.target/i386/avx512f-pr85090-1.c   2018-03-30 
15:04:26.722722711 +0200
@@ -0,0 +1,35 @@
+/* PR middle-end/85090 */
+/* { dg-do run { target int128 } } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-options "-O2 -fno-tree-dominator-opts -mavx512f 
-fira-algorithm=priority" } */
+
+#include "avx512f-check.h"
+
+typedef unsigned short U __attribute__ ((vector_size (64)));
+typedef unsigned int V __attribute__ ((vector_size (64)));
+typedef unsigned __int128 W __attribute__ ((vector_size (64)));
+
+V h;
+W d, e, g;
+U f;
+
+static __attribute__((noipa)) U
+foo (U i)
+{
+  f >>= ((U)d > f) & 1;
+  i[0] <<= 1;
+  e = (7 & -d) << (7 & -(g & 7));
+  return i;
+}
+
+void
+avx512f_test (void)
+{
+  U x;
+  for (unsigned i = 0; i < 32; i++)
+    x[i] = i;
+  x = foo (x);
+  for (unsigned i = 0; i < 32; i++)
+    if (x[i] != i)
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx512f-pr85090-2.c.jj        2018-03-30 
16:18:01.744300954 +0200
+++ gcc/testsuite/gcc.target/i386/avx512f-pr85090-2.c   2018-03-30 
16:17:26.674285394 +0200
@@ -0,0 +1,37 @@
+/* PR middle-end/85090 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mno-avx512bw -mtune=intel -masm=att" } */
+
+typedef short V __attribute__((vector_size (64)));
+
+V
+f1 (V x, int y)
+{
+  x[0] = y;
+  return x;
+}
+
+V
+f2 (V x, int y)
+{
+  x[7] = y;
+  return x;
+}
+
+V
+f3 (V x, int y)
+{
+  x[11] = y;
+  return x;
+}
+
+V
+f4 (V x, int y)
+{
+  x[29] = y;
+  return x;
+}
+
+/* { dg-final { scan-assembler-times "vpinsrw\t" 4 } } */
+/* { dg-final { scan-assembler-times "vextracti32x4\t" 2 } } */
+/* { dg-final { scan-assembler-times "vinserti32x4\t" 4 } } */
--- gcc/testsuite/gcc.target/i386/avx512f-pr85090-3.c.jj        2018-03-30 
16:21:01.358380633 +0200
+++ gcc/testsuite/gcc.target/i386/avx512f-pr85090-3.c   2018-03-30 
16:21:27.633392302 +0200
@@ -0,0 +1,37 @@
+/* PR middle-end/85090 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mno-avx512bw -mtune=intel -masm=att" } */
+
+typedef signed char V __attribute__((vector_size (64)));
+
+V
+f1 (V x, int y)
+{
+  x[0] = y;
+  return x;
+}
+
+V
+f2 (V x, int y)
+{
+  x[15] = y;
+  return x;
+}
+
+V
+f3 (V x, int y)
+{
+  x[22] = y;
+  return x;
+}
+
+V
+f4 (V x, int y)
+{
+  x[59] = y;
+  return x;
+}
+
+/* { dg-final { scan-assembler-times "vpinsrb\t" 4 } } */
+/* { dg-final { scan-assembler-times "vextracti32x4\t" 2 } } */
+/* { dg-final { scan-assembler-times "vinserti32x4\t" 4 } } */
--- gcc/testsuite/gcc.target/i386/avx512bw-pr85090-2.c.jj       2018-03-30 
16:19:34.986342321 +0200
+++ gcc/testsuite/gcc.target/i386/avx512bw-pr85090-2.c  2018-03-30 
16:19:07.650330188 +0200
@@ -0,0 +1,35 @@
+/* PR middle-end/85090 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw -mtune=intel -masm=att" } */
+
+typedef short V __attribute__((vector_size (64)));
+
+V
+f1 (V x, int y)
+{
+  x[0] = y;
+  return x;
+}
+
+V
+f2 (V x, int y)
+{
+  x[7] = y;
+  return x;
+}
+
+V
+f3 (V x, int y)
+{
+  x[11] = y;
+  return x;
+}
+
+V
+f4 (V x, int y)
+{
+  x[29] = y;
+  return x;
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastw\t" 4 } } */
--- gcc/testsuite/gcc.target/i386/avx512bw-pr85090-3.c.jj       2018-03-30 
16:20:00.359353581 +0200
+++ gcc/testsuite/gcc.target/i386/avx512bw-pr85090-3.c  2018-03-30 
16:20:44.638373224 +0200
@@ -0,0 +1,35 @@
+/* PR middle-end/85090 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw -mtune=intel -masm=att" } */
+
+typedef signed char V __attribute__((vector_size (64)));
+
+V
+f1 (V x, int y)
+{
+  x[0] = y;
+  return x;
+}
+
+V
+f2 (V x, int y)
+{
+  x[15] = y;
+  return x;
+}
+
+V
+f3 (V x, int y)
+{
+  x[22] = y;
+  return x;
+}
+
+V
+f4 (V x, int y)
+{
+  x[59] = y;
+  return x;
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\t" 4 } } */

        Jakub

Reply via email to