On Tue, Feb 04, 2020 at 12:24:10PM +0100, Uros Bizjak wrote:
> > A && is missing in the split condition to inherit TARGET_AVX.
> 
> Also, you don't need to emit "#" in output template. This is just for
> extra safety, we can live without. Please see e.g. "*tzcnt<mode>_1".

I was worried that -fipa-ra then would give wrong answers.
-O2 -mabi=ms -mavx512f -fno-schedule-insns2 I was worried about isn't a problem 
though,
because while the split4 pass is disabled, the split3 pass is scheduled instead 
before
regstack.
I can do
-O2 -mabi=ms -mavx512f -fdisable-rtl-split4
or
-O2 -mabi=ms -mavx512f -fno-schedule-insns2 -fdisable-rtl-split3
though and then we ICE, with the patch as is with just && epilogue_completed ...
the ICE is
cc1: note: disable pass rtl-split4 for functions in the range of [0, 4294967295]
pr92190.c: In function ‘foo’:
pr92190.c:19:1: error: could not split insn
   19 | }
      | ^
(insn:TI 23 18 10 2 (parallel [
            (unspec_volatile [
                    (const_int 0 [0])
                ] UNSPECV_VZEROUPPER)
            (clobber (reg:V2DI 20 xmm0))
            (clobber (reg:V2DI 21 xmm1))
            (clobber (reg:V2DI 22 xmm2))
            (clobber (reg:V2DI 23 xmm3))
            (clobber (reg:V2DI 24 xmm4))
            (clobber (reg:V2DI 25 xmm5))
            (set (reg:V2DI 26 xmm6)
                (reg:V2DI 26 xmm6))
            (clobber (reg:V2DI 27 xmm7))
            (clobber (reg:V2DI 44 xmm8))
            (clobber (reg:V2DI 45 xmm9))
            (clobber (reg:V2DI 46 xmm10))
            (clobber (reg:V2DI 47 xmm11))
            (clobber (reg:V2DI 48 xmm12))
            (clobber (reg:V2DI 49 xmm13))
            (clobber (reg:V2DI 50 xmm14))
            (clobber (reg:V2DI 51 xmm15))
        ]) "pr92190.c":17:3 4895 {*avx_vzeroupper}
     (nil))
because as the splitter is written, it modifies the insn in place and so
later in try_split
      if (INSN_P (insn_last)
          && rtx_equal_p (PATTERN (insn_last), pat))
        return trial;
rtx_equal_p is true because of the modification in place.
Now, if I slightly modify the patch, so that it does
  operands[0] = gen_rtx_PARALLEL (VOIDmode, vec);
instead of
  XVEC (operands[0], 0) = vec;
and thus doesn't modify in place, it will ICE in a different spot:
pr92190.c: In function ‘foo’:
pr92190.c:19:1: internal compiler error: in final_scan_insn_1, at final.c:3078
   19 | }
      | ^
i.e. on
            /* If we have a length attribute, this instruction should have
               been split in shorten_branches, to ensure that we would have
               valid length info for the splitees.  */
            gcc_assert (!HAVE_ATTR_length);
Though, I guess that is not really specific to this patch, by
-fdisable-rtl-split{3,4,5} we effectively disable all possibilities of
splitting before final, so anything that isn't split before will ICE that
way.
So, if we wanted to remove that class of ICEs, we should just fatal_error if
somebody disables the last splitter pass before expansion and
HAVE_ATTR_length is defined.

Here is the patch with && before epilogue_completed, no "#" and that avoids
modification of the insn in place.  We don't ICE, just could generate wrong
code with -fipa-ra, but user asked for that with -fdisable-rtl-split{3,4,5},
so guess that is ok.

2020-02-04  Jakub Jelinek  <ja...@redhat.com>

        PR target/92190
        * config/i386/i386-features.c (ix86_add_reg_usage_to_vzeroupper): Only
        include sets and not clobbers in the vzeroupper pattern.
        * config/i386/sse.md (*avx_vzeroupper): Change from define_insn to
        define_insn_and_split, split if epilogue_completed and not all 
xmm0-xmm15
        registers are mentioned in the pattern and add clobbers for the missing
        registers at that point.

        * gcc.target/i386/pr92190.c: New test.

--- gcc/config/i386/i386-features.c.jj  2020-02-04 11:40:58.755611428 +0100
+++ gcc/config/i386/i386-features.c     2020-02-04 11:51:33.602148491 +0100
@@ -1764,29 +1764,32 @@ convert_scalars_to_vector (bool timode_p
 
      (set (reg:V2DF R) (reg:V2DF R))
 
-   which preserves the low 128 bits but clobbers the upper bits.
-   For a dead register we just use:
-
-     (clobber (reg:V2DF R))
-
-   which invalidates any previous contents of R and stops R from becoming
-   live across the vzeroupper in future.  */
+   which preserves the low 128 bits but clobbers the upper bits.  */
 
 static void
 ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs)
 {
   rtx pattern = PATTERN (insn);
   unsigned int nregs = TARGET_64BIT ? 16 : 8;
-  rtvec vec = rtvec_alloc (nregs + 1);
-  RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0);
+  unsigned int npats = nregs;
   for (unsigned int i = 0; i < nregs; ++i)
     {
       unsigned int regno = GET_SSE_REGNO (i);
+      if (!bitmap_bit_p (live_regs, regno))
+       npats--;
+    }
+  if (npats == 0)
+    return;
+  rtvec vec = rtvec_alloc (npats + 1);
+  RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0);
+  for (unsigned int i = 0, j = 0; i < nregs; ++i)
+    {
+      unsigned int regno = GET_SSE_REGNO (i);
+      if (!bitmap_bit_p (live_regs, regno))
+       continue;
       rtx reg = gen_rtx_REG (V2DImode, regno);
-      if (bitmap_bit_p (live_regs, regno))
-       RTVEC_ELT (vec, i + 1) = gen_rtx_SET (reg, reg);
-      else
-       RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg);
+      ++j;
+      RTVEC_ELT (vec, j) = gen_rtx_SET (reg, reg);
     }
   XVEC (pattern, 0) = vec;
   df_insn_rescan (insn);
--- gcc/config/i386/sse.md.jj   2020-02-04 11:40:58.813610563 +0100
+++ gcc/config/i386/sse.md      2020-02-04 13:21:50.088340875 +0100
@@ -19815,11 +19815,38 @@ (define_expand "avx_vzeroupper"
   [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])]
   "TARGET_AVX")
 
-(define_insn "*avx_vzeroupper"
+(define_insn_and_split "*avx_vzeroupper"
   [(match_parallel 0 "vzeroupper_pattern"
      [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])]
   "TARGET_AVX"
   "vzeroupper"
+  "&& epilogue_completed
+   && XVECLEN (operands[0], 0) != (TARGET_64BIT ? 16 : 8) + 1"
+  [(match_dup 0)]
+{
+  /* For IPA-RA purposes, make it clear the instruction clobbers
+     even XMM registers not mentioned explicitly in the pattern.  */
+  unsigned int nregs = TARGET_64BIT ? 16 : 8;
+  unsigned int npats = XVECLEN (operands[0], 0);
+  rtvec vec = rtvec_alloc (nregs + 1);
+  RTVEC_ELT (vec, 0) = XVECEXP (operands[0], 0, 0);
+  for (unsigned int i = 0, j = 1; i < nregs; ++i)
+    {
+      unsigned int regno = GET_SSE_REGNO (i);
+      if (j < npats
+         && REGNO (SET_DEST (XVECEXP (operands[0], 0, j))) == regno)
+       {
+         RTVEC_ELT (vec, i + 1) = XVECEXP (operands[0], 0, j);
+         j++;
+       }
+      else
+       {
+         rtx reg = gen_rtx_REG (V2DImode, regno);
+         RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg);
+       }
+    }
+  operands[0] = gen_rtx_PARALLEL (VOIDmode, vec);
+}
   [(set_attr "type" "sse")
    (set_attr "modrm" "0")
    (set_attr "memory" "none")
--- gcc/testsuite/gcc.target/i386/pr92190.c.jj  2020-02-04 11:51:33.608148402 
+0100
+++ gcc/testsuite/gcc.target/i386/pr92190.c     2020-02-04 11:51:33.608148402 
+0100
@@ -0,0 +1,19 @@
+/* PR target/92190 */
+/* { dg-do compile { target { *-*-linux* && lp64 } } } */
+/* { dg-options "-mabi=ms -O2 -mavx512f" } */
+
+typedef char VC __attribute__((vector_size (16)));
+typedef int VI __attribute__((vector_size (16 * sizeof 0)));
+VC a;
+VI b;
+void bar (VI);
+void baz (VC);
+
+void
+foo (void)
+{
+  VC k = a;
+  VI n = b;
+  bar (n);
+  baz (k);
+}


        Jakub

Reply via email to