The following patch fixes a performance issue when loading/storing/moving
TImode values when using -mvsx-timode -mcpu=power7 with LRA.  The problem is
that the vsx_le_permute_<mode> and vsx_le_perm_{load,store}_<mode> patterns
do no support TImode values in GPRs, and LRA is using these patterns to
fixup constraints, which ends up leading to really bad code gen as seen by
the test cases in the bug report.

This patch fixes the bug by adding GPR support to the above patterns, as
well as a couple of peepholes that improve the code for loads and stores
to/from GPRs.

This passed bootstrapping and regtesting with no regressions and Mike
ran this on SPEC2006 and found no performance regressions with it.

Ok for trunk?  Do we want this on the GCC 7 branch where LRA is on by default?

Peter


gcc/
        * config/rs6000/vsx.md (*vsx_le_permute_<mode>): Add support for
        operands residing in integer registers.
        (*vsx_le_perm_load_<mode>): Likewise.
        (*vsx_le_perm_store_<mode>): Likewise.
        (define_peephole2): Add peepholes to optimize the above.

gcc/
        * gcc.target/powerpc/pr72804.c: New test.

Index: gcc/config/rs6000/vsx.md
===================================================================
--- gcc/config/rs6000/vsx.md    (revision 250918)
+++ gcc/config/rs6000/vsx.md    (working copy)
@@ -759,17 +759,20 @@ (define_split
 ;; special V1TI container class, which it is not appropriate to use vec_select
 ;; for the type.
 (define_insn "*vsx_le_permute_<mode>"
-  [(set (match_operand:VSX_TI 0 "nonimmediate_operand" "=<VSa>,<VSa>,Z")
+  [(set (match_operand:VSX_TI 0 "nonimmediate_operand" 
"=<VSa>,<VSa>,Z,&r,&r,Q")
        (rotate:VSX_TI
-        (match_operand:VSX_TI 1 "input_operand" "<VSa>,Z,<VSa>")
+        (match_operand:VSX_TI 1 "input_operand" "<VSa>,Z,<VSa>,r,Q,r")
         (const_int 64)))]
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
   "@
    xxpermdi %x0,%x1,%x1,2
    lxvd2x %x0,%y1
-   stxvd2x %x1,%y0"
-  [(set_attr "length" "4")
-   (set_attr "type" "vecperm,vecload,vecstore")])
+   stxvd2x %x1,%y0
+   mr %0,%L1; mr %L0,%1
+   ld%U1%X1 %0,%L1; ld%U1%X1 %L0,%1
+   std%U0%X0 %L1,%0; std%U0%X0 %1,%L0"
+  [(set_attr "length" "4,4,4,8,8,8")
+   (set_attr "type" "vecperm,vecload,vecstore,*,load,store")])
 
 (define_insn_and_split "*vsx_le_undo_permute_<mode>"
   [(set (match_operand:VSX_TI 0 "vsx_register_operand" "=<VSa>,<VSa>")
@@ -795,10 +798,12 @@ (define_insn_and_split "*vsx_le_undo_per
    (set_attr "type" "veclogical")])
 
 (define_insn_and_split "*vsx_le_perm_load_<mode>"
-  [(set (match_operand:VSX_LE_128 0 "vsx_register_operand" "=<VSa>")
-        (match_operand:VSX_LE_128 1 "memory_operand" "Z"))]
+  [(set (match_operand:VSX_LE_128 0 "vsx_register_operand" "=<VSa>,r")
+        (match_operand:VSX_LE_128 1 "memory_operand" "Z,Q"))]
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
-  "#"
+  "@
+   #
+   #"
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
   [(const_int 0)]
   "
@@ -811,16 +816,18 @@ (define_insn_and_split "*vsx_le_perm_loa
   DONE;
 }
   "
-  [(set_attr "type" "vecload")
-   (set_attr "length" "8")])
+  [(set_attr "type" "vecload,load")
+   (set_attr "length" "8,8")])
 
 (define_insn "*vsx_le_perm_store_<mode>"
-  [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z")
-        (match_operand:VSX_LE_128 1 "vsx_register_operand" "+<VSa>"))]
+  [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z,Q")
+        (match_operand:VSX_LE_128 1 "vsx_register_operand" "+<VSa>,r"))]
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
-  "#"
-  [(set_attr "type" "vecstore")
-   (set_attr "length" "12")])
+  "@
+   #
+   #"
+  [(set_attr "type" "vecstore,store")
+   (set_attr "length" "12,8")])
 
 (define_split
   [(set (match_operand:VSX_LE_128 0 "memory_operand" "")
@@ -836,6 +843,31 @@ (define_split
   DONE;
 })
 
+;; Peepholes to catch loads and stores for TImode if TImode landed in
+;; GPR registers on a little endian system.
+(define_peephole2
+  [(set (match_operand:VSX_TI 0 "int_reg_operand" "")
+       (rotate:VSX_TI (match_operand:VSX_TI 1 "memory_operand" "")
+                      (const_int 64)))
+   (set (match_operand:VSX_TI 2 "int_reg_operand" "")
+       (rotate:VSX_TI (match_dup 0)
+                      (const_int 64)))]
+  "!BYTES_BIG_ENDIAN && TARGET_VSX && TARGET_VSX_TIMODE && !TARGET_P9_VECTOR
+   && (rtx_equal_p (operands[0], operands[2])
+       || peep2_reg_dead_p (2, operands[0]))"
+   [(set (match_dup 2) (match_dup 1))])
+
+(define_peephole2
+  [(set (match_operand:VSX_TI 0 "int_reg_operand" "")
+       (rotate:VSX_TI (match_operand:VSX_TI 1 "int_reg_operand" "")
+                      (const_int 64)))
+   (set (match_operand:VSX_TI 2 "memory_operand" "")
+       (rotate:VSX_TI (match_dup 0)
+                      (const_int 64)))]
+  "!BYTES_BIG_ENDIAN && TARGET_VSX && TARGET_VSX_TIMODE && !TARGET_P9_VECTOR
+   && peep2_reg_dead_p (2, operands[0])"
+   [(set (match_dup 2) (match_dup 1))])
+
 ;; Peephole to catch memory to memory transfers for TImode if TImode landed in
 ;; VSX registers on a little endian system.  The vector types and IEEE 128-bit
 ;; floating point are handled by the more generic swap elimination pass.
Index: gcc/testsuite/gcc.target/powerpc/pr72804.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr72804.c  (nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/pr72804.c  (working copy)
@@ -0,0 +1,25 @@
+/* { dg-do compile { target { powerpc64*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+__int128_t
+foo (__int128_t *src)
+{
+  return ~*src;
+}
+
+void
+bar (__int128_t *dst, __int128_t src)
+{
+  *dst =  ~src;
+}
+
+/* { dg-final { scan-assembler-times "not " 4 } } */
+/* { dg-final { scan-assembler-times "std " 2 } } */
+/* { dg-final { scan-assembler-times "ld " 2 } } */
+/* { dg-final { scan-assembler-not "lxvd2x" } } */
+/* { dg-final { scan-assembler-not "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+/* { dg-final { scan-assembler-not "mfvsrd" } } */
+/* { dg-final { scan-assembler-not "mfvsrd" } } */

Reply via email to