The following patch fixes a performance issue when loading/storing/moving TImode values when using -mvsx-timode -mcpu=power7 with LRA. The problem is that the vsx_le_permute_<mode> and vsx_le_perm_{load,store}_<mode> patterns do no support TImode values in GPRs, and LRA is using these patterns to fixup constraints, which ends up leading to really bad code gen as seen by the test cases in the bug report.
This patch fixes the bug by adding GPR support to the above patterns, as well as a couple of peepholes that improve the code for loads and stores to/from GPRs. This passed bootstrapping and regtesting with no regressions and Mike ran this on SPEC2006 and found no performance regressions with it. Ok for trunk? Do we want this on the GCC 7 branch where LRA is on by default? Peter gcc/ * config/rs6000/vsx.md (*vsx_le_permute_<mode>): Add support for operands residing in integer registers. (*vsx_le_perm_load_<mode>): Likewise. (*vsx_le_perm_store_<mode>): Likewise. (define_peephole2): Add peepholes to optimize the above. gcc/ * gcc.target/powerpc/pr72804.c: New test. Index: gcc/config/rs6000/vsx.md =================================================================== --- gcc/config/rs6000/vsx.md (revision 250918) +++ gcc/config/rs6000/vsx.md (working copy) @@ -759,17 +759,20 @@ (define_split ;; special V1TI container class, which it is not appropriate to use vec_select ;; for the type. (define_insn "*vsx_le_permute_<mode>" - [(set (match_operand:VSX_TI 0 "nonimmediate_operand" "=<VSa>,<VSa>,Z") + [(set (match_operand:VSX_TI 0 "nonimmediate_operand" "=<VSa>,<VSa>,Z,&r,&r,Q") (rotate:VSX_TI - (match_operand:VSX_TI 1 "input_operand" "<VSa>,Z,<VSa>") + (match_operand:VSX_TI 1 "input_operand" "<VSa>,Z,<VSa>,r,Q,r") (const_int 64)))] "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR" "@ xxpermdi %x0,%x1,%x1,2 lxvd2x %x0,%y1 - stxvd2x %x1,%y0" - [(set_attr "length" "4") - (set_attr "type" "vecperm,vecload,vecstore")]) + stxvd2x %x1,%y0 + mr %0,%L1; mr %L0,%1 + ld%U1%X1 %0,%L1; ld%U1%X1 %L0,%1 + std%U0%X0 %L1,%0; std%U0%X0 %1,%L0" + [(set_attr "length" "4,4,4,8,8,8") + (set_attr "type" "vecperm,vecload,vecstore,*,load,store")]) (define_insn_and_split "*vsx_le_undo_permute_<mode>" [(set (match_operand:VSX_TI 0 "vsx_register_operand" "=<VSa>,<VSa>") @@ -795,10 +798,12 @@ (define_insn_and_split "*vsx_le_undo_per (set_attr "type" "veclogical")]) (define_insn_and_split "*vsx_le_perm_load_<mode>" - [(set (match_operand:VSX_LE_128 0 "vsx_register_operand" "=<VSa>") - (match_operand:VSX_LE_128 1 "memory_operand" "Z"))] + [(set (match_operand:VSX_LE_128 0 "vsx_register_operand" "=<VSa>,r") + (match_operand:VSX_LE_128 1 "memory_operand" "Z,Q"))] "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR" - "#" + "@ + # + #" "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR" [(const_int 0)] " @@ -811,16 +816,18 @@ (define_insn_and_split "*vsx_le_perm_loa DONE; } " - [(set_attr "type" "vecload") - (set_attr "length" "8")]) + [(set_attr "type" "vecload,load") + (set_attr "length" "8,8")]) (define_insn "*vsx_le_perm_store_<mode>" - [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z") - (match_operand:VSX_LE_128 1 "vsx_register_operand" "+<VSa>"))] + [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z,Q") + (match_operand:VSX_LE_128 1 "vsx_register_operand" "+<VSa>,r"))] "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR" - "#" - [(set_attr "type" "vecstore") - (set_attr "length" "12")]) + "@ + # + #" + [(set_attr "type" "vecstore,store") + (set_attr "length" "12,8")]) (define_split [(set (match_operand:VSX_LE_128 0 "memory_operand" "") @@ -836,6 +843,31 @@ (define_split DONE; }) +;; Peepholes to catch loads and stores for TImode if TImode landed in +;; GPR registers on a little endian system. +(define_peephole2 + [(set (match_operand:VSX_TI 0 "int_reg_operand" "") + (rotate:VSX_TI (match_operand:VSX_TI 1 "memory_operand" "") + (const_int 64))) + (set (match_operand:VSX_TI 2 "int_reg_operand" "") + (rotate:VSX_TI (match_dup 0) + (const_int 64)))] + "!BYTES_BIG_ENDIAN && TARGET_VSX && TARGET_VSX_TIMODE && !TARGET_P9_VECTOR + && (rtx_equal_p (operands[0], operands[2]) + || peep2_reg_dead_p (2, operands[0]))" + [(set (match_dup 2) (match_dup 1))]) + +(define_peephole2 + [(set (match_operand:VSX_TI 0 "int_reg_operand" "") + (rotate:VSX_TI (match_operand:VSX_TI 1 "int_reg_operand" "") + (const_int 64))) + (set (match_operand:VSX_TI 2 "memory_operand" "") + (rotate:VSX_TI (match_dup 0) + (const_int 64)))] + "!BYTES_BIG_ENDIAN && TARGET_VSX && TARGET_VSX_TIMODE && !TARGET_P9_VECTOR + && peep2_reg_dead_p (2, operands[0])" + [(set (match_dup 2) (match_dup 1))]) + ;; Peephole to catch memory to memory transfers for TImode if TImode landed in ;; VSX registers on a little endian system. The vector types and IEEE 128-bit ;; floating point are handled by the more generic swap elimination pass. Index: gcc/testsuite/gcc.target/powerpc/pr72804.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/pr72804.c (nonexistent) +++ gcc/testsuite/gcc.target/powerpc/pr72804.c (working copy) @@ -0,0 +1,25 @@ +/* { dg-do compile { target { powerpc64*-*-* } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } } */ +/* { dg-require-effective-target powerpc_vsx_ok } */ +/* { dg-options "-O2 -mvsx" } */ + +__int128_t +foo (__int128_t *src) +{ + return ~*src; +} + +void +bar (__int128_t *dst, __int128_t src) +{ + *dst = ~src; +} + +/* { dg-final { scan-assembler-times "not " 4 } } */ +/* { dg-final { scan-assembler-times "std " 2 } } */ +/* { dg-final { scan-assembler-times "ld " 2 } } */ +/* { dg-final { scan-assembler-not "lxvd2x" } } */ +/* { dg-final { scan-assembler-not "stxvd2x" } } */ +/* { dg-final { scan-assembler-not "xxpermdi" } } */ +/* { dg-final { scan-assembler-not "mfvsrd" } } */ +/* { dg-final { scan-assembler-not "mfvsrd" } } */