Hi, I previously added POWER9 support for lxvx and stxvx to replace the load-swap and swap-store patterns for POWER8. However, I missed the fact that we have different patterns for loads and stores of 128-bit floats and other scalars. This patch expands the previous POWER9 override to catch these cases, and disables those other patterns when P9 vector support is available.
Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no regressions. Ok for trunk? Thanks, Bill [gcc] 2015-01-06 Bill Schmidt <wschm...@linux.vnet.ibm.com> * config/rs6000/vsx.md (*p9_vecload_<mode>): Replace VSX_M mode iterator with VSX_M2. (*p9_vecstore_<mode>): Likewise. (*vsx_le_permute_<mode>): Restrict to !TARGET_P9_VECTOR. (*vsx_le_perm_load_<mode> for VSX_LE_128): Likewise. (*vsx_le_perm_store_<mode> for VSX_LE_128): Likewise. (define_split for VSX_LE128 stores): Likewise. (define_peephole2 for TImode LE swaps): Likewise. (define_split for VSX_LE128 post-reload stores): Likewise. [gcc/testsuite] 2015-01-06 Bill Schmidt <wschm...@linux.vnet.ibm.com> * gcc.target/powerpc/p9-lxvx-stxvx-3.c: New test. Index: gcc/config/rs6000/vsx.md =================================================================== --- gcc/config/rs6000/vsx.md (revision 232077) +++ gcc/config/rs6000/vsx.md (working copy) @@ -304,8 +304,8 @@ ;; VSX (P9) moves (define_insn "*p9_vecload_<mode>" - [(set (match_operand:VSX_M 0 "vsx_register_operand" "=<VSa>") - (match_operand:VSX_M 1 "memory_operand" "Z"))] + [(set (match_operand:VSX_M2 0 "vsx_register_operand" "=<VSa>") + (match_operand:VSX_M2 1 "memory_operand" "Z"))] "TARGET_P9_VECTOR" "lxvx %x0,%y1" [(set_attr "type" "vecload") @@ -312,8 +312,8 @@ (set_attr "length" "4")]) (define_insn "*p9_vecstore_<mode>" - [(set (match_operand:VSX_M 0 "memory_operand" "=Z") - (match_operand:VSX_M 1 "vsx_register_operand" "<VSa>"))] + [(set (match_operand:VSX_M2 0 "memory_operand" "=Z") + (match_operand:VSX_M2 1 "vsx_register_operand" "<VSa>"))] "TARGET_P9_VECTOR" "stxvx %x1,%y0" [(set_attr "type" "vecstore") @@ -680,7 +680,7 @@ (rotate:VSX_LE_128 (match_operand:VSX_LE_128 1 "input_operand" "<VSa>,Z,<VSa>") (const_int 64)))] - "!BYTES_BIG_ENDIAN && TARGET_VSX" + "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR" "@ xxpermdi %x0,%x1,%x1,2 lxvd2x %x0,%y1 @@ -714,9 +714,9 @@ (define_insn_and_split "*vsx_le_perm_load_<mode>" [(set (match_operand:VSX_LE_128 0 "vsx_register_operand" "=<VSa>") (match_operand:VSX_LE_128 1 "memory_operand" "Z"))] - "!BYTES_BIG_ENDIAN && TARGET_VSX" + "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR" "#" - "!BYTES_BIG_ENDIAN && TARGET_VSX" + "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR" [(set (match_dup 2) (rotate:VSX_LE_128 (match_dup 1) (const_int 64))) @@ -735,7 +735,7 @@ (define_insn "*vsx_le_perm_store_<mode>" [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z") (match_operand:VSX_LE_128 1 "vsx_register_operand" "+<VSa>"))] - "!BYTES_BIG_ENDIAN && TARGET_VSX" + "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR" "#" [(set_attr "type" "vecstore") (set_attr "length" "12")]) @@ -743,7 +743,7 @@ (define_split [(set (match_operand:VSX_LE_128 0 "memory_operand" "") (match_operand:VSX_LE_128 1 "vsx_register_operand" ""))] - "!BYTES_BIG_ENDIAN && TARGET_VSX && !reload_completed" + "!BYTES_BIG_ENDIAN && TARGET_VSX && !reload_completed && !TARGET_P9_VECTOR" [(set (match_dup 2) (rotate:VSX_LE_128 (match_dup 1) (const_int 64))) @@ -765,7 +765,7 @@ (set (match_operand:TI 2 "vsx_register_operand" "") (rotate:TI (match_dup 0) (const_int 64)))] - "!BYTES_BIG_ENDIAN && TARGET_VSX && TARGET_VSX_TIMODE + "!BYTES_BIG_ENDIAN && TARGET_VSX && TARGET_VSX_TIMODE && !TARGET_P9_VECTOR && (rtx_equal_p (operands[0], operands[2]) || peep2_reg_dead_p (2, operands[0]))" [(set (match_dup 2) (match_dup 1))]) @@ -775,7 +775,7 @@ (define_split [(set (match_operand:VSX_LE_128 0 "memory_operand" "") (match_operand:VSX_LE_128 1 "vsx_register_operand" ""))] - "!BYTES_BIG_ENDIAN && TARGET_VSX && reload_completed" + "!BYTES_BIG_ENDIAN && TARGET_VSX && reload_completed && !TARGET_P9_VECTOR" [(set (match_dup 1) (rotate:VSX_LE_128 (match_dup 1) (const_int 64))) Index: gcc/testsuite/gcc.target/powerpc/p9-lxvx-stxvx-3.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/p9-lxvx-stxvx-3.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/p9-lxvx-stxvx-3.c (working copy) @@ -0,0 +1,30 @@ +/* { dg-do compile { target { powerpc64le-*-* } } } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */ +/* { dg-options "-mcpu=power9 -O3" } */ +/* { dg-final { scan-assembler "lxvx" } } */ +/* { dg-final { scan-assembler "stxvx" } } */ +/* { dg-final { scan-assembler-not "lxvd2x" } } */ +/* { dg-final { scan-assembler-not "stxvd2x" } } */ +/* { dg-final { scan-assembler-not "xxpermdi" } } */ + +/* Verify P9 vector loads and stores are used rather than the + load-swap/swap-store workarounds for P8. */ +#define SIZE (16384/sizeof(__float128)) + +static __float128 x[SIZE] __attribute__ ((aligned (16))); +static __float128 y[SIZE] __attribute__ ((aligned (16))); +static __float128 a; + +void obfuscate(void *a, ...); + +void __attribute__((noinline)) do_one(void) +{ + unsigned long i; + + obfuscate(x, y, &a); + + for (i = 0; i < SIZE; i++) + y[i] = a * x[i]; + + obfuscate(x, y, &a); +}