Hi, The patch 2 enables 16-byte by pieces move on rs6000. This patch fixes the regression cases caused by previous patch. For sra-17/18, the long array with 4 elements can be loaded by one 16-byte by pieces move on 32-bit platform. So the array is not be constructed in LC0 and SRA optimization is unable to be taken. "no-vsx" option is added for 32-bit platform, as it sets the MOVE_MAX_PIECES to 4-byte on 32-bit platform and the array can't be loaded by one by pieces move.
Another regression is on P8 LE. The 16-byte memory to memory is implemented by two TImode load/store. The TImode load/store is finally split to two DImode load/store on P8 LE as it doesn't have unaligned vector load/store instructions. Actually, 16-byte memory to memory move can be implement by two V2DI reversed load/store on P8 LE. The patch creates a insn_and_split pattern for this optimization. Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no regressions. Is this OK for trunk? Thanks Gui Haochen ChangeLog rs6000: Enable 16-byte by pieces move This patch enables 16-byte by pieces move. The 16-byte move is generated with TImode and finally implemented by vector instructions. There are several regression cases after the enablement. 16-byte TImode memory to memory move is originally implemented by two pairs of DImode load/store on P8 LE as there is no unalignment vsx load/store on it. The patch fixes the problem by creating an insn_and_split pattern and converts it to one pair of reversed load/store. Two SRA cases lost the SRA optimization as the array can be loaded by one 16-byte move so that not be initialized in LC0 on 32-bit platform. So fixes them by adding no-vsx option. gcc/ PR target/111449 * config/rs6000/vsx.md (*vsx_le_mem_to_mem_mov_ti): New. gcc/testsuite/ PR target/111449 * gcc.dg/tree-ssa/sra-17.c: Add no-vsx option for powerpc ilp32. * gcc.dg/tree-ssa/sra-18.c: Likewise. * gcc.target/powerpc/pr111449-1.c: New. patch.diff diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index f3b40229094..9f6bc49998a 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -414,6 +414,27 @@ (define_mode_attr VM3_char [(V2DI "d") ;; VSX moves +;; TImode memory to memory move optimization on LE with p8vector +(define_insn_and_split "*vsx_le_mem_to_mem_mov_ti" + [(set (match_operand:TI 0 "indexed_or_indirect_operand" "=Z") + (match_operand:TI 1 "indexed_or_indirect_operand" "Z"))] + "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR + && !MEM_VOLATILE_P (operands[0]) + && !MEM_VOLATILE_P (operands[1]) + && !reload_completed" + "#" + "&& 1" + [(const_int 0)] +{ + rtx tmp = gen_reg_rtx (V2DImode); + rtx src = adjust_address (operands[1], V2DImode, 0); + emit_insn (gen_vsx_ld_elemrev_v2di (tmp, src)); + rtx dest = adjust_address (operands[0], V2DImode, 0); + emit_insn (gen_vsx_st_elemrev_v2di (dest, tmp)); + DONE; +} + [(set_attr "length" "16")]) + ;; The patterns for LE permuted loads and stores come before the general ;; VSX moves so they match first. (define_insn_and_split "*vsx_le_perm_load_<mode>" diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-17.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-17.c index 221d96b6cd9..36d72c9256b 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-17.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-17.c @@ -1,6 +1,7 @@ /* { dg-do run { target { aarch64*-*-* alpha*-*-* arm*-*-* hppa*-*-* powerpc*-*-* s390*-*-* } } } */ /* { dg-options "-O2 -fdump-tree-esra --param sra-max-scalarization-size-Ospeed=32" } */ /* { dg-additional-options "-mcpu=ev4" { target alpha*-*-* } } */ +/* { dg-additional-options "-mno-vsx" { target powerpc*-*-* && ilp32 } } */ extern void abort (void); diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-18.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-18.c index f5e6a21c2ae..3682a9a8c29 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-18.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-18.c @@ -1,6 +1,7 @@ /* { dg-do run { target { aarch64*-*-* alpha*-*-* arm*-*-* hppa*-*-* powerpc*-*-* s390*-*-* } } } */ /* { dg-options "-O2 -fdump-tree-esra --param sra-max-scalarization-size-Ospeed=32" } */ /* { dg-additional-options "-mcpu=ev4" { target alpha*-*-* } } */ +/* { dg-additional-options "-mno-vsx" { target powerpc*-*-* && ilp32 } } */ extern void abort (void); struct foo { long x; }; diff --git a/gcc/testsuite/gcc.target/powerpc/pr111449-2.c b/gcc/testsuite/gcc.target/powerpc/pr111449-2.c new file mode 100644 index 00000000000..7003bdc0208 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr111449-2.c @@ -0,0 +1,18 @@ +/* { dg-do compile { target { has_arch_pwr8 } } } */ +/* { dg-require-effective-target powerpc_p8vector_ok } */ +/* { dg-options "-mvsx -O2" } */ + +/* Ensure 16-byte by pieces move is enabled. */ + +void move1 (void *s1, void *s2) +{ + __builtin_memcpy (s1, s2, 16); +} + +void move2 (void *s1) +{ + __builtin_memcpy (s1, "0123456789012345", 16); +} + +/* { dg-final { scan-assembler-times {\mlxvd2x\M|\mp?lxv\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mstxvd2x\M|\mstxv\M} 2 } } */