On Wed, Aug 10, 2016 at 9:26 AM, H.J. Lu <hjl.to...@gmail.com> wrote:
> On Wed, Aug 10, 2016 at 8:55 AM, Uros Bizjak <ubiz...@gmail.com> wrote:
>> On Wed, Aug 10, 2016 at 5:32 PM, H.J. Lu <hongjiu...@intel.com> wrote:
>>> Use TImode for piecewise move in 64-bit mode.  When vector register
>>> is used for piecewise move, we don't increase stack_alignment_needed
>>> since vector register spill isn't required for piecewise move.  Since
>>> stack_realign_needed is set to true by checking stack_alignment_estimated
>>> set by pseudo vector register usage, we also need to check
>>> stack_realign_needed to eliminate frame pointer.
>>
>> Why only in 64-bit mode? We can use SSE moves also in 32-bit mode.
>
> I will extend it to 32-bit mode.

It doesn't work in 32-bit mode due to

#define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TARGET_64BIT ? TImode : DImode):

/export/build/gnu/gcc/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/gcc/build-x86_64-linux/gcc/ -O2
-fno-asynchronous-unwind-tables -m32 -S -o x.s x.i
x.i: In function ‘foo’:
x.i:6:10: internal compiler error: in by_pieces_ninsns, at expr.c:799
   return __builtin_mempcpy (dst, src, 32);
          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0xa5a5a5 by_pieces_ninsns(unsigned long, unsigned int, unsigned int,
by_pieces_operation)
/export/gnu/import/git/sources/gcc/gcc/expr.c:799
0xe88e57 default_use_by_pieces_infrastructure_p(unsigned long,
unsigned int, by_pieces_operation, bool)
/export/gnu/import/git/sources/gcc/gcc/targhooks.c:1516
0xa5a3c2 can_do_by_pieces
/export/gnu/import/git/sources/gcc/gcc/expr.c:739
0xa5a3ee can_move_by_pieces(unsigned long, unsigned int)
/export/gnu/import/git/sources/gcc/gcc/expr.c:749
0x8d85c8 expand_builtin_mempcpy_args
/export/gnu/import/git/sources/gcc/gcc/builtins.c:3152
0x8d8092 expand_builtin_mempcpy
/export/gnu/import/git/sources/gcc/gcc/builtins.c:3044
0x8e1bf1 expand_builtin(tree_node*, rtx_def*, rtx_def*, machine_mode, int)
/export/gnu/import/git/sources/gcc/gcc/builtins.c:6146
0xa7ca9e expand_expr_real_1(tree_node*, rtx_def*, machine_mode,
expand_modifier, rtx_def**, bool)
/export/gnu/import/git/sources/gcc/gcc/expr.c:10733
0xa70d9d expand_expr_real(tree_node*, rtx_def*, machine_mode,
expand_modifier, rtx_def**, bool)
/export/gnu/import/git/sources/gcc/gcc/expr.c:8088
0xa67c2b store_expr_with_bounds(tree_node*, rtx_def*, int, bool, bool,
tree_node*)
/export/gnu/import/git/sources/gcc/gcc/expr.c:5547
0xa6683b expand_assignment(tree_node*, tree_node*, bool)
/export/gnu/import/git/sources/gcc/gcc/expr.c:5316
0x917582 expand_call_stmt
/export/gnu/import/git/sources/gcc/gcc/cfgexpand.c:2665
0x91a7b2 expand_gimple_stmt_1
/export/gnu/import/git/sources/gcc/gcc/cfgexpand.c:3580
0x91aea8 expand_gimple_stmt
/export/gnu/import/git/sources/gcc/gcc/cfgexpand.c:3746
0x91afa7 expand_gimple_tailcall
/export/gnu/import/git/sources/gcc/gcc/cfgexpand.c:3793
0x92265e expand_gimple_basic_block
/export/gnu/import/git/sources/gcc/gcc/cfgexpand.c:5730
0x92429d execute
/export/gnu/import/git/sources/gcc/gcc/cfgexpand.c:6367
Please submit a full bug report,
with preprocessed source if appropriate.
Please include the complete backtrace with any bug report.
See <http://gcc.gnu.org/bugs.html> for instructions.
Makefile:23: recipe for target 'x.s' failed
make: *** [x.s] Error 1

I tried to fix it:

https://gcc.gnu.org/ml/gcc-patches/2016-04/msg01506.html

But there is a concern for simplify_immed_subreg:

https://gcc.gnu.org/ml/gcc-patches/2016-04/msg01527.html

>> I don't think we can handle crtl->stack_realign_needed in this way. If
>> there are other insns with 32byte vector registers in use in the same
>> (large) function as converted __builtin_memcpy, we will *still* need
>> realigned stack.
>
> We don't align stack in leaf functions using SSE/AVX/AVX512 vector
> registers if there is no spill when __builtin_memcpy isn't optimized with
> SSE load/store.  My change just extends it to __builtin_memcpy optimized
> with SSE load/store with or without 32byte vector registers in use.
>
>> Uros.
>>
>>
>>> Tested on x86-64.  OK for trunk?
>>>
>>> H.J.
>>> ---
>>> gcc/
>>>
>>>         * config/i386/i386.c (ix86_finalize_stack_realign_flags): Also
>>>         check stack_realign_needed for stack realignment.
>>>         * config/i386/i386.h (MOVE_MAX_PIECES): Set to 16 in 64-bit mode
>>>         if unaligned SSE load and store are optimal.
>>>
>>> gcc/testsuite/
>>>
>>>         * gcc.target/i386/pieces-memcpy-1.c: New test.
>>>         * gcc.target/i386/pieces-memcpy-2.c: Likewise.
>>>         * gcc.target/i386/pieces-memcpy-3.c: Likewise.
>>>         * gcc.target/i386/pieces-memcpy-4.c: Likewise.
>>>         * gcc.target/i386/pieces-memcpy-5.c: Likewise.
>>>         * gcc.target/i386/pieces-memcpy-6.c: Likewise.
>>> ---
>>>  gcc/config/i386/i386.c                          | 11 +++++++++--
>>>  gcc/config/i386/i386.h                          |  6 +++++-
>>>  gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c | 17 +++++++++++++++++
>>>  gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c | 17 +++++++++++++++++
>>>  gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c | 17 +++++++++++++++++
>>>  gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c | 17 +++++++++++++++++
>>>  gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c | 17 +++++++++++++++++
>>>  gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c | 17 +++++++++++++++++
>>>  8 files changed, 116 insertions(+), 3 deletions(-)
>>>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c
>>>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c
>>>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c
>>>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c
>>>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c
>>>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
>>>
>>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>>> index 93eaab1..60dc160 100644
>>> --- a/gcc/config/i386/i386.c
>>> +++ b/gcc/config/i386/i386.c
>>> @@ -13286,8 +13286,15 @@ ix86_finalize_stack_realign_flags (void)
>>>    /* If the only reason for frame_pointer_needed is that we conservatively
>>>       assumed stack realignment might be needed, but in the end nothing that
>>>       needed the stack alignment had been spilled, clear 
>>> frame_pointer_needed
>>> -     and say we don't need stack realignment.  */
>>> -  if (stack_realign
>>> +     and say we don't need stack realignment.
>>> +
>>> +     When vector register is used for piecewise move and store, we don't
>>> +     increase stack_alignment_needed as there is no register spill for
>>> +     piecewise move and store.  Since stack_realign_needed is set to true
>>> +     by checking stack_alignment_estimated which is updated by pseudo
>>> +     vector register usage, we also need to check stack_realign_needed to
>>> +     eliminate frame pointer.  */
>>> +  if ((stack_realign || crtl->stack_realign_needed)
>>>        && frame_pointer_needed
>>>        && crtl->is_leaf
>>>        && flag_omit_frame_pointer
>>> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
>>> index 9b66264..24db855 100644
>>> --- a/gcc/config/i386/i386.h
>>> +++ b/gcc/config/i386/i386.h
>>> @@ -1951,7 +1951,11 @@ typedef struct ix86_args {
>>>  /* MOVE_MAX_PIECES is the number of bytes at a time which we can
>>>     move efficiently, as opposed to  MOVE_MAX which is the maximum
>>>     number of bytes we can move with a single instruction.  */
>>> -#define MOVE_MAX_PIECES UNITS_PER_WORD
>>> +#define MOVE_MAX_PIECES \
>>> +  ((TARGET_64BIT \
>>> +    && TARGET_SSE2 \
>>> +    && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
>>> +    && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) ? 16 : UNITS_PER_WORD)
>>>
>>>  /* If a memory-to-memory move would take MOVE_RATIO or more simple
>>>     move-instruction pairs, we will do a movmem or libcall instead.
>>> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c 
>>> b/gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c
>>> new file mode 100644
>>> index 0000000..adc0aa8
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c
>>> @@ -0,0 +1,17 @@
>>> +/* { dg-do compile { target { ! ia32 } } } */
>>> +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
>>> +
>>> +extern char *dst, *src;
>>> +
>>> +void
>>> +foo (void)
>>> +{
>>> +  __builtin_memcpy (dst, src, 64);
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 4 } } */
>>> +/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */
>>> +/* No need to dynamically realign the stack here.  */
>>> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
>>> +/* Nor use a frame pointer.  */
>>> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
>>> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c 
>>> b/gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c
>>> new file mode 100644
>>> index 0000000..c52c1d9
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c
>>> @@ -0,0 +1,17 @@
>>> +/* { dg-do compile { target { ! ia32 } } } */
>>> +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
>>> +
>>> +extern char *dst, *src;
>>> +
>>> +void
>>> +foo (void)
>>> +{
>>> +  __builtin_memcpy (dst, src, 33);
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
>>> +/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */
>>> +/* No need to dynamically realign the stack here.  */
>>> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
>>> +/* Nor use a frame pointer.  */
>>> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
>>> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c 
>>> b/gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c
>>> new file mode 100644
>>> index 0000000..c532bbd
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c
>>> @@ -0,0 +1,17 @@
>>> +/* { dg-do compile { target { ! ia32 } } } */
>>> +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
>>> +
>>> +extern char *dst, *src;
>>> +
>>> +void
>>> +foo (void)
>>> +{
>>> +  __builtin_memcpy (dst, src, 17);
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
>>> +/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 1 } } */
>>> +/* No need to dynamically realign the stack here.  */
>>> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
>>> +/* Nor use a frame pointer.  */
>>> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
>>> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c 
>>> b/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c
>>> new file mode 100644
>>> index 0000000..4ef763d
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c
>>> @@ -0,0 +1,17 @@
>>> +/* { dg-do compile { target { ! ia32 } } } */
>>> +/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
>>> +
>>> +extern char *dst, *src;
>>> +
>>> +void
>>> +foo (void)
>>> +{
>>> +  __builtin_memcpy (dst, src, 18);
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } 
>>> */
>>> +/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 1 } } 
>>> */
>>> +/* No need to dynamically realign the stack here.  */
>>> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
>>> +/* Nor use a frame pointer.  */
>>> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
>>> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c 
>>> b/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c
>>> new file mode 100644
>>> index 0000000..2687560
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c
>>> @@ -0,0 +1,17 @@
>>> +/* { dg-do compile { target { ! ia32 } } } */
>>> +/* { dg-options "-O2 -mavx512f -mtune=generic" } */
>>> +
>>> +extern char *dst, *src;
>>> +
>>> +void
>>> +foo (void)
>>> +{
>>> +  __builtin_memcpy (dst, src, 19);
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } 
>>> */
>>> +/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 1 } } 
>>> */
>>> +/* No need to dynamically realign the stack here.  */
>>> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
>>> +/* Nor use a frame pointer.  */
>>> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
>>> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c 
>>> b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
>>> new file mode 100644
>>> index 0000000..a205f83
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
>>> @@ -0,0 +1,17 @@
>>> +/* { dg-do compile { target { ! ia32 } } } */
>>> +/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
>>> +
>>> +extern char *dst, *src;
>>> +
>>> +void
>>> +foo (void)
>>> +{
>>> +  __builtin_memcpy (dst, src, 33);
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } 
>>> */
>>> +/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 2 } } 
>>> */
>>> +/* No need to dynamically realign the stack here.  */
>>> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
>>> +/* Nor use a frame pointer.  */
>>> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
>>> --
>>> 2.7.4
>>>
>
>
>
> --
> H.J.



-- 
H.J.

Reply via email to