https://gcc.gnu.org/g:050b1708ea532ea4840e97d85fad4ca63d4cd631
commit r16-1588-g050b1708ea532ea4840e97d85fad4ca63d4cd631 Author: H.J. Lu <hjl.to...@gmail.com> Date: Thu Jun 19 05:03:48 2025 +0800 x86: Get the widest vector mode from MOVE_MAX Since MOVE_MAX defines the maximum number of bytes that an instruction can move quickly between memory and registers, use it to get the widest vector mode in vector loop when inlining memcpy and memset. gcc/ PR target/120708 * config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): Use MOVE_MAX to get the widest vector mode in vector loop. gcc/testsuite/ PR target/120708 * gcc.target/i386/memcpy-pr120708-1.c: New test. * gcc.target/i386/memcpy-pr120708-2.c: Likewise. * gcc.target/i386/memcpy-pr120708-3.c: Likewise. * gcc.target/i386/memcpy-pr120708-4.c: Likewise. * gcc.target/i386/memcpy-pr120708-5.c: Likewise. * gcc.target/i386/memcpy-pr120708-6.c: Likewise. * gcc.target/i386/memset-pr120708-1.c: Likewise. * gcc.target/i386/memset-pr120708-2.c: Likewise. * gcc.target/i386/memcpy-strategy-1.c: Drop dg-skip-if. Replace -march=atom with -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores. * gcc.target/i386/memcpy-strategy-2.c: Likewise. * gcc.target/i386/memcpy-vector_loop-1.c: Likewise. * gcc.target/i386/memcpy-vector_loop-2.c: Likewise. * gcc.target/i386/memset-vector_loop-1.c: Likewise. * gcc.target/i386/memset-vector_loop-2.c: Likewise. Signed-off-by: H.J. Lu <hjl.to...@gmail.com> Diff: --- gcc/config/i386/i386-expand.cc | 31 +++++++--------------- gcc/testsuite/gcc.target/i386/memcpy-pr120708-1.c | 11 ++++++++ gcc/testsuite/gcc.target/i386/memcpy-pr120708-2.c | 11 ++++++++ gcc/testsuite/gcc.target/i386/memcpy-pr120708-3.c | 11 ++++++++ gcc/testsuite/gcc.target/i386/memcpy-pr120708-4.c | 11 ++++++++ gcc/testsuite/gcc.target/i386/memcpy-pr120708-5.c | 15 +++++++++++ gcc/testsuite/gcc.target/i386/memcpy-pr120708-6.c | 15 +++++++++++ gcc/testsuite/gcc.target/i386/memcpy-strategy-1.c | 3 +-- gcc/testsuite/gcc.target/i386/memcpy-strategy-2.c | 3 +-- .../gcc.target/i386/memcpy-vector_loop-1.c | 3 +-- .../gcc.target/i386/memcpy-vector_loop-2.c | 5 ++-- gcc/testsuite/gcc.target/i386/memset-pr120708-1.c | 10 +++++++ gcc/testsuite/gcc.target/i386/memset-pr120708-2.c | 10 +++++++ .../gcc.target/i386/memset-vector_loop-1.c | 3 +-- .../gcc.target/i386/memset-vector_loop-2.c | 3 +-- 15 files changed, 110 insertions(+), 35 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 4946f87a1317..423fc632003d 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -9351,7 +9351,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, bool need_zero_guard = false; bool noalign; machine_mode move_mode = VOIDmode; - machine_mode wider_mode; int unroll_factor = 1; /* TODO: Once value ranges are available, fill in proper data. */ unsigned HOST_WIDE_INT min_size = 0; @@ -9427,6 +9426,7 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, unroll_factor = 1; move_mode = word_mode; + int nunits; switch (alg) { case libcall: @@ -9447,27 +9447,14 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, case vector_loop: need_zero_guard = true; unroll_factor = 4; - /* Find the widest supported mode. */ - move_mode = word_mode; - while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode) - && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing) - move_mode = wider_mode; - - if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128) - move_mode = TImode; - if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256) - move_mode = OImode; - - /* Find the corresponding vector mode with the same size as MOVE_MODE. - MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */ - if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode)) - { - int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode); - if (!mode_for_vector (word_mode, nunits).exists (&move_mode) - || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing) - move_mode = word_mode; - } - gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing); + /* Get the vector mode to move MOVE_MAX bytes. */ + nunits = MOVE_MAX / GET_MODE_SIZE (word_mode); + if (nunits > 1) + { + move_mode = mode_for_vector (word_mode, nunits).require (); + gcc_assert (optab_handler (mov_optab, move_mode) + != CODE_FOR_nothing); + } break; case rep_prefix_8_byte: move_mode = DImode; diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120708-1.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-1.c new file mode 100644 index 000000000000..d4fe2adc7ffd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-1.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ +/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-final { scan-assembler-not "movdqa" } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120708-2.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-2.c new file mode 100644 index 000000000000..9a6fcfd171b1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-2.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ +/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-final { scan-assembler-not "movdqa" } } */ + +char *a; +char *b; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120708-3.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-3.c new file mode 100644 index 000000000000..010ac24d50f8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-3.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ +/* { dg-final { scan-assembler-not "movdqa" } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120708-4.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-4.c new file mode 100644 index 000000000000..87a58ef369a0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-4.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-not "movdqa" } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120708-5.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-5.c new file mode 100644 index 000000000000..19e060075cf6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-5.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mprefer-vector-width=128 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */ + +#define SIZE (16 + 1) * 16 + +char dest[SIZE]; +char src[SIZE]; + +void +foo (void) +{ + __builtin_memcpy (dest, src, SIZE); +} + +/* { dg-final { scan-assembler-times "vmovdqa\[ \t]\+\[^\n\r]*%xmm\[0-9\]\+" 10 } } */ diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120708-6.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-6.c new file mode 100644 index 000000000000..17b101f130ec --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-6.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mprefer-vector-width=256 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */ + +#define SIZE (16 + 1) * 32 + +char dest[SIZE]; +char src[SIZE]; + +void +foo (void) +{ + __builtin_memcpy (dest, src, SIZE); +} + +/* { dg-final { scan-assembler-times "vmovdqa\[ \t]\+\[^\n\r]*%ymm\[0-9\]\+" 10 } } */ diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-1.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-1.c index 6ac80c910533..b29867388928 100644 --- a/gcc/testsuite/gcc.target/i386/memcpy-strategy-1.c +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-1.c @@ -1,6 +1,5 @@ /* { dg-do compile } */ -/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ -/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores -mmemcpy-strategy=vector_loop:-1:align" } */ /* { dg-final { scan-assembler-times "movdqa" 8 } } */ char a[2048]; diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-2.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-2.c index c103896a1106..18e260b0191a 100644 --- a/gcc/testsuite/gcc.target/i386/memcpy-strategy-2.c +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-2.c @@ -1,6 +1,5 @@ /* { dg-do compile } */ -/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ -/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ /* { dg-final { scan-assembler-times "movdqa" 8 } } */ char a[2048]; diff --git a/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-1.c b/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-1.c index 93f428acc859..cec8c90e5655 100644 --- a/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-1.c +++ b/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-1.c @@ -1,6 +1,5 @@ /* { dg-do compile } */ -/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ -/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores -minline-all-stringops -mstringop-strategy=vector_loop" } */ /* { dg-final { scan-assembler-times "movdqa" 8 } } */ char a[2048]; diff --git a/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-2.c b/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-2.c index ab235401972f..314eb3d5b53e 100644 --- a/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-2.c +++ b/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-2.c @@ -1,7 +1,6 @@ /* { dg-do compile } */ -/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ -/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ -/* { dg-final { scan-assembler-times "movdqa" 4} } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-final { scan-assembler-times "movdqa" 4 } } */ char *a; char *b; diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120708-1.c b/gcc/testsuite/gcc.target/i386/memset-pr120708-1.c new file mode 100644 index 000000000000..fba05883db5c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120708-1.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mprefer-vector-width=128 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 254); +} + +/* { dg-final { scan-assembler "vmovdqu\[ \t]\+%xmm\[0-9\]+, \\(\[^\n\r]*\\)" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120708-2.c b/gcc/testsuite/gcc.target/i386/memset-pr120708-2.c new file mode 100644 index 000000000000..d9a3e7ebc673 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120708-2.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mprefer-vector-width=256 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 254); +} + +/* { dg-final { scan-assembler "vmovdqu\[ \t]\+%ymm\[0-9\]+, \\(\[^\n\r]*\\)" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c b/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c index d6fdc9819081..5bb30a844eab 100644 --- a/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c +++ b/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c @@ -1,6 +1,5 @@ /* { dg-do compile } */ -/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ -/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores -minline-all-stringops -mstringop-strategy=vector_loop" } */ /* { dg-final { scan-assembler-times "movdqa" 4 } } */ char a[2048]; diff --git a/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c b/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c index bce8be0ffae2..6e31070ee86c 100644 --- a/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c +++ b/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c @@ -1,6 +1,5 @@ /* { dg-do compile } */ -/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ -/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores -mstringop-strategy=vector_loop" } */ /* { dg-final { scan-assembler-times "movdqa" 4} } */ char *a;