This patch aims to avoid generate vblendps with ymm16+, And have bootstrapped and tested on x86_64-pc-linux-gnu{-m32,-m64}. Ok for trunk?
gcc/ChangeLog: PR target/112435 * config/i386/sse.md: Adding constraints to restrict the generation of vblendps. gcc/testsuite/ChangeLog: PR target/112435 * gcc.target/i386/pr112435-1.c: New test. * gcc.target/i386/pr112435-2.c: Ditto. * gcc.target/i386/pr112435-3.c: Ditto. --- gcc/config/i386/sse.md | 28 +++++--- gcc/testsuite/gcc.target/i386/pr112435-1.c | 14 ++++ gcc/testsuite/gcc.target/i386/pr112435-2.c | 64 ++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr112435-3.c | 79 ++++++++++++++++++++++ 4 files changed, 175 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr112435-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr112435-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr112435-3.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 33198756bb0..666f931c88d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -19254,7 +19254,8 @@ mask = INTVAL (operands[3]) / 2; mask |= (INTVAL (operands[5]) - 4) / 2 << 1; operands[3] = GEN_INT (mask); - if (INTVAL (operands[3]) == 2 && !<mask_applied>) + if (INTVAL (operands[3]) == 2 && !<mask_applied> + && !x86_evex_reg_mentioned_p (operands, 3)) return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; return "vshuf<shuffletype>64x2\t{%3, %2, %1, %0<mask_operand7>|%0<mask_operand7>, %1, %2, %3}"; } @@ -19414,7 +19415,8 @@ mask |= (INTVAL (operands[7]) - 8) / 4 << 1; operands[3] = GEN_INT (mask); - if (INTVAL (operands[3]) == 2 && !<mask_applied>) + if (INTVAL (operands[3]) == 2 && !<mask_applied> + && !x86_evex_reg_mentioned_p (operands, 3)) return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; return "vshuf<shuffletype>32x4\t{%3, %2, %1, %0<mask_operand11>|%0<mask_operand11>, %1, %2, %3}"; @@ -26776,10 +26778,13 @@ else return "vmovaps\t{%2, %0|%0, %2}"; } - if ((mask & 0xbb) == 18) - return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; - if ((mask & 0xbb) == 48) - return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + if (!x86_evex_reg_mentioned_p (operands, 3)) + { + if ((mask & 0xbb) == 18) + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; + if ((mask & 0xbb) == 48) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + } return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"; } [(set_attr "type" "sselog") @@ -27433,10 +27438,13 @@ && avx_vperm2f128_parallel (operands[3], <MODE>mode)" { int mask = avx_vperm2f128_parallel (operands[3], <MODE>mode) - 1; - if ((mask & 0xbb) == 0x12) - return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; - if ((mask & 0xbb) == 0x30) - return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + if (!x86_evex_reg_mentioned_p (operands, 3)) + { + if ((mask & 0xbb) == 0x12) + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; + if ((mask & 0xbb) == 0x30) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + } if ((mask & 0xbb) == 0x20) return "vinsert<i128>\t{$1, %x2, %1, %0|%0, %1, %x2, 1}"; operands[3] = GEN_INT (mask); diff --git a/gcc/testsuite/gcc.target/i386/pr112435-1.c b/gcc/testsuite/gcc.target/i386/pr112435-1.c new file mode 100644 index 00000000000..ff56523b4e1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr112435-1.c @@ -0,0 +1,14 @@ +/* PR target/112435 */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-Ofast -march=sapphirerapids" } */ +/* { dg-final { scan-assembler-not "vblendps" } } */ + +#include<x86intrin.h> + +__m256i +f(__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") = a; + asm("":"+v"(t)); + return _mm256_shuffle_i32x4 (t, b, 2); +} diff --git a/gcc/testsuite/gcc.target/i386/pr112435-2.c b/gcc/testsuite/gcc.target/i386/pr112435-2.c new file mode 100644 index 00000000000..27ba80b1e68 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr112435-2.c @@ -0,0 +1,64 @@ +/* PR target/112435 */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-Ofast -march=sapphirerapids" } */ +/* { dg-final { scan-assembler-not "vblendps.*ymm17\$" } } */ + +#include<x86intrin.h> + +/* Vpermi128/Vpermf128 */ +__m256i +perm0 (__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") = a; + asm("":"+v"(t)); + return _mm256_permute2x128_si256 (t, b, 50); +} + +__m256i +perm1 (__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") = a; + asm("":"+v"(t)); + return _mm256_permute2x128_si256 (t, b, 18); +} + +__m256i +perm2 (__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") = a; + asm("":"+v"(t)); + return _mm256_permute2x128_si256 (t, b, 48); +} + +/* vshuf{i,f}{32x4,64x2} ymm .*/ +__m256i +shuff0 (__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") = a; + asm("":"+v"(t)); + return _mm256_shuffle_i32x4(t, b, 2); +} + +__m256 +shuff1 (__m256 a, __m256 b) +{ + register __m256 t __asm__("ymm17") = a; + asm("":"+v"(t)); + return _mm256_shuffle_f32x4(t, b, 2); +} + +__m256i +shuff2 (__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") = a; + asm("":"+v"(t)); + return _mm256_shuffle_i64x2(t, b, 2); +} + +__m256d +shuff3 (__m256d a, __m256d b) +{ + register __m256d t __asm__("ymm17") = a; + asm("":"+v"(t)); + return _mm256_shuffle_f64x2(t, b, 2); +} diff --git a/gcc/testsuite/gcc.target/i386/pr112435-3.c b/gcc/testsuite/gcc.target/i386/pr112435-3.c new file mode 100644 index 00000000000..f39820d4f37 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr112435-3.c @@ -0,0 +1,79 @@ +/* PR target/112435 */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-Ofast -march=sapphirerapids" } */ +/* { dg-final { scan-assembler-not "vblendps.*ymm17\$" } } */ + +#include<x86intrin.h> + +/* Vpermf128 */ +__m256 +perm0 (__m256 a, __m256 b) +{ + register __m256 t __asm__("ymm17") =a; + asm("":"+v"(t)); + return _mm256_permute2f128_ps (t, b, 50); +} + +__m256 +perm1 (__m256 a, __m256 b) +{ + register __m256 t __asm__("ymm17") =a; + asm("":"+v"(t)); + return _mm256_permute2f128_ps (t, b, 18); +} + +__m256 +perm2 (__m256 a, __m256 b) +{ + register __m256 t __asm__("ymm17") =a; + asm("":"+v"(t)); + return _mm256_permute2f128_ps (t, b, 48); +} + +__m256i +perm3 (__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") =a; + asm("":"+v"(t)); + return _mm256_permute2f128_si256 (t, b, 50); +} + +__m256i +perm4 (__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") =a; + asm("":"+v"(t)); + return _mm256_permute2f128_si256 (t, b, 18); +} + +__m256i +perm5 (__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") =a; + asm("":"+v"(t)); + return _mm256_permute2f128_si256 (t, b, 48); +} + +__m256d +perm6 (__m256d a, __m256d b) +{ + register __m256d t __asm__("ymm17") =a; + asm("":"+v"(t)); + return _mm256_permute2f128_pd (t, b, 50); +} + +__m256d +perm7 (__m256d a, __m256d b) +{ + register __m256d t __asm__("ymm17") =a; + asm("":"+v"(t)); + return _mm256_permute2f128_pd (t, b, 18); +} + +__m256d +perm8 (__m256d a, __m256d b) +{ + register __m256d t __asm__("ymm17") =a; + asm("":"+v"(t)); + return _mm256_permute2f128_pd (t, b, 48); +} -- 2.31.1