On Tue, Apr 18, 2023 at 2:52 PM Hu, Lin1 via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Hi, all > > The patch aims to optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128. > And it has regtested on x86_64-pc-linux-gnu. OK for trunk? Ok. > > Thanks. > Lin > > vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk. > We can optimze them to vblend, vmovaps when there's no cross-lane. > > gcc/ChangeLog: > > * config/i386/sse.md: Modify insn vperm{i,f} > and vshuf{i,f}. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test. > * gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto. > * gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto. > * gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto. > * gcc.target/i386/opt-vperm-vshuf-1.c: New test. > * gcc.target/i386/opt-vperm-vshuf-2.c: Ditto. > * gcc.target/i386/opt-vperm-vshuf-3.c: Ditto. > --- > gcc/config/i386/sse.md | 36 ++++++++-- > .../gcc.target/i386/avx512vl-vshuff32x4-1.c | 2 +- > .../gcc.target/i386/avx512vl-vshuff64x2-1.c | 2 +- > .../gcc.target/i386/avx512vl-vshufi32x4-1.c | 2 +- > .../gcc.target/i386/avx512vl-vshufi64x2-1.c | 2 +- > .../gcc.target/i386/opt-vperm-vshuf-1.c | 51 ++++++++++++++ > .../gcc.target/i386/opt-vperm-vshuf-2.c | 68 +++++++++++++++++++ > .../gcc.target/i386/opt-vperm-vshuf-3.c | 63 +++++++++++++++++ > 8 files changed, 218 insertions(+), 8 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 513960e8f33..5b6b2427460 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -18437,6 +18437,8 @@ > mask = INTVAL (operands[3]) / 2; > mask |= (INTVAL (operands[5]) - 4) / 2 << 1; > operands[3] = GEN_INT (mask); > + if (INTVAL (operands[3]) == 2 && !<mask_applied>) > + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; > return "vshuf<shuffletype>64x2\t{%3, %2, %1, > %0<mask_operand7>|%0<mask_operand7>, %1, %2, %3}"; > } > [(set_attr "type" "sselog") > @@ -18595,6 +18597,9 @@ > mask |= (INTVAL (operands[7]) - 8) / 4 << 1; > operands[3] = GEN_INT (mask); > > + if (INTVAL (operands[3]) == 2 && !<mask_applied>) > + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; > + > return "vshuf<shuffletype>32x4\t{%3, %2, %1, > %0<mask_operand11>|%0<mask_operand11>, %1, %2, %3}"; > } > [(set_attr "type" "sselog") > @@ -25663,7 +25668,28 @@ > (match_operand:SI 3 "const_0_to_255_operand")] > UNSPEC_VPERMTI))] > "TARGET_AVX2" > - "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + { > + int mask = INTVAL (operands[3]); > + if ((mask & 0xbb) == 16) > + { > + if (rtx_equal_p (operands[0], operands[1])) > + return ""; > + else > + return "vmovaps\t{%1, %0|%0, %1}"; > + } > + if ((mask & 0xbb) == 50) > + { > + if (rtx_equal_p (operands[0], operands[2])) > + return ""; > + else > + return "vmovaps\t{%2, %0|%0, %2}"; > + } > + if ((mask & 0xbb) == 18) > + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; > + if ((mask & 0xbb) == 48) > + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; > + return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"; > + } > [(set_attr "type" "sselog") > (set_attr "prefix" "vex") > (set_attr "mode" "OI")]) > @@ -26226,9 +26252,11 @@ > && avx_vperm2f128_parallel (operands[3], <MODE>mode)" > { > int mask = avx_vperm2f128_parallel (operands[3], <MODE>mode) - 1; > - if (mask == 0x12) > - return "vinsert<i128>\t{$0, %x2, %1, %0|%0, %1, %x2, 0}"; > - if (mask == 0x20) > + if ((mask & 0xbb) == 0x12) > + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; > + if ((mask & 0xbb) == 0x30) > + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; > + if ((mask & 0xbb) == 0x20) > return "vinsert<i128>\t{$1, %x2, %1, %0|%0, %1, %x2, 1}"; > operands[3] = GEN_INT (mask); > return "vperm2<i128>\t{%3, %2, %1, %0|%0, %1, %2, %3}"; > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c > b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c > index 6c2fb2f184a..02aecf4edce 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c > @@ -12,7 +12,7 @@ volatile __mmask8 m; > void extern > avx512vl_test (void) > { > - x = _mm256_shuffle_f32x4 (x, x, 2); > + x = _mm256_shuffle_f32x4 (x, x, 3); > x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2); > x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2); > } > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c > b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c > index 1191b400134..563ded5d9df 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c > @@ -12,7 +12,7 @@ volatile __mmask8 m; > void extern > avx512vl_test (void) > { > - x = _mm256_shuffle_f64x2 (x, x, 2); > + x = _mm256_shuffle_f64x2 (x, x, 3); > x = _mm256_mask_shuffle_f64x2 (x, m, x, x, 2); > x = _mm256_maskz_shuffle_f64x2 (m, x, x, 2); > } > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c > b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c > index ef9a441e7a5..e89c4140d37 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c > @@ -12,7 +12,7 @@ volatile __mmask8 m; > void extern > avx512vl_test (void) > { > - x = _mm256_shuffle_i32x4 (x, x, 2); > + x = _mm256_shuffle_i32x4 (x, x, 3); > x = _mm256_mask_shuffle_i32x4 (x, m, x, x, 2); > x = _mm256_maskz_shuffle_i32x4 (m, x, x, 2); > } > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c > b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c > index 0bd117e85d4..8e8e47eda38 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c > @@ -12,7 +12,7 @@ volatile __mmask8 m; > void extern > avx512vl_test (void) > { > - x = _mm256_shuffle_i64x2 (x, x, 2); > + x = _mm256_shuffle_i64x2 (x, x, 3); > x = _mm256_mask_shuffle_i64x2 (x, m, x, x, 2); > x = _mm256_maskz_shuffle_i64x2 (m, x, x, 2); > } > diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c > b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c > new file mode 100644 > index 00000000000..1ee00b6b4a1 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c > @@ -0,0 +1,51 @@ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -march=sapphirerapids" } */ > +/* { dg-final { scan-assembler-times "vmovaps" 1 } } */ > +/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 1 } } */ > +/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 5 } } */ > + > +#include<x86intrin.h> > + > +/* Vpermi128/Vpermf128 */ > +__m256i > +perm0 (__m256i a, __m256i b) > +{ > + return _mm256_permute2x128_si256 (a, b, 50); > +} > + > +__m256i > +perm1 (__m256i a, __m256i b) > +{ > + return _mm256_permute2x128_si256 (a, b, 18); > +} > + > +__m256i > +perm2 (__m256i a, __m256i b) > +{ > + return _mm256_permute2x128_si256 (a, b, 48); > +} > + > +/* vshuf{i,f}{32x4,64x2} ymm .*/ > +__m256i > +shuff0 (__m256i a, __m256i b) > +{ > + return _mm256_shuffle_i32x4(a, b, 2); > +} > + > +__m256 > +shuff1 (__m256 a, __m256 b) > +{ > + return _mm256_shuffle_f32x4(a, b, 2); > +} > + > +__m256i > +shuff2 (__m256i a, __m256i b) > +{ > + return _mm256_shuffle_i64x2(a, b, 2); > +} > + > +__m256d > +shuff3 (__m256d a, __m256d b) > +{ > + return _mm256_shuffle_f64x2(a, b, 2); > +} > diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c > b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c > new file mode 100644 > index 00000000000..9775072b97a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c > @@ -0,0 +1,68 @@ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -march=sapphirerapids" } */ > +/* { dg-final { scan-assembler-not "vmovaps" } } */ > +/* { dg-final { scan-assembler-not "vblendps" } } */ > +/* { dg-final { scan-assembler-not "vperm2i128" } } */ > +/* { dg-final { scan-assembler-not "vperm2f128" } } */ > + > +#include<x86intrin.h> > + > +__m256i > +perm0 (__m256i a, __m256i b) > +{ > + return _mm256_permute2x128_si256 (a, b, 16); > +} > + > +__m256d > +perm1 (__m256d a, __m256d b) > +{ > + return _mm256_permute2f128_pd (a, b, 16); > +} > + > +__m256 > +perm2 (__m256 a, __m256 b) > +{ > + return _mm256_permute2f128_ps (a, b, 16); > +} > + > +__m256i > +perm3 (__m256i a, __m256i b) > +{ > + return _mm256_permute2f128_si256 (a, b, 16); > +} > + > +__m256i > +perm4 (__m256i a, __m256i b) > +{ > + return _mm256_permute2x128_si256 (a, b, 20); > +} > + > +__m256d > +perm5 (__m256d a, __m256d b) > +{ > + return _mm256_permute2f128_pd (a, b, 20); > +} > + > +__m256i > +perm6 (__m256i a, __m256i b) > +{ > + return _mm256_permute2x128_si256 (a, b, 80); > +} > + > +__m256d > +perm7 (__m256d a, __m256d b) > +{ > + return _mm256_permute2f128_pd (a, b, 80); > +} > + > +__m256i > +perm8 (__m256i a, __m256i b) > +{ > + return _mm256_permute2x128_si256 (a, b, 84); > +} > + > +__m256d > +perm9 (__m256d a, __m256d b) > +{ > + return _mm256_permute2f128_pd (a, b, 84); > +} > diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c > b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c > new file mode 100644 > index 00000000000..a330b14caca > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c > @@ -0,0 +1,63 @@ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -march=sapphirerapids" } */ > +/* { dg-final { scan-assembler-times "vmov..." 3 } } */ > +/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 3 } } */ > +/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 3 } } */ > +/* { dg-final { scan-assembler-not "vperm2f128" } } */ > + > +#include<x86intrin.h> > + > +/* Vpermf128 */ > +__m256 > +perm0 (__m256 a, __m256 b) > +{ > + return _mm256_permute2f128_ps (a, b, 50); > +} > + > +__m256 > +perm1 (__m256 a, __m256 b) > +{ > + return _mm256_permute2f128_ps (a, b, 18); > +} > + > +__m256 > +perm2 (__m256 a, __m256 b) > +{ > + return _mm256_permute2f128_ps (a, b, 48); > +} > + > +__m256i > +perm3 (__m256i a, __m256i b) > +{ > + return _mm256_permute2f128_si256 (a, b, 50); > +} > + > +__m256i > +perm4 (__m256i a, __m256i b) > +{ > + return _mm256_permute2f128_si256 (a, b, 18); > +} > + > +__m256i > +perm5 (__m256i a, __m256i b) > +{ > + return _mm256_permute2f128_si256 (a, b, 48); > +} > + > +__m256d > +perm6 (__m256d a, __m256d b) > +{ > + return _mm256_permute2f128_pd (a, b, 50); > +} > + > +__m256d > +perm7 (__m256d a, __m256d b) > +{ > + return _mm256_permute2f128_pd (a, b, 18); > +} > + > +__m256d > +perm8 (__m256d a, __m256d b) > +{ > + return _mm256_permute2f128_pd (a, b, 48); > +} > -- > 2.31.1 >
-- BR, Hongtao