On Tue, Aug 22, 2023 at 5:05 PM Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > The PRs ask for optimizing of > > _1 = BIT_FIELD_REF <b_3(D), 64, 64>; > result_4 = BIT_INSERT_EXPR <a_2(D), _1, 64>; > > to a vector permutation. The following implements this as > match.pd pattern, improving code generation on x86_64. > > On the RTL level we face the issue that backend patterns inconsistently > use vec_merge and vec_select of vec_concat to represent permutes. > > I think using a (supported) permute is almost always better > than an extract plus insert, maybe excluding the case we extract > element zero and that's aliased to a register that can be used > directly for insertion (not sure how to query that). > > The patch FAILs one case in gcc.target/i386/avx512fp16-vmovsh-1a.c > where we now expand from > > __A_28 = VEC_PERM_EXPR <x2.8_9, x1.9_10, { 0, 9, 10, 11, 12, 13, 14, 15 }>; > > instead of > > _28 = BIT_FIELD_REF <x2.8_9, 16, 0>; > __A_29 = BIT_INSERT_EXPR <x1.9_10, _28, 0>; > > producing a vpblendw instruction instead of the expected vmovsh. That's > either a missed vec_perm_const expansion optimization or even better, > an improvement - Zen4 for example has 4 ports to execute vpblendw > but only 3 for executing vmovsh and both instructions have the same size. Looks like Sapphire rapids only have 2 ports for executing vpblendw but 3 for vmovsh. I guess we may need a micro-architecture tuning for this specific permutation. for vmovss/vpblendd, they're equivalent on SPR, both are 3. The change for the testcase is ok, I'll handle it with an incremental patch. > > The patch XFAILs the sub-testcase - is that OK or should I update > the expected instruction to a vpblend? > > Bootstrapped and tested on x86_64-unknown-linux-gnu. > > Thanks, > Richard. > > PR tree-optimization/94864 > PR tree-optimization/94865 > PR tree-optimization/93080 > * match.pd (bit_insert @0 (BIT_FIELD_REF @1 ..) ..): New pattern > for vector insertion from vector extraction. > > * gcc.target/i386/pr94864.c: New testcase. > * gcc.target/i386/pr94865.c: Likewise. > * gcc.target/i386/avx512fp16-vmovsh-1a.c: XFAIL. > * gcc.dg/tree-ssa/forwprop-40.c: Likewise. > * gcc.dg/tree-ssa/forwprop-41.c: Likewise. > --- > gcc/match.pd | 25 +++++++++++++++++++ > gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c | 14 +++++++++++ > gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c | 16 ++++++++++++ > .../gcc.target/i386/avx512fp16-vmovsh-1a.c | 2 +- > gcc/testsuite/gcc.target/i386/pr94864.c | 13 ++++++++++ > gcc/testsuite/gcc.target/i386/pr94865.c | 13 ++++++++++ > 6 files changed, 82 insertions(+), 1 deletion(-) > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94864.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94865.c > > diff --git a/gcc/match.pd b/gcc/match.pd > index 86fdc606a79..6e083021b27 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -8006,6 +8006,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > wi::to_wide (@ipos) + isize)) > (BIT_FIELD_REF @0 @rsize @rpos))))) > > +/* Simplify vector inserts of other vector extracts to a permute. */ > +(simplify > + (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos) > + (if (VECTOR_TYPE_P (type) > + && types_match (@0, @1) > + && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2)) > + && TYPE_VECTOR_SUBPARTS (type).is_constant ()) > + (with > + { > + unsigned HOST_WIDE_INT elsz > + = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1)))); > + poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz); > + poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz); > + unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant (); > + vec_perm_builder builder; > + builder.new_vector (nunits, nunits, 1); > + for (unsigned i = 0; i < nunits; ++i) > + builder.quick_push (known_eq (ielt, i) ? nunits + relt : i); > + vec_perm_indices sel (builder, 2, nunits); > + } > + (if (!VECTOR_MODE_P (TYPE_MODE (type)) > + || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, > false)) > + (vec_perm @0 @1 { vec_perm_indices_to_tree > + (build_vector_type (ssizetype, nunits), sel); }))))) > + > (if (canonicalize_math_after_vectorization_p ()) > (for fmas (FMA) > (simplify > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c > b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c > new file mode 100644 > index 00000000000..7513497f552 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c > @@ -0,0 +1,14 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-optimized -Wno-psabi -w" } */ > + > +#define vector __attribute__((__vector_size__(16) )) > + > +vector int g(vector int a) > +{ > + int b = a[0]; > + a[0] = b; > + return a; > +} > + > +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 0 "optimized" } } */ > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c > b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c > new file mode 100644 > index 00000000000..b1e75797a90 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-optimized -Wno-psabi -w" } */ > + > +#define vector __attribute__((__vector_size__(16) )) > + > +vector int g(vector int a, int c) > +{ > + int b = a[2]; > + a[2] = b; > + a[1] = c; > + return a; > +} > + > +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 1 "optimized" } } */ > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */ > +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 0 "optimized" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vmovsh-1a.c > b/gcc/testsuite/gcc.target/i386/avx512fp16-vmovsh-1a.c > index ba10096aa20..38bf5cc0395 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vmovsh-1a.c > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vmovsh-1a.c > @@ -3,7 +3,7 @@ > /* { dg-final { scan-assembler-times "vmovsh\[ > \\t\]+%xmm\[0-9\]+\[^\n\r\]*%\[er\]\[ad]x+\[^\n\r]*\{%k\[0-9\]\}(?:\n|\[ > \\t\]+#)" 1 } } */ > /* { dg-final { scan-assembler-times "vmovsh\[ > \\t\]+\[^\n\r\]*%\[er\]\[ad]x+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ > \\t\]+#)" 1 } } */ > /* { dg-final { scan-assembler-times "vmovsh\[ > \\t\]+\[^\n\r\]*%\[er\]\[ad]x+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ > \\t\]+#)" 1 } } */ > -/* { dg-final { scan-assembler-times "vmovsh\[ > \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > \\t\]+#)" 1 } } */ > +/* { dg-final { scan-assembler-times "vmovsh\[ > \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > \\t\]+#)" 1 { xfail *-*-* } } } */ > /* { dg-final { scan-assembler-times "vmovsh\[ > \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^z\n\r]*(?:\n|\[ > \\t\]+#)" 1 } } */ > /* { dg-final { scan-assembler-times "vmovsh\[ > \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ > \\t\]+#)" 1 } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr94864.c > b/gcc/testsuite/gcc.target/i386/pr94864.c > new file mode 100644 > index 00000000000..69cb481fcfe > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr94864.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2 -mno-avx" } */ > + > +typedef double v2df __attribute__((vector_size(16))); > + > +v2df move_sd(v2df a, v2df b) > +{ > + v2df result = a; > + result[0] = b[1]; > + return result; > +} > + > +/* { dg-final { scan-assembler "unpckhpd\[\\t \]%xmm0, %xmm1" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr94865.c > b/gcc/testsuite/gcc.target/i386/pr94865.c > new file mode 100644 > index 00000000000..84065ac2467 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr94865.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2 -mno-avx" } */ > + > +typedef double v2df __attribute__((vector_size(16))); > + > +v2df move_sd(v2df a, v2df b) > +{ > + v2df result = a; > + result[1] = b[1]; > + return result; > +} > + > +/* { dg-final { scan-assembler "shufpd\[\\t \]*.2, %xmm1, %xmm0" } } */ > -- > 2.35.3
-- BR, Hongtao