On Tue, Aug 22, 2023 at 5:05 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> The PRs ask for optimizing of
>
>   _1 = BIT_FIELD_REF <b_3(D), 64, 64>;
>   result_4 = BIT_INSERT_EXPR <a_2(D), _1, 64>;
>
> to a vector permutation.  The following implements this as
> match.pd pattern, improving code generation on x86_64.
>
> On the RTL level we face the issue that backend patterns inconsistently
> use vec_merge and vec_select of vec_concat to represent permutes.
>
> I think using a (supported) permute is almost always better
> than an extract plus insert, maybe excluding the case we extract
> element zero and that's aliased to a register that can be used
> directly for insertion (not sure how to query that).
>
> The patch FAILs one case in gcc.target/i386/avx512fp16-vmovsh-1a.c
> where we now expand from
>
>  __A_28 = VEC_PERM_EXPR <x2.8_9, x1.9_10, { 0, 9, 10, 11, 12, 13, 14, 15 }>;
>
> instead of
>
>  _28 = BIT_FIELD_REF <x2.8_9, 16, 0>;
>  __A_29 = BIT_INSERT_EXPR <x1.9_10, _28, 0>;
>
> producing a vpblendw instruction instead of the expected vmovsh.  That's
> either a missed vec_perm_const expansion optimization or even better,
> an improvement - Zen4 for example has 4 ports to execute vpblendw
> but only 3 for executing vmovsh and both instructions have the same size.
Looks like Sapphire rapids only have 2 ports for executing vpblendw
but 3 for vmovsh. I guess we may need a micro-architecture tuning for
this specific permutation.
for vmovss/vpblendd, they're equivalent on SPR, both are 3.
The change for the testcase is ok, I'll handle it with an incremental patch.
>
> The patch XFAILs the sub-testcase - is that OK or should I update
> the expected instruction to a vpblend?
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
> Thanks,
> Richard.
>
>         PR tree-optimization/94864
>         PR tree-optimization/94865
>         PR tree-optimization/93080
>         * match.pd (bit_insert @0 (BIT_FIELD_REF @1 ..) ..): New pattern
>         for vector insertion from vector extraction.
>
>         * gcc.target/i386/pr94864.c: New testcase.
>         * gcc.target/i386/pr94865.c: Likewise.
>         * gcc.target/i386/avx512fp16-vmovsh-1a.c: XFAIL.
>         * gcc.dg/tree-ssa/forwprop-40.c: Likewise.
>         * gcc.dg/tree-ssa/forwprop-41.c: Likewise.
> ---
>  gcc/match.pd                                  | 25 +++++++++++++++++++
>  gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c   | 14 +++++++++++
>  gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c   | 16 ++++++++++++
>  .../gcc.target/i386/avx512fp16-vmovsh-1a.c    |  2 +-
>  gcc/testsuite/gcc.target/i386/pr94864.c       | 13 ++++++++++
>  gcc/testsuite/gcc.target/i386/pr94865.c       | 13 ++++++++++
>  6 files changed, 82 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94864.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94865.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 86fdc606a79..6e083021b27 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8006,6 +8006,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>                       wi::to_wide (@ipos) + isize))
>      (BIT_FIELD_REF @0 @rsize @rpos)))))
>
> +/* Simplify vector inserts of other vector extracts to a permute.  */
> +(simplify
> + (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
> + (if (VECTOR_TYPE_P (type)
> +      && types_match (@0, @1)
> +      && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
> +      && TYPE_VECTOR_SUBPARTS (type).is_constant ())
> +  (with
> +   {
> +     unsigned HOST_WIDE_INT elsz
> +       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
> +     poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
> +     poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
> +     unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
> +     vec_perm_builder builder;
> +     builder.new_vector (nunits, nunits, 1);
> +     for (unsigned i = 0; i < nunits; ++i)
> +       builder.quick_push (known_eq (ielt, i) ? nunits + relt : i);
> +     vec_perm_indices sel (builder, 2, nunits);
> +   }
> +   (if (!VECTOR_MODE_P (TYPE_MODE (type))
> +       || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, 
> false))
> +    (vec_perm @0 @1 { vec_perm_indices_to_tree
> +                        (build_vector_type (ssizetype, nunits), sel); })))))
> +
>  (if (canonicalize_math_after_vectorization_p ())
>   (for fmas (FMA)
>    (simplify
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c
> new file mode 100644
> index 00000000000..7513497f552
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized -Wno-psabi -w" } */
> +
> +#define vector __attribute__((__vector_size__(16) ))
> +
> +vector int g(vector int a)
> +{
> +  int b = a[0];
> +  a[0] = b;
> +  return a;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 0 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c
> new file mode 100644
> index 00000000000..b1e75797a90
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized -Wno-psabi -w" } */
> +
> +#define vector __attribute__((__vector_size__(16) ))
> +
> +vector int g(vector int a, int c)
> +{
> +  int b = a[2];
> +  a[2] = b;
> +  a[1] = c;
> +  return a;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 0 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vmovsh-1a.c 
> b/gcc/testsuite/gcc.target/i386/avx512fp16-vmovsh-1a.c
> index ba10096aa20..38bf5cc0395 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vmovsh-1a.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vmovsh-1a.c
> @@ -3,7 +3,7 @@
>  /* { dg-final { scan-assembler-times "vmovsh\[ 
> \\t\]+%xmm\[0-9\]+\[^\n\r\]*%\[er\]\[ad]x+\[^\n\r]*\{%k\[0-9\]\}(?:\n|\[ 
> \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovsh\[ 
> \\t\]+\[^\n\r\]*%\[er\]\[ad]x+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ 
> \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovsh\[ 
> \\t\]+\[^\n\r\]*%\[er\]\[ad]x+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
>  \\t\]+#)" 1 } } */
> -/* { dg-final { scan-assembler-times "vmovsh\[ 
> \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
> \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovsh\[ 
> \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
> \\t\]+#)" 1 { xfail *-*-* } } } */
>  /* { dg-final { scan-assembler-times "vmovsh\[ 
> \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^z\n\r]*(?:\n|\[
>  \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovsh\[ 
> \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
>  \\t\]+#)" 1 } } */
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr94864.c 
> b/gcc/testsuite/gcc.target/i386/pr94864.c
> new file mode 100644
> index 00000000000..69cb481fcfe
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94864.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx" } */
> +
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df move_sd(v2df a, v2df b)
> +{
> +    v2df result = a;
> +    result[0] = b[1];
> +    return result;
> +}
> +
> +/* { dg-final { scan-assembler "unpckhpd\[\\t \]%xmm0, %xmm1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr94865.c 
> b/gcc/testsuite/gcc.target/i386/pr94865.c
> new file mode 100644
> index 00000000000..84065ac2467
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr94865.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx" } */
> +
> +typedef double v2df __attribute__((vector_size(16)));
> +
> +v2df move_sd(v2df a, v2df b)
> +{
> +    v2df result = a;
> +    result[1] = b[1];
> +    return result;
> +}
> +
> +/* { dg-final { scan-assembler "shufpd\[\\t \]*.2, %xmm1, %xmm0" } } */
> --
> 2.35.3



-- 
BR,
Hongtao

Reply via email to