On Fri, 15 Nov 2024, Jennifer Schmitz wrote:

> 
> 
> > On 7 Nov 2024, at 13:47, Richard Biener <rguent...@suse.de> wrote:
> > 
> > External email: Use caution opening links or attachments
> > 
> > 
> > On Tue, 5 Nov 2024, Jennifer Schmitz wrote:
> > 
> >> We are working on a patch to improve the codegen for the following test 
> >> case:
> >> uint64x2_t foo (uint64x2_t r) {
> >>    uint32x4_t a = vreinterpretq_u32_u64 (r);
> >>    uint32_t t;
> >>    t = a[0]; a[0] = a[1]; a[1] = t;
> >>    t = a[2]; a[2] = a[3]; a[3] = t;
> >>    return vreinterpretq_u64_u32 (a);
> >> }
> >> that GCC currently compiles to (-O1):
> >> foo:
> >>        mov     v31.16b, v0.16b
> >>        ins     v0.s[0], v0.s[1]
> >>        ins     v0.s[1], v31.s[0]
> >>        ins     v0.s[2], v31.s[3]
> >>        ins     v0.s[3], v31.s[2]
> >>        ret
> >> whereas LLVM produces the preferable sequence
> >> foo:
> >>      rev64   v0.4s, v0.4s
> >>        ret
> >> 
> >> On gimple level, we currently have:
> >>  _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_3(D));
> >>  t_4 = BIT_FIELD_REF <r_3(D), 32, 0>;
> >>  a_5 = VEC_PERM_EXPR <_1, _1, { 1, 1, 2, 3 }>;
> >>  a_6 = BIT_INSERT_EXPR <a_5, t_4, 32 (32 bits)>;
> >>  t_7 = BIT_FIELD_REF <r_3(D), 32, 64>;
> >>  _2 = BIT_FIELD_REF <r_3(D), 32, 96>;
> >>  a_8 = BIT_INSERT_EXPR <a_6, _2, 64 (32 bits)>;
> >>  a_9 = BIT_INSERT_EXPR <a_8, t_7, 96 (32 bits)>;
> >>  _10 = VIEW_CONVERT_EXPR<uint64x2_t>(a_9);
> >>  return _10;
> >> 
> >> whereas the desired sequence is:
> >>  _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_2(D));
> >>  a_3 = VEC_PERM_EXPR <_1, _1, { 1, 0, 3, 2 }>;
> >>  _4 = VIEW_CONVERT_EXPR<uint64x2_t>(a_3);
> >>  return _4;
> >> 
> >> If we remove the casts from the test case, the forwprop1 dump shows that
> >> a series of match.pd is applied (repeatedly, only showing the first
> >> iteration here):
> >> Applying pattern match.pd:10881, gimple-match-1.cc:25213
> >> Applying pattern match.pd:11099, gimple-match-1.cc:25714
> >> Applying pattern match.pd:9549, gimple-match-1.cc:24274
> >> gimple_simplified to a_7 = VEC_PERM_EXPR <r_3(D), r_3(D), { 1, 0, 2, 3 }>;
> >> 
> >> The reason why these patterns cannot be applied with casts seems to be
> >> the failing types_match (@0, @1) in the following pattern:
> >> /* Simplify vector inserts of other vector extracts to a permute.  */
> >> (simplify
> >> (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
> >> (if (VECTOR_TYPE_P (type)
> >>      && (VECTOR_MODE_P (TYPE_MODE (type))
> >>          || optimize_vectors_before_lowering_p ())
> >>      && types_match (@0, @1)
> >>      && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
> >>      && TYPE_VECTOR_SUBPARTS (type).is_constant ()
> >>      && multiple_p (wi::to_poly_offset (@rpos),
> >>                     wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (type)))))
> >>  (with
> >>   {
> >>     [...]
> >>   }
> >>   (if (!VECTOR_MODE_P (TYPE_MODE (type))
> >>        || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, 
> >> false))
> >>    (vec_perm @0 @1 { vec_perm_indices_to_tree
> >>                        (build_vector_type (ssizetype, nunits), sel); })))))
> >> 
> >> The types_match fails, because the following pattern has already removed 
> >> the
> >> view_convert expression, thereby changing the type of @0:
> >> (simplify
> >> (BIT_FIELD_REF (view_convert @0) @1 @2)
> >>  [...]
> >>  (BIT_FIELD_REF @0 @1 @2)))
> >> 
> >> One attempt to make the types_match true was to add a single_use flag to
> >> the view_convert expression in the pattern above, preventing it from
> >> being applied.
> >> While this actually fixed the test case and produced the intended
> >> instruction sequence, it caused another test to fail that relies on 
> >> application
> >> of the pattern with multiple use of the view_convert expression
> >> (gcc.target/i386/vect-strided-3.c).
> >> 
> >> Hence, the RFC: How can we make the types_match work with view_convert
> >> expressions in the arguments?
> > 
> > You could remove the types_match (@0, @1) with
> > 
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index 00988241348..820a589b577 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -9539,7 +9539,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >  (if (VECTOR_TYPE_P (type)
> >       && (VECTOR_MODE_P (TYPE_MODE (type))
> >          || optimize_vectors_before_lowering_p ())
> > -      && types_match (@0, @1)
> > +      && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
> > +                         TYPE_SIZE (TREE_TYPE (@1)), 0)
> >       && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
> >       && TYPE_VECTOR_SUBPARTS (type).is_constant ()
> >       && multiple_p (wi::to_poly_offset (@rpos),
> > @@ -9547,7 +9548,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >   (with
> >    {
> >      unsigned HOST_WIDE_INT elsz
> > -       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
> > +       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@0))));
> >      poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
> >      poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
> >      unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
> > @@ -9559,7 +9560,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >    }
> >    (if (!VECTOR_MODE_P (TYPE_MODE (type))
> >        || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel,
> > false))
> > -    (vec_perm @0 @1 { vec_perm_indices_to_tree
> > +    (vec_perm @0 (view_convert @1) { vec_perm_indices_to_tree
> >                         (build_vector_type (ssizetype, nunits), sel);
> > })))))
> > 
> > (if (canonicalize_math_after_vectorization_p ())
> > 
> > or alternatively avoid the BIT_FIELD_REF (view_convert @) transform
> > iff the original ref type-wise matches a vector element extract
> > and the result with the view_convert does not.
> 
> Dear Richard,
> thank you for the helpful feedback. I made the changes as suggested and added 
> you as co-author.
> Best,
> Jennifer
> 
> This patch improves the codegen for the following test case:
> uint64x2_t foo (uint64x2_t r) {
>     uint32x4_t a = vreinterpretq_u32_u64 (r);
>     uint32_t t;
>     t = a[0]; a[0] = a[1]; a[1] = t;
>     t = a[2]; a[2] = a[3]; a[3] = t;
>     return vreinterpretq_u64_u32 (a);
> }
> from (-O1):
> foo:
>         mov     v31.16b, v0.16b
>         ins     v0.s[0], v0.s[1]
>         ins     v0.s[1], v31.s[0]
>         ins     v0.s[2], v31.s[3]
>         ins     v0.s[3], v31.s[2]
>         ret
> to:
> foo:
>       rev64   v0.4s, v0.4s
>         ret
> 
> This is achieved by extending the following match.pd pattern to account
> for type differences between @0 and @1 due to view converts.
> /* Simplify vector inserts of other vector extracts to a permute.  */
> (simplify
>  (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
> 
> The patch was bootstrapped and regtested on aarch64-linux-gnu and
> x86_64-linux-gnu, no regression.
> OK for mainline?

OK.

Thanks,
Richard.

> Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com>
> Co-authored-by: Richard Biener <rguent...@suse.de>
> 
> gcc/
>       PR tree-optimization/117093
>       * match.pd: Extend
>       (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos) to allow
>       type differences between @0 and @1 due to view converts.
> 
> gcc/testsuite/
>       PR tree-optimization/117093
>       * gcc.dg/tree-ssa/pr117093.c: New test.
> ---
>  gcc/match.pd                             | 13 ++++++++-----
>  gcc/testsuite/gcc.dg/tree-ssa/pr117093.c | 17 +++++++++++++++++
>  2 files changed, 25 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 9107e6a95ca..af6205cd9a1 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -9526,7 +9526,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (VECTOR_TYPE_P (type)
>        && (VECTOR_MODE_P (TYPE_MODE (type))
>         || optimize_vectors_before_lowering_p ())
> -      && types_match (@0, @1)
> +      && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
> +                       TYPE_SIZE (TREE_TYPE (@1)), 0)
>        && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
>        && TYPE_VECTOR_SUBPARTS (type).is_constant ()
>        && multiple_p (wi::to_poly_offset (@rpos),
> @@ -9534,7 +9535,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>    (with
>     {
>       unsigned HOST_WIDE_INT elsz
> -       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
> +       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@0))));
>       poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
>       poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
>       unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
> @@ -9545,9 +9546,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>       vec_perm_indices sel (builder, 2, nunits);
>     }
>     (if (!VECTOR_MODE_P (TYPE_MODE (type))
> -     || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, 
> false))
> -    (vec_perm @0 @1 { vec_perm_indices_to_tree
> -                        (build_vector_type (ssizetype, nunits), sel); })))))
> +     || can_vec_perm_const_p (TYPE_MODE (type),
> +                              TYPE_MODE (type), sel, false))
> +    (vec_perm @0 (view_convert @1)
> +     { vec_perm_indices_to_tree (build_vector_type (ssizetype, nunits),
> +                              sel); })))))
>  
>  (if (canonicalize_math_after_vectorization_p ())
>   (for fmas (FMA)
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
> new file mode 100644
> index 00000000000..0fea32919dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
> @@ -0,0 +1,17 @@
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* { dg-options "-O1" } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foo:
> +**   rev64   v0\.4s, v0\.4s
> +**   ret
> +*/
> +uint64x2_t foo (uint64x2_t r) {
> +    uint32x4_t a = vreinterpretq_u32_u64 (r);
> +    uint32_t t;
> +    t = a[0]; a[0] = a[1]; a[1] = t;
> +    t = a[2]; a[2] = a[3]; a[3] = t;
> +    return vreinterpretq_u64_u32 (a);
> +}
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to