We are working on a patch to improve the codegen for the following test case:
uint64x2_t foo (uint64x2_t r) {
    uint32x4_t a = vreinterpretq_u32_u64 (r);
    uint32_t t;
    t = a[0]; a[0] = a[1]; a[1] = t;
    t = a[2]; a[2] = a[3]; a[3] = t;
    return vreinterpretq_u64_u32 (a);
}
that GCC currently compiles to (-O1):
foo:
        mov     v31.16b, v0.16b
        ins     v0.s[0], v0.s[1]
        ins     v0.s[1], v31.s[0]
        ins     v0.s[2], v31.s[3]
        ins     v0.s[3], v31.s[2]
        ret
whereas LLVM produces the preferable sequence
foo:
        rev64   v0.4s, v0.4s
        ret

On gimple level, we currently have:
  _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_3(D));
  t_4 = BIT_FIELD_REF <r_3(D), 32, 0>;
  a_5 = VEC_PERM_EXPR <_1, _1, { 1, 1, 2, 3 }>;
  a_6 = BIT_INSERT_EXPR <a_5, t_4, 32 (32 bits)>;
  t_7 = BIT_FIELD_REF <r_3(D), 32, 64>;
  _2 = BIT_FIELD_REF <r_3(D), 32, 96>;
  a_8 = BIT_INSERT_EXPR <a_6, _2, 64 (32 bits)>;
  a_9 = BIT_INSERT_EXPR <a_8, t_7, 96 (32 bits)>;
  _10 = VIEW_CONVERT_EXPR<uint64x2_t>(a_9);
  return _10;

whereas the desired sequence is:
  _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_2(D));
  a_3 = VEC_PERM_EXPR <_1, _1, { 1, 0, 3, 2 }>;
  _4 = VIEW_CONVERT_EXPR<uint64x2_t>(a_3);
  return _4;

If we remove the casts from the test case, the forwprop1 dump shows that
a series of match.pd is applied (repeatedly, only showing the first
iteration here):
Applying pattern match.pd:10881, gimple-match-1.cc:25213
Applying pattern match.pd:11099, gimple-match-1.cc:25714
Applying pattern match.pd:9549, gimple-match-1.cc:24274
gimple_simplified to a_7 = VEC_PERM_EXPR <r_3(D), r_3(D), { 1, 0, 2, 3 }>;

The reason why these patterns cannot be applied with casts seems to be
the failing types_match (@0, @1) in the following pattern:
/* Simplify vector inserts of other vector extracts to a permute.  */
(simplify
 (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
 (if (VECTOR_TYPE_P (type)
      && (VECTOR_MODE_P (TYPE_MODE (type))
          || optimize_vectors_before_lowering_p ())
      && types_match (@0, @1)
      && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
      && TYPE_VECTOR_SUBPARTS (type).is_constant ()
      && multiple_p (wi::to_poly_offset (@rpos),
                     wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (type)))))
  (with
   {
     [...]
   }
   (if (!VECTOR_MODE_P (TYPE_MODE (type))
        || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, 
false))
    (vec_perm @0 @1 { vec_perm_indices_to_tree
                        (build_vector_type (ssizetype, nunits), sel); })))))

The types_match fails, because the following pattern has already removed the
view_convert expression, thereby changing the type of @0:
(simplify
 (BIT_FIELD_REF (view_convert @0) @1 @2)
  [...]
  (BIT_FIELD_REF @0 @1 @2)))

One attempt to make the types_match true was to add a single_use flag to
the view_convert expression in the pattern above, preventing it from
being applied.
While this actually fixed the test case and produced the intended
instruction sequence, it caused another test to fail that relies on application
of the pattern with multiple use of the view_convert expression
(gcc.target/i386/vect-strided-3.c).

Hence, the RFC: How can we make the types_match work with view_convert
expressions in the arguments?

---
 gcc/match.pd                             |  7 ++++---
 gcc/testsuite/gcc.dg/tree-ssa/pr117093.c | 17 +++++++++++++++++
 2 files changed, 21 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr117093.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 9107e6a95ca..d7957177027 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -9357,9 +9357,10 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (BIT_FIELD_REF @0 @3 { const_binop (PLUS_EXPR, bitsizetype, @2, @4); }))
 
 (simplify
- (BIT_FIELD_REF (view_convert @0) @1 @2)
- (if (! INTEGRAL_TYPE_P (TREE_TYPE (@0))
-      || type_has_mode_precision_p (TREE_TYPE (@0)))
+ (BIT_FIELD_REF (view_convert@3 @0) @1 @2)
+ (if ((! INTEGRAL_TYPE_P (TREE_TYPE (@0))
+       || type_has_mode_precision_p (TREE_TYPE (@0)))
+      && single_use (@3))
   (BIT_FIELD_REF @0 @1 @2)))
 
 (simplify
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
new file mode 100644
index 00000000000..0fea32919dd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
@@ -0,0 +1,17 @@
+/* { dg-final { check-function-bodies "**" "" } } */
+/* { dg-options "-O1" } */
+
+#include <arm_neon.h>
+
+/*
+** foo:
+**     rev64   v0\.4s, v0\.4s
+**     ret
+*/
+uint64x2_t foo (uint64x2_t r) {
+    uint32x4_t a = vreinterpretq_u32_u64 (r);
+    uint32_t t;
+    t = a[0]; a[0] = a[1]; a[1] = t;
+    t = a[2]; a[2] = a[3]; a[3] = t;
+    return vreinterpretq_u64_u32 (a);
+}
-- 
2.44.0

Attachment: smime.p7s
Description: S/MIME cryptographic signature

Reply via email to