[PATCH 1/1] aarch64: remove extra XTN in vector concatenation

Akram Ahmad Mon, 02 Dec 2024 11:00:14 -0800

GIMPLE code which performs a narrowing truncation on the result of a
vector concatenation currently results in an unnecessary XTN being
emitted following a UZP1 to concate the operands. In cases such as this,
UZP1 should instead use a smaller arrangement specifier to replace the
XTN instruction. This is seen in cases such as in this GIMPLE example:


        int32x2_t foo (svint64_t a, svint64_t b)
        {
          vector(2) int vect__2.8;
          long int _1;
          long int _3;
          vector(2) long int _12;

          <bb 2> [local count: 1073741824]:
          _1 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, a_6(D));
          _3 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, b_7(D));
          _12 = {_1, _3};
          vect__2.8_13 = (vector(2) int) _12;
          return vect__2.8_13;

        }

Original assembly generated:

        bar:
                ptrue   p3.b, all
                uaddv   d0, p3, z0.d
                uaddv   d1, p3, z1.d
                uzp1    v0.2d, v0.2d, v1.2d
                xtn     v0.2s, v0.2d
                ret

This patch therefore defines the *aarch64_trunc_concat<mode> insn which
truncates the concatenation result, rather than concatenating the
truncated operands (such as in *aarch64_narrow_trunc<mode>), resulting
in the following optimised assembly being emitted:

        bar:
                ptrue   p3.b, all
                uaddv   d0, p3, z0.d
                uaddv   d1, p3, z1.d
                uzp1    v0.2s, v0.2s, v1.2s
                ret

This patch passes all regression tests on aarch64 with no new failures.
A supporting test for this optimisation is also written and passes.

OK for master? I do not have commit rights so I cannot push the patch
myself.

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md: (*aarch64_trunc_concat) new
          insn definition.
        * config/aarch64/iterators.md: (VDQHSD_F): new mode iterator.
          (VTRUNCD): new mode attribute for truncated modes.
          (Vtruncd): new mode attribute for arrangement specifier.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/sve/truncated_concatenation_1.c: new test
          for the above example and the int64x2 version of the above.
---
 gcc/config/aarch64/aarch64-simd.md            | 16 ++++++++++++++
 gcc/config/aarch64/iterators.md               | 12 ++++++++++
 .../aarch64/sve/truncated_concatenation_1.c   | 22 +++++++++++++++++++
 3 files changed, 50 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index cfe95bd4c31..de3dd444ecd 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1872,6 +1872,22 @@
   [(set_attr "type" "neon_permute<q>")]
 )
 
+(define_insn "*aarch64_trunc_concat<mode>"
+  [(set (match_operand:<VTRUNCD> 0 "register_operand" "=w")
+       (truncate:<VTRUNCD>
+         (vec_concat:VDQHSD_F
+            (match_operand:<VHALF> 1 "register_operand" "w")
+           (match_operand:<VHALF> 2 "register_operand" "w"))))]
+  "TARGET_SIMD"
+{
+  if (!BYTES_BIG_ENDIAN)
+    return "uzp1\\t%0.<Vtruncd>, %1.<Vtruncd>, %2.<Vtruncd>";
+  else
+    return "uzp1\\t%0.<Vtruncd>, %2.<Vtruncd>, %1.<Vtruncd>";
+}
+  [(set_attr "type" "neon_permute<q>")]
+)
+
 ;; Packing doubles.
 
 (define_expand "vec_pack_trunc_<mode>"
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index d7cb27e1885..3b28b2fae0c 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -290,6 +290,10 @@
 ;; Advanced SIMD modes for H, S and D types.
 (define_mode_iterator VDQHSD [V4HI V8HI V2SI V4SI V2DI])
 
+;; Advanced SIMD modes that can be truncated whilst preserving
+;; the number of vector elements.
+(define_mode_iterator VDQHSD_F [V8HI V4SI V2DI V2SF V4SF V2DF])
+
 (define_mode_iterator VDQHSD_V1DI [VDQHSD V1DI])
 
 ;; Advanced SIMD and scalar integer modes for H and S.
@@ -1722,6 +1726,14 @@
 (define_mode_attr Vnarrowq2 [(V8HI "v16qi") (V4SI "v8hi")
                             (V2DI "v4si")])
 
+;; Truncated Advanced SIMD modes which preserve the number of lanes.
+(define_mode_attr VTRUNCD [(V8HI "V8QI") (V4SI "V4HI")
+                          (V2SF "V2HF") (V4SF "V4HF")
+                          (V2DI "V2SI") (V2DF "V2SF")])
+(define_mode_attr Vtruncd [(V8HI "8b") (V4SI "4h")
+                          (V2SF "2h") (V4SF "4h")
+                          (V2DI "2s") (V2DF "2s")])
+
 ;; Narrowed modes of vector modes.
 (define_mode_attr VNARROW [(VNx8HI "VNx16QI")
                           (VNx4SI "VNx8HI") (VNx4SF "VNx8HF")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c
new file mode 100644
index 00000000000..e0ad4209206
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -Wall -march=armv8.2-a+sve" } */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+int32x2_t foo (svint64_t a, svint64_t b) {
+    int32x2_t ab = vdup_n_s32 (0);
+    ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), a), ab, 0);
+    ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), b), ab, 1);
+    return ab;
+}
+
+int64x2_t bar (svint64_t a, svint64_t b) {
+    int64x2_t ab = vdupq_n_s64(0);
+    ab = vsetq_lane_s64 ((int)svaddv_s64 (svptrue_b64 (), a), ab, 0);
+    ab = vsetq_lane_s64 ((int)svaddv_s64 (svptrue_b64 (), b), ab, 1);
+    return ab;
+}
+
+/* { dg-final { scan-assembler-not {\txtn\t} } }*/
+/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.2s, v[0-9]+\.2s, 
v[0-9]+\.2s} 2 } }*/
\ No newline at end of file
-- 
2.34.1

[PATCH 1/1] aarch64: remove extra XTN in vector concatenation

Reply via email to