From: Artemiy Volkov <[email protected]>

Presently, when compiling:

int16x8_t foo( int16x8_t x )
{
  return vcombine_s16( vget_high_s16( x ), vget_low_s16( x ) );
}

we produce:

foo:
        dup     d31, v0.d[1]
        uzp1    v0.2d, v31.2d, v0.2d
        ret

instead of the more efficient:

foo:
        ext     v0.16b, v0.16b, v0.16b, #8
        ret

This happens because the vec_select expression used to extract the upper
half of the vector does not get combined into an insn, and thus has to
be materialized in another register.  To fix this, add an insn pattern
for a vec_combine taking a vec_select as one of the arguments.
Additionally, provide an equivalent pattern for big-endian targets.

This patch also includes a new test file to cover this transformation.

Bootstrapped and regtested on aarch64-linux-gnu, and additionally
regtested on aarch64_be-linux-gnu, no issues.

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md 
(*aarch64_combine_high_low_internal<mode>):
        New insn.
        (*aarch64_combine_high_low_internal_be<mode>): Ditto.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/simd/combine_ext.c: New test.
---
 gcc/config/aarch64/aarch64-simd.md            | 27 +++++++++++
 .../gcc.target/aarch64/simd/combine_ext.c     | 47 +++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 0d5b02a739f..309c5ad3e3d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4423,6 +4423,33 @@
   }
 )
 
+;; Combine high half of operand 1 (extracted with vec_select) with
+;; low half of operand 2.
+
+(define_insn "*aarch64_combine_high_low_internal<mode>"
+  [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w")
+       (vec_concat:<VDBL>
+         (vec_select:VDC
+       (match_operand:<VDBL> 1 "register_operand" "w")
+       (match_operand:<VDBL> 3 "vect_par_cnst_hi_half"))
+         (match_operand:VDC 2 "register_operand" "w")))]
+  "TARGET_FLOAT && !BYTES_BIG_ENDIAN"
+  "ext\\t%0.16b, %1.16b, %2.16b, #8"
+  [(set_attr "type" "neon_ext<q>")]
+)
+
+(define_insn "*aarch64_combine_high_low_internal_be<mode>"
+  [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w")
+       (vec_concat:<VDBL>
+         (match_operand:VDC 1 "register_operand" "w")
+      (vec_select:VDC
+       (match_operand:<VDBL> 2 "register_operand" "w")
+       (match_operand:<VDBL> 3 "vect_par_cnst_hi_half"))))]
+  "TARGET_FLOAT && BYTES_BIG_ENDIAN"
+  "ext\\t%0.16b, %1.16b, %2.16b, #8"
+  [(set_attr "type" "neon_ext<q>")]
+)
+
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
 ;; dest vector.
 
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c 
b/gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c
new file mode 100644
index 00000000000..27bcf310e19
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <arm_neon.h>
+
+#ifndef TEST_COMBINE_HIGH_LOW_1
+#define TEST_COMBINE_HIGH_LOW_1(TYPE, SUFF)                            \
+  TYPE rev_##TYPE##_1 (TYPE x)                                         \
+  {                                                                    \
+    return vcombine_##SUFF (vget_high_##SUFF (x), vget_low_##SUFF (x)); \
+  }
+#endif
+
+#ifndef TEST_COMBINE_HIGH_LOW_2
+#define TEST_COMBINE_HIGH_LOW_2(TYPE, SUFF)                            \
+  TYPE rev_##TYPE##_2 (TYPE x, TYPE y)                                 \
+  {                                                                    \
+    return vcombine_##SUFF (vget_high_##SUFF (x), vget_low_##SUFF (y)); \
+  }
+#endif
+
+
+TEST_COMBINE_HIGH_LOW_1 (int8x16_t, s8)
+TEST_COMBINE_HIGH_LOW_1 (int16x8_t, s16)
+TEST_COMBINE_HIGH_LOW_1 (int32x4_t, s32)
+TEST_COMBINE_HIGH_LOW_1 (int64x2_t, s64)
+TEST_COMBINE_HIGH_LOW_1 (uint8x16_t, u8)
+TEST_COMBINE_HIGH_LOW_1 (uint16x8_t, u16)
+TEST_COMBINE_HIGH_LOW_1 (uint32x4_t, u32)
+TEST_COMBINE_HIGH_LOW_1 (uint64x2_t, u64)
+TEST_COMBINE_HIGH_LOW_1 (float16x8_t, f16)
+TEST_COMBINE_HIGH_LOW_1 (float32x4_t, f32)
+
+TEST_COMBINE_HIGH_LOW_2 (int8x16_t, s8)
+TEST_COMBINE_HIGH_LOW_2 (int16x8_t, s16)
+TEST_COMBINE_HIGH_LOW_2 (int32x4_t, s32)
+TEST_COMBINE_HIGH_LOW_2 (int64x2_t, s64)
+TEST_COMBINE_HIGH_LOW_2 (uint8x16_t, u8)
+TEST_COMBINE_HIGH_LOW_2 (uint16x8_t, u16)
+TEST_COMBINE_HIGH_LOW_2 (uint32x4_t, u32)
+TEST_COMBINE_HIGH_LOW_2 (uint64x2_t, u64)
+TEST_COMBINE_HIGH_LOW_2 (float16x8_t, f16)
+TEST_COMBINE_HIGH_LOW_2 (float32x4_t, f32)
+
+/* { dg-final { scan-assembler-times {ext\tv0.16b, v0.16b, v0.16b} 10 } } */
+/* { dg-final { scan-assembler-times {ext\tv0.16b, v0.16b, v1.16b} 10 { target 
aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {ext\tv0.16b, v1.16b, v0.16b} 10 { target 
aarch64_big_endian } } } */
-- 
2.43.0

Reply via email to