[pushed 5/8] aarch64: Add more vec_combine patterns

Richard Sandiford via Gcc-patches Wed, 09 Feb 2022 09:05:51 -0800

vec_combine is really one instruction on aarch64, provided that
the lowpart element is in the same register as the destination
vector.  This patch adds patterns for that.


The patch fixes a regression from GCC 8.  Before the patch:

int64x2_t s64q_1(int64_t a0, int64_t a1) {
  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
    return (int64x2_t) { a1, a0 };
  else
    return (int64x2_t) { a0, a1 };
}

generated:

        fmov    d0, x0
        ins     v0.d[1], x1
        ins     v0.d[1], x1
        ret

whereas GCC 8 generated the more respectable:

        dup     v0.2d, x0
        ins     v0.d[1], x1
        ret

gcc/
        * config/aarch64/predicates.md (aarch64_reg_or_mem_pair_operand):
        New predicate.
        * config/aarch64/aarch64-simd.md (*aarch64_combine_internal<mode>)
        (*aarch64_combine_internal_be<mode>): New patterns.

gcc/testsuite/
        * gcc.target/aarch64/vec-init-9.c: New test.
        * gcc.target/aarch64/vec-init-10.c: Likewise.
        * gcc.target/aarch64/vec-init-11.c: Likewise.
---
 gcc/config/aarch64/aarch64-simd.md            |  62 ++++
 gcc/config/aarch64/predicates.md              |   4 +
 .../gcc.target/aarch64/vec-init-10.c          |  15 +
 .../gcc.target/aarch64/vec-init-11.c          |  12 +
 gcc/testsuite/gcc.target/aarch64/vec-init-9.c | 267 ++++++++++++++++++
 5 files changed, 360 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vec-init-10.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vec-init-11.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vec-init-9.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index d6cd4c70fe7..ead80396e70 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4326,6 +4326,25 @@ (define_insn "load_pair_lanes<mode>"
   [(set_attr "type" "neon_load1_1reg_q")]
 )
 
+;; This STP pattern is a partial duplicate of the general vec_concat patterns
+;; below.  The reason for having both of them is that the alternatives of
+;; the later patterns do not have consistent register preferences: the STP
+;; alternatives have no preference between GPRs and FPRs (and if anything,
+;; the GPR form is more natural for scalar integers) whereas the other
+;; alternatives *require* an FPR for operand 1 and prefer one for operand 2.
+;;
+;; Using "*" to hide the STP alternatives from the RA penalizes cases in
+;; which the destination was always memory.  On the other hand, expressing
+;; the true preferences makes GPRs seem more palatable than they really are
+;; for register destinations.
+;;
+;; Despite that, we do still want the general form to have STP alternatives,
+;; in order to handle cases where a register destination is spilled.
+;;
+;; The best compromise therefore seemed to be to have a dedicated STP
+;; pattern to catch cases in which the destination was always memory.
+;; This dedicated pattern must come first.
+
 (define_insn "store_pair_lanes<mode>"
   [(set (match_operand:<VDBL> 0 "aarch64_mem_pair_lanes_operand" "=Umn, Umn")
        (vec_concat:<VDBL>
@@ -4338,6 +4357,49 @@ (define_insn "store_pair_lanes<mode>"
   [(set_attr "type" "neon_stp, store_16")]
 )
 
+;; Form a vector whose least significant half comes from operand 1 and whose
+;; most significant half comes from operand 2.  The register alternatives
+;; tie the least significant half to the same register as the destination,
+;; so that only the other half needs to be handled explicitly.  For the
+;; reasons given above, the STP alternatives use ? for constraints that
+;; the register alternatives either don't accept or themselves disparage.
+
+(define_insn "*aarch64_combine_internal<mode>"
+  [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, 
Umn, Umn")
+       (vec_concat:<VDBL>
+         (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r")
+         (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, 
w, ?r")))]
+  "TARGET_SIMD
+   && !BYTES_BIG_ENDIAN
+   && (register_operand (operands[0], <VDBL>mode)
+       || register_operand (operands[2], <MODE>mode))"
+  "@
+   ins\t%0.d[1], %2.d[0]
+   ins\t%0.d[1], %2
+   ld1\t{%0.d}[1], %2
+   stp\t%d1, %d2, %y0
+   stp\t%x1, %x2, %y0"
+  [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, 
neon_stp, store_16")]
+)
+
+(define_insn "*aarch64_combine_internal_be<mode>"
+  [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, 
Umn, Umn")
+       (vec_concat:<VDBL>
+         (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, 
?w, ?r")
+         (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r")))]
+  "TARGET_SIMD
+   && BYTES_BIG_ENDIAN
+   && (register_operand (operands[0], <VDBL>mode)
+       || register_operand (operands[2], <MODE>mode))"
+  "@
+   ins\t%0.d[1], %2.d[0]
+   ins\t%0.d[1], %2
+   ld1\t{%0.d}[1], %2
+   stp\t%d2, %d1, %y0
+   stp\t%x2, %x1, %y0"
+  [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, 
neon_stp, store_16")]
+)
+
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
 ;; dest vector.
 
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 7dc4c155ea8..c308015ac2c 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -254,6 +254,10 @@ (define_predicate "aarch64_mem_pair_lanes_operand"
                                                  false,
                                                  ADDR_QUERY_LDP_STP_N)")))
 
+(define_predicate "aarch64_reg_or_mem_pair_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_mem_pair_lanes_operand")))
+
 (define_predicate "aarch64_prefetch_operand"
   (match_test "aarch64_address_valid_for_prefetch_p (op, false)"))
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-10.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-10.c
new file mode 100644
index 00000000000..f5dd83b94b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-10.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+int64x2_t f1(int64_t *x, int c) {
+  return c ? (int64x2_t) { x[0], x[2] } : (int64x2_t) { 0, 0 };
+}
+
+int64x2_t f2(int64_t *x, int i0, int i1, int c) {
+  return c ? (int64x2_t) { x[i0], x[i1] } : (int64x2_t) { 0, 0 };
+}
+
+/* { dg-final { scan-assembler-times {\t(?:ldr\td[0-9]+|ld1\t)} 4 } } */
+/* { dg-final { scan-assembler-not {\tldr\tx} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-11.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-11.c
new file mode 100644
index 00000000000..df242702c0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-11.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+void f1(int64x2_t *res, int64_t *x, int c0, int c1) {
+  res[0] = (int64x2_t) { c0 ? x[0] : 0, c1 ? x[2] : 0 };
+}
+
+/* { dg-final { scan-assembler-times {\tldr\tx[0-9]+} 2 } } */
+/* { dg-final { scan-assembler {\tstp\tx[0-9]+, x[0-9]+} } } */
+/* { dg-final { scan-assembler-not {\tldr\td} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-9.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
new file mode 100644
index 00000000000..8f68e06a559
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
@@ -0,0 +1,267 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+
+void ext();
+
+/*
+** s64q_1:
+**     fmov    d0, x0
+**     ins     v0\.d\[1\], x1
+**     ret
+*/
+int64x2_t s64q_1(int64_t a0, int64_t a1) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (int64x2_t) { a1, a0 };
+  else
+    return (int64x2_t) { a0, a1 };
+}
+/*
+** s64q_2:
+**     fmov    d0, x0
+**     ld1     {v0\.d}\[1\], \[x1\]
+**     ret
+*/
+int64x2_t s64q_2(int64_t a0, int64_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (int64x2_t) { ptr[0], a0 };
+  else
+    return (int64x2_t) { a0, ptr[0] };
+}
+/*
+** s64q_3:
+**     ldr     d0, \[x0\]
+**     ins     v0\.d\[1\], x1
+**     ret
+*/
+int64x2_t s64q_3(int64_t *ptr, int64_t a1) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (int64x2_t) { a1, ptr[0] };
+  else
+    return (int64x2_t) { ptr[0], a1 };
+}
+/*
+** s64q_4:
+**     stp     x1, x2, \[x0\]
+**     ret
+*/
+void s64q_4(int64x2_t *res, int64_t a0, int64_t a1) {
+  res[0] = (int64x2_t) { a0, a1 };
+}
+/*
+** s64q_5:
+**     stp     x1, x2, \[x0, #?8\]
+**     ret
+*/
+void s64q_5(uintptr_t res, int64_t a0, int64_t a1) {
+  *(int64x2_t *)(res + 8) = (int64x2_t) { a0, a1 };
+}
+/*
+** s64q_6:
+**     ...
+**     stp     x0, x1, .*
+**     ...
+**     ldr     q0, .*
+**     ...
+**     ret
+*/
+int64x2_t s64q_6(int64_t a0, int64_t a1) {
+  int64x2_t res = { a0, a1 };
+  ext ();
+  return res;
+}
+
+/*
+** f64q_1:
+**     ins     v0\.d\[1\], v1\.d\[0\]
+**     ret
+*/
+float64x2_t f64q_1(float64_t a0, float64_t a1) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (float64x2_t) { a1, a0 };
+  else
+    return (float64x2_t) { a0, a1 };
+}
+/*
+** f64q_2:
+**     ld1     {v0\.d}\[1\], \[x0\]
+**     ret
+*/
+float64x2_t f64q_2(float64_t a0, float64_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (float64x2_t) { ptr[0], a0 };
+  else
+    return (float64x2_t) { a0, ptr[0] };
+}
+/*
+** f64q_3:
+**     ldr     d0, \[x0\]
+**     ins     v0\.d\[1\], v1\.d\[0\]
+**     ret
+*/
+float64x2_t f64q_3(float64_t a0, float64_t a1, float64_t *ptr) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return (float64x2_t) { a1, ptr[0] };
+  else
+    return (float64x2_t) { ptr[0], a1 };
+}
+/*
+** f64q_4:
+**     stp     d0, d1, \[x0\]
+**     ret
+*/
+void f64q_4(float64x2_t *res, float64_t a0, float64_t a1) {
+  res[0] = (float64x2_t) { a0, a1 };
+}
+/*
+** f64q_5:
+**     stp     d0, d1, \[x0, #?8\]
+**     ret
+*/
+void f64q_5(uintptr_t res, float64_t a0, float64_t a1) {
+  *(float64x2_t *)(res + 8) = (float64x2_t) { a0, a1 };
+}
+/*
+** f64q_6:
+**     ...
+**     stp     d0, d1, .*
+**     ...
+**     ldr     q0, .*
+**     ...
+**     ret
+*/
+float64x2_t f64q_6(float64_t a0, float64_t a1) {
+  float64x2_t res = { a0, a1 };
+  ext ();
+  return res;
+}
+
+/*
+** s32q_1:
+**     ins     v0\.d\[1\], v1\.d\[0\]
+**     ret
+*/
+int32x4_t s32q_1(int32x2_t a0, int32x2_t a1) {
+  return vcombine_s32 (a0, a1);
+}
+/*
+** s32q_2:
+**     ld1     {v0\.d}\[1\], \[x0\]
+**     ret
+*/
+int32x4_t s32q_2(int32x2_t a0, int32x2_t *ptr) {
+  return vcombine_s32 (a0, ptr[0]);
+}
+/*
+** s32q_3:
+**     ldr     d0, \[x0\]
+**     ins     v0\.d\[1\], v1\.d\[0\]
+**     ret
+*/
+int32x4_t s32q_3(int32x2_t a0, int32x2_t a1, int32x2_t *ptr) {
+  return vcombine_s32 (ptr[0], a1);
+}
+/*
+** s32q_4:
+**     stp     d0, d1, \[x0\]
+**     ret
+*/
+void s32q_4(int32x4_t *res, int32x2_t a0, int32x2_t a1) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    res[0] = vcombine_s32 (a1, a0);
+  else
+    res[0] = vcombine_s32 (a0, a1);
+}
+/*
+** s32q_5:
+**     stp     d0, d1, \[x0, #?8\]
+**     ret
+*/
+void s32q_5(uintptr_t res, int32x2_t a0, int32x2_t a1) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    *(int32x4_t *)(res + 8) = vcombine_s32 (a1, a0);
+  else
+    *(int32x4_t *)(res + 8) = vcombine_s32 (a0, a1);
+}
+/*
+** s32q_6:
+**     ...
+**     stp     d0, d1, .*
+**     ...
+**     ldr     q0, .*
+**     ...
+**     ret
+*/
+int32x4_t s32q_6(int32x2_t a0, int32x2_t a1) {
+  int32x4_t res = (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+                  ? vcombine_s32 (a1, a0)
+                  : vcombine_s32 (a0, a1));
+  ext ();
+  return res;
+}
+
+/*
+** f32q_1:
+**     ins     v0\.d\[1\], v1\.d\[0\]
+**     ret
+*/
+float32x4_t f32q_1(float32x2_t a0, float32x2_t a1) {
+  return vcombine_f32 (a0, a1);
+}
+/*
+** f32q_2:
+**     ld1     {v0\.d}\[1\], \[x0\]
+**     ret
+*/
+float32x4_t f32q_2(float32x2_t a0, float32x2_t *ptr) {
+  return vcombine_f32 (a0, ptr[0]);
+}
+/*
+** f32q_3:
+**     ldr     d0, \[x0\]
+**     ins     v0\.d\[1\], v1\.d\[0\]
+**     ret
+*/
+float32x4_t f32q_3(float32x2_t a0, float32x2_t a1, float32x2_t *ptr) {
+  return vcombine_f32 (ptr[0], a1);
+}
+/*
+** f32q_4:
+**     stp     d0, d1, \[x0\]
+**     ret
+*/
+void f32q_4(float32x4_t *res, float32x2_t a0, float32x2_t a1) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    res[0] = vcombine_f32 (a1, a0);
+  else
+    res[0] = vcombine_f32 (a0, a1);
+}
+/*
+** f32q_5:
+**     stp     d0, d1, \[x0, #?8\]
+**     ret
+*/
+void f32q_5(uintptr_t res, float32x2_t a0, float32x2_t a1) {
+  if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    *(float32x4_t *)(res + 8) = vcombine_f32 (a1, a0);
+  else
+    *(float32x4_t *)(res + 8) = vcombine_f32 (a0, a1);
+}
+/*
+** f32q_6:
+**     ...
+**     stp     d0, d1, .*
+**     ...
+**     ldr     q0, .*
+**     ...
+**     ret
+*/
+float32x4_t f32q_6(float32x2_t a0, float32x2_t a1) {
+  float32x4_t res = (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+                    ? vcombine_f32 (a1, a0)
+                    : vcombine_f32 (a0, a1));
+  ext ();
+  return res;
+}
-- 
2.25.1

[pushed 5/8] aarch64: Add more vec_combine patterns

Reply via email to