https://gcc.gnu.org/g:fcc766c82cf8e0473ba54f1660c8282a7ce3231c

commit r15-2883-gfcc766c82cf8e0473ba54f1660c8282a7ce3231c
Author: Kyrylo Tkachov <ktkac...@nvidia.com>
Date:   Mon Aug 5 11:29:44 2024 -0700

    aarch64: Emit ADD X, Y, Y instead of SHL X, Y, #1 for Advanced SIMD
    
    On many cores, including Neoverse V2 the throughput of vector ADD
    instructions is higher than vector shifts like SHL.  We can lean on that
    to emit code like:
      add     v0.4s, v0.4s, v0.4s
    instead of:
      shl     v0.4s, v0.4s, 1
    
    LLVM already does this trick.
    In RTL the code gets canonincalised from (plus x x) to (ashift x 1) so I
    opted to instead do this at the final assembly printing stage, similar
    to how we emit CMLT instead of SSHR elsewhere in the backend.
    
    I'd like to also do this for SVE shifts, but those will have to be
    separate patches.
    
    Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com>
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-simd.md
            (aarch64_simd_imm_shl<mode><vczle><vczbe>): Rewrite to new
            syntax.  Add =w,w,vs1 alternative.
            * config/aarch64/constraints.md (vs1): New constraint.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/advsimd_shl_add.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md                 | 12 ++--
 gcc/config/aarch64/constraints.md                  |  6 ++
 gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c | 64 ++++++++++++++++++++++
 3 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index cc612ec2ca0e..475f19766c38 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1352,12 +1352,14 @@
 )
 
 (define_insn "aarch64_simd_imm_shl<mode><vczle><vczbe>"
- [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-       (ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
-                  (match_operand:VDQ_I  2 "aarch64_simd_lshift_imm" "Dl")))]
+ [(set (match_operand:VDQ_I 0 "register_operand")
+       (ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand")
+                  (match_operand:VDQ_I  2 "aarch64_simd_lshift_imm")))]
  "TARGET_SIMD"
-  "shl\t%0.<Vtype>, %1.<Vtype>, %2"
-  [(set_attr "type" "neon_shift_imm<q>")]
+  {@ [ cons: =0, 1,  2   ; attrs: type       ]
+     [ w       , w,  vs1 ; neon_add<q>       ] add\t%0.<Vtype>, %1.<Vtype>, 
%1.<Vtype>
+     [ w       , w,  Dl  ; neon_shift_imm<q> ] shl\t%0.<Vtype>, %1.<Vtype>, %2
+  }
 )
 
 (define_insn "aarch64_simd_reg_sshl<mode><vczle><vczbe>"
diff --git a/gcc/config/aarch64/constraints.md 
b/gcc/config/aarch64/constraints.md
index a2878f580d90..f491e4bd6a06 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -667,6 +667,12 @@
    SMAX and SMIN operations."
  (match_operand 0 "aarch64_sve_vsm_immediate"))
 
+(define_constraint "vs1"
+  "@internal
+ A constraint that matches a vector of immediate one."
+ (and (match_code "const,const_vector")
+      (match_test "op == CONST1_RTX (GET_MODE (op))")))
+
 (define_constraint "vsA"
   "@internal
    A constraint that matches an immediate operand valid for SVE FADD
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c
new file mode 100644
index 000000000000..a161f89a3acc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-additional-options "--save-temps -O1" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef __INT64_TYPE__ __attribute__ ((vector_size (16))) v2di;
+typedef int __attribute__ ((vector_size (16))) v4si;
+typedef short __attribute__ ((vector_size (16))) v8hi;
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef short __attribute__ ((vector_size (8))) v4hi;
+typedef char __attribute__ ((vector_size (8))) v8qi;
+
+#define FUNC(S) \
+S               \
+foo_##S (S a)   \
+{ return a << 1; }
+
+/*
+** foo_v2di:
+**      add    v0.2d, v0.2d, v0.2d
+**      ret
+*/
+
+FUNC (v2di)
+
+/*
+** foo_v4si:
+**      add    v0.4s, v0.4s, v0.4s
+**      ret
+*/
+
+FUNC (v4si)
+
+/*
+** foo_v8hi:
+**      add    v0.8h, v0.8h, v0.8h
+**      ret
+*/
+
+FUNC (v8hi)
+
+/*
+** foo_v16qi:
+**      add    v0.16b, v0.16b, v0.16b
+**      ret
+*/
+
+FUNC (v16qi)
+
+/*
+** foo_v4hi:
+**      add    v0.4h, v0.4h, v0.4h
+**      ret
+*/
+
+FUNC (v4hi)
+
+/*
+** foo_v8qi:
+**      add    v0.8b, v0.8b, v0.8b
+**      ret
+*/
+
+FUNC (v8qi)
+

Reply via email to