[PATCH v2 10/16]AArch64: Add NEON RTL patterns for Complex Addition, Multiply and FMA.

Tamar Christina Fri, 25 Sep 2020 07:30:54 -0700

Hi All,

This adds implementation for the optabs for complex operations.  With this the
following C code:


  void f90 (float complex a[restrict N], float complex b[restrict N],
            float complex c[restrict N])
  {
    for (int i=0; i < N; i++)
      c[i] = a[i] + (b[i] * I);
  }

generates

  f90:
          mov     x3, 0
          .p2align 3,,7
  .L2:
          ldr     q0, [x0, x3]
          ldr     q1, [x1, x3]
          fcadd   v0.4s, v0.4s, v1.4s, #90
          str     q0, [x2, x3]
          add     x3, x3, 16
          cmp     x3, 1600
          bne     .L2
          ret

instead of

  f90:
          add     x3, x1, 1600
          .p2align 3,,7
  .L2:
          ld2     {v4.4s - v5.4s}, [x0], 32
          ld2     {v2.4s - v3.4s}, [x1], 32
          fsub    v0.4s, v4.4s, v3.4s
          fadd    v1.4s, v5.4s, v2.4s
          st2     {v0.4s - v1.4s}, [x2], 32
          cmp     x3, x1
          bne     .L2
          ret

It defined a new iterator VALL_ARITH which contains types for which we can do
general arithmetic (excludes bfloat16).

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md (cadd<rot><mode>3,
        cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New.
        * config/aarch64/iterators.md (VALL_ARITH, UNSPEC_FCMUL,
        UNSPEC_FCMUL180, UNSPEC_FCMLS, UNSPEC_FCMLS180, UNSPEC_CMLS,
        UNSPEC_CMLS180, UNSPEC_CMUL, UNSPEC_CMUL180, FCMLA_OP, FCMUL_OP, rot_op,
        rotsplit1, rotsplit2, fcmac1): New.
        (rot): Add UNSPEC_FCMLS, UNSPEC_FCMUL, UNSPEC_FCMUL180.

--

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 381a702eba003520d2e83e91065d2a808b9c6493..c2ddef19e4e433f7ca055e42d1222d9dad6bd6c2 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -449,6 +449,14 @@ (define_insn "aarch64_fcadd<rot><mode>"
   [(set_attr "type" "neon_fcadd")]
 )
 
+(define_expand "cadd<rot><mode>3"
+  [(set (match_operand:VHSDF 0 "register_operand")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")
+		       (match_operand:VHSDF 2 "register_operand")]
+		       FCADD))]
+  "TARGET_COMPLEX"
+)
+
 (define_insn "aarch64_fcmla<rot><mode>"
   [(set (match_operand:VHSDF 0 "register_operand" "=w")
 	(plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0")
@@ -508,6 +516,45 @@ (define_insn "aarch64_fcmlaq_lane<rot><mode>"
   [(set_attr "type" "neon_fcmla")]
 )
 
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cml<fcmac1><rot_op><mode>4"
+  [(set (match_operand:VHSDF 0 "register_operand")
+	(plus:VHSDF (match_operand:VHSDF 1 "register_operand")
+		    (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand")
+				   (match_operand:VHSDF 3 "register_operand")]
+				   FCMLA_OP)))]
+  "TARGET_COMPLEX"
+{
+  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (operands[0], operands[1],
+						 operands[2], operands[3]));
+  emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0], operands[0],
+						 operands[2], operands[3]));
+  DONE;
+})
+
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<rot_op><mode>3"
+  [(set (match_operand:VHSDF 0 "register_operand")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")
+		       (match_operand:VHSDF 2 "register_operand")]
+		       FCMUL_OP))]
+  "TARGET_COMPLEX"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_move_insn (tmp, CONST0_RTX (<MODE>mode));
+  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (operands[0], tmp,
+						 operands[1], operands[2]));
+  emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0], operands[0],
+						 operands[1], operands[2]));
+  DONE;
+})
+
+
+
 ;; These instructions map to the __builtins for the Dot Product operations.
 (define_insn "aarch64_<sur>dot<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 054fd8515c6ebf136da699e2993f6ebb348c3b1a..98217c9fd3ee2b6063f7564193e400e9ef71c6ac 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -182,6 +182,11 @@ (define_mode_iterator V2F [V2SF V2DF])
 ;; All Advanced SIMD modes on which we support any arithmetic operations.
 (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for performing arithmetics.
+(define_mode_iterator VALL_ARITH [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				  (V4HF "TARGET_SIMD_F16INST") (V8HF "TARGET_SIMD_F16INST")
+				  V2SF V4SF V2DF])
+
 ;; All Advanced SIMD modes suitable for moving, loading, and storing.
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
@@ -705,6 +710,10 @@ (define_c_enum "unspec"
     UNSPEC_FCMLA90	; Used in aarch64-simd.md.
     UNSPEC_FCMLA180	; Used in aarch64-simd.md.
     UNSPEC_FCMLA270	; Used in aarch64-simd.md.
+    UNSPEC_FCMUL	; Used in aarch64-simd.md.
+    UNSPEC_FCMUL180	; Used in aarch64-simd.md.
+    UNSPEC_FCMLS	; Used in aarch64-simd.md.
+    UNSPEC_FCMLS180	; Used in aarch64-simd.md.
     UNSPEC_ASRD		; Used in aarch64-sve.md.
     UNSPEC_ADCLB	; Used in aarch64-sve2.md.
     UNSPEC_ADCLT	; Used in aarch64-sve2.md.
@@ -723,6 +732,10 @@ (define_c_enum "unspec"
     UNSPEC_CMLA180	; Used in aarch64-sve2.md.
     UNSPEC_CMLA270	; Used in aarch64-sve2.md.
     UNSPEC_CMLA90	; Used in aarch64-sve2.md.
+    UNSPEC_CMLS		; Used in aarch64-sve2.md.
+    UNSPEC_CMLS180	; Used in aarch64-sve2.md.
+    UNSPEC_CMUL		; Used in aarch64-sve2.md.
+    UNSPEC_CMUL180	; Used in aarch64-sve2.md.
     UNSPEC_COND_FCVTLT	; Used in aarch64-sve2.md.
     UNSPEC_COND_FCVTNT	; Used in aarch64-sve2.md.
     UNSPEC_COND_FCVTX	; Used in aarch64-sve2.md.
@@ -2680,6 +2693,14 @@ (define_int_iterator FMMLA [UNSPEC_FMMLA])
 (define_int_iterator BF_MLA [UNSPEC_BFMLALB
 			     UNSPEC_BFMLALT])
 
+(define_int_iterator FCMLA_OP [UNSPEC_FCMLA
+			       UNSPEC_FCMLA180
+			       UNSPEC_FCMLS
+			       UNSPEC_FCMLS180])
+
+(define_int_iterator FCMUL_OP [UNSPEC_FCMUL
+			       UNSPEC_FCMUL180])
+
 ;; Iterators for atomic operations.
 
 (define_int_iterator ATOMIC_LDOP
@@ -3375,6 +3396,7 @@ (define_int_attr rot [(UNSPEC_CADD90 "90")
 		      (UNSPEC_CMLA270 "270")
 		      (UNSPEC_FCADD90 "90")
 		      (UNSPEC_FCADD270 "270")
+		      (UNSPEC_FCMLS "0")
 		      (UNSPEC_FCMLA "0")
 		      (UNSPEC_FCMLA90 "90")
 		      (UNSPEC_FCMLA180 "180")
@@ -3390,7 +3412,41 @@ (define_int_attr rot [(UNSPEC_CADD90 "90")
 		      (UNSPEC_COND_FCMLA "0")
 		      (UNSPEC_COND_FCMLA90 "90")
 		      (UNSPEC_COND_FCMLA180 "180")
-		      (UNSPEC_COND_FCMLA270 "270")])
+		      (UNSPEC_COND_FCMLA270 "270")
+		      (UNSPEC_FCMUL "0")
+		      (UNSPEC_FCMUL180 "180")])
+
+;; A conjucate is a rotation of 180* around the argand plane, or * I.
+(define_int_attr rot_op [(UNSPEC_FCMLS "")
+			 (UNSPEC_FCMLS180 "_conj")
+			 (UNSPEC_FCMLA "")
+			 (UNSPEC_FCMLA180 "_conj")
+			 (UNSPEC_FCMUL "")
+			 (UNSPEC_FCMUL180 "_conj")
+			 (UNSPEC_CMLS "")
+			 (UNSPEC_CMLA "")
+			 (UNSPEC_CMLA180 "_conj")
+			 (UNSPEC_CMUL "")
+			 (UNSPEC_CMUL180 "_conj")])
+
+(define_int_attr rotsplit1 [(UNSPEC_FCMLA "0")
+			    (UNSPEC_FCMLA180 "0")
+			    (UNSPEC_FCMUL "0")
+			    (UNSPEC_FCMUL180 "0")
+			    (UNSPEC_FCMLS "270")
+			    (UNSPEC_FCMLS180 "90")])
+
+(define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
+			    (UNSPEC_FCMLA180 "270")
+			    (UNSPEC_FCMUL "90")
+			    (UNSPEC_FCMUL180 "270")
+			    (UNSPEC_FCMLS "180")
+			    (UNSPEC_FCMLS180 "180")])
+
+(define_int_attr fcmac1 [(UNSPEC_FCMLA "a") (UNSPEC_FCMLA180 "a")
+			 (UNSPEC_FCMLS "s") (UNSPEC_FCMLS180 "s")
+			 (UNSPEC_CMLA "a") (UNSPEC_CMLA180 "a")
+			 (UNSPEC_CMLS "s") (UNSPEC_CMLS180 "s")])
 
 (define_int_attr sve_fmla_op [(UNSPEC_COND_FMLA "fmla")
 			      (UNSPEC_COND_FMLS "fmls")

[PATCH v2 10/16]AArch64: Add NEON RTL patterns for Complex Addition, Multiply and FMA.

Reply via email to