Implementing vmulx_* and vmulx_lane* NEON intrinsics

Hi all,

This series of patches focuses on the different vmulx_ and vmulx_lane NEON
intrinsics variants. All of the existing inlined assembly block implementations
are replaced with newly defined __builtin functions, and the missing intrinsics
are implemented with __builtins as well.

The rationale for the change from assembly to __builtin is that the compiler
would be able to do more optimisations like instruction scheduling. A new named
md pattern was added for the new fmulx __builtin.

Most vmulx_lane variants have been implemented as a combination of a vdup
followed by a vmulx_, rather than as separate __builtins.  The remaining
vmulx_lane intrinsics (vmulx(s|d)_lane*) were implemented using
__aarch64_vget_lane_any () and an appropriate vmulx. Four new nameless md
patterns were added to replace all the different types of RTL generated from the
combination of these intrinsics during the combine pass.

The rationale for this change is that in this way we would be able to optimise
away all uses of a dup followed by a fmulx to the appropriate fmulx lane variant
instruction.

New test cases were added for all the implemented intrinsics. Also new tests
were added for the proper error reporting of out-of-bounds accesses to _lane
intrinsics.

Tested on targets aarch64-none-elf and aarch64_be-none-elf.

Dependencies: patch 2/3 depends on patch 1/3, and patch 3/3 depends on patch
2/3.

---

In this patch from the series, a single new md pattern is added: the one for
fmulx, from which all necessary __builtin functions are derived.

Several intrinsics were refactored to use the new __builtin functions as some
of them already had an assembly block implementation. The rest, which had no
existing implementation, were also added. A single intrinsic was removed:
vmulx_lane_f32, since there was no test case that covered it and, moreover,
its implementation was wrong: it was in fact implementing vmulxq_lane_f32.

In addition, test cases for all new intrinsics were added. Tested on targets
aarch64-none-elf and aarch64_be-none-elf.

gcc/

2015-XX-XX  Bilyan Borisov  <bilyan.bori...@arm.com>

        * config/aarch64/aarch64-simd-builtins.def: BUILTIN declaration for
        fmulx...
        * config/aarch64/aarch64-simd.md: And its corresponding md pattern.
        * config/aarch64/arm_neon.h (vmulx_f32): Refactored to call fmulx
        __builtin, also moved.
        (vmulxq_f32): Same.
        (vmulx_f64): New, uses __builtin.
        (vmulxq_f64): Refactored to call fmulx __builtin, also moved.
        (vmulxs_f32): Same.
        (vmulxd_f64): Same.
        (vmulx_lane_f32): Removed, implementation was wrong.
        * config/aarch64/iterators.md: UNSPEC enum for fmulx.

gcc/testsuite/

2015-XX-XX  Bilyan Borisov  <bilyan.bori...@arm.com>

        * gcc/testsuite/gcc.target/aarch64/simd/vmulx_f32_1.c: New.
        * gcc/testsuite/gcc.target/aarch64/simd/vmulx_f64_1.c: New.
        * gcc/testsuite/gcc.target/aarch64/simd/vmulxq_f32_1.c: New.
        * gcc/testsuite/gcc.target/aarch64/simd/vmulxq_f64_1.c: New.
        * gcc/testsuite/gcc.target/aarch64/simd/vmulxs_f32_1.c: New.
        * gcc/testsuite/gcc.target/aarch64/simd/vmulxd_f64_1.c: New.

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 2c13cfb0823640254f02c202b19ddae78484d537..eed5f2b21997d4ea439dea828a0888cb253ad041 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -41,6 +41,7 @@
 
   BUILTIN_VDC (COMBINE, combine, 0)
   BUILTIN_VB (BINOP, pmul, 0)
+  BUILTIN_VALLF (BINOP, fmulx, 0)
   BUILTIN_VDQF_DF (UNOP, sqrt, 2)
   BUILTIN_VD_BHSI (BINOP, addp, 0)
   VAR1 (UNOP, addp, 0, di)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 541faf982effc7195a5f8d0d82738f76a7e04b4b..e7e8888bbd158d21691791a8d7db8a2616062e50 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2810,6 +2810,18 @@
   [(set_attr "type" "neon_mul_<Vetype><q>")]
 )
 
+;; fmulx.
+
+(define_insn "aarch64_fmulx<mode>"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+		       (match_operand:VALLF 2 "register_operand" "w")]
+		      UNSPEC_FMULX))]
+ "TARGET_SIMD"
+ "fmulx\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+ [(set_attr "type" "neon_fp_mul_<Vetype>")]
+)
+
 ;; <su>q<addsub>
 
 (define_insn "aarch64_<su_optab><optab><mode>"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 91ada618b79e038eb61e09ecd29af5129de81f51..4a3ef455b0945ed7e77fb3e78621d5010cd4c094 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8509,63 +8509,6 @@ vmulq_n_u32 (uint32x4_t a, uint32_t b)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmulx_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmulx %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmulx_lane_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]"                            \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vmulxd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("fmulx %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulxq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmulx %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulxq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmulx %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 #define vmulxq_lane_f32(a, b, c)                                        \
   __extension__                                                         \
     ({                                                                  \
@@ -8592,17 +8535,6 @@ vmulxq_f64 (float64x2_t a, float64x2_t b)
        result;                                                          \
      })
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vmulxs_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("fmulx %s0, %s1, %s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vmvn_p8 (poly8x8_t a)
 {
@@ -17778,6 +17710,43 @@ vmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
   return (uint32x4_t) __builtin_aarch64_umaxv4si ((int32x4_t) __a,
 						  (int32x4_t) __b);
 }
+/* vmulx */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmulx_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2sf (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulxq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_fmulxv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vmulx_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return (float64x1_t) {__builtin_aarch64_fmulxdf (__a[0], __b[0])};
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vmulxq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_fmulxv2df (__a, __b);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vmulxs_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_aarch64_fmulxsf (__a, __b);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vmulxd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_aarch64_fmulxdf (__a, __b);
+}
 
 /* vpmax  */
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 38c5a2424e4989b962165640bcb7fe122c3648e8..d38b92b02fefd0bf9f0e1b9e6f16b0392dba7b61 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -239,6 +239,7 @@
     UNSPEC_SQDMULH	; Used in aarch64-simd.md.
     UNSPEC_SQRDMULH	; Used in aarch64-simd.md.
     UNSPEC_PMUL		; Used in aarch64-simd.md.
+    UNSPEC_FMULX	; Used in aarch64-simd.md.
     UNSPEC_USQADD	; Used in aarch64-simd.md.
     UNSPEC_SUQADD	; Used in aarch64-simd.md.
     UNSPEC_SQXTUN	; Used in aarch64-simd.md.
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulx_f32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulx_f32_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..d0d26c5695e4a565dcdef3209ec9cc6116c2aa3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulx_f32_1.c
@@ -0,0 +1,52 @@
+/* Test the vmulx_f32 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+void __attribute__ ((noinline))
+test_case (float32_t v1[2], float32_t v2[2], float32_t e[2])
+{
+  int i;
+  float32x2_t vec1_1 = vld1_f32 (v1);
+  float32x2_t vec1_2 = vld1_f32 (v2);
+
+  float32x2_t actual1 = vmulx_f32 (vec1_1, vec1_2);
+  float32_t actual[2];
+  vst1_f32 (actual, actual1);
+
+  for (i = 0; i < 2; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+int
+main (void)
+{
+  float32_t v1 = 3.14159265359;
+  float32_t v2 = 1.383894;
+  float32_t v3 = -2.71828;
+  float32_t v4 = -3.4891931;
+
+  float32_t v1_1[] = {v1, v2};
+  float32_t v1_2[] = {v3, v4};
+  float32_t e1[] = {v1 * v3, v2 * v4};
+  test_case (v1_1, v1_2, e1);
+
+  float32_t v2_1[] = {0, -0.0};
+  float32_t v2_2[] = {__builtin_huge_valf (), __builtin_huge_valf ()};
+  float32_t e2[] = {2.0, -2.0};
+  test_case (v2_1, v2_2, e2);
+
+  float32_t v3_1[] = {0, -0.0};
+  float32_t v3_2[] = {-__builtin_huge_valf (), -__builtin_huge_valf ()};
+  float32_t e3[] = {-2.0, 2.0};
+  test_case (v3_1, v3_2, e3);
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.2s, ?\[vV\]\[0-9\]+\.2s, ?\[vV\]\[0-9\]+\.2s\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulx_f64_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulx_f64_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..5791d8a6b28b4f5bea54c8204c5fba0a67c09480
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulx_f64_1.c
@@ -0,0 +1,57 @@
+/* Test the vmulx_f64 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+void  __attribute__ ((noinline))
+test_case (float64_t v1[1], float64_t v2[1], float64_t e1[1])
+{
+  float64x1_t vec1_1 = vld1_f64 (v1);
+  float64x1_t vec1_2 = vld1_f64 (v2);
+
+  float64x1_t actual1 = vmulx_f64 (vec1_1, vec1_2);
+  float64_t actual[1];
+  vst1_f64 (actual, actual1);
+  if (actual[0] != e1[0])
+    abort ();
+}
+
+int
+main (void)
+{
+  float64_t v1 = 3.14159265359;
+  float64_t v2 = -2.71828;
+
+  float64_t v1_1[] = {v1};
+  float64_t v1_2[] = {v2};
+  float64_t e1[] = {v1 * v2};
+  test_case (v1_1, v1_2, e1);
+
+  float64_t v2_1[] = {0};
+  float64_t v2_2[] = {__builtin_huge_val ()};
+  float64_t e2[] = {2.0};
+  test_case (v2_1, v2_2, e2);
+
+  float64_t v3_1[] = {0};
+  float64_t v3_2[] = {-__builtin_huge_val ()};
+  float64_t e3[] = {-2.0};
+  test_case (v3_1, v3_2, e3);
+
+  float64_t v4_1[] = {-0.0};
+  float64_t v4_2[] = {__builtin_huge_val ()};
+  float64_t e4[] = {-2.0};
+  test_case (v4_1, v4_2, e4);
+
+  float64_t v5_1[] = {-0.0};
+  float64_t v5_2[] = {-__builtin_huge_val ()};
+  float64_t e5[] = {2.0};
+  test_case (v5_1, v5_2, e5);
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulxd_f64_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulxd_f64_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..d36e7428289c17799952aadb951c1cc9c964a3c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulxd_f64_1.c
@@ -0,0 +1,36 @@
+/* Test the vmulxd_f64 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+void __attribute__ ((noinline))
+test_case (float64_t v1, float64_t v2, float64_t e1)
+{
+  float64_t actual1 = vmulxd_f64 (v1, v2);
+  if (actual1 != e1)
+    abort ();
+}
+
+int
+main (void)
+{
+  int i;
+  float64_t v1 = 3.14159265359;
+  float64_t v2 = 1.383894;
+  float64_t v3 = -2.71828;
+  float64_t v4 = -3.4891931;
+
+  test_case (v1, v2, v1 * v2);
+  test_case (0.0, __builtin_huge_val (), 2.0);
+  test_case (0.0, -__builtin_huge_val (), -2.0);
+  test_case (-0.0, __builtin_huge_val (), -2.0);
+  test_case (-0.0, -__builtin_huge_val (), 2.0);
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_f32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_f32_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..7f8dbd9c8deca7a7891bcf8ba890fc7af213ed78
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_f32_1.c
@@ -0,0 +1,48 @@
+/* Test the vmulxq_f32 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+void __attribute__ ((noinline))
+test_case (float32_t v1[4], float32_t v2[4], float32_t e[4])
+{
+  int i;
+  float32x4_t vec1_1 = vld1q_f32 (v1);
+  float32x4_t vec1_2 = vld1q_f32 (v2);
+
+  float32x4_t actual1 = vmulxq_f32 (vec1_1, vec1_2);
+  float32_t actual[4];
+  vst1q_f32 (actual, actual1);
+
+  for (i = 0; i < 4; ++i)
+    if (actual[i] != e[i])
+      abort ();
+}
+
+int
+main (void)
+{
+  float32_t v1 = 3.14159265359;
+  float32_t v2 = 1.383894;
+  float32_t v3 = -2.71828;
+  float32_t v4 = -3.4891931;
+
+  float32_t v1_1[] = {v1, v2, v3, v4};
+  float32_t v1_2[] = {v3, v4, v1, v2};
+  float32_t e1[] = {v1 * v3, v2 * v4, v3 * v1, v4 * v2};
+  test_case (v1_1, v1_2, e1);
+
+  float32_t v2_1[] = {0, -0.0, 0, -0.0};
+  float32_t v2_2[] = {-__builtin_huge_valf (), -__builtin_huge_valf (),
+		      __builtin_huge_valf (), __builtin_huge_valf () };
+  float32_t e2[] = {-2.0, 2.0, 2.0, -2.0};
+  test_case (v2_1, v2_2, e2);
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.4s, ?\[vV\]\[0-9\]+\.4s, ?\[vV\]\[0-9\]+\.4s\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_f64_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_f64_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..f306bd05301ddabf6cf6b01a235bf1b54b1fe5e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulxq_f64_1.c
@@ -0,0 +1,51 @@
+/* Test the vmulxq_f64 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+void  __attribute__ ((noinline))
+test_case (float64_t v1[2], float64_t v2[2], float64_t e1[2])
+{
+  int i;
+  float64x2_t vec1_1 = vld1q_f64 (v1);
+  float64x2_t vec1_2 = vld1q_f64 (v2);
+
+  float64x2_t actual1 = vmulxq_f64 (vec1_1, vec1_2);
+  float64_t actual[2];
+  vst1q_f64 (actual, actual1);
+
+  for (i = 0; i < 2; ++i)
+    if (actual[i] != e1[i])
+      abort ();
+}
+
+int
+main (void)
+{
+  int i;
+  float64_t v1 = 3.14159265359;
+  float64_t v2 = -2.71828;
+
+  float64_t v1_1[] = {v1, v2};
+  float64_t v1_2[] = {v2, v1};
+  float64_t e1[] = {v1 * v2, v2* v1};
+  test_case (v1_1, v1_2, e1);
+
+  float64_t v2_1[] = {0, 0};
+  float64_t v2_2[] = {__builtin_huge_val (), -__builtin_huge_val ()};
+  float64_t e2[] = {2.0, -2.0};
+  test_case (v2_1, v2_2, e2);
+
+  float64_t v3_1[] = {-0.0, -0.0};
+  float64_t v3_2[] = {__builtin_huge_val (), -__builtin_huge_val ()};
+  float64_t e3[] = {-2.0, 2.0};
+  test_case (v3_1, v3_2, e3);
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[vV\]\[0-9\]+\.2\[dD\], ?\[vV\]\[0-9\]+\.2\[dD\], ?\[vV\]\[0-9\]+\.2\[dD\]\n" 1} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmulxs_f32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmulxs_f32_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..fc08e143b874b6501a1076720ca9467ddbb32ddb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmulxs_f32_1.c
@@ -0,0 +1,34 @@
+/* Test the vmulxs_f32 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-save-temps -O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+void __attribute__ ((noinline))
+test_case (float32_t v1, float32_t v2, float32_t e)
+{
+  float32_t actual = vmulxs_f32 (v1, v2);
+  if (actual != e)
+    abort ();
+}
+
+int
+main (void)
+{
+  float32_t v1 = 3.14159265359;
+  float32_t v2 = 1.383894;
+  float32_t v3 = -2.71828;
+  float32_t v4 = -3.4891931;
+
+  test_case (v1, v2, v1 * v2);
+  test_case (0.0, __builtin_huge_valf (), 2.0);
+  test_case (0.0, -__builtin_huge_valf (), -2.0);
+  test_case (-0.0, __builtin_huge_valf (), -2.0);
+  test_case (-0.0, -__builtin_huge_valf (), 2.0);
+
+  return 0;
+}
+/* { dg-final { scan-assembler-times "fmulx\[ \t\]+\[sS\]\[0-9\]+, ?\[sS\]\[0-9\]+, ?\[sS\]\[0-9\]+\n" 1 } } */

Reply via email to