this patch enable auto-vectorization for copysignf by using vector
bit selection instruction on arm32 when neon available.

for a simple testcase:

  for (i = 0; i < N; i++)
    r[i] = __builtin_copysignf (a[i], b[i]);


assuming vector factor be 4, the generated instruction sequences is:

        vmov.i32        q10, #2147483648  @ v4si
.L2:
        vld1.64 {d18-d19}, [ip:64]
        add     r3, r3, #16
        add     ip, ip, #16
        vldr    d16, [r3, #-16]
        vldr    d17, [r3, #-8]
        vbif    q8, q9, q10
        vst1.32 {q8}, [r1]
        add     r1, r1, #16
        cmp     r1, lr
        bne     .L2

ok to install?

thanks.

gcc/
  * config/arm/arm.c (NEON_COPYSIGNF): New enum.
  (arm_init_neon_builtins): Support NEON_COPYSIGNF.
  (arm_builtin_vectorized_function): Likewise.
  * config/arm/arm_neon_builtins.def: New macro for copysignf.
  * config/arm/neon.md (neon_copysignf<mode>): New pattern for vector copysignf.

gcc/testsuite/
  * gcc.target/arm/vect-copysignf.c: New testcase.
commit 533b209f1899a1070394506ab32cc640de6a58e3
Author: Jiong Wang <jiong.w...@arm.com>
Date:   Thu Aug 14 11:54:41 2014 +0100

    vect copysignf.

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 2f8d327..045c56e 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -23243,6 +23243,7 @@ typedef enum {
   NEON_SETLANE,
   NEON_CREATE,
   NEON_RINT,
+  NEON_COPYSIGNF,
   NEON_DUP,
   NEON_DUPLANE,
   NEON_COMBINE,
@@ -24237,6 +24238,22 @@ arm_init_neon_builtins (void)
 	    ftype = build_function_type_list (eltype, eltype, NULL);
 	    break;
 	}
+	case NEON_COPYSIGNF:
+	  {
+	    tree eltype = NULL_TREE;
+	    switch (insn_data[d->code].operand[1].mode)
+	      {
+	      case V2SFmode:
+		eltype = V2SF_type_node;
+		break;
+	      case V4SFmode:
+		eltype = V4SF_type_node;
+		break;
+	      default: gcc_unreachable ();
+	      }
+	    ftype = build_function_type_list (eltype, eltype, NULL);
+	    break;
+	  }
 	default:
 	  gcc_unreachable ();
 	}
@@ -25440,6 +25457,7 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
       return arm_expand_neon_args (target, icode, 1, type_mode, exp, fcode,
         NEON_ARG_COPY_TO_REG, NEON_ARG_STOP);
 
+    case NEON_COPYSIGNF:
     case NEON_COMBINE:
     case NEON_VTBL:
       return arm_expand_neon_args (target, icode, 1, type_mode, exp, fcode,
@@ -29984,27 +30002,34 @@ arm_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
             return ARM_FIND_VRINT_VARIANT (vrinta);
 #undef ARM_CHECK_BUILTIN_MODE
 #define ARM_CHECK_BUILTIN_MODE(C, N) \
-  (out_mode == N##Imode && out_n == C \
-   && in_mode == N##Imode && in_n == C)
+  (out_mode == N##mode && out_n == C \
+   && in_mode == N##mode && in_n == C)
           case BUILT_IN_BSWAP16:
-            if (ARM_CHECK_BUILTIN_MODE (4, H))
+            if (ARM_CHECK_BUILTIN_MODE (4, HI))
               return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv4hi, false);
-            else if (ARM_CHECK_BUILTIN_MODE (8, H))
+            else if (ARM_CHECK_BUILTIN_MODE (8, HI))
               return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv8hi, false);
             else
               return NULL_TREE;
           case BUILT_IN_BSWAP32:
-            if (ARM_CHECK_BUILTIN_MODE (2, S))
+            if (ARM_CHECK_BUILTIN_MODE (2, SI))
               return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv2si, false);
-            else if (ARM_CHECK_BUILTIN_MODE (4, S))
+            else if (ARM_CHECK_BUILTIN_MODE (4, SI))
               return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv4si, false);
             else
               return NULL_TREE;
           case BUILT_IN_BSWAP64:
-            if (ARM_CHECK_BUILTIN_MODE (2, D))
+            if (ARM_CHECK_BUILTIN_MODE (2, DI))
               return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv2di, false);
             else
               return NULL_TREE;
+	  case BUILT_IN_COPYSIGNF:
+	    if (ARM_CHECK_BUILTIN_MODE (2, SF))
+              return arm_builtin_decl (ARM_BUILTIN_NEON_copysignfv2sf, false);
+	    else if (ARM_CHECK_BUILTIN_MODE (4, SF))
+              return arm_builtin_decl (ARM_BUILTIN_NEON_copysignfv4sf, false);
+	    else
+	      return NULL_TREE;
 
           default:
             return NULL_TREE;
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index f4531f3..a2e1301 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -135,6 +135,7 @@ VAR1 (FLOAT_WIDEN, vcvtv4sf, v4hf),
 VAR1 (FLOAT_NARROW, vcvtv4hf, v4sf),
 VAR10 (SELECT, vbsl,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di),
+VAR2 (COPYSIGNF, copysignf, v2sf, v4sf),
 VAR2 (RINT, vrintn, v2sf, v4sf),
 VAR2 (RINT, vrinta, v2sf, v4sf),
 VAR2 (RINT, vrintp, v2sf, v4sf),
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index dc364ee..7a21a42 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2569,6 +2569,33 @@
   DONE;
 })
 
+(define_expand "neon_copysignf<mode>"
+  [(match_operand:VCVTF 0 "register_operand")
+   (match_operand:VCVTF 1 "register_operand")
+   (match_operand:VCVTF 2 "register_operand")]
+  "TARGET_NEON"
+  "{
+     rtx v_bitmask_cast;
+     rtx v_bitmask = gen_reg_rtx (<VCVTF:V_cmp_result>mode);
+     int i, n_elt = GET_MODE_NUNITS (<MODE>mode);
+     rtvec v = rtvec_alloc (n_elt);
+
+     /* Create bitmask for vector select.  */
+     for (i = 0; i < n_elt; ++i)
+       RTVEC_ELT (v, i) = GEN_INT (0x80000000);
+
+     emit_move_insn (v_bitmask,
+		     gen_rtx_CONST_VECTOR (<VCVTF:V_cmp_result>mode, v));
+     emit_move_insn (operands[0], operands[2]);
+     v_bitmask_cast = simplify_gen_subreg (<MODE>mode, v_bitmask,
+					   <VCVTF:V_cmp_result>mode, 0);
+     emit_insn (gen_neon_vbsl<mode> (operands[0], v_bitmask_cast, operands[0],
+				     operands[1]));
+
+     DONE;
+  }"
+)
+
 (define_insn "neon_vqneg<mode>"
   [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
 	(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
diff --git a/gcc/testsuite/gcc.target/arm/vect-copysignf.c b/gcc/testsuite/gcc.target/arm/vect-copysignf.c
new file mode 100644
index 0000000..42f5560
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-copysignf.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-add-options "arm_neon" } */
+
+extern void abort ();
+
+#define N 16
+float a[N] = {-0.1f, -3.2f, -6.3f, -9.4f,
+	      -12.5f, -15.6f, -18.7f, -21.8f,
+	      24.9f, 27.1f, 30.2f, 33.3f,
+	      36.4f, 39.5f, 42.6f, 45.7f};
+float b[N] = {-1.2f, 3.4f, -5.6f, 7.8f,
+	      -9.0f, 1.0f, -2.0f, 3.0f,
+	      -4.0f, -5.0f, 6.0f, 7.0f,
+	      -8.0f, -9.0f, 10.0f, 11.0f};
+float r[N];
+
+int
+main (void)
+{
+  int i;
+
+  for (i = 0; i < N; i++)
+    r[i] = __builtin_copysignf (a[i], b[i]);
+
+  /* check results:  */
+  for (i = 0; i < N; i++)
+    if (r[i] != __builtin_copysignf (a[i], b[i]))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */

Reply via email to