Hello!

We can use ROUNDSP/ROUNDSD in round(a) expansion. Currently, we expand
round(a) as (-O2 -ffast-math):

.LFB0:
        .cfi_startproc
        movsd   .LC1(%rip), %xmm1
        movapd  %xmm0, %xmm2
        movsd   .LC0(%rip), %xmm3
        andpd   %xmm1, %xmm2
        ucomisd %xmm2, %xmm3
        jbe     .L2
        addsd   .LC2(%rip), %xmm2
        andnpd  %xmm0, %xmm1
        movapd  %xmm1, %xmm0
        cvttsd2siq      %xmm2, %rax
        cvtsi2sdq       %rax, %xmm2
        orpd    %xmm2, %xmm0
.L2:
        rep
        ret

Adding -msse4, we now generate branchless code using roundsd:

.LFB0:
        .cfi_startproc
        movsd   .LC0(%rip), %xmm2
        movapd  %xmm0, %xmm1
        andpd   %xmm2, %xmm1
        andnpd  %xmm0, %xmm2
        addsd   .LC1(%rip), %xmm1
        roundsd $1, %xmm1, %xmm1
        orpd    %xmm2, %xmm1
        movapd  %xmm1, %xmm0
        ret

The patch also simplifies a couple of checks in related patterns.

2011-08-14  Uros Bizjak  <ubiz...@gmail.com>

        * config/i386/i386.c (ix86_expand_round_sse4): New function.
        * config/i386/i386-protos.h (ix86_expand_round_sse4): New prototype.
        * config/i386/i386.md (round<mode>2): Use ix86_expand_round_sse4
        for TARGET_ROUND.

        (rint<mode>2): Simplify TARGET_ROUND check.
        (floor<mode>2): Ditto.
        (ceil<mode>2): Ditto.
        (btrunc<mode>2): Ditto.

Bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32},
will be committed to mainline soon.

Uros.
Index: i386.md
===================================================================
--- i386.md     (revision 177746)
+++ i386.md     (working copy)
@@ -14394,11 +14394,11 @@
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
       && !flag_trapping_math)
     {
-      if (!TARGET_ROUND && optimize_insn_for_size_p ())
-       FAIL;
       if (TARGET_ROUND)
        emit_insn (gen_sse4_1_round<mode>2
                   (operands[0], operands[1], GEN_INT (ROUND_MXCSR)));
+      else if (optimize_insn_for_size_p ())
+        FAIL;
       else
        ix86_expand_rint (operand0, operand1);
     }
@@ -14431,7 +14431,12 @@
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
       && !flag_trapping_math && !flag_rounding_math)
     {
-      if (TARGET_64BIT || (<MODE>mode != DFmode))
+      if (TARGET_ROUND)
+        {
+         operands[1] = force_reg (<MODE>mode, operands[1]);
+         ix86_expand_round_sse4 (operands[0], operands[1]);
+       }
+      else if (TARGET_64BIT || (<MODE>mode != DFmode))
        ix86_expand_round (operands[0], operands[1]);
       else
        ix86_expand_rounddf_32 (operands[0], operands[1]);
@@ -14663,14 +14668,13 @@
        && !flag_trapping_math)"
 {
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
-      && !flag_trapping_math
-      && (TARGET_ROUND || optimize_insn_for_speed_p ()))
+      && !flag_trapping_math)
     {
-      if (!TARGET_ROUND && optimize_insn_for_size_p ())
-       FAIL;
       if (TARGET_ROUND)
        emit_insn (gen_sse4_1_round<mode>2
                   (operands[0], operands[1], GEN_INT (ROUND_FLOOR)));
+      else if (optimize_insn_for_size_p ())
+        FAIL;
       else if (TARGET_64BIT || (<MODE>mode != DFmode))
        ix86_expand_floorceil (operand0, operand1, true);
       else
@@ -14922,8 +14926,7 @@
        && !flag_trapping_math)"
 {
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
-      && !flag_trapping_math
-      && (TARGET_ROUND || optimize_insn_for_speed_p ()))
+      && !flag_trapping_math)
     {
       if (TARGET_ROUND)
        emit_insn (gen_sse4_1_round<mode>2
@@ -15179,8 +15182,7 @@
        && !flag_trapping_math)"
 {
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
-      && !flag_trapping_math
-      && (TARGET_ROUND || optimize_insn_for_speed_p ()))
+      && !flag_trapping_math)
     {
       if (TARGET_ROUND)
        emit_insn (gen_sse4_1_round<mode>2
Index: i386-protos.h
===================================================================
--- i386-protos.h       (revision 177746)
+++ i386-protos.h       (working copy)
@@ -174,6 +174,7 @@
 extern void ix86_expand_rint (rtx, rtx);
 extern void ix86_expand_floorceil (rtx, rtx, bool);
 extern void ix86_expand_floorceildf_32 (rtx, rtx, bool);
+extern void ix86_expand_round_sse4 (rtx, rtx);
 extern void ix86_expand_round (rtx, rtx);
 extern void ix86_expand_rounddf_32 (rtx, rtx);
 extern void ix86_expand_trunc (rtx, rtx);
Index: i386.c
===================================================================
--- i386.c      (revision 177746)
+++ i386.c      (working copy)
@@ -32676,6 +32676,40 @@
 
   emit_move_insn (operand0, res);
 }
+
+/* Expand SSE sequence for computing round
+   from OP1 storing into OP0 using sse4 round insn.  */
+void
+ix86_expand_round_sse4 (rtx op0, rtx op1)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx e1, e2, e3, res, half, mask;
+
+  half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, mode);
+
+  /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
+
+  /* e1 = fabs(op1) */
+  e1 = ix86_expand_sse_fabs (op1, &mask);
+
+  /* e2 = e1 + 0.5 */
+  half = force_reg (mode, half);
+  e2 = expand_simple_binop (mode, PLUS, e1, half, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* e3 = floor(e2) */
+  e3 = gen_reg_rtx (mode);
+  emit_insn
+    (gen_rtx_SET (VOIDmode, e3,
+                 gen_rtx_UNSPEC (mode,
+                                 gen_rtvec (2, e2, GEN_INT (ROUND_FLOOR)),
+                                 UNSPEC_ROUND)));
+
+  /* res = copysign (e3, op1) */
+  res = gen_reg_rtx (mode);
+  ix86_sse_copysign_to_positive (res, e3, op1, mask);
+
+  emit_move_insn (op0, res);
+}
 
 
 /* Table of valid machine attributes.  */

Reply via email to