Hello! We can use ROUNDSP/ROUNDSD in round(a) expansion. Currently, we expand round(a) as (-O2 -ffast-math):
.LFB0: .cfi_startproc movsd .LC1(%rip), %xmm1 movapd %xmm0, %xmm2 movsd .LC0(%rip), %xmm3 andpd %xmm1, %xmm2 ucomisd %xmm2, %xmm3 jbe .L2 addsd .LC2(%rip), %xmm2 andnpd %xmm0, %xmm1 movapd %xmm1, %xmm0 cvttsd2siq %xmm2, %rax cvtsi2sdq %rax, %xmm2 orpd %xmm2, %xmm0 .L2: rep ret Adding -msse4, we now generate branchless code using roundsd: .LFB0: .cfi_startproc movsd .LC0(%rip), %xmm2 movapd %xmm0, %xmm1 andpd %xmm2, %xmm1 andnpd %xmm0, %xmm2 addsd .LC1(%rip), %xmm1 roundsd $1, %xmm1, %xmm1 orpd %xmm2, %xmm1 movapd %xmm1, %xmm0 ret The patch also simplifies a couple of checks in related patterns. 2011-08-14 Uros Bizjak <ubiz...@gmail.com> * config/i386/i386.c (ix86_expand_round_sse4): New function. * config/i386/i386-protos.h (ix86_expand_round_sse4): New prototype. * config/i386/i386.md (round<mode>2): Use ix86_expand_round_sse4 for TARGET_ROUND. (rint<mode>2): Simplify TARGET_ROUND check. (floor<mode>2): Ditto. (ceil<mode>2): Ditto. (btrunc<mode>2): Ditto. Bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}, will be committed to mainline soon. Uros.
Index: i386.md =================================================================== --- i386.md (revision 177746) +++ i386.md (working copy) @@ -14394,11 +14394,11 @@ if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH && !flag_trapping_math) { - if (!TARGET_ROUND && optimize_insn_for_size_p ()) - FAIL; if (TARGET_ROUND) emit_insn (gen_sse4_1_round<mode>2 (operands[0], operands[1], GEN_INT (ROUND_MXCSR))); + else if (optimize_insn_for_size_p ()) + FAIL; else ix86_expand_rint (operand0, operand1); } @@ -14431,7 +14431,12 @@ if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH && !flag_trapping_math && !flag_rounding_math) { - if (TARGET_64BIT || (<MODE>mode != DFmode)) + if (TARGET_ROUND) + { + operands[1] = force_reg (<MODE>mode, operands[1]); + ix86_expand_round_sse4 (operands[0], operands[1]); + } + else if (TARGET_64BIT || (<MODE>mode != DFmode)) ix86_expand_round (operands[0], operands[1]); else ix86_expand_rounddf_32 (operands[0], operands[1]); @@ -14663,14 +14668,13 @@ && !flag_trapping_math)" { if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH - && !flag_trapping_math - && (TARGET_ROUND || optimize_insn_for_speed_p ())) + && !flag_trapping_math) { - if (!TARGET_ROUND && optimize_insn_for_size_p ()) - FAIL; if (TARGET_ROUND) emit_insn (gen_sse4_1_round<mode>2 (operands[0], operands[1], GEN_INT (ROUND_FLOOR))); + else if (optimize_insn_for_size_p ()) + FAIL; else if (TARGET_64BIT || (<MODE>mode != DFmode)) ix86_expand_floorceil (operand0, operand1, true); else @@ -14922,8 +14926,7 @@ && !flag_trapping_math)" { if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH - && !flag_trapping_math - && (TARGET_ROUND || optimize_insn_for_speed_p ())) + && !flag_trapping_math) { if (TARGET_ROUND) emit_insn (gen_sse4_1_round<mode>2 @@ -15179,8 +15182,7 @@ && !flag_trapping_math)" { if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH - && !flag_trapping_math - && (TARGET_ROUND || optimize_insn_for_speed_p ())) + && !flag_trapping_math) { if (TARGET_ROUND) emit_insn (gen_sse4_1_round<mode>2 Index: i386-protos.h =================================================================== --- i386-protos.h (revision 177746) +++ i386-protos.h (working copy) @@ -174,6 +174,7 @@ extern void ix86_expand_rint (rtx, rtx); extern void ix86_expand_floorceil (rtx, rtx, bool); extern void ix86_expand_floorceildf_32 (rtx, rtx, bool); +extern void ix86_expand_round_sse4 (rtx, rtx); extern void ix86_expand_round (rtx, rtx); extern void ix86_expand_rounddf_32 (rtx, rtx); extern void ix86_expand_trunc (rtx, rtx); Index: i386.c =================================================================== --- i386.c (revision 177746) +++ i386.c (working copy) @@ -32676,6 +32676,40 @@ emit_move_insn (operand0, res); } + +/* Expand SSE sequence for computing round + from OP1 storing into OP0 using sse4 round insn. */ +void +ix86_expand_round_sse4 (rtx op0, rtx op1) +{ + enum machine_mode mode = GET_MODE (op0); + rtx e1, e2, e3, res, half, mask; + + half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, mode); + + /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */ + + /* e1 = fabs(op1) */ + e1 = ix86_expand_sse_fabs (op1, &mask); + + /* e2 = e1 + 0.5 */ + half = force_reg (mode, half); + e2 = expand_simple_binop (mode, PLUS, e1, half, NULL_RTX, 0, OPTAB_DIRECT); + + /* e3 = floor(e2) */ + e3 = gen_reg_rtx (mode); + emit_insn + (gen_rtx_SET (VOIDmode, e3, + gen_rtx_UNSPEC (mode, + gen_rtvec (2, e2, GEN_INT (ROUND_FLOOR)), + UNSPEC_ROUND))); + + /* res = copysign (e3, op1) */ + res = gen_reg_rtx (mode); + ix86_sse_copysign_to_positive (res, e3, op1, mask); + + emit_move_insn (op0, res); +} /* Table of valid machine attributes. */