Hi, Currently GCC generates a store followed by a load for __builtin_signbit for both IBM long double and IEEE-128 floating point. This patch, originating with Michael Meissner, adds logic to instead use direct move instructions. The original patch also added direct-move support for single- and double-precision floating point, but Mike was concerned that performance on P8 may not have been improved. So for now I've removed those portions of the patch until we can do some studies.
This leads to one strange-looking bit of code in rs6000.md, where we have a mode attribute named Fsignbit that always maps to the wa constraint. Once the SF and DF support goes in, that will be more interesting, so I'd like to put it in this way to make it obvious how this should be modified later. The patch includes compile-only test variants for P8 and P9, and a runtime test that assumes only P7-level code generation so that software float128 is used. Bootstrapped and tested on powerpc64[le]-unknown-linux-gnu with no regressions. Is this ok for trunk, and later for 6.2? Thanks, Bill [gcc] 2016-06-28 Michael Meissner <meiss...@linux.vnet.ibm.com> Bill Schmidt <wschm...@linux.vnet.ibm.com> * config/rs6000/rs6000-protos.h (rs6000_split_signbit): New prototype. * config/rs6000/rs6000.c (rs6000_split_signbit): New function. * config/rs6000/rs6000.md (UNSPEC_SIGNBIT): New constant. (SIGNBIT): New mode iterator. (Fsignbit): New mode attribute. (signbit<mode>2): Change operand1 to match FLOAT128 instead of IBM128; dispatch to gen_signbit{kf,tf}2_dm for __float128 when direct moves are available. (signbit<mode>2_dm): New define_insn_and_split). (signbit<mode>2_dm2): New define_insn. [gcc/testsuite] 2016-06-28 Michael Meissner <meiss...@linux.vnet.ibm.com> Bill Schmidt <wschm...@linux.vnet.ibm.com> * gcc.target/powerpc/signbit-1.c: New test. * gcc.target/powerpc/signbit-2.c: New test. * gcc.target/powerpc/signbit-3.c: New test. Index: gcc/config/rs6000/rs6000-protos.h =================================================================== --- gcc/config/rs6000/rs6000-protos.h (revision 237802) +++ gcc/config/rs6000/rs6000-protos.h (working copy) @@ -135,6 +135,7 @@ extern bool rs6000_emit_set_const (rtx, rtx); extern int rs6000_emit_cmove (rtx, rtx, rtx, rtx); extern int rs6000_emit_vector_cond_expr (rtx, rtx, rtx, rtx, rtx, rtx); extern void rs6000_emit_minmax (rtx, enum rtx_code, rtx, rtx); +extern void rs6000_split_signbit (rtx, rtx); extern void rs6000_expand_atomic_compare_and_swap (rtx op[]); extern void rs6000_expand_atomic_exchange (rtx op[]); extern void rs6000_expand_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx); Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 237802) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -23135,6 +23135,68 @@ rs6000_emit_minmax (rtx dest, enum rtx_code code, emit_move_insn (dest, target); } +/* Split a signbit operation on 64-bit machines with direct move. Also allow + for the value to come from memory or if it is already loaded into a GPR. */ + +void +rs6000_split_signbit (rtx dest, rtx src) +{ + machine_mode d_mode = GET_MODE (dest); + machine_mode s_mode = GET_MODE (src); + rtx dest_di = (d_mode == DImode) ? dest : gen_lowpart (DImode, dest); + rtx shift_reg = dest_di; + + gcc_assert (REG_P (dest)); + gcc_assert (REG_P (src) || MEM_P (src)); + + if (MEM_P (src)) + { + rtx mem; + + if (s_mode == SFmode) + mem = gen_rtx_SIGN_EXTEND (DImode, adjust_address (src, SImode, 0)); + + else if (GET_MODE_SIZE (s_mode) == 16 && !WORDS_BIG_ENDIAN) + mem = adjust_address (src, DImode, 8); + + else + mem = adjust_address (src, DImode, 0); + + emit_insn (gen_rtx_SET (dest_di, mem)); + } + + else + { + unsigned int r = REGNO (src); + + /* If this is a VSX register, generate the special mfvsrd instruction + to get it in a GPR. */ + if (VSX_REGNO_P (r)) + emit_insn (gen_rtx_SET (dest_di, + gen_rtx_UNSPEC (DImode, + gen_rtvec (2, src, const0_rtx), + UNSPEC_SIGNBIT))); + + else + { + gcc_assert (INT_REGNO_P (r)); + + /* We need to sign extend SFmode if it lives in a GPR. */ + if (s_mode == SFmode) + { + emit_insn (gen_extendsidi2 (dest_di, gen_lowpart (SImode, src))); + shift_reg = dest_di; + } + + else + shift_reg = gen_highpart (DImode, src); + } + } + + emit_insn (gen_lshrdi3 (dest_di, shift_reg, GEN_INT (63))); + return; +} + /* A subroutine of the atomic operation splitters. Jump to LABEL if COND is true. Mark the jump as unlikely to be taken. */ Index: gcc/config/rs6000/rs6000.md =================================================================== --- gcc/config/rs6000/rs6000.md (revision 237802) +++ gcc/config/rs6000/rs6000.md (working copy) @@ -147,6 +147,7 @@ UNSPEC_ROUND_TO_ODD UNSPEC_IEEE128_MOVE UNSPEC_IEEE128_CONVERT + UNSPEC_SIGNBIT ]) ;; @@ -506,6 +507,13 @@ (IF "TARGET_FLOAT128") (TF "TARGET_LONG_DOUBLE_128")]) +; Iterator for signbit on 64-bit machines with direct move +(define_mode_iterator SIGNBIT [(KF "FLOAT128_VECTOR_P (KFmode)") + (TF "FLOAT128_VECTOR_P (TFmode)")]) + +(define_mode_attr Fsignbit [(KF "wa") + (TF "wa")]) + ; SF/DF suffix for traditional floating instructions (define_mode_attr Ftrad [(SF "s") (DF "")]) @@ -4559,7 +4567,7 @@ ;; when little-endian. (define_expand "signbit<mode>2" [(set (match_dup 2) - (float_truncate:DF (match_operand:IBM128 1 "gpc_reg_operand" ""))) + (float_truncate:DF (match_operand:FLOAT128 1 "gpc_reg_operand" ""))) (set (match_dup 3) (subreg:DI (match_dup 2) 0)) (set (match_dup 4) @@ -4567,8 +4575,25 @@ (set (match_operand:SI 0 "gpc_reg_operand" "") (match_dup 6))] "TARGET_HARD_FLOAT - && (TARGET_FPRS || TARGET_E500_DOUBLE)" + && (TARGET_FPRS || TARGET_E500_DOUBLE) + && (!FLOAT128_IEEE_P (<MODE>mode) + || (TARGET_POWERPC64 && TARGET_DIRECT_MOVE))" { + if (FLOAT128_IEEE_P (<MODE>mode)) + { + if (<MODE>mode == KFmode) + { + emit_insn (gen_signbitkf2_dm (operands[0], operands[1])); + DONE; + } + else if (<MODE>mode == TFmode) + { + emit_insn (gen_signbittf2_dm (operands[0], operands[1])); + DONE; + } + else + gcc_unreachable (); + } operands[2] = gen_reg_rtx (DFmode); operands[3] = gen_reg_rtx (DImode); if (TARGET_POWERPC64) @@ -4616,6 +4641,37 @@ operands[5] = CONST0_RTX (<MODE>mode); }) +;; Optimize signbit on 64-bit systems with direct move to avoid doing the store +;; and load +(define_insn_and_split "signbit<mode>2_dm" + [(set (match_operand:SI 0 "gpc_reg_operand" "=r,r,r") + (unspec:SI + [(match_operand:SIGNBIT 1 "input_operand" "<Fsignbit>,m,r")] + UNSPEC_SIGNBIT))] + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rs6000_split_signbit (operands[0], operands[1]); + DONE; +} + [(set_attr "length" "8,8,12") + (set_attr "type" "mftgpr,load,integer")]) + +;; MODES_TIEABLE_P doesn't allow DImode to be tied with the various floating +;; point types, which makes normal SUBREG's problematical. Instead use a +;; special pattern to avoid using a normal movdi. +(define_insn "signbit<mode>2_dm2" + [(set (match_operand:DI 0 "gpc_reg_operand" "=r") + (unspec:DI [(match_operand:SIGNBIT 1 "gpc_reg_operand" "<Fsignbit>") + (const_int 0)] + UNSPEC_SIGNBIT))] + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "mfvsrd %0,%x1" + [(set_attr "type" "mftgpr")]) + + ;; Use an unspec rather providing an if-then-else in RTL, to prevent the ;; compiler from optimizing -0.0 (define_insn "copysign<mode>3_fcpsgn" Index: gcc/testsuite/gcc.target/powerpc/signbit-1.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/signbit-1.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/signbit-1.c (working copy) @@ -0,0 +1,16 @@ +/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-require-effective-target powerpc_p8vector_ok } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O2 -mfloat128" } */ + +int do_signbit_kf (__float128 a) { return __builtin_signbit (a); } +int do_signbit_if (__ibm128 a) { return __builtin_signbit (a); } +int do_signbit_tf (long double a) { return __builtin_signbit (a); } + +/* { dg-final { scan-assembler-not "stxvd2x" } } */ +/* { dg-final { scan-assembler-not "stxvw4x" } } */ +/* { dg-final { scan-assembler-not "stxsd" } } */ +/* { dg-final { scan-assembler-not "stxsdx" } } */ +/* { dg-final { scan-assembler-times "mfvsrd" 3 } } */ +/* { dg-final { scan-assembler-times "srdi" 3 } } */ Index: gcc/testsuite/gcc.target/powerpc/signbit-2.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/signbit-2.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/signbit-2.c (working copy) @@ -0,0 +1,18 @@ +/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-require-effective-target powerpc_p9vector_ok } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */ +/* { dg-options "-mcpu=power9 -O2 -mfloat128" } */ + +int do_signbit_kf (__float128 *a) { return __builtin_signbit (*a); } + +/* { dg-final { scan-assembler-not "stxvd2x" } } */ +/* { dg-final { scan-assembler-not "stxvw4x" } } */ +/* { dg-final { scan-assembler-not "stxsd" } } */ +/* { dg-final { scan-assembler-not "stxsdx" } } */ +/* { dg-final { scan-assembler-not "lxvd2x" } } */ +/* { dg-final { scan-assembler-not "lxvw4x" } } */ +/* { dg-final { scan-assembler-not "lxsd" } } */ +/* { dg-final { scan-assembler-not "lxsdx" } } */ +/* { dg-final { scan-assembler-times "ld" 1 } } */ +/* { dg-final { scan-assembler-times "srdi" 1 } } */ Index: gcc/testsuite/gcc.target/powerpc/signbit-3.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/signbit-3.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/signbit-3.c (working copy) @@ -0,0 +1,172 @@ +/* { dg-do run { target { powerpc*-*-linux* } } } */ +/* { dg-require-effective-target ppc_float128_sw } */ +/* { dg-options "-mcpu=power7 -O2 -mfloat128 -static-libgcc -lm" } */ + +#ifdef DEBUG +#include <stdio.h> +#endif + +#include <stddef.h> +#include <stdint.h> +#include <inttypes.h> +#include <stdlib.h> +#include <math.h> + +#if defined(__BIG_ENDIAN__) +struct ieee128 { + uint64_t upper; + uint64_t lower; +}; + +#elif defined(__LITTLE_ENDIAN__) +struct ieee128 { + uint64_t lower; + uint64_t upper; +}; + +#else +#error "Unknown system" +#endif + +union ieee_union { + __float128 f128; + struct ieee128 st128; +}; + +#ifdef DEBUG +static int num_errors = 0; + +__attribute__((__noinline__)) +static void +failure (int expected, int got, __float128 x) +{ + unsigned sign; + unsigned exponent; + uint64_t mantissa1; + uint64_t mantissa2; + uint64_t upper; + uint64_t lower; + + union ieee_union u; + + u.f128 = x; + upper = u.st128.upper; + lower = u.st128.lower; + + sign = (unsigned)((upper >> 63) & 1); + exponent = (unsigned)((upper >> 48) & ((((uint64_t)1) << 16) - 1)); + mantissa1 = (upper & ((((uint64_t)1) << 48) - 1)); + mantissa2 = lower; + + printf ("Expected %d, got %d, %c 0x%.4x 0x%.12" PRIx64 " 0x%.16" PRIx64, + expected, got, + sign ? '-' : '+', + exponent, + mantissa1, + mantissa2); + + num_errors++; +} + +#else + +#define failure(E, G, F) abort () +#endif + +__attribute__((__noinline__)) +static void +test_signbit_arg (__float128 f128, int expected) +{ + int sign = __builtin_signbit (f128); + + if ((expected != 0 && sign == 0) + || (expected == 0 && sign != 0)) + failure (f128, expected, sign); +} + +__attribute__((__noinline__)) +static void +test_signbit_mem (__float128 *ptr, int expected) +{ + int sign = __builtin_signbit (*ptr); + + if ((expected != 0 && sign == 0) + || (expected == 0 && sign != 0)) + failure (*ptr, expected, sign); +} + +__attribute__((__noinline__)) +static void +test_signbit_gpr (__float128 *ptr, int expected) +{ + __float128 f128 = *ptr; + int sign; + + __asm__ (" # %0" : "+r" (f128)); + + sign = __builtin_signbit (f128); + if ((expected != 0 && sign == 0) + || (expected == 0 && sign != 0)) + failure (f128, expected, sign); +} + +__attribute__((__noinline__)) +static void +test_signbit (__float128 f128, int expected) +{ +#ifdef DEBUG + union ieee_union u; + u.f128 = f128; + printf ("Expecting %d, trying %-5g " + "(0x%.16" PRIx64 " 0x%.16" PRIx64 ")\n", + expected, (double)f128, + u.st128.upper, u.st128.lower); +#endif + + test_signbit_arg (f128, expected); + test_signbit_mem (&f128, expected); + test_signbit_gpr (&f128, expected); +} + +int +main (void) +{ + union ieee_union u; + + test_signbit (+0.0q, 0); + test_signbit (+1.0q, 0); + + test_signbit (-0.0q, 1); + test_signbit (-1.0q, 1); + + test_signbit (__builtin_copysign ((__float128)__builtin_inf (), +1.0q), 0); + test_signbit (__builtin_copysign ((__float128)__builtin_inf (), -1.0q), 1); + + test_signbit (__builtin_copysign ((__float128)__builtin_nan (""), +1.0q), 0); + test_signbit (__builtin_copysign ((__float128)__builtin_nan (""), -1.0q), 1); + + /* force the bottom double word to have specific bits in the 'sign' bit to + make sure we are picking the right word. */ + u.f128 = 1.0q; + u.st128.lower = 0ULL; + test_signbit (u.f128, 0); + + u.st128.lower = ~0ULL; + test_signbit (u.f128, 0); + + u.f128 = -1.0q; + u.st128.lower = 0ULL; + test_signbit (u.f128, 1); + + u.st128.lower = ~0ULL; + test_signbit (u.f128, 1); + +#ifdef DEBUG + printf ("%d error(s) were found\n", num_errors); + if (num_errors) + return num_errors; +#endif + + return 0; +} +