On Thu, 24 Nov 2022 at 09:23, Jakub Jelinek wrote: > > Hi! > > Upstream fast_float came up with a cheaper test for > fegetround () == FE_TONEAREST using one float addition, one subtraction > and one comparison. If we know we are rounding to nearest, we can use > fast path in more cases as before. > The following patch merges those changes into libstdc++. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
OK, thanks. > > 2022-11-24 Jakub Jelinek <ja...@redhat.com> > > PR libstdc++/107468 > * src/c++17/fast_float/MERGE: Adjust for merge from upstream. > * src/c++17/fast_float/fast_float.h: Merge from fast_float > 2ef9abbcf6a11958b6fa685a89d0150022e82e78 commit. > > --- libstdc++-v3/src/c++17/fast_float/MERGE.jj 2022-11-07 15:17:14.035071694 > +0100 > +++ libstdc++-v3/src/c++17/fast_float/MERGE 2022-11-23 17:09:20.940866070 > +0100 > @@ -1,4 +1,4 @@ > -662497742fea7055f0e0ee27e5a7ddc382c2c38e > +2ef9abbcf6a11958b6fa685a89d0150022e82e78 > > The first line of this file holds the git revision number of the > last merge done from the master library sources. > --- libstdc++-v3/src/c++17/fast_float/fast_float.h.jj 2022-11-07 > 15:17:14.066071268 +0100 > +++ libstdc++-v3/src/c++17/fast_float/fast_float.h 2022-11-23 > 17:19:41.735693122 +0100 > @@ -99,11 +99,11 @@ from_chars_result from_chars_advanced(co > || defined(__MINGW64__) \ > || defined(__s390x__) \ > || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) > || defined(__PPC64LE__)) ) > -#define FASTFLOAT_64BIT > +#define FASTFLOAT_64BIT 1 > #elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) \ > || defined(__arm__) || defined(_M_ARM) \ > || defined(__MINGW32__) || defined(__EMSCRIPTEN__)) > -#define FASTFLOAT_32BIT > +#define FASTFLOAT_32BIT 1 > #else > // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow. > // We can never tell the register width, but the SIZE_MAX is a good > approximation. > @@ -111,9 +111,9 @@ from_chars_result from_chars_advanced(co > #if SIZE_MAX == 0xffff > #error Unknown platform (16-bit, unsupported) > #elif SIZE_MAX == 0xffffffff > - #define FASTFLOAT_32BIT > + #define FASTFLOAT_32BIT 1 > #elif SIZE_MAX == 0xffffffffffffffff > - #define FASTFLOAT_64BIT > + #define FASTFLOAT_64BIT 1 > #else > #error Unknown platform (not 32-bit, not 64-bit?) > #endif > @@ -359,10 +359,12 @@ template <typename T> struct binary_form > static inline constexpr int minimum_exponent(); > static inline constexpr int infinite_power(); > static inline constexpr int sign_index(); > + static inline constexpr int min_exponent_fast_path(); // used when > fegetround() == FE_TONEAREST > static inline constexpr int max_exponent_fast_path(); > static inline constexpr int max_exponent_round_to_even(); > static inline constexpr int min_exponent_round_to_even(); > static inline constexpr uint64_t max_mantissa_fast_path(int64_t power); > + static inline constexpr uint64_t max_mantissa_fast_path(); // used when > fegetround() == FE_TONEAREST > static inline constexpr int largest_power_of_ten(); > static inline constexpr int smallest_power_of_ten(); > static inline constexpr T exact_power_of_ten(int64_t power); > @@ -372,6 +374,22 @@ template <typename T> struct binary_form > static inline constexpr equiv_uint hidden_bit_mask(); > }; > > +template <> inline constexpr int > binary_format<double>::min_exponent_fast_path() { > +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) > + return 0; > +#else > + return -22; > +#endif > +} > + > +template <> inline constexpr int > binary_format<float>::min_exponent_fast_path() { > +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) > + return 0; > +#else > + return -10; > +#endif > +} > + > template <> inline constexpr int > binary_format<double>::mantissa_explicit_bits() { > return 52; > } > @@ -418,13 +436,18 @@ template <> inline constexpr int binary_ > template <> inline constexpr int > binary_format<float>::max_exponent_fast_path() { > return 10; > } > - > +template <> inline constexpr uint64_t > binary_format<double>::max_mantissa_fast_path() { > + return uint64_t(2) << mantissa_explicit_bits(); > +} > template <> inline constexpr uint64_t > binary_format<double>::max_mantissa_fast_path(int64_t power) { > // caller is responsible to ensure that > // power >= 0 && power <= 22 > // > return max_mantissa_double[power]; > } > +template <> inline constexpr uint64_t > binary_format<float>::max_mantissa_fast_path() { > + return uint64_t(2) << mantissa_explicit_bits(); > +} > template <> inline constexpr uint64_t > binary_format<float>::max_mantissa_fast_path(int64_t power) { > // caller is responsible to ensure that > // power >= 0 && power <= 10 > @@ -619,10 +642,6 @@ parsed_number_string parse_number_string > > uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) > > - while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) { > - i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, > this will overflow, but that's ok > - p += 8; > - } > while ((p != pend) && is_integer(*p)) { > // a multiplication by 10 is cheaper than an arbitrary integer > // multiplication > @@ -1640,7 +1659,7 @@ namespace fast_float { > // we might have platforms where `CHAR_BIT` is not 8, so let's avoid > // doing `8 * sizeof(limb)`. > #if defined(FASTFLOAT_64BIT) && !defined(__sparc) > -#define FASTFLOAT_64BIT_LIMB > +#define FASTFLOAT_64BIT_LIMB 1 > typedef uint64_t limb; > constexpr size_t limb_bits = 64; > #else > @@ -2314,10 +2333,6 @@ parsed_number_string parse_number_string > > uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) > > - while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) { > - i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, > this will overflow, but that's ok > - p += 8; > - } > while ((p != pend) && is_integer(*p)) { > // a multiplication by 10 is cheaper than an arbitrary integer > // multiplication > @@ -2892,6 +2907,48 @@ from_chars_result parse_infnan(const cha > return answer; > } > > +/** > + * Returns true if the floating-pointing rounding mode is to 'nearest'. > + * It is the default on most system. This function is meant to be > inexpensive. > + * Credit : @mwalcott3 > + */ > +fastfloat_really_inline bool rounds_to_nearest() noexcept { > + // See > + // A fast function to check your floating-point rounding mode > + // > https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-floating-point-rounding-mode/ > + // > + // This function is meant to be equivalent to : > + // prior: #include <cfenv> > + // return fegetround() == FE_TONEAREST; > + // However, it is expected to be much faster than the fegetround() > + // function call. > + // > + // The volatile keywoard prevents the compiler from computing the function > + // at compile-time. > + // There might be other ways to prevent compile-time optimizations (e.g., > asm). > + // The value does not need to be std::numeric_limits<float>::min(), any > small > + // value so that 1 + x should round to 1 would do (after accounting for > excess > + // precision, as in 387 instructions). > + static volatile float fmin = std::numeric_limits<float>::min(); > + float fmini = fmin; // we copy it so that it gets loaded at most once. > + // > + // Explanation: > + // Only when fegetround() == FE_TONEAREST do we have that > + // fmin + 1.0f == 1.0f - fmin. > + // > + // FE_UPWARD: > + // fmin + 1.0f > 1 > + // 1.0f - fmin == 1 > + // > + // FE_DOWNWARD or FE_TOWARDZERO: > + // fmin + 1.0f == 1 > + // 1.0f - fmin < 1 > + // > + // Note: This may fail to be accurate if fast-math has been > + // enabled, as rounding conventions may not apply. > + return (fmini + 1.0f == 1.0f - fmini); > +} > + > } // namespace detail > > template<typename T> > @@ -2919,12 +2976,45 @@ from_chars_result from_chars_advanced(co > } > answer.ec = std::errc(); // be optimistic > answer.ptr = pns.lastmatch; > - // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's > proposal > - if (pns.exponent >= 0 && pns.exponent <= > binary_format<T>::max_exponent_fast_path() && pns.mantissa > <=binary_format<T>::max_mantissa_fast_path(pns.exponent) && > !pns.too_many_digits) { > - value = T(pns.mantissa); > - value = value * binary_format<T>::exact_power_of_ten(pns.exponent); > - if (pns.negative) { value = -value; } > - return answer; > + // The implementation of the Clinger's fast path is convoluted because > + // we want round-to-nearest in all cases, irrespective of the rounding mode > + // selected on the thread. > + // We proceed optimistically, assuming that detail::rounds_to_nearest() > returns > + // true. > + if (binary_format<T>::min_exponent_fast_path() <= pns.exponent && > pns.exponent <= binary_format<T>::max_exponent_fast_path() && > !pns.too_many_digits) { > + // Unfortunately, the conventional Clinger's fast path is only possible > + // when the system rounds to the nearest float. > + // > + // We expect the next branch to almost always be selected. > + // We could check it first (before the previous branch), but > + // there might be performance advantages at having the check > + // be last. > + if(detail::rounds_to_nearest()) { > + // We have that fegetround() == FE_TONEAREST. > + // Next is Clinger's fast path. > + if (pns.mantissa <=binary_format<T>::max_mantissa_fast_path()) { > + value = T(pns.mantissa); > + if (pns.exponent < 0) { value = value / > binary_format<T>::exact_power_of_ten(-pns.exponent); } > + else { value = value * > binary_format<T>::exact_power_of_ten(pns.exponent); } > + if (pns.negative) { value = -value; } > + return answer; > + } > + } else { > + // We do not have that fegetround() == FE_TONEAREST. > + // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's > proposal > + if (pns.exponent >= 0 && pns.mantissa > <=binary_format<T>::max_mantissa_fast_path(pns.exponent)) { > +#if (defined(_WIN32) && defined(__clang__)) > + // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD > + if(pns.mantissa == 0) { > + value = 0; > + return answer; > + } > +#endif > + value = T(pns.mantissa) * > binary_format<T>::exact_power_of_ten(pns.exponent); > + if (pns.negative) { value = -value; } > + return answer; > + } > + } > } > adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, > pns.mantissa); > if(pns.too_many_digits && am.power2 >= 0) { > > Jakub >