snprintf.c

Andrew Gierth Fri, 05 Oct 2018 20:37:49 -0700

>>>>> "Tom" == Tom Lane <t...@sss.pgh.pa.us> writes:


 >> Ryu is so blazing fast that with it, COPY of a table with 2million
 >> rows of 12 random float8 columns (plus id) becomes FASTER in text
 >> mode than in binary mode (rather than ~5x slower):

 Tom> Oh yeah?  Where's the code for this?

Upstream code is at https://github.com/ulfjack/ryu

Most of that is benchmarking, java, and other stuff not interesting to
us; the guts are under ryu/ and are dual-licensed under Boost 1.0 (which
I think we can use, since the only difference from BSD seems to be a
permissive one) as well as Apache 2.0 (which AFAIK we can't use).

I attach the patch I've used for testing, which has these changes from
upstream Ryu:

  - added ryu_ prefix to entry point functions
  - changed some #include file locations
  - added #define NDEBUG since there are a bunch of plain C assert()s

but I didn't touch the formatting or style of the Ryu code so it's all
C99 and // comments and OTB etc.

For testing purposes what I did was to change float[48]out to use the
Ryu code iff extra_float_digits > 0. This isn't likely what a final
version should do, just a convenience flag. The regression tests for
float8 fail of course since Ryu's output format differs (it always
includes an exponent, but the code for that part can be tweaked without
touching the main algorithm).

-- 
Andrew (irc:RhodiumToad)

diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index df35557b73..b7e44b2eba 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -21,6 +21,7 @@
 
 #include "catalog/pg_type.h"
 #include "common/int.h"
+#include "common/ryu.h"
 #include "libpq/pqformat.h"
 #include "utils/array.h"
 #include "utils/float.h"
@@ -245,6 +246,13 @@ float4out(PG_FUNCTION_ARGS)
 	float4		num = PG_GETARG_FLOAT4(0);
 	char	   *ascii;
 
+	if (extra_float_digits > 0)
+	{
+		ascii = (char *) palloc(24);
+		ryu_f2s_buffered(num, ascii);
+		PG_RETURN_CSTRING(ascii);
+	}
+
 	if (isnan(num))
 		PG_RETURN_CSTRING(pstrdup("NaN"));
 
@@ -481,6 +489,13 @@ float8out_internal(double num)
 {
 	char	   *ascii;
 
+	if (extra_float_digits > 0)
+	{
+		ascii = (char *) palloc(32);
+		ryu_d2s_buffered(num, ascii);
+		return ascii;
+	}
+
 	if (isnan(num))
 		return pstrdup("NaN");
 
diff --git a/src/common/Makefile b/src/common/Makefile
index ec8139f014..ae89dd8c5e 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -44,8 +44,8 @@ override CPPFLAGS += -DVAL_LIBS="\"$(LIBS)\""
 override CPPFLAGS := -DFRONTEND $(CPPFLAGS)
 LIBS += $(PTHREAD_LIBS)
 
-OBJS_COMMON = base64.o config_info.o controldata_utils.o exec.o file_perm.o \
-	ip.o keywords.o link-canary.o md5.o pg_lzcompress.o \
+OBJS_COMMON = base64.o config_info.o controldata_utils.o d2s.o exec.o f2s.o \
+	file_perm.o ip.o keywords.o link-canary.o md5.o pg_lzcompress.o \
 	pgfnames.o psprintf.o relpath.o \
 	rmtree.o saslprep.o scram-common.o string.o unicode_norm.o \
 	username.o wait_error.o
diff --git a/src/common/d2s.c b/src/common/d2s.c
new file mode 100644
index 0000000000..20fac52704
--- /dev/null
+++ b/src/common/d2s.c
@@ -0,0 +1,612 @@
+// Copyright 2018 Ulf Adams
+//
+// The contents of this file may be used under the terms of the Apache License,
+// Version 2.0.
+//
+//    (See accompanying file LICENSE-Apache or copy at
+//     http://www.apache.org/licenses/LICENSE-2.0)
+//
+// Alternatively, the contents of this file may be used under the terms of
+// the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE-Boost or copy at
+//     https://www.boost.org/LICENSE_1_0.txt)
+//
+// Unless required by applicable law or agreed to in writing, this software
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.
+
+// Runtime compiler options:
+// -DRYU_DEBUG Generate verbose debugging output to stdout.
+//
+// -DRYU_ONLY_64_BIT_OPS Avoid using uint128_t or 64-bit intrinsics. Slower,
+//     depending on your compiler.
+//
+// -DRYU_OPTIMIZE_SIZE Use smaller lookup tables. Instead of storing every
+//     required power of 5, only store every 26th entry, and compute
+//     intermediate values with a multiplication. This reduces the lookup table
+//     size by about 10x (only one case, and only double) at the cost of some
+//     performance. Currently requires MSVC intrinsics.
+
+#define NDEBUG
+
+#include "common/ryu.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef RYU_DEBUG
+#include <inttypes.h>
+#include <stdio.h>
+#endif
+
+// ABSL avoids uint128_t on Win32 even if __SIZEOF_INT128__ is defined.
+// Let's do the same for now.
+#if defined(__SIZEOF_INT128__) && !defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS)
+#define HAS_UINT128
+#elif defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS) && defined(_M_X64) \
+  && !defined(__clang__) // https://bugs.llvm.org/show_bug.cgi?id=37755
+#define HAS_64_BIT_INTRINSICS
+#endif
+
+#include "ryu_common.h"
+#include "digit_table.h"
+#include "d2s.h"
+#include "d2s_intrinsics.h"
+
+static inline uint32_t pow5Factor(uint64_t value) {
+  uint32_t count = 0;
+  for (;;) {
+    assert(value != 0);
+    const uint64_t q = div5(value);
+    const uint32_t r = (uint32_t) (value - 5 * q);
+    if (r != 0) {
+      break;
+    }
+    value = q;
+    ++count;
+  }
+  return count;
+}
+
+// Returns true if value is divisible by 5^p.
+static inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {
+  // I tried a case distinction on p, but there was no performance difference.
+  return pow5Factor(value) >= p;
+}
+
+// Returns true if value is divisible by 2^p.
+static inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {
+  // return __builtin_ctzll(value) >= p;
+  return (value & ((1ull << p) - 1)) == 0;
+}
+
+// We need a 64x128-bit multiplication and a subsequent 128-bit shift.
+// Multiplication:
+//   The 64-bit factor is variable and passed in, the 128-bit factor comes
+//   from a lookup table. We know that the 64-bit factor only has 55
+//   significant bits (i.e., the 9 topmost bits are zeros). The 128-bit
+//   factor only has 124 significant bits (i.e., the 4 topmost bits are
+//   zeros).
+// Shift:
+//   In principle, the multiplication result requires 55 + 124 = 179 bits to
+//   represent. However, we then shift this value to the right by j, which is
+//   at least j >= 115, so the result is guaranteed to fit into 179 - 115 = 64
+//   bits. This means that we only need the topmost 64 significant bits of
+//   the 64x128-bit multiplication.
+//
+// There are several ways to do this:
+// 1. Best case: the compiler exposes a 128-bit type.
+//    We perform two 64x64-bit multiplications, add the higher 64 bits of the
+//    lower result to the higher result, and shift by j - 64 bits.
+//
+//    We explicitly cast from 64-bit to 128-bit, so the compiler can tell
+//    that these are only 64-bit inputs, and can map these to the best
+//    possible sequence of assembly instructions.
+//    x86-64 machines happen to have matching assembly instructions for
+//    64x64-bit multiplications and 128-bit shifts.
+//
+// 2. Second best case: the compiler exposes intrinsics for the x86-64 assembly
+//    instructions mentioned in 1.
+//
+// 3. We only have 64x64 bit instructions that return the lower 64 bits of
+//    the result, i.e., we have to use plain C.
+//    Our inputs are less than the full width, so we have three options:
+//    a. Ignore this fact and just implement the intrinsics manually.
+//    b. Split both into 31-bit pieces, which guarantees no internal overflow,
+//       but requires extra work upfront (unless we change the lookup table).
+//    c. Split only the first factor into 31-bit pieces, which also guarantees
+//       no internal overflow, but requires extra work since the intermediate
+//       results are not perfectly aligned.
+#if defined(HAS_UINT128)
+
+// Best case: use 128-bit type.
+static inline uint64_t mulShift(const uint64_t m, const uint64_t* const mul, const int32_t j) {
+  const uint128_t b0 = ((uint128_t) m) * mul[0];
+  const uint128_t b2 = ((uint128_t) m) * mul[1];
+  return (uint64_t) (((b0 >> 64) + b2) >> (j - 64));
+}
+
+static inline uint64_t mulShiftAll(
+    const uint64_t m, const uint64_t* const mul, const int32_t j, uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
+//  m <<= 2;
+//  uint128_t b0 = ((uint128_t) m) * mul[0]; // 0
+//  uint128_t b2 = ((uint128_t) m) * mul[1]; // 64
+//
+//  uint128_t hi = (b0 >> 64) + b2;
+//  uint128_t lo = b0 & 0xffffffffffffffffull;
+//  uint128_t factor = (((uint128_t) mul[1]) << 64) + mul[0];
+//  uint128_t vpLo = lo + (factor << 1);
+//  *vp = (uint64_t) ((hi + (vpLo >> 64)) >> (j - 64));
+//  uint128_t vmLo = lo - (factor << mmShift);
+//  *vm = (uint64_t) ((hi + (vmLo >> 64) - (((uint128_t) 1ull) << 64)) >> (j - 64));
+//  return (uint64_t) (hi >> (j - 64));
+  *vp = mulShift(4 * m + 2, mul, j);
+  *vm = mulShift(4 * m - 1 - mmShift, mul, j);
+  return mulShift(4 * m, mul, j);
+}
+
+#elif defined(HAS_64_BIT_INTRINSICS)
+
+static inline uint64_t mulShift(const uint64_t m, const uint64_t* const mul, const int32_t j) {
+  // m is maximum 55 bits
+  uint64_t high1;                                   // 128
+  const uint64_t low1 = umul128(m, mul[1], &high1); // 64
+  uint64_t high0;                                   // 64
+  umul128(m, mul[0], &high0);                       // 0
+  const uint64_t sum = high0 + low1;
+  if (sum < high0) {
+    ++high1; // overflow into high1
+  }
+  return shiftright128(sum, high1, j - 64);
+}
+
+static inline uint64_t mulShiftAll(
+    const uint64_t m, const uint64_t* const mul, const int32_t j, uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
+  *vp = mulShift(4 * m + 2, mul, j);
+  *vm = mulShift(4 * m - 1 - mmShift, mul, j);
+  return mulShift(4 * m, mul, j);
+}
+
+#else // !defined(HAS_UINT128) && !defined(HAS_64_BIT_INTRINSICS)
+
+static inline uint64_t mulShiftAll(
+    uint64_t m, const uint64_t* const mul, const int32_t j, uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
+  m <<= 1;
+  // m is maximum 55 bits
+  uint64_t tmp;
+  const uint64_t lo = umul128(m, mul[0], &tmp);
+  uint64_t hi;
+  const uint64_t mid = tmp + umul128(m, mul[1], &hi);
+  hi += mid < tmp; // overflow into hi
+
+  const uint64_t lo2 = lo + mul[0];
+  const uint64_t mid2 = mid + mul[1] + (lo2 < lo);
+  const uint64_t hi2 = hi + (mid2 < mid);
+  *vp = shiftright128(mid2, hi2, j - 64 - 1);
+
+  if (mmShift == 1) {
+    const uint64_t lo3 = lo - mul[0];
+    const uint64_t mid3 = mid - mul[1] - (lo3 > lo);
+    const uint64_t hi3 = hi - (mid3 > mid);
+    *vm = shiftright128(mid3, hi3, j - 64 - 1);
+  } else {
+    const uint64_t lo3 = lo + lo;
+    const uint64_t mid3 = mid + mid + (lo3 < lo);
+    const uint64_t hi3 = hi + hi + (mid3 < mid);
+    const uint64_t lo4 = lo3 - mul[0];
+    const uint64_t mid4 = mid3 - mul[1] - (lo4 > lo3);
+    const uint64_t hi4 = hi3 - (mid4 > mid3);
+    *vm = shiftright128(mid4, hi4, j - 64);
+  }
+
+  return shiftright128(mid, hi, j - 64 - 1);
+}
+
+#endif // HAS_64_BIT_INTRINSICS
+
+static inline uint32_t decimalLength(const uint64_t v) {
+  // This is slightly faster than a loop.
+  // The average output length is 16.38 digits, so we check high-to-low.
+  // Function precondition: v is not an 18, 19, or 20-digit number.
+  // (17 digits are sufficient for round-tripping.)
+  assert(v < 100000000000000000L);
+  if (v >= 10000000000000000L) { return 17; }
+  if (v >= 1000000000000000L) { return 16; }
+  if (v >= 100000000000000L) { return 15; }
+  if (v >= 10000000000000L) { return 14; }
+  if (v >= 1000000000000L) { return 13; }
+  if (v >= 100000000000L) { return 12; }
+  if (v >= 10000000000L) { return 11; }
+  if (v >= 1000000000L) { return 10; }
+  if (v >= 100000000L) { return 9; }
+  if (v >= 10000000L) { return 8; }
+  if (v >= 1000000L) { return 7; }
+  if (v >= 100000L) { return 6; }
+  if (v >= 10000L) { return 5; }
+  if (v >= 1000L) { return 4; }
+  if (v >= 100L) { return 3; }
+  if (v >= 10L) { return 2; }
+  return 1;
+}
+
+// A floating decimal representing m * 10^e.
+typedef struct floating_decimal_64 {
+  uint64_t mantissa;
+  int32_t exponent;
+} floating_decimal_64;
+
+static inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) {
+  const uint32_t bias = (1u << (DOUBLE_EXPONENT_BITS - 1)) - 1;
+
+  int32_t e2;
+  uint64_t m2;
+  if (ieeeExponent == 0) {
+    // We subtract 2 so that the bounds computation has 2 additional bits.
+    e2 = 1 - bias - DOUBLE_MANTISSA_BITS - 2;
+    m2 = ieeeMantissa;
+  } else {
+    e2 = ieeeExponent - bias - DOUBLE_MANTISSA_BITS - 2;
+    m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
+  }
+  const bool even = (m2 & 1) == 0;
+  const bool acceptBounds = even;
+
+#ifdef RYU_DEBUG
+  printf("-> %" PRIu64 " * 2^%d\n", m2, e2 + 2);
+#endif
+
+  // Step 2: Determine the interval of legal decimal representations.
+  const uint64_t mv = 4 * m2;
+  // Implicit bool -> int conversion. True is 1, false is 0.
+  const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
+  // We would compute mp and mm like this:
+  // uint64_t mp = 4 * m2 + 2;
+  // uint64_t mm = mv - 1 - mmShift;
+
+  // Step 3: Convert to a decimal power base using 128-bit arithmetic.
+  uint64_t vr, vp, vm;
+  int32_t e10;
+  bool vmIsTrailingZeros = false;
+  bool vrIsTrailingZeros = false;
+  if (e2 >= 0) {
+    // I tried special-casing q == 0, but there was no effect on performance.
+    // This expression is slightly faster than max(0, log10Pow2(e2) - 1).
+    const uint32_t q = log10Pow2(e2) - (e2 > 3);
+    e10 = q;
+    const int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits(q) - 1;
+    const int32_t i = -e2 + q + k;
+#if defined(RYU_OPTIMIZE_SIZE)
+    uint64_t pow5[2];
+    double_computeInvPow5(q, pow5);
+    vr = mulShiftAll(m2, pow5, i, &vp, &vm, mmShift);
+#else
+    vr = mulShiftAll(m2, DOUBLE_POW5_INV_SPLIT[q], i, &vp, &vm, mmShift);
+#endif
+#ifdef RYU_DEBUG
+    printf("%" PRIu64 " * 2^%d / 10^%u\n", mv, e2, q);
+    printf("V+=%" PRIu64 "\nV =%" PRIu64 "\nV-=%" PRIu64 "\n", vp, vr, vm);
+#endif
+    if (q <= 21) {
+      // This should use q <= 22, but I think 21 is also safe. Smaller values
+      // may still be safe, but it's more difficult to reason about them.
+      // Only one of mp, mv, and mm can be a multiple of 5, if any.
+      const uint32_t mvMod5 = (uint32_t) (mv - 5 * div5(mv));
+      if (mvMod5 == 0) {
+        vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
+      } else if (acceptBounds) {
+        // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q
+        // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q
+        // <=> true && pow5Factor(mm) >= q, since e2 >= q.
+        vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q);
+      } else {
+        // Same as min(e2 + 1, pow5Factor(mp)) >= q.
+        vp -= multipleOfPowerOf5(mv + 2, q);
+      }
+    }
+  } else {
+    // This expression is slightly faster than max(0, log10Pow5(-e2) - 1).
+    const uint32_t q = log10Pow5(-e2) - (-e2 > 1);
+    e10 = q + e2;
+    const int32_t i = -e2 - q;
+    const int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
+    const int32_t j = q - k;
+#if defined(RYU_OPTIMIZE_SIZE)
+    uint64_t pow5[2];
+    double_computePow5(i, pow5);
+    vr = mulShiftAll(m2, pow5, j, &vp, &vm, mmShift);
+#else
+    vr = mulShiftAll(m2, DOUBLE_POW5_SPLIT[i], j, &vp, &vm, mmShift);
+#endif
+#ifdef RYU_DEBUG
+    printf("%" PRIu64 " * 5^%d / 10^%u\n", mv, -e2, q);
+    printf("%u %d %d %d\n", q, i, k, j);
+    printf("V+=%" PRIu64 "\nV =%" PRIu64 "\nV-=%" PRIu64 "\n", vp, vr, vm);
+#endif
+    if (q <= 1) {
+      // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
+      // mv = 4 * m2, so it always has at least two trailing 0 bits.
+      vrIsTrailingZeros = true;
+      if (acceptBounds) {
+        // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
+        vmIsTrailingZeros = mmShift == 1;
+      } else {
+        // mp = mv + 2, so it always has at least one trailing 0 bit.
+        --vp;
+      }
+    } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here.
+      // We need to compute min(ntz(mv), pow5Factor(mv) - e2) >= q - 1
+      // <=> ntz(mv) >= q - 1 && pow5Factor(mv) - e2 >= q - 1
+      // <=> ntz(mv) >= q - 1 (e2 is negative and -e2 >= q)
+      // <=> (mv & ((1 << (q - 1)) - 1)) == 0
+      // We also need to make sure that the left shift does not overflow.
+      vrIsTrailingZeros = multipleOfPowerOf2(mv, q - 1);
+#ifdef RYU_DEBUG
+      printf("vr is trailing zeros=%s\n", vrIsTrailingZeros ? "true" : "false");
+#endif
+    }
+  }
+#ifdef RYU_DEBUG
+  printf("e10=%d\n", e10);
+  printf("V+=%" PRIu64 "\nV =%" PRIu64 "\nV-=%" PRIu64 "\n", vp, vr, vm);
+  printf("vm is trailing zeros=%s\n", vmIsTrailingZeros ? "true" : "false");
+  printf("vr is trailing zeros=%s\n", vrIsTrailingZeros ? "true" : "false");
+#endif
+
+  // Step 4: Find the shortest decimal representation in the interval of legal representations.
+  uint32_t removed = 0;
+  uint8_t lastRemovedDigit = 0;
+  uint64_t output;
+  // On average, we remove ~2 digits.
+  if (vmIsTrailingZeros || vrIsTrailingZeros) {
+    // General case, which happens rarely (~0.7%).
+    for (;;) {
+      const uint64_t vpDiv10 = div10(vp);
+      const uint64_t vmDiv10 = div10(vm);
+      if (vpDiv10 <= vmDiv10) {
+        break;
+      }
+      const uint32_t vmMod10 = (uint32_t) (vm - 10 * vmDiv10);
+      const uint64_t vrDiv10 = div10(vr);
+      const uint32_t vrMod10 = (uint32_t) (vr - 10 * vrDiv10);
+      vmIsTrailingZeros &= vmMod10 == 0;
+      vrIsTrailingZeros &= lastRemovedDigit == 0;
+      lastRemovedDigit = (uint8_t) vrMod10;
+      vr = vrDiv10;
+      vp = vpDiv10;
+      vm = vmDiv10;
+      ++removed;
+    }
+#ifdef RYU_DEBUG
+    printf("V+=%" PRIu64 "\nV =%" PRIu64 "\nV-=%" PRIu64 "\n", vp, vr, vm);
+    printf("d-10=%s\n", vmIsTrailingZeros ? "true" : "false");
+#endif
+    if (vmIsTrailingZeros) {
+      for (;;) {
+        const uint64_t vmDiv10 = div10(vm);
+        const uint32_t vmMod10 = (uint32_t) (vm - 10 * vmDiv10);
+        if (vmMod10 != 0) {
+          break;
+        }
+        const uint64_t vpDiv10 = div10(vp);
+        const uint64_t vrDiv10 = div10(vr);
+        const uint32_t vrMod10 = (uint32_t) (vr - 10 * vrDiv10);
+        vrIsTrailingZeros &= lastRemovedDigit == 0;
+        lastRemovedDigit = (uint8_t) vrMod10;
+        vr = vrDiv10;
+        vp = vpDiv10;
+        vm = vmDiv10;
+        ++removed;
+      }
+    }
+#ifdef RYU_DEBUG
+    printf("%" PRIu64 " %d\n", vr, lastRemovedDigit);
+    printf("vr is trailing zeros=%s\n", vrIsTrailingZeros ? "true" : "false");
+#endif
+    if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
+      // Round even if the exact number is .....50..0.
+      lastRemovedDigit = 4;
+    }
+    // We need to take vr + 1 if vr is outside bounds or we need to round up.
+    output = vr +
+        ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
+  } else {
+    // Specialized for the common case (~99.3%). Percentages below are relative to this.
+    bool roundUp = false;
+    const uint64_t vpDiv100 = div100(vp);
+    const uint64_t vmDiv100 = div100(vm);
+    if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%).
+      const uint64_t vrDiv100 = div100(vr);
+      const uint32_t vrMod100 = (uint32_t) (vr - 100 * vrDiv100);
+      roundUp = vrMod100 >= 50;
+      vr = vrDiv100;
+      vp = vpDiv100;
+      vm = vmDiv100;
+      removed += 2;
+    }
+    // Loop iterations below (approximately), without optimization above:
+    // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
+    // Loop iterations below (approximately), with optimization above:
+    // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
+    for (;;) {
+      const uint64_t vpDiv10 = div10(vp);
+      const uint64_t vmDiv10 = div10(vm);
+      if (vpDiv10 <= vmDiv10) {
+        break;
+      }
+      const uint64_t vrDiv10 = div10(vr);
+      const uint32_t vrMod10 = (uint32_t) (vr - 10 * vrDiv10);
+      roundUp = vrMod10 >= 5;
+      vr = vrDiv10;
+      vp = vpDiv10;
+      vm = vmDiv10;
+      ++removed;
+    }
+#ifdef RYU_DEBUG
+    printf("%" PRIu64 " roundUp=%s\n", vr, roundUp ? "true" : "false");
+    printf("vr is trailing zeros=%s\n", vrIsTrailingZeros ? "true" : "false");
+#endif
+    // We need to take vr + 1 if vr is outside bounds or we need to round up.
+    output = vr + (vr == vm || roundUp);
+  }
+  const int32_t exp = e10 + removed;
+
+#ifdef RYU_DEBUG
+  printf("V+=%" PRIu64 "\nV =%" PRIu64 "\nV-=%" PRIu64 "\n", vp, vr, vm);
+  printf("O=%" PRIu64 "\n", output);
+  printf("EXP=%d\n", exp);
+#endif
+
+  floating_decimal_64 fd;
+  fd.exponent = exp;
+  fd.mantissa = output;
+  return fd;
+}
+
+static inline int to_chars(const floating_decimal_64 v, const bool sign, char* const result) {
+  // Step 5: Print the decimal representation.
+  int index = 0;
+  if (sign) {
+    result[index++] = '-';
+  }
+
+  uint64_t output = v.mantissa;
+  const uint32_t olength = decimalLength(output);
+
+#ifdef RYU_DEBUG
+  printf("DIGITS=%" PRIu64 "\n", v.mantissa);
+  printf("OLEN=%u\n", olength);
+  printf("EXP=%u\n", v.exponent + olength);
+#endif
+
+  // Print the decimal digits.
+  // The following code is equivalent to:
+  // for (uint32_t i = 0; i < olength - 1; ++i) {
+  //   const uint32_t c = output % 10; output /= 10;
+  //   result[index + olength - i] = (char) ('0' + c);
+  // }
+  // result[index] = '0' + output % 10;
+
+  uint32_t i = 0;
+  // We prefer 32-bit operations, even on 64-bit platforms.
+  // We have at most 17 digits, and uint32_t can store 9 digits.
+  // If output doesn't fit into uint32_t, we cut off 8 digits,
+  // so the rest will fit into uint32_t.
+  if ((output >> 32) != 0) {
+    // Expensive 64-bit division.
+    const uint64_t q = div100000000(output);
+    uint32_t output2 = (uint32_t) (output - 100000000 * q);
+    output = q;
+
+    const uint32_t c = output2 % 10000;
+    output2 /= 10000;
+    const uint32_t d = output2 % 10000;
+    const uint32_t c0 = (c % 100) << 1;
+    const uint32_t c1 = (c / 100) << 1;
+    const uint32_t d0 = (d % 100) << 1;
+    const uint32_t d1 = (d / 100) << 1;
+    memcpy(result + index + olength - i - 1, DIGIT_TABLE + c0, 2);
+    memcpy(result + index + olength - i - 3, DIGIT_TABLE + c1, 2);
+    memcpy(result + index + olength - i - 5, DIGIT_TABLE + d0, 2);
+    memcpy(result + index + olength - i - 7, DIGIT_TABLE + d1, 2);
+    i += 8;
+  }
+  uint32_t output2 = (uint32_t) output;
+  while (output2 >= 10000) {
+#ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217
+    const uint32_t c = output2 - 10000 * (output2 / 10000);
+#else
+    const uint32_t c = output2 % 10000;
+#endif
+    output2 /= 10000;
+    const uint32_t c0 = (c % 100) << 1;
+    const uint32_t c1 = (c / 100) << 1;
+    memcpy(result + index + olength - i - 1, DIGIT_TABLE + c0, 2);
+    memcpy(result + index + olength - i - 3, DIGIT_TABLE + c1, 2);
+    i += 4;
+  }
+  if (output2 >= 100) {
+    const uint32_t c = (output2 % 100) << 1;
+    output2 /= 100;
+    memcpy(result + index + olength - i - 1, DIGIT_TABLE + c, 2);
+    i += 2;
+  }
+  if (output2 >= 10) {
+    const uint32_t c = output2 << 1;
+    // We can't use memcpy here: the decimal dot goes between these two digits.
+    result[index + olength - i] = DIGIT_TABLE[c + 1];
+    result[index] = DIGIT_TABLE[c];
+  } else {
+    result[index] = (char) ('0' + output2);
+  }
+
+  // Print decimal point if needed.
+  if (olength > 1) {
+    result[index + 1] = '.';
+    index += olength + 1;
+  } else {
+    ++index;
+  }
+
+  // Print the exponent.
+  result[index++] = 'E';
+  int32_t exp = v.exponent + olength - 1;
+  if (exp < 0) {
+    result[index++] = '-';
+    exp = -exp;
+  }
+
+  if (exp >= 100) {
+    const int32_t c = exp % 10;
+    memcpy(result + index, DIGIT_TABLE + 2 * (exp / 10), 2);
+    result[index + 2] = (char) ('0' + c);
+    index += 3;
+  } else if (exp >= 10) {
+    memcpy(result + index, DIGIT_TABLE + 2 * exp, 2);
+    index += 2;
+  } else {
+    result[index++] = (char) ('0' + exp);
+  }
+
+  return index;
+}
+
+int ryu_d2s_buffered_n(double f, char* result) {
+  // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
+  const uint64_t bits = double_to_bits(f);
+
+#ifdef RYU_DEBUG
+  printf("IN=");
+  for (int32_t bit = 63; bit >= 0; --bit) {
+    printf("%d", (int) ((bits >> bit) & 1));
+  }
+  printf("\n");
+#endif
+
+  // Decode bits into sign, mantissa, and exponent.
+  const bool ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
+  const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
+  const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
+  // Case distinction; exit early for the easy cases.
+  if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
+    return copy_special_str(result, ieeeSign, ieeeExponent, ieeeMantissa);
+  }
+
+  const floating_decimal_64 v = d2d(ieeeMantissa, ieeeExponent);
+  return to_chars(v, ieeeSign, result);
+}
+
+void ryu_d2s_buffered(double f, char* result) {
+  const int index = ryu_d2s_buffered_n(f, result);
+
+  // Terminate the string.
+  result[index] = '\0';
+}
+
+char* ryu_d2s(double f) {
+  char* const result = (char*) malloc(25);
+  ryu_d2s_buffered(f, result);
+  return result;
+}
diff --git a/src/common/d2s.h b/src/common/d2s.h
new file mode 100644
index 0000000000..ce71293ad3
--- /dev/null
+++ b/src/common/d2s.h
@@ -0,0 +1,201 @@
+// Copyright 2018 Ulf Adams
+//
+// The contents of this file may be used under the terms of the Apache License,
+// Version 2.0.
+//
+//    (See accompanying file LICENSE-Apache or copy at
+//     http://www.apache.org/licenses/LICENSE-2.0)
+//
+// Alternatively, the contents of this file may be used under the terms of
+// the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE-Boost or copy at
+//     https://www.boost.org/LICENSE_1_0.txt)
+//
+// Unless required by applicable law or agreed to in writing, this software
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.
+#ifndef RYU_D2S_H
+#define RYU_D2S_H
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "ryu_common.h"
+
+// Only include the full table if we're not optimizing for size.
+#if !defined(RYU_OPTIMIZE_SIZE)
+#include "d2s_full_table.h"
+#endif
+
+#if defined(HAS_UINT128)
+typedef __uint128_t uint128_t;
+#else
+#include "d2s_intrinsics.h"
+#endif
+
+#define DOUBLE_MANTISSA_BITS 52
+#define DOUBLE_EXPONENT_BITS 11
+
+#define DOUBLE_POW5_INV_BITCOUNT 122
+#define DOUBLE_POW5_BITCOUNT 121
+
+#if defined(RYU_OPTIMIZE_SIZE)
+
+#define POW5_TABLE_SIZE 26
+static const uint64_t DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {
+1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull,
+1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull,
+30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull,
+19073486328125ull, 95367431640625ull, 476837158203125ull,
+2384185791015625ull, 11920928955078125ull, 59604644775390625ull,
+298023223876953125ull //, 1490116119384765625ull
+};
+
+static const uint64_t DOUBLE_POW5_SPLIT2[13][2] = {
+ {                    0u,  72057594037927936u },
+ { 10376293541461622784u,  93132257461547851u },
+ { 15052517733678820785u, 120370621524202240u },
+ {  6258995034005762182u,  77787690973264271u },
+ { 14893927168346708332u, 100538234169297439u },
+ {  4272820386026678563u, 129942622070561240u },
+ {  7330497575943398595u,  83973451344588609u },
+ { 18377130505971182927u, 108533142064701048u },
+ { 10038208235822497557u, 140275798336537794u },
+ {  7017903361312433648u,  90651109995611182u },
+ {  6366496589810271835u, 117163813585596168u },
+ {  9264989777501460624u,  75715339914673581u },
+ { 17074144231291089770u,  97859783203563123u },
+};
+// Unfortunately, the results are sometimes off by one. We use an additional
+// lookup table to store those cases and adjust the result.
+static const uint32_t POW5_OFFSETS[13] = {
+0x00000000, 0x00000000, 0x00000000, 0x033c55be, 0x03db77d8, 0x0265ffb2,
+0x00000800, 0x01a8ff56, 0x00000000, 0x0037a200, 0x00004000, 0x03fffffc,
+0x00003ffe,
+};
+
+
+static const uint64_t DOUBLE_POW5_INV_SPLIT2[13][2] = {
+ {                    1u, 288230376151711744u },
+ {  7661987648932456967u, 223007451985306231u },
+ { 12652048002903177473u, 172543658669764094u },
+ {  5522544058086115566u, 266998379490113760u },
+ {  3181575136763469022u, 206579990246952687u },
+ {  4551508647133041040u, 159833525776178802u },
+ {  1116074521063664381u, 247330401473104534u },
+ { 17400360011128145022u, 191362629322552438u },
+ {  9297997190148906106u, 148059663038321393u },
+ { 11720143854957885429u, 229111231347799689u },
+ { 15401709288678291155u, 177266229209635622u },
+ {  3003071137298187333u, 274306203439684434u },
+ { 17516772882021341108u, 212234145163966538u },
+};
+static const uint32_t POW5_INV_OFFSETS[20] = {
+0x51505404, 0x55054514, 0x45555545, 0x05511411, 0x00505010, 0x00000004,
+0x00000000, 0x00000000, 0x55555040, 0x00505051, 0x00050040, 0x55554000,
+0x51659559, 0x00001000, 0x15000010, 0x55455555, 0x41404051, 0x00001010,
+0x00000014, 0x00000000,
+};
+
+#if defined(HAS_UINT128)
+
+// Computes 5^i in the form required by Ryu, and stores it in the given pointer.
+static inline void double_computePow5(const uint32_t i, uint64_t* const result) {
+  const uint32_t base = i / POW5_TABLE_SIZE;
+  const uint32_t base2 = base * POW5_TABLE_SIZE;
+  const uint32_t offset = i - base2;
+  const uint64_t* const mul = DOUBLE_POW5_SPLIT2[base];
+  if (offset == 0) {
+    result[0] = mul[0];
+    result[1] = mul[1];
+    return;
+  }
+  const uint64_t m = DOUBLE_POW5_TABLE[offset];
+  const uint128_t b0 = ((uint128_t) m) * mul[0];
+  const uint128_t b2 = ((uint128_t) m) * mul[1];
+  const uint32_t delta = pow5bits(i) - pow5bits(base2);
+  const uint128_t shiftedSum = (b0 >> delta) + (b2 << (64 - delta)) + ((POW5_OFFSETS[base] >> offset) & 1);
+  result[0] = (uint64_t) shiftedSum;
+  result[1] = (uint64_t) (shiftedSum >> 64);
+}
+
+// Computes 5^-i in the form required by Ryu, and stores it in the given pointer.
+static inline void double_computeInvPow5(const uint32_t i, uint64_t* const result) {
+  const uint32_t base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE;
+  const uint32_t base2 = base * POW5_TABLE_SIZE;
+  const uint32_t offset = base2 - i;
+  const uint64_t* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2
+  if (offset == 0) {
+    result[0] = mul[0];
+    result[1] = mul[1];
+    return;
+  }
+  const uint64_t m = DOUBLE_POW5_TABLE[offset]; // 5^offset
+  const uint128_t b0 = ((uint128_t) m) * (mul[0] - 1);
+  const uint128_t b2 = ((uint128_t) m) * mul[1]; // 1/5^base2 * 5^offset = 1/5^(base2-offset) = 1/5^i
+  const uint32_t delta = pow5bits(base2) - pow5bits(i);
+  const uint128_t shiftedSum =
+    ((b0 >> delta) + (b2 << (64 - delta))) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
+  result[0] = (uint64_t) shiftedSum;
+  result[1] = (uint64_t) (shiftedSum >> 64);
+}
+
+#else // defined(HAS_UINT128)
+
+// Computes 5^i in the form required by Ryu, and stores it in the given pointer.
+static inline void double_computePow5(const uint32_t i, uint64_t* const result) {
+  const uint32_t base = i / POW5_TABLE_SIZE;
+  const uint32_t base2 = base * POW5_TABLE_SIZE;
+  const uint32_t offset = i - base2;
+  const uint64_t* const mul = DOUBLE_POW5_SPLIT2[base];
+  if (offset == 0) {
+    result[0] = mul[0];
+    result[1] = mul[1];
+    return;
+  }
+  const uint64_t m = DOUBLE_POW5_TABLE[offset];
+  uint64_t high1;
+  const uint64_t low1 = umul128(m, mul[1], &high1);
+  uint64_t high0;
+  const uint64_t low0 = umul128(m, mul[0], &high0);
+  const uint64_t sum = high0 + low1;
+  if (sum < high0) {
+    ++high1; // overflow into high1
+  }
+  // high1 | sum | low0
+  const uint32_t delta = pow5bits(i) - pow5bits(base2);
+  result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[base] >> offset) & 1);
+  result[1] = shiftright128(sum, high1, delta);
+}
+
+// Computes 5^-i in the form required by Ryu, and stores it in the given pointer.
+static inline void double_computeInvPow5(const uint32_t i, uint64_t* const result) {
+  const uint32_t base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE;
+  const uint32_t base2 = base * POW5_TABLE_SIZE;
+  const uint32_t offset = base2 - i;
+  const uint64_t* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2
+  if (offset == 0) {
+    result[0] = mul[0];
+    result[1] = mul[1];
+    return;
+  }
+  const uint64_t m = DOUBLE_POW5_TABLE[offset];
+  uint64_t high1;
+  const uint64_t low1 = umul128(m, mul[1], &high1);
+  uint64_t high0;
+  const uint64_t low0 = umul128(m, mul[0] - 1, &high0);
+  const uint64_t sum = high0 + low1;
+  if (sum < high0) {
+    ++high1; // overflow into high1
+  }
+  // high1 | sum | low0
+  const uint32_t delta = pow5bits(base2) - pow5bits(i);
+  result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
+  result[1] = shiftright128(sum, high1, delta);
+}
+
+#endif // defined(HAS_UINT128)
+
+#endif // defined(RYU_OPTIMIZE_SIZE)
+
+#endif // RYU_D2S_H
diff --git a/src/common/d2s_full_table.h b/src/common/d2s_full_table.h
new file mode 100644
index 0000000000..6f062b4595
--- /dev/null
+++ b/src/common/d2s_full_table.h
@@ -0,0 +1,338 @@
+// Copyright 2018 Ulf Adams
+//
+// The contents of this file may be used under the terms of the Apache License,
+// Version 2.0.
+//
+//    (See accompanying file LICENSE-Apache or copy at
+//     http://www.apache.org/licenses/LICENSE-2.0)
+//
+// Alternatively, the contents of this file may be used under the terms of
+// the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE-Boost or copy at
+//     https://www.boost.org/LICENSE_1_0.txt)
+//
+// Unless required by applicable law or agreed to in writing, this software
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.
+#ifndef RYU_D2S_FULL_TABLE_H
+#define RYU_D2S_FULL_TABLE_H
+
+#include <stdint.h>
+
+// These tables are generated by PrintDoubleLookupTable.
+static const uint64_t DOUBLE_POW5_INV_SPLIT[292][2] = {
+ {                    1u, 288230376151711744u }, {  3689348814741910324u, 230584300921369395u },
+ {  2951479051793528259u, 184467440737095516u }, { 17118578500402463900u, 147573952589676412u },
+ { 12632330341676300947u, 236118324143482260u }, { 10105864273341040758u, 188894659314785808u },
+ { 15463389048156653253u, 151115727451828646u }, { 17362724847566824558u, 241785163922925834u },
+ { 17579528692795369969u, 193428131138340667u }, {  6684925324752475329u, 154742504910672534u },
+ { 18074578149087781173u, 247588007857076054u }, { 18149011334012135262u, 198070406285660843u },
+ {  3451162622983977240u, 158456325028528675u }, {  5521860196774363583u, 253530120045645880u },
+ {  4417488157419490867u, 202824096036516704u }, {  7223339340677503017u, 162259276829213363u },
+ {  7867994130342094503u, 259614842926741381u }, {  2605046489531765280u, 207691874341393105u },
+ {  2084037191625412224u, 166153499473114484u }, { 10713157136084480204u, 265845599156983174u },
+ { 12259874523609494487u, 212676479325586539u }, { 13497248433629505913u, 170141183460469231u },
+ { 14216899864323388813u, 272225893536750770u }, { 11373519891458711051u, 217780714829400616u },
+ {  5409467098425058518u, 174224571863520493u }, {  4965798542738183305u, 278759314981632789u },
+ {  7661987648932456967u, 223007451985306231u }, {  2440241304404055250u, 178405961588244985u },
+ {  3904386087046488400u, 285449538541191976u }, { 17880904128604832013u, 228359630832953580u },
+ { 14304723302883865611u, 182687704666362864u }, { 15133127457049002812u, 146150163733090291u },
+ { 16834306301794583852u, 233840261972944466u }, {  9778096226693756759u, 187072209578355573u },
+ { 15201174610838826053u, 149657767662684458u }, {  2185786488890659746u, 239452428260295134u },
+ {  5437978005854438120u, 191561942608236107u }, { 15418428848909281466u, 153249554086588885u },
+ {  6222742084545298729u, 245199286538542217u }, { 16046240111861969953u, 196159429230833773u },
+ {  1768945645263844993u, 156927543384667019u }, { 10209010661905972635u, 251084069415467230u },
+ {  8167208529524778108u, 200867255532373784u }, { 10223115638361732810u, 160693804425899027u },
+ {  1599589762411131202u, 257110087081438444u }, {  4969020624670815285u, 205688069665150755u },
+ {  3975216499736652228u, 164550455732120604u }, { 13739044029062464211u, 263280729171392966u },
+ {  7301886408508061046u, 210624583337114373u }, { 13220206756290269483u, 168499666669691498u },
+ { 17462981995322520850u, 269599466671506397u }, {  6591687966774196033u, 215679573337205118u },
+ { 12652048002903177473u, 172543658669764094u }, {  9175230360419352987u, 276069853871622551u },
+ {  3650835473593572067u, 220855883097298041u }, { 17678063637842498946u, 176684706477838432u },
+ { 13527506561580357021u, 282695530364541492u }, {  3443307619780464970u, 226156424291633194u },
+ {  6443994910566282300u, 180925139433306555u }, {  5155195928453025840u, 144740111546645244u },
+ { 15627011115008661990u, 231584178474632390u }, { 12501608892006929592u, 185267342779705912u },
+ {  2622589484121723027u, 148213874223764730u }, {  4196143174594756843u, 237142198758023568u },
+ { 10735612169159626121u, 189713759006418854u }, { 12277838550069611220u, 151771007205135083u },
+ { 15955192865369467629u, 242833611528216133u }, {  1696107848069843133u, 194266889222572907u },
+ { 12424932722681605476u, 155413511378058325u }, {  1433148282581017146u, 248661618204893321u },
+ { 15903913885032455010u, 198929294563914656u }, {  9033782293284053685u, 159143435651131725u },
+ { 14454051669254485895u, 254629497041810760u }, { 11563241335403588716u, 203703597633448608u },
+ { 16629290697806691620u, 162962878106758886u }, {   781423413297334329u, 260740604970814219u },
+ {  4314487545379777786u, 208592483976651375u }, {  3451590036303822229u, 166873987181321100u },
+ {  5522544058086115566u, 266998379490113760u }, {  4418035246468892453u, 213598703592091008u },
+ { 10913125826658934609u, 170878962873672806u }, { 10082303693170474728u, 273406340597876490u },
+ {  8065842954536379782u, 218725072478301192u }, { 17520720807854834795u, 174980057982640953u },
+ {  5897060404116273733u, 279968092772225526u }, {  1028299508551108663u, 223974474217780421u },
+ { 15580034865808528224u, 179179579374224336u }, { 17549358155809824511u, 286687326998758938u },
+ {  2971440080422128639u, 229349861599007151u }, { 17134547323305344204u, 183479889279205720u },
+ { 13707637858644275364u, 146783911423364576u }, { 14553522944347019935u, 234854258277383322u },
+ {  4264120725993795302u, 187883406621906658u }, { 10789994210278856888u, 150306725297525326u },
+ {  9885293106962350374u, 240490760476040522u }, {   529536856086059653u, 192392608380832418u },
+ {  7802327114352668369u, 153914086704665934u }, {  1415676938738538420u, 246262538727465495u },
+ {  1132541550990830736u, 197010030981972396u }, { 15663428499760305882u, 157608024785577916u },
+ { 17682787970132668764u, 252172839656924666u }, { 10456881561364224688u, 201738271725539733u },
+ { 15744202878575200397u, 161390617380431786u }, { 17812026976236499989u, 258224987808690858u },
+ {  3181575136763469022u, 206579990246952687u }, { 13613306553636506187u, 165263992197562149u },
+ { 10713244041592678929u, 264422387516099439u }, { 12259944048016053467u, 211537910012879551u },
+ {  6118606423670932450u, 169230328010303641u }, {  2411072648389671274u, 270768524816485826u },
+ { 16686253377679378312u, 216614819853188660u }, { 13349002702143502650u, 173291855882550928u },
+ { 17669055508687693916u, 277266969412081485u }, { 14135244406950155133u, 221813575529665188u },
+ {   240149081334393137u, 177450860423732151u }, { 11452284974360759988u, 283921376677971441u },
+ {  5472479164746697667u, 227137101342377153u }, { 11756680961281178780u, 181709681073901722u },
+ {  2026647139541122378u, 145367744859121378u }, { 18000030682233437097u, 232588391774594204u },
+ { 18089373360528660001u, 186070713419675363u }, {  3403452244197197031u, 148856570735740291u },
+ { 16513570034941246220u, 238170513177184465u }, { 13210856027952996976u, 190536410541747572u },
+ {  3189987192878576934u, 152429128433398058u }, {  1414630693863812771u, 243886605493436893u },
+ {  8510402184574870864u, 195109284394749514u }, { 10497670562401807014u, 156087427515799611u },
+ {  9417575270359070576u, 249739884025279378u }, { 14912757845771077107u, 199791907220223502u },
+ {  4551508647133041040u, 159833525776178802u }, { 10971762650154775986u, 255733641241886083u },
+ { 16156107749607641435u, 204586912993508866u }, {  9235537384944202825u, 163669530394807093u },
+ { 11087511001168814197u, 261871248631691349u }, { 12559357615676961681u, 209496998905353079u },
+ { 13736834907283479668u, 167597599124282463u }, { 18289587036911657145u, 268156158598851941u },
+ { 10942320814787415393u, 214524926879081553u }, { 16132554281313752961u, 171619941503265242u },
+ { 11054691591134363444u, 274591906405224388u }, { 16222450902391311402u, 219673525124179510u },
+ { 12977960721913049122u, 175738820099343608u }, { 17075388340318968271u, 281182112158949773u },
+ {  2592264228029443648u, 224945689727159819u }, {  5763160197165465241u, 179956551781727855u },
+ {  9221056315464744386u, 287930482850764568u }, { 14755542681855616155u, 230344386280611654u },
+ { 15493782960226403247u, 184275509024489323u }, {  1326979923955391628u, 147420407219591459u },
+ {  9501865507812447252u, 235872651551346334u }, { 11290841220991868125u, 188698121241077067u },
+ {  1653975347309673853u, 150958496992861654u }, { 10025058185179298811u, 241533595188578646u },
+ {  4330697733401528726u, 193226876150862917u }, { 14532604630946953951u, 154581500920690333u },
+ {  1116074521063664381u, 247330401473104534u }, {  4582208431592841828u, 197864321178483627u },
+ { 14733813189500004432u, 158291456942786901u }, { 16195403473716186445u, 253266331108459042u },
+ {  5577625149489128510u, 202613064886767234u }, {  8151448934333213131u, 162090451909413787u },
+ { 16731667109675051333u, 259344723055062059u }, { 17074682502481951390u, 207475778444049647u },
+ {  6281048372501740465u, 165980622755239718u }, {  6360328581260874421u, 265568996408383549u },
+ {  8777611679750609860u, 212455197126706839u }, { 10711438158542398211u, 169964157701365471u },
+ {  9759603424184016492u, 271942652322184754u }, { 11497031554089123517u, 217554121857747803u },
+ { 16576322872755119460u, 174043297486198242u }, { 11764721337440549842u, 278469275977917188u },
+ { 16790474699436260520u, 222775420782333750u }, { 13432379759549008416u, 178220336625867000u },
+ {  3045063541568861850u, 285152538601387201u }, { 17193446092222730773u, 228122030881109760u },
+ { 13754756873778184618u, 182497624704887808u }, { 18382503128506368341u, 145998099763910246u },
+ {  3586563302416817083u, 233596959622256395u }, {  2869250641933453667u, 186877567697805116u },
+ { 17052795772514404226u, 149502054158244092u }, { 12527077977055405469u, 239203286653190548u },
+ { 17400360011128145022u, 191362629322552438u }, {  2852241564676785048u, 153090103458041951u },
+ { 15631632947708587046u, 244944165532867121u }, {  8815957543424959314u, 195955332426293697u },
+ { 18120812478965698421u, 156764265941034957u }, { 14235904707377476180u, 250822825505655932u },
+ {  4010026136418160298u, 200658260404524746u }, { 17965416168102169531u, 160526608323619796u },
+ {  2919224165770098987u, 256842573317791675u }, {  2335379332616079190u, 205474058654233340u },
+ {  1868303466092863352u, 164379246923386672u }, {  6678634360490491686u, 263006795077418675u },
+ {  5342907488392393349u, 210405436061934940u }, {  4274325990713914679u, 168324348849547952u },
+ { 10528270399884173809u, 269318958159276723u }, { 15801313949391159694u, 215455166527421378u },
+ {  1573004715287196786u, 172364133221937103u }, { 17274202803427156150u, 275782613155099364u },
+ { 17508711057483635243u, 220626090524079491u }, { 10317620031244997871u, 176500872419263593u },
+ { 12818843235250086271u, 282401395870821749u }, { 13944423402941979340u, 225921116696657399u },
+ { 14844887537095493795u, 180736893357325919u }, { 15565258844418305359u, 144589514685860735u },
+ {  6457670077359736959u, 231343223497377177u }, { 16234182506113520537u, 185074578797901741u },
+ {  9297997190148906106u, 148059663038321393u }, { 11187446689496339446u, 236895460861314229u },
+ { 12639306166338981880u, 189516368689051383u }, { 17490142562555006151u, 151613094951241106u },
+ {  2158786396894637579u, 242580951921985771u }, { 16484424376483351356u, 194064761537588616u },
+ {  9498190686444770762u, 155251809230070893u }, { 11507756283569722895u, 248402894768113429u },
+ { 12895553841597688639u, 198722315814490743u }, { 17695140702761971558u, 158977852651592594u },
+ { 17244178680193423523u, 254364564242548151u }, { 10105994129412828495u, 203491651394038521u },
+ {  4395446488788352473u, 162793321115230817u }, { 10722063196803274280u, 260469313784369307u },
+ {  1198952927958798777u, 208375451027495446u }, { 15716557601334680315u, 166700360821996356u },
+ { 17767794532651667857u, 266720577315194170u }, { 14214235626121334286u, 213376461852155336u },
+ {  7682039686155157106u, 170701169481724269u }, {  1223217053622520399u, 273121871170758831u },
+ { 15735968901865657612u, 218497496936607064u }, { 16278123936234436413u, 174797997549285651u },
+ {   219556594781725998u, 279676796078857043u }, {  7554342905309201445u, 223741436863085634u },
+ {  9732823138989271479u, 178993149490468507u }, {   815121763415193074u, 286389039184749612u },
+ { 11720143854957885429u, 229111231347799689u }, { 13065463898708218666u, 183288985078239751u },
+ {  6763022304224664610u, 146631188062591801u }, {  3442138057275642729u, 234609900900146882u },
+ { 13821756890046245153u, 187687920720117505u }, { 11057405512036996122u, 150150336576094004u },
+ {  6623802375033462826u, 240240538521750407u }, { 16367088344252501231u, 192192430817400325u },
+ { 13093670675402000985u, 153753944653920260u }, {  2503129006933649959u, 246006311446272417u },
+ { 13070549649772650937u, 196805049157017933u }, { 17835137349301941396u, 157444039325614346u },
+ {  2710778055689733971u, 251910462920982955u }, {  2168622444551787177u, 201528370336786364u },
+ {  5424246770383340065u, 161222696269429091u }, {  1300097203129523457u, 257956314031086546u },
+ { 15797473021471260058u, 206365051224869236u }, {  8948629602435097724u, 165092040979895389u },
+ {  3249760919670425388u, 264147265567832623u }, {  9978506365220160957u, 211317812454266098u },
+ { 15361502721659949412u, 169054249963412878u }, {  2442311466204457120u, 270486799941460606u },
+ { 16711244431931206989u, 216389439953168484u }, { 17058344360286875914u, 173111551962534787u },
+ { 12535955717491360170u, 276978483140055660u }, { 10028764573993088136u, 221582786512044528u },
+ { 15401709288678291155u, 177266229209635622u }, {  9885339602917624555u, 283625966735416996u },
+ {  4218922867592189321u, 226900773388333597u }, { 14443184738299482427u, 181520618710666877u },
+ {  4175850161155765295u, 145216494968533502u }, { 10370709072591134795u, 232346391949653603u },
+ { 15675264887556728482u, 185877113559722882u }, {  5161514280561562140u, 148701690847778306u },
+ {   879725219414678777u, 237922705356445290u }, {   703780175531743021u, 190338164285156232u },
+ { 11631070584651125387u, 152270531428124985u }, {   162968861732249003u, 243632850284999977u },
+ { 11198421533611530172u, 194906280227999981u }, {  5269388412147313814u, 155925024182399985u },
+ {  8431021459435702103u, 249480038691839976u }, {  3055468352806651359u, 199584030953471981u },
+ { 17201769941212962380u, 159667224762777584u }, { 16454785461715008838u, 255467559620444135u },
+ { 13163828369372007071u, 204374047696355308u }, { 17909760324981426303u, 163499238157084246u },
+ {  2830174816776909822u, 261598781051334795u }, {  2264139853421527858u, 209279024841067836u },
+ { 16568707141704863579u, 167423219872854268u }, {  4373838538276319787u, 267877151796566830u },
+ {  3499070830621055830u, 214301721437253464u }, {  6488605479238754987u, 171441377149802771u },
+ {  3003071137298187333u, 274306203439684434u }, {  6091805724580460189u, 219444962751747547u },
+ { 15941491023890099121u, 175555970201398037u }, { 10748990379256517301u, 280889552322236860u },
+ {  8599192303405213841u, 224711641857789488u }, { 14258051472207991719u, 179769313486231590u }
+};
+
+static const uint64_t DOUBLE_POW5_SPLIT[326][2] = {
+ {                    0u,  72057594037927936u }, {                    0u,  90071992547409920u },
+ {                    0u, 112589990684262400u }, {                    0u, 140737488355328000u },
+ {                    0u,  87960930222080000u }, {                    0u, 109951162777600000u },
+ {                    0u, 137438953472000000u }, {                    0u,  85899345920000000u },
+ {                    0u, 107374182400000000u }, {                    0u, 134217728000000000u },
+ {                    0u,  83886080000000000u }, {                    0u, 104857600000000000u },
+ {                    0u, 131072000000000000u }, {                    0u,  81920000000000000u },
+ {                    0u, 102400000000000000u }, {                    0u, 128000000000000000u },
+ {                    0u,  80000000000000000u }, {                    0u, 100000000000000000u },
+ {                    0u, 125000000000000000u }, {                    0u,  78125000000000000u },
+ {                    0u,  97656250000000000u }, {                    0u, 122070312500000000u },
+ {                    0u,  76293945312500000u }, {                    0u,  95367431640625000u },
+ {                    0u, 119209289550781250u }, {  4611686018427387904u,  74505805969238281u },
+ { 10376293541461622784u,  93132257461547851u }, {  8358680908399640576u, 116415321826934814u },
+ {   612489549322387456u,  72759576141834259u }, { 14600669991935148032u,  90949470177292823u },
+ { 13639151471491547136u, 113686837721616029u }, {  3213881284082270208u, 142108547152020037u },
+ {  4314518811765112832u,  88817841970012523u }, {   781462496279003136u, 111022302462515654u },
+ { 10200200157203529728u, 138777878078144567u }, { 13292654125893287936u,  86736173798840354u },
+ {  7392445620511834112u, 108420217248550443u }, {  4628871007212404736u, 135525271560688054u },
+ { 16728102434789916672u,  84703294725430033u }, {  7075069988205232128u, 105879118406787542u },
+ { 18067209522111315968u, 132348898008484427u }, {  8986162942105878528u,  82718061255302767u },
+ {  6621017659204960256u, 103397576569128459u }, {  3664586055578812416u, 129246970711410574u },
+ { 16125424340018921472u,  80779356694631608u }, {  1710036351314100224u, 100974195868289511u },
+ { 15972603494424788992u, 126217744835361888u }, {  9982877184015493120u,  78886090522101180u },
+ { 12478596480019366400u,  98607613152626475u }, { 10986559581596820096u, 123259516440783094u },
+ {  2254913720070624656u,  77037197775489434u }, { 12042014186943056628u,  96296497219361792u },
+ { 15052517733678820785u, 120370621524202240u }, {  9407823583549262990u,  75231638452626400u },
+ { 11759779479436578738u,  94039548065783000u }, { 14699724349295723422u, 117549435082228750u },
+ {  4575641699882439235u,  73468396926392969u }, { 10331238143280436948u,  91835496157991211u },
+ {  8302361660673158281u, 114794370197489014u }, {  1154580038986672043u, 143492962746861268u },
+ {  9944984561221445835u,  89683101716788292u }, { 12431230701526807293u, 112103877145985365u },
+ {  1703980321626345405u, 140129846432481707u }, { 17205888765512323542u,  87581154020301066u },
+ { 12283988920035628619u, 109476442525376333u }, {  1519928094762372062u, 136845553156720417u },
+ { 12479170105294952299u,  85528470722950260u }, { 15598962631618690374u, 106910588403687825u },
+ {  5663645234241199255u, 133638235504609782u }, { 17374836326682913246u,  83523897190381113u },
+ {  7883487353071477846u, 104404871487976392u }, {  9854359191339347308u, 130506089359970490u },
+ { 10770660513014479971u,  81566305849981556u }, { 13463325641268099964u, 101957882312476945u },
+ {  2994098996302961243u, 127447352890596182u }, { 15706369927971514489u,  79654595556622613u },
+ {  5797904354682229399u,  99568244445778267u }, {  2635694424925398845u, 124460305557222834u },
+ {  6258995034005762182u,  77787690973264271u }, {  3212057774079814824u,  97234613716580339u },
+ { 17850130272881932242u, 121543267145725423u }, { 18073860448192289507u,  75964541966078389u },
+ {  8757267504958198172u,  94955677457597987u }, {  6334898362770359811u, 118694596821997484u },
+ { 13182683513586250689u,  74184123013748427u }, { 11866668373555425458u,  92730153767185534u },
+ {  5609963430089506015u, 115912692208981918u }, { 17341285199088104971u,  72445432630613698u },
+ { 12453234462005355406u,  90556790788267123u }, { 10954857059079306353u, 113195988485333904u },
+ { 13693571323849132942u, 141494985606667380u }, { 17781854114260483896u,  88434366004167112u },
+ {  3780573569116053255u, 110542957505208891u }, {   114030942967678664u, 138178696881511114u },
+ {  4682955357782187069u,  86361685550944446u }, { 15077066234082509644u, 107952106938680557u },
+ {  5011274737320973344u, 134940133673350697u }, { 14661261756894078100u,  84337583545844185u },
+ {  4491519140835433913u, 105421979432305232u }, {  5614398926044292391u, 131777474290381540u },
+ { 12732371365632458552u,  82360921431488462u }, {  6692092170185797382u, 102951151789360578u },
+ { 17588487249587022536u, 128688939736700722u }, { 15604490549419276989u,  80430587335437951u },
+ { 14893927168346708332u, 100538234169297439u }, { 14005722942005997511u, 125672792711621799u },
+ { 15671105866394830300u,  78545495444763624u }, {  1142138259283986260u,  98181869305954531u },
+ { 15262730879387146537u, 122727336632443163u }, {  7233363790403272633u,  76704585395276977u },
+ { 13653390756431478696u,  95880731744096221u }, {  3231680390257184658u, 119850914680120277u },
+ {  4325643253124434363u,  74906821675075173u }, { 10018740084832930858u,  93633527093843966u },
+ {  3300053069186387764u, 117041908867304958u }, { 15897591223523656064u,  73151193042065598u },
+ { 10648616992549794273u,  91438991302581998u }, {  4087399203832467033u, 114298739128227498u },
+ { 14332621041645359599u, 142873423910284372u }, { 18181260187883125557u,  89295889943927732u },
+ {  4279831161144355331u, 111619862429909666u }, { 14573160988285219972u, 139524828037387082u },
+ { 13719911636105650386u,  87203017523366926u }, {  7926517508277287175u, 109003771904208658u },
+ {   684774848491833161u, 136254714880260823u }, {  7345513307948477581u,  85159196800163014u },
+ { 18405263671790372785u, 106448996000203767u }, { 18394893571310578077u, 133061245000254709u },
+ { 13802651491282805250u,  83163278125159193u }, {  3418256308821342851u, 103954097656448992u },
+ {  4272820386026678563u, 129942622070561240u }, {  2670512741266674102u,  81214138794100775u },
+ { 17173198981865506339u, 101517673492625968u }, {  3019754653622331308u, 126897091865782461u },
+ {  4193189667727651020u,  79310682416114038u }, { 14464859121514339583u,  99138353020142547u },
+ { 13469387883465536574u, 123922941275178184u }, {  8418367427165960359u,  77451838296986365u },
+ { 15134645302384838353u,  96814797871232956u }, {   471562554271496325u, 121018497339041196u },
+ {  9518098633274461011u,  75636560836900747u }, {  7285937273165688360u,  94545701046125934u },
+ { 18330793628311886258u, 118182126307657417u }, {  4539216990053847055u,  73863828942285886u },
+ { 14897393274422084627u,  92329786177857357u }, {  4786683537745442072u, 115412232722321697u },
+ { 14520892257159371055u,  72132645451451060u }, { 18151115321449213818u,  90165806814313825u },
+ {  8853836096529353561u, 112707258517892282u }, {  1843923083806916143u, 140884073147365353u },
+ { 12681666973447792349u,  88052545717103345u }, {  2017025661527576725u, 110065682146379182u },
+ { 11744654113764246714u, 137582102682973977u }, {   422879793461572340u,  85988814176858736u },
+ {   528599741826965425u, 107486017721073420u }, {   660749677283706782u, 134357522151341775u },
+ {  7330497575943398595u,  83973451344588609u }, { 13774807988356636147u, 104966814180735761u },
+ {  3383451930163631472u, 131208517725919702u }, { 15949715511634433382u,  82005323578699813u },
+ {  6102086334260878016u, 102506654473374767u }, {  3015921899398709616u, 128133318091718459u },
+ { 18025852251620051174u,  80083323807324036u }, {  4085571240815512351u, 100104154759155046u },
+ { 14330336087874166247u, 125130193448943807u }, { 15873989082562435760u,  78206370905589879u },
+ { 15230800334775656796u,  97757963631987349u }, {  5203442363187407284u, 122197454539984187u },
+ {   946308467778435600u,  76373409087490117u }, {  5794571603150432404u,  95466761359362646u },
+ { 16466586540792816313u, 119333451699203307u }, {  7985773578781816244u,  74583407312002067u },
+ {  5370530955049882401u,  93229259140002584u }, {  6713163693812353001u, 116536573925003230u },
+ { 18030785363914884337u,  72835358703127018u }, { 13315109668038829614u,  91044198378908773u },
+ {  2808829029766373305u, 113805247973635967u }, { 17346094342490130344u, 142256559967044958u },
+ {  6229622945628943561u,  88910349979403099u }, {  3175342663608791547u, 111137937474253874u },
+ { 13192550366365765242u, 138922421842817342u }, {  3633657960551215372u,  86826513651760839u },
+ { 18377130505971182927u, 108533142064701048u }, {  4524669058754427043u, 135666427580876311u },
+ {  9745447189362598758u,  84791517238047694u }, {  2958436949848472639u, 105989396547559618u },
+ { 12921418224165366607u, 132486745684449522u }, { 12687572408530742033u,  82804216052780951u },
+ { 11247779492236039638u, 103505270065976189u }, {   224666310012885835u, 129381587582470237u },
+ {  2446259452971747599u,  80863492239043898u }, { 12281196353069460307u, 101079365298804872u },
+ { 15351495441336825384u, 126349206623506090u }, { 14206370669262903769u,  78968254139691306u },
+ {  8534591299723853903u,  98710317674614133u }, { 15279925143082205283u, 123387897093267666u },
+ { 14161639232853766206u,  77117435683292291u }, { 13090363022639819853u,  96396794604115364u },
+ { 16362953778299774816u, 120495993255144205u }, { 12532689120651053212u,  75309995784465128u },
+ { 15665861400813816515u,  94137494730581410u }, { 10358954714162494836u, 117671868413226763u },
+ {  4168503687137865320u,  73544917758266727u }, {   598943590494943747u,  91931147197833409u },
+ {  5360365506546067587u, 114913933997291761u }, { 11312142901609972388u, 143642417496614701u },
+ {  9375932322719926695u,  89776510935384188u }, { 11719915403399908368u, 112220638669230235u },
+ { 10038208235822497557u, 140275798336537794u }, { 10885566165816448877u,  87672373960336121u },
+ { 18218643725697949000u, 109590467450420151u }, { 18161618638695048346u, 136988084313025189u },
+ { 13656854658398099168u,  85617552695640743u }, { 12459382304570236056u, 107021940869550929u },
+ {  1739169825430631358u, 133777426086938662u }, { 14922039196176308311u,  83610891304336663u },
+ { 14040862976792997485u, 104513614130420829u }, {  3716020665709083144u, 130642017663026037u },
+ {  4628355925281870917u,  81651261039391273u }, { 10397130925029726550u, 102064076299239091u },
+ {  8384727637859770284u, 127580095374048864u }, {  5240454773662356427u,  79737559608780540u },
+ {  6550568467077945534u,  99671949510975675u }, {  3576524565420044014u, 124589936888719594u },
+ {  6847013871814915412u,  77868710555449746u }, { 17782139376623420074u,  97335888194312182u },
+ { 13004302183924499284u, 121669860242890228u }, { 17351060901807587860u,  76043662651806392u },
+ {  3242082053549933210u,  95054578314757991u }, { 17887660622219580224u, 118818222893447488u },
+ { 11179787888887237640u,  74261389308404680u }, { 13974734861109047050u,  92826736635505850u },
+ {  8245046539531533005u, 116033420794382313u }, { 16682369133275677888u,  72520887996488945u },
+ {  7017903361312433648u,  90651109995611182u }, { 17995751238495317868u, 113313887494513977u },
+ {  8659630992836983623u, 141642359368142472u }, {  5412269370523114764u,  88526474605089045u },
+ { 11377022731581281359u, 110658093256361306u }, {  4997906377621825891u, 138322616570451633u },
+ { 14652906532082110942u,  86451635356532270u }, {  9092761128247862869u, 108064544195665338u },
+ {  2142579373455052779u, 135080680244581673u }, { 12868327154477877747u,  84425425152863545u },
+ {  2250350887815183471u, 105531781441079432u }, {  2812938609768979339u, 131914726801349290u },
+ {  6369772649532999991u,  82446704250843306u }, { 17185587848771025797u, 103058380313554132u },
+ {  3035240737254230630u, 128822975391942666u }, {  6508711479211282048u,  80514359619964166u },
+ { 17359261385868878368u, 100642949524955207u }, { 17087390713908710056u, 125803686906194009u },
+ {  3762090168551861929u,  78627304316371256u }, {  4702612710689827411u,  98284130395464070u },
+ { 15101637925217060072u, 122855162994330087u }, { 16356052730901744401u,  76784476871456304u },
+ {  1998321839917628885u,  95980596089320381u }, {  7109588318324424010u, 119975745111650476u },
+ { 13666864735807540814u,  74984840694781547u }, { 12471894901332038114u,  93731050868476934u },
+ {  6366496589810271835u, 117163813585596168u }, {  3979060368631419896u,  73227383490997605u },
+ {  9585511479216662775u,  91534229363747006u }, {  2758517312166052660u, 114417786704683758u },
+ { 12671518677062341634u, 143022233380854697u }, {  1002170145522881665u,  89388895863034186u },
+ { 10476084718758377889u, 111736119828792732u }, { 13095105898447972362u, 139670149785990915u },
+ {  5878598177316288774u,  87293843616244322u }, { 16571619758500136775u, 109117304520305402u },
+ { 11491152661270395161u, 136396630650381753u }, {   264441385652915120u,  85247894156488596u },
+ {   330551732066143900u, 106559867695610745u }, {  5024875683510067779u, 133199834619513431u },
+ { 10058076329834874218u,  83249896637195894u }, {  3349223375438816964u, 104062370796494868u },
+ {  4186529219298521205u, 130077963495618585u }, { 14145795808130045513u,  81298727184761615u },
+ { 13070558741735168987u, 101623408980952019u }, { 11726512408741573330u, 127029261226190024u },
+ {  7329070255463483331u,  79393288266368765u }, { 13773023837756742068u,  99241610332960956u },
+ { 17216279797195927585u, 124052012916201195u }, {  8454331864033760789u,  77532508072625747u },
+ {  5956228811614813082u,  96915635090782184u }, {  7445286014518516353u, 121144543863477730u },
+ {  9264989777501460624u,  75715339914673581u }, { 16192923240304213684u,  94644174893341976u },
+ {  1794409976670715490u, 118305218616677471u }, {  8039035263060279037u,  73940761635423419u },
+ {  5437108060397960892u,  92425952044279274u }, { 16019757112352226923u, 115532440055349092u },
+ {   788976158365366019u,  72207775034593183u }, { 14821278253238871236u,  90259718793241478u },
+ {  9303225779693813237u, 112824648491551848u }, { 11629032224617266546u, 141030810614439810u },
+ { 11879831158813179495u,  88144256634024881u }, {  1014730893234310657u, 110180320792531102u },
+ { 10491785653397664129u, 137725400990663877u }, {  8863209042587234033u,  86078375619164923u },
+ {  6467325284806654637u, 107597969523956154u }, { 17307528642863094104u, 134497461904945192u },
+ { 10817205401789433815u,  84060913690590745u }, { 18133192770664180173u, 105076142113238431u },
+ { 18054804944902837312u, 131345177641548039u }, { 18201782118205355176u,  82090736025967524u },
+ {  4305483574047142354u, 102613420032459406u }, { 14605226504413703751u, 128266775040574257u },
+ {  2210737537617482988u,  80166734400358911u }, { 16598479977304017447u, 100208418000448638u },
+ { 11524727934775246001u, 125260522500560798u }, {  2591268940807140847u,  78287826562850499u },
+ { 17074144231291089770u,  97859783203563123u }, { 16730994270686474309u, 122324729004453904u },
+ { 10456871419179046443u,  76452955627783690u }, {  3847717237119032246u,  95566194534729613u },
+ {  9421332564826178211u, 119457743168412016u }, {  5888332853016361382u,  74661089480257510u },
+ { 16583788103125227536u,  93326361850321887u }, { 16118049110479146516u, 116657952312902359u },
+ { 16991309721690548428u,  72911220195563974u }, { 12015765115258409727u,  91139025244454968u },
+ { 15019706394073012159u, 113923781555568710u }, {  9551260955736489391u, 142404726944460888u },
+ {  5969538097335305869u,  89002954340288055u }, {  2850236603241744433u, 111253692925360069u }
+};
+
+#endif // RYU_D2S_FULL_TABLE_H
diff --git a/src/common/d2s_intrinsics.h b/src/common/d2s_intrinsics.h
new file mode 100644
index 0000000000..54ac7bb002
--- /dev/null
+++ b/src/common/d2s_intrinsics.h
@@ -0,0 +1,156 @@
+// Copyright 2018 Ulf Adams
+//
+// The contents of this file may be used under the terms of the Apache License,
+// Version 2.0.
+//
+//    (See accompanying file LICENSE-Apache or copy at
+//     http://www.apache.org/licenses/LICENSE-2.0)
+//
+// Alternatively, the contents of this file may be used under the terms of
+// the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE-Boost or copy at
+//     https://www.boost.org/LICENSE_1_0.txt)
+//
+// Unless required by applicable law or agreed to in writing, this software
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.
+#ifndef RYU_D2S_INTRINSICS_H
+#define RYU_D2S_INTRINSICS_H
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "ryu_common.h"
+
+#if defined(HAS_64_BIT_INTRINSICS)
+
+#include <intrin.h>
+
+static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
+  return _umul128(a, b, productHi);
+}
+
+static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
+  // For the __shiftright128 intrinsic, the shift value is always
+  // modulo 64.
+  // In the current implementation of the double-precision version
+  // of Ryu, the shift value is always < 64. (In the case
+  // RYU_OPTIMIZE_SIZE == 0, the shift value is in the range [49, 58].
+  // Otherwise in the range [2, 59].)
+  // Check this here in case a future change requires larger shift
+  // values. In this case this function needs to be adjusted.
+  assert(dist < 64);
+  return __shiftright128(lo, hi, (unsigned char) dist);
+}
+
+#else // defined(HAS_64_BIT_INTRINSICS)
+
+static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
+  // The casts here help MSVC to avoid calls to the __allmul library function.
+  const uint32_t aLo = (uint32_t)a;
+  const uint32_t aHi = (uint32_t)(a >> 32);
+  const uint32_t bLo = (uint32_t)b;
+  const uint32_t bHi = (uint32_t)(b >> 32);
+
+  const uint64_t b00 = (uint64_t)aLo * bLo;
+  const uint64_t b01 = (uint64_t)aLo * bHi;
+  const uint64_t b10 = (uint64_t)aHi * bLo;
+  const uint64_t b11 = (uint64_t)aHi * bHi;
+
+  const uint32_t b00Lo = (uint32_t)b00;
+  const uint32_t b00Hi = (uint32_t)(b00 >> 32);
+
+  const uint64_t mid1 = b10 + b00Hi;
+  const uint32_t mid1Lo = (uint32_t)(mid1);
+  const uint32_t mid1Hi = (uint32_t)(mid1 >> 32);
+
+  const uint64_t mid2 = b01 + mid1Lo;
+  const uint32_t mid2Lo = (uint32_t)(mid2);
+  const uint32_t mid2Hi = (uint32_t)(mid2 >> 32);
+
+  const uint64_t pHi = b11 + mid1Hi + mid2Hi;
+  const uint64_t pLo = ((uint64_t)mid2Lo << 32) + b00Lo;
+
+  *productHi = pHi;
+  return pLo;
+}
+
+static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
+  // We don't need to handle the case dist >= 64 here (see above).
+  assert(dist < 64);
+#if defined(RYU_OPTIMIZE_SIZE) || !defined(RYU_32_BIT_PLATFORM)
+  assert(dist > 0);
+  return (hi << (64 - dist)) | (lo >> dist);
+#else
+  // Avoid a 64-bit shift by taking advantage of the range of shift values.
+  assert(dist >= 32);
+  return (hi << (64 - dist)) | ((uint32_t)(lo >> 32) >> (dist - 32));
+#endif
+}
+
+#endif // defined(HAS_64_BIT_INTRINSICS)
+
+#ifdef RYU_32_BIT_PLATFORM
+
+// Returns the high 64 bits of the 128-bit product of a and b.
+static inline uint64_t umulh(const uint64_t a, const uint64_t b) {
+  // Reuse the umul128 implementation.
+  // Optimizers will likely eliminate the instructions used to compute the
+  // low part of the product.
+  uint64_t hi;
+  umul128(a, b, &hi);
+  return hi;
+}
+
+// On 32-bit platforms, compilers typically generate calls to library
+// functions for 64-bit divisions, even if the divisor is a constant.
+//
+// E.g.:
+// https://bugs.llvm.org/show_bug.cgi?id=37932
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=17958
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=37443
+//
+// The functions here perform division-by-constant using multiplications
+// in the same way as 64-bit compilers would do.
+//
+// NB:
+// The multipliers and shift values are the ones generated by clang x64
+// for expressions like x/5, x/10, etc.
+
+static inline uint64_t div5(const uint64_t x) {
+  return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 2;
+}
+
+static inline uint64_t div10(const uint64_t x) {
+  return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 3;
+}
+
+static inline uint64_t div100(const uint64_t x) {
+  return umulh(x >> 2, 0x28F5C28F5C28F5C3u) >> 2;
+}
+
+static inline uint64_t div100000000(const uint64_t x) {
+  return umulh(x, 0xABCC77118461CEFDu) >> 26;
+}
+
+#else // RYU_32_BIT_PLATFORM
+
+static inline uint64_t div5(const uint64_t x) {
+  return x / 5;
+}
+
+static inline uint64_t div10(const uint64_t x) {
+  return x / 10;
+}
+
+static inline uint64_t div100(const uint64_t x) {
+  return x / 100;
+}
+
+static inline uint64_t div100000000(const uint64_t x) {
+  return x / 100000000;
+}
+
+#endif // RYU_32_BIT_PLATFORM
+
+#endif // RYU_D2S_INTRINSICS_H
diff --git a/src/common/digit_table.h b/src/common/digit_table.h
new file mode 100644
index 0000000000..02219bc6d5
--- /dev/null
+++ b/src/common/digit_table.h
@@ -0,0 +1,35 @@
+// Copyright 2018 Ulf Adams
+//
+// The contents of this file may be used under the terms of the Apache License,
+// Version 2.0.
+//
+//    (See accompanying file LICENSE-Apache or copy at
+//     http://www.apache.org/licenses/LICENSE-2.0)
+//
+// Alternatively, the contents of this file may be used under the terms of
+// the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE-Boost or copy at
+//     https://www.boost.org/LICENSE_1_0.txt)
+//
+// Unless required by applicable law or agreed to in writing, this software
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.
+#ifndef RYU_DIGIT_TABLE_H
+#define RYU_DIGIT_TABLE_H
+
+// A table of all two-digit numbers. This is used to speed up decimal digit
+// generation by copying pairs of digits into the final output.
+static const char DIGIT_TABLE[200] = {
+  '0','0','0','1','0','2','0','3','0','4','0','5','0','6','0','7','0','8','0','9',
+  '1','0','1','1','1','2','1','3','1','4','1','5','1','6','1','7','1','8','1','9',
+  '2','0','2','1','2','2','2','3','2','4','2','5','2','6','2','7','2','8','2','9',
+  '3','0','3','1','3','2','3','3','3','4','3','5','3','6','3','7','3','8','3','9',
+  '4','0','4','1','4','2','4','3','4','4','4','5','4','6','4','7','4','8','4','9',
+  '5','0','5','1','5','2','5','3','5','4','5','5','5','6','5','7','5','8','5','9',
+  '6','0','6','1','6','2','6','3','6','4','6','5','6','6','6','7','6','8','6','9',
+  '7','0','7','1','7','2','7','3','7','4','7','5','7','6','7','7','7','8','7','9',
+  '8','0','8','1','8','2','8','3','8','4','8','5','8','6','8','7','8','8','8','9',
+  '9','0','9','1','9','2','9','3','9','4','9','5','9','6','9','7','9','8','9','9'
+};
+
+#endif // RYU_DIGIT_TABLE_H
diff --git a/src/common/f2s.c b/src/common/f2s.c
new file mode 100644
index 0000000000..cd690d57b5
--- /dev/null
+++ b/src/common/f2s.c
@@ -0,0 +1,453 @@
+// Copyright 2018 Ulf Adams
+//
+// The contents of this file may be used under the terms of the Apache License,
+// Version 2.0.
+//
+//    (See accompanying file LICENSE-Apache or copy at
+//     http://www.apache.org/licenses/LICENSE-2.0)
+//
+// Alternatively, the contents of this file may be used under the terms of
+// the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE-Boost or copy at
+//     https://www.boost.org/LICENSE_1_0.txt)
+//
+// Unless required by applicable law or agreed to in writing, this software
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.
+
+// Runtime compiler options:
+// -DRYU_DEBUG Generate verbose debugging output to stdout.
+
+#define NDEBUG
+
+#include "common/ryu.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#ifdef RYU_DEBUG
+#include <stdio.h>
+#endif
+
+#include "ryu_common.h"
+#include "digit_table.h"
+
+#define FLOAT_MANTISSA_BITS 23
+#define FLOAT_EXPONENT_BITS 8
+
+// This table is generated by PrintFloatLookupTable.
+#define FLOAT_POW5_INV_BITCOUNT 59
+static const uint64_t FLOAT_POW5_INV_SPLIT[31] = {
+  576460752303423489u, 461168601842738791u, 368934881474191033u, 295147905179352826u,
+  472236648286964522u, 377789318629571618u, 302231454903657294u, 483570327845851670u,
+  386856262276681336u, 309485009821345069u, 495176015714152110u, 396140812571321688u,
+  316912650057057351u, 507060240091291761u, 405648192073033409u, 324518553658426727u,
+  519229685853482763u, 415383748682786211u, 332306998946228969u, 531691198313966350u,
+  425352958651173080u, 340282366920938464u, 544451787073501542u, 435561429658801234u,
+  348449143727040987u, 557518629963265579u, 446014903970612463u, 356811923176489971u,
+  570899077082383953u, 456719261665907162u, 365375409332725730u
+};
+#define FLOAT_POW5_BITCOUNT 61
+static const uint64_t FLOAT_POW5_SPLIT[47] = {
+ 1152921504606846976u, 1441151880758558720u, 1801439850948198400u, 2251799813685248000u,
+ 1407374883553280000u, 1759218604441600000u, 2199023255552000000u, 1374389534720000000u,
+ 1717986918400000000u, 2147483648000000000u, 1342177280000000000u, 1677721600000000000u,
+ 2097152000000000000u, 1310720000000000000u, 1638400000000000000u, 2048000000000000000u,
+ 1280000000000000000u, 1600000000000000000u, 2000000000000000000u, 1250000000000000000u,
+ 1562500000000000000u, 1953125000000000000u, 1220703125000000000u, 1525878906250000000u,
+ 1907348632812500000u, 1192092895507812500u, 1490116119384765625u, 1862645149230957031u,
+ 1164153218269348144u, 1455191522836685180u, 1818989403545856475u, 2273736754432320594u,
+ 1421085471520200371u, 1776356839400250464u, 2220446049250313080u, 1387778780781445675u,
+ 1734723475976807094u, 2168404344971008868u, 1355252715606880542u, 1694065894508600678u,
+ 2117582368135750847u, 1323488980084844279u, 1654361225106055349u, 2067951531382569187u,
+ 1292469707114105741u, 1615587133892632177u, 2019483917365790221u
+};
+
+static inline uint32_t pow5Factor(uint32_t value) {
+  uint32_t count = 0;
+  for (;;) {
+    assert(value != 0);
+    const uint32_t q = value / 5;
+    const uint32_t r = value % 5;
+    if (r != 0) {
+      break;
+    }
+    value = q;
+    ++count;
+  }
+  return count;
+}
+
+// Returns true if value is divisible by 5^p.
+static inline bool multipleOfPowerOf5(const uint32_t value, const uint32_t p) {
+  return pow5Factor(value) >= p;
+}
+
+// Returns true if value is divisible by 2^p.
+static inline bool multipleOfPowerOf2(const uint32_t value, const uint32_t p) {
+  // return __builtin_ctz(value) >= p;
+  return (value & ((1u << p) - 1)) == 0;
+}
+
+// It seems to be slightly faster to avoid uint128_t here, although the
+// generated code for uint128_t looks slightly nicer.
+static inline uint32_t mulShift(const uint32_t m, const uint64_t factor, const int32_t shift) {
+  assert(shift > 32);
+
+  // The casts here help MSVC to avoid calls to the __allmul library
+  // function.
+  const uint32_t factorLo = (uint32_t)(factor);
+  const uint32_t factorHi = (uint32_t)(factor >> 32);
+  const uint64_t bits0 = (uint64_t)m * factorLo;
+  const uint64_t bits1 = (uint64_t)m * factorHi;
+
+#ifdef RYU_32_BIT_PLATFORM
+  // On 32-bit platforms we can avoid a 64-bit shift-right since we only
+  // need the upper 32 bits of the result and the shift value is > 32.
+  const uint32_t bits0Hi = (uint32_t)(bits0 >> 32);
+  uint32_t bits1Lo = (uint32_t)(bits1);
+  uint32_t bits1Hi = (uint32_t)(bits1 >> 32);
+  bits1Lo += bits0Hi;
+  bits1Hi += (bits1Lo < bits0Hi);
+  const int32_t s = shift - 32;
+  return (bits1Hi << (32 - s)) | (bits1Lo >> s);
+#else // RYU_32_BIT_PLATFORM
+  const uint64_t sum = (bits0 >> 32) + bits1;
+  const uint64_t shiftedSum = sum >> (shift - 32);
+  assert(shiftedSum <= UINT32_MAX);
+  return (uint32_t) shiftedSum;
+#endif // RYU_32_BIT_PLATFORM
+}
+
+static inline uint32_t mulPow5InvDivPow2(const uint32_t m, const uint32_t q, const int32_t j) {
+  return mulShift(m, FLOAT_POW5_INV_SPLIT[q], j);
+}
+
+static inline uint32_t mulPow5divPow2(const uint32_t m, const uint32_t i, const int32_t j) {
+  return mulShift(m, FLOAT_POW5_SPLIT[i], j);
+}
+
+static inline uint32_t decimalLength(const uint32_t v) {
+  // Function precondition: v is not a 10-digit number.
+  // (9 digits are sufficient for round-tripping.)
+  assert(v < 1000000000);
+  if (v >= 100000000) { return 9; }
+  if (v >= 10000000) { return 8; }
+  if (v >= 1000000) { return 7; }
+  if (v >= 100000) { return 6; }
+  if (v >= 10000) { return 5; }
+  if (v >= 1000) { return 4; }
+  if (v >= 100) { return 3; }
+  if (v >= 10) { return 2; }
+  return 1;
+}
+
+// A floating decimal representing m * 10^e.
+typedef struct floating_decimal_32 {
+  uint32_t mantissa;
+  int32_t exponent;
+} floating_decimal_32;
+
+static inline floating_decimal_32 f2d(const uint32_t ieeeMantissa, const uint32_t ieeeExponent) {
+  const uint32_t bias = (1u << (FLOAT_EXPONENT_BITS - 1)) - 1;
+
+  int32_t e2;
+  uint32_t m2;
+  if (ieeeExponent == 0) {
+    // We subtract 2 so that the bounds computation has 2 additional bits.
+    e2 = 1 - bias - FLOAT_MANTISSA_BITS - 2;
+    m2 = ieeeMantissa;
+  } else {
+    e2 = ieeeExponent - bias - FLOAT_MANTISSA_BITS - 2;
+    m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa;
+  }
+  const bool even = (m2 & 1) == 0;
+  const bool acceptBounds = even;
+
+#ifdef RYU_DEBUG
+  printf("-> %u * 2^%d\n", m2, e2 + 2);
+#endif
+
+  // Step 2: Determine the interval of legal decimal representations.
+  const uint32_t mv = 4 * m2;
+  const uint32_t mp = 4 * m2 + 2;
+  // Implicit bool -> int conversion. True is 1, false is 0.
+  const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
+  const uint32_t mm = 4 * m2 - 1 - mmShift;
+
+  // Step 3: Convert to a decimal power base using 64-bit arithmetic.
+  uint32_t vr, vp, vm;
+  int32_t e10;
+  bool vmIsTrailingZeros = false;
+  bool vrIsTrailingZeros = false;
+  uint8_t lastRemovedDigit = 0;
+  if (e2 >= 0) {
+    const uint32_t q = log10Pow2(e2);
+    e10 = q;
+    const int32_t k = FLOAT_POW5_INV_BITCOUNT + pow5bits(q) - 1;
+    const int32_t i = -e2 + q + k;
+    vr = mulPow5InvDivPow2(mv, q, i);
+    vp = mulPow5InvDivPow2(mp, q, i);
+    vm = mulPow5InvDivPow2(mm, q, i);
+#ifdef RYU_DEBUG
+    printf("%u * 2^%d / 10^%u\n", mv, e2, q);
+    printf("V+=%u\nV =%u\nV-=%u\n", vp, vr, vm);
+#endif
+    if (q != 0 && (vp - 1) / 10 <= vm / 10) {
+      // We need to know one removed digit even if we are not going to loop below. We could use
+      // q = X - 1 above, except that would require 33 bits for the result, and we've found that
+      // 32-bit arithmetic is faster even on 64-bit machines.
+      const int32_t l = FLOAT_POW5_INV_BITCOUNT + pow5bits(q - 1) - 1;
+      lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + q - 1 + l) % 10);
+    }
+    if (q <= 9) {
+      // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well.
+      // Only one of mp, mv, and mm can be a multiple of 5, if any.
+      if (mv % 5 == 0) {
+        vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
+      } else if (acceptBounds) {
+        vmIsTrailingZeros = multipleOfPowerOf5(mm, q);
+      } else {
+        vp -= multipleOfPowerOf5(mp, q);
+      }
+    }
+  } else {
+    const uint32_t q = log10Pow5(-e2);
+    e10 = q + e2;
+    const int32_t i = -e2 - q;
+    const int32_t k = pow5bits(i) - FLOAT_POW5_BITCOUNT;
+    int32_t j = q - k;
+    vr = mulPow5divPow2(mv, i, j);
+    vp = mulPow5divPow2(mp, i, j);
+    vm = mulPow5divPow2(mm, i, j);
+#ifdef RYU_DEBUG
+    printf("%u * 5^%d / 10^%u\n", mv, -e2, q);
+    printf("%u %d %d %d\n", q, i, k, j);
+    printf("V+=%u\nV =%u\nV-=%u\n", vp, vr, vm);
+#endif
+    if (q != 0 && (vp - 1) / 10 <= vm / 10) {
+      j = q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT);
+      lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, i + 1, j) % 10);
+    }
+    if (q <= 1) {
+      // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
+      // mv = 4 * m2, so it always has at least two trailing 0 bits.
+      vrIsTrailingZeros = true;
+      if (acceptBounds) {
+        // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
+        vmIsTrailingZeros = mmShift == 1;
+      } else {
+        // mp = mv + 2, so it always has at least one trailing 0 bit.
+        --vp;
+      }
+    } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here.
+      vrIsTrailingZeros = multipleOfPowerOf2(mv, q - 1);
+#ifdef RYU_DEBUG
+      printf("vr is trailing zeros=%s\n", vrIsTrailingZeros ? "true" : "false");
+#endif
+    }
+  }
+#ifdef RYU_DEBUG
+  printf("e10=%d\n", e10);
+  printf("V+=%u\nV =%u\nV-=%u\n", vp, vr, vm);
+  printf("vm is trailing zeros=%s\n", vmIsTrailingZeros ? "true" : "false");
+  printf("vr is trailing zeros=%s\n", vrIsTrailingZeros ? "true" : "false");
+#endif
+
+  // Step 4: Find the shortest decimal representation in the interval of legal representations.
+  uint32_t removed = 0;
+  uint32_t output;
+  if (vmIsTrailingZeros || vrIsTrailingZeros) {
+    // General case, which happens rarely (~4.0%).
+    while (vp / 10 > vm / 10) {
+#ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=23106
+      // The compiler does not realize that vm % 10 can be computed from vm / 10
+      // as vm - (vm / 10) * 10.
+      vmIsTrailingZeros &= vm - (vm / 10) * 10 == 0;
+#else
+      vmIsTrailingZeros &= vm % 10 == 0;
+#endif
+      vrIsTrailingZeros &= lastRemovedDigit == 0;
+      lastRemovedDigit = (uint8_t) (vr % 10);
+      vr /= 10;
+      vp /= 10;
+      vm /= 10;
+      ++removed;
+    }
+#ifdef RYU_DEBUG
+    printf("V+=%u\nV =%u\nV-=%u\n", vp, vr, vm);
+    printf("d-10=%s\n", vmIsTrailingZeros ? "true" : "false");
+#endif
+    if (vmIsTrailingZeros) {
+      while (vm % 10 == 0) {
+        vrIsTrailingZeros &= lastRemovedDigit == 0;
+        lastRemovedDigit = (uint8_t) (vr % 10);
+        vr /= 10;
+        vp /= 10;
+        vm /= 10;
+        ++removed;
+      }
+    }
+#ifdef RYU_DEBUG
+    printf("%u %d\n", vr, lastRemovedDigit);
+    printf("vr is trailing zeros=%s\n", vrIsTrailingZeros ? "true" : "false");
+#endif
+    if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
+      // Round even if the exact number is .....50..0.
+      lastRemovedDigit = 4;
+    }
+    // We need to take vr + 1 if vr is outside bounds or we need to round up.
+    output = vr +
+        ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
+  } else {
+    // Specialized for the common case (~96.0%). Percentages below are relative to this.
+    // Loop iterations below (approximately):
+    // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01%
+    while (vp / 10 > vm / 10) {
+      lastRemovedDigit = (uint8_t) (vr % 10);
+      vr /= 10;
+      vp /= 10;
+      vm /= 10;
+      ++removed;
+    }
+#ifdef RYU_DEBUG
+    printf("%u %d\n", vr, lastRemovedDigit);
+    printf("vr is trailing zeros=%s\n", vrIsTrailingZeros ? "true" : "false");
+#endif
+    // We need to take vr + 1 if vr is outside bounds or we need to round up.
+    output = vr + (vr == vm || lastRemovedDigit >= 5);
+  }
+  const int32_t exp = e10 + removed;
+
+#ifdef RYU_DEBUG
+  printf("V+=%u\nV =%u\nV-=%u\n", vp, vr, vm);
+  printf("O=%u\n", output);
+  printf("EXP=%d\n", exp);
+#endif
+
+  floating_decimal_32 fd;
+  fd.exponent = exp;
+  fd.mantissa = output;
+  return fd;
+}
+
+static inline int to_chars(const floating_decimal_32 v, const bool sign, char* const result) {
+  // Step 5: Print the decimal representation.
+  int index = 0;
+  if (sign) {
+    result[index++] = '-';
+  }
+
+  uint32_t output = v.mantissa;
+  const uint32_t olength = decimalLength(output);
+
+#ifdef RYU_DEBUG
+  printf("DIGITS=%u\n", v.mantissa);
+  printf("OLEN=%u\n", olength);
+  printf("EXP=%u\n", v.exponent + olength);
+#endif
+
+  // Print the decimal digits.
+  // The following code is equivalent to:
+  // for (uint32_t i = 0; i < olength - 1; ++i) {
+  //   const uint32_t c = output % 10; output /= 10;
+  //   result[index + olength - i] = (char) ('0' + c);
+  // }
+  // result[index] = '0' + output % 10;
+  uint32_t i = 0;
+  while (output >= 10000) {
+#ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217
+    const uint32_t c = output - 10000 * (output / 10000);
+#else
+    const uint32_t c = output % 10000;
+#endif
+    output /= 10000;
+    const uint32_t c0 = (c % 100) << 1;
+    const uint32_t c1 = (c / 100) << 1;
+    memcpy(result + index + olength - i - 1, DIGIT_TABLE + c0, 2);
+    memcpy(result + index + olength - i - 3, DIGIT_TABLE + c1, 2);
+    i += 4;
+  }
+  if (output >= 100) {
+    const uint32_t c = (output % 100) << 1;
+    output /= 100;
+    memcpy(result + index + olength - i - 1, DIGIT_TABLE + c, 2);
+    i += 2;
+  }
+  if (output >= 10) {
+    const uint32_t c = output << 1;
+    // We can't use memcpy here: the decimal dot goes between these two digits.
+    result[index + olength - i] = DIGIT_TABLE[c + 1];
+    result[index] = DIGIT_TABLE[c];
+  } else {
+    result[index] = (char) ('0' + output);
+  }
+
+  // Print decimal point if needed.
+  if (olength > 1) {
+    result[index + 1] = '.';
+    index += olength + 1;
+  } else {
+    ++index;
+  }
+
+  // Print the exponent.
+  result[index++] = 'E';
+  int32_t exp = v.exponent + olength - 1;
+  if (exp < 0) {
+    result[index++] = '-';
+    exp = -exp;
+  }
+
+  if (exp >= 10) {
+    memcpy(result + index, DIGIT_TABLE + 2 * exp, 2);
+    index += 2;
+  } else {
+    result[index++] = (char) ('0' + exp);
+  }
+
+  return index;
+}
+
+int ryu_f2s_buffered_n(float f, char* result) {
+  // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
+  const uint32_t bits = float_to_bits(f);
+
+#ifdef RYU_DEBUG
+  printf("IN=");
+  for (int32_t bit = 31; bit >= 0; --bit) {
+    printf("%u", (bits >> bit) & 1);
+  }
+  printf("\n");
+#endif
+
+  // Decode bits into sign, mantissa, and exponent.
+  const bool ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0;
+  const uint32_t ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1);
+  const uint32_t ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1);
+
+  // Case distinction; exit early for the easy cases.
+  if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
+    return copy_special_str(result, ieeeSign, ieeeExponent, ieeeMantissa);
+  }
+
+  const floating_decimal_32 v = f2d(ieeeMantissa, ieeeExponent);
+  return to_chars(v, ieeeSign, result);
+}
+
+void ryu_f2s_buffered(float f, char* result) {
+  const int index = ryu_f2s_buffered_n(f, result);
+
+  // Terminate the string.
+  result[index] = '\0';
+}
+
+char* ryu_f2s(float f) {
+  char* const result = (char*) malloc(16);
+  ryu_f2s_buffered(f, result);
+  return result;
+}
diff --git a/src/common/ryu_common.h b/src/common/ryu_common.h
new file mode 100644
index 0000000000..a88661d1e9
--- /dev/null
+++ b/src/common/ryu_common.h
@@ -0,0 +1,81 @@
+// Copyright 2018 Ulf Adams
+//
+// The contents of this file may be used under the terms of the Apache License,
+// Version 2.0.
+//
+//    (See accompanying file LICENSE-Apache or copy at
+//     http://www.apache.org/licenses/LICENSE-2.0)
+//
+// Alternatively, the contents of this file may be used under the terms of
+// the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE-Boost or copy at
+//     https://www.boost.org/LICENSE_1_0.txt)
+//
+// Unless required by applicable law or agreed to in writing, this software
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.
+#ifndef RYU_COMMON_H
+#define RYU_COMMON_H
+
+#include <assert.h>
+#include <stdint.h>
+
+#if defined(_M_IX86) || defined(_M_ARM)
+#define RYU_32_BIT_PLATFORM
+#endif
+
+// Returns e == 0 ? 1 : ceil(log_2(5^e)).
+static inline uint32_t pow5bits(const int32_t e) {
+  // This approximation works up to the point that the multiplication overflows at e = 3529.
+  // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
+  // than 2^9297.
+  assert(e >= 0);
+  assert(e <= 3528);
+  return ((((uint32_t) e) * 1217359) >> 19) + 1;
+}
+
+// Returns floor(log_10(2^e)).
+static inline int32_t log10Pow2(const int32_t e) {
+  // The first value this approximation fails for is 2^1651 which is just greater than 10^297.
+  assert(e >= 0);
+  assert(e <= 1650);
+  return (int32_t) ((((uint32_t) e) * 78913) >> 18);
+}
+
+// Returns floor(log_10(5^e)).
+static inline int32_t log10Pow5(const int32_t e) {
+  // The first value this approximation fails for is 5^2621 which is just greater than 10^1832.
+  assert(e >= 0);
+  assert(e <= 2620);
+  return (int32_t) ((((uint32_t) e) * 732923) >> 20);
+}
+
+static inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa) {
+  if (mantissa) {
+    memcpy(result, "NaN", 3);
+    return 3;
+  }
+  if (sign) {
+    result[0] = '-';
+  }
+  if (exponent) {
+    memcpy(result + sign, "Infinity", 8);
+    return sign + 8;
+  }
+  memcpy(result + sign, "0E0", 3);
+  return sign + 3;
+}
+
+static inline uint32_t float_to_bits(const float f) {
+  uint32_t bits = 0;
+  memcpy(&bits, &f, sizeof(float));
+  return bits;
+}
+
+static inline uint64_t double_to_bits(const double d) {
+  uint64_t bits = 0;
+  memcpy(&bits, &d, sizeof(double));
+  return bits;
+}
+
+#endif // RYU_COMMON_H
diff --git a/src/include/common/ryu.h b/src/include/common/ryu.h
new file mode 100644
index 0000000000..3378086c4b
--- /dev/null
+++ b/src/include/common/ryu.h
@@ -0,0 +1,36 @@
+// Copyright 2018 Ulf Adams
+//
+// The contents of this file may be used under the terms of the Apache License,
+// Version 2.0.
+//
+//    (See accompanying file LICENSE-Apache or copy at
+//     http://www.apache.org/licenses/LICENSE-2.0)
+//
+// Alternatively, the contents of this file may be used under the terms of
+// the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE-Boost or copy at
+//     https://www.boost.org/LICENSE_1_0.txt)
+//
+// Unless required by applicable law or agreed to in writing, this software
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.
+#ifndef RYU_H
+#define RYU_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ryu_d2s_buffered_n(double f, char* result);
+void ryu_d2s_buffered(double f, char* result);
+char* ryu_d2s(double f);
+
+int ryu_f2s_buffered_n(float f, char* result);
+void ryu_f2s_buffered(float f, char* result);
+char* ryu_f2s(float f);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // RYU_H

Re: Performance improvements for src/port/snprintf.c

Reply via email to