https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105101
--- Comment #9 from Michael_S <already5chosen at yahoo dot com> --- (In reply to Michael_S from comment #4) > If you want quick fix for immediate shipment then you can take that: > > #include <math.h> > #include <quadmath.h> > > __float128 quick_and_dirty_sqrtq(__float128 x) > { > if (isnanq(x)) > return x; > > if (x==0) > return x; > > if (x < 0) > return nanq(""); > > if (isinfq(x)) > return x; > > int xExp; > x = frexpq(x, &xExp); > if (xExp & 1) > x *= 2.0; // x in [0.5:2.0) > > __float128 r = (__float128)(1.0/sqrt((double)x)); // r=rsqrt(x) estimate > (53 bits) > r *= 1.5 - r*r*x*0.5; // NR iteration improves precision of r to 105.4 bit > > __float128 y = x*r; // y=sqrt(x) estimate (105.4 bits) > > // extended-precision NR iteration > __float128 yH = (double)y; > __float128 yL = y - yH; > > __float128 deltaX = x - yH*yH; > deltaX -= yH*yL*2; > deltaX -= yL*yL; > y += deltaX*r*0.5; // improve precision of y to ~210.2 bits. Not enough > for perfect rounding, but not too bad > > return ldexpq(y, xExp >> 1); > } > > It is very slow, even slower than what you have now, which by itself is > quite astonishingly slow. > It is also not sufficiently precise for correct rounding in all cases. > But, at least, the worst error is something like (0.5+2**-98) ULP, so you are > unlikely to be ever caught by black box type of testing. > It's biggest advantage is extreme portability. > Should run on all platforms where double==IEEE binary64 and __float128 == > IEEE binary128. > > May be, few days later I'll have better variant for "good" 64-bit platforms > i.e. for those where we have __int128. > It would be 15-25 times faster than the variant above and rounding would be > mathematically correct rather than just "impossible to be caught" like above. > But it would not run everywhere. > Also, I want to give it away under MIT or BSD license, rather than under GPL. Here. #include <stdint.h> #include <string.h> #include <quadmath.h> static __inline uint64_t umulx64(uint64_t mult1, uint64_t mult2, int rshift) { return (uint64_t)(((unsigned __int128)mult1 * mult2)>>rshift); } static __inline uint64_t umulh64(uint64_t mult1, uint64_t mult2) { return umulx64(mult1, mult2, 64); } static __inline __float128 ret__float128(uint64_t wordH, uint64_t wordL) { unsigned __int128 res_u128 = ((unsigned __int128)wordH << 64) | wordL; __float128 result; memcpy(&result, &res_u128, sizeof(result)); // hopefully __int128 and __float128 have the endianness return result; } __float128 slow_and_clean_sqrtq(__float128 src) { typedef unsigned __int128 __uint128; const uint64_t SIGN_BIT = (uint64_t)1 << 63; const uint64_t MANT_H_MSK = (uint64_t)-1 >> 16; const uint64_t nNAN_MSW = (uint64_t)0xFFFF8 << 44; // -NaN __uint128 src_u128; memcpy(&src_u128, &src, sizeof(src_u128)); uint64_t mshalf = (uint64_t)(src_u128 >> 64); // MS half uint64_t mantL = (uint64_t)(src_u128); // LS half // parse MS half of source unsigned exp = mshalf >> 48; uint64_t mantH = mshalf & MANT_H_MSK; unsigned expx = exp + 0x3FFF; if (exp - 1 >= 0x7FFF-1) { // special cases: exp = 0, exp=max_exp or sign=1 if (exp > 0x7fff) { // negative if (exp==0xFFFF) { // NaN or -Inf if ((mantH | mantL)==0) // -Inf mshalf = nNAN_MSW; return ret__float128(mshalf, mantL); // in case of NaN the leave number intact } if (mshalf==SIGN_BIT && mantL==0) // -0 return ret__float128(mshalf, mantL); // in case of -0 leave the number intact // normal or sub-normal mantL = 0; mshalf = nNAN_MSW; return ret__float128(mshalf, mantL); } // non-negative if (exp == 0x7fff) // NaN or -Inf return ret__float128(mshalf, mantL); // in case of positive NaN or Inf leave the number intact // exp==0 : zero or subnormal if ((mantH | mantL)==0) // +0 return ret__float128(mshalf, mantL); // in case of +0 leave the number intact // positive subnormal // normalize if (mantH == 0) { expx -= 48; int lz = __builtin_clzll(mantL); expx -= lz; mantL <<= lz + 1; mantH = mantL >> 16; mantL = mantL << 48; } else { int lz = __builtin_clzll(mantH)-16; expx -= lz; mantH = (mantH << (lz+1)) | mantL >> (63-lz); mantL = mantL << (lz + 1); } mantH &= MANT_H_MSK; } // Normal case int e = expx & 1; // e= LS bit of exponent const uint64_t BIT_30 = (uint64_t)1 << 30; static const uint8_t rsqrt_tab[64] = { // floor(1/sqrt(1+i/32)*512) - 256 252, 248, 244, 240, 237, 233, 230, 226, 223, 220, 216, 213, 210, 207, 204, 201, 199, 196, 193, 190, 188, 185, 183, 180, 178, 175, 173, 171, 168, 166, 164, 162, 159, 157, 155, 153, 151, 149, 147, 145, 143, 141, 139, 138, 136, 134, 132, 131, 129, 127, 125, 124, 122, 121, 119, 117, 116, 114, 113, 111, 110, 108, 107, 106, }; // Look for approximate value of rsqrt(x)=1/sqrt(x) // where x = 1:mantH:mantL unsigned r0 = rsqrt_tab[mantH >> (48-6)]+256; // scale = 2**9 // r0 is low estimate, r0*sqrt(x) is in range [0.99119507..0.99991213] // Est. Precisions: ~6.82 bits // // Improve precision of the estimate by 2nd-order NR iteration with custom polynomial // r1 = r0 * (P2*(r0*r0*x)**2 + P1*(r0*r0*x) + P0) // where // P2 = 0.38345164828933775 // P1 = -1.26684607889193623 // P0 = 1.88339443966540210 // Calculations are implemented in a way that minimizes latency // and minimizes requirement to precision of the coefficients as // ((rrx0_n + A)**2 + B)*(r0*F[e]) // where rrx0_n = 1 - r0*r0*x // // The last multiplications serves additional purpose of adjustment by exponent (multiplication by sqrt(0.5)) // Only upper 30 bits of mantissa used in evaluation. Use of minimal amount of input bits // simplifies validation by exhausting with minimal negative effect on worst-case precision // uint32_t rrx0 = ((r0*r0 * ((mantH >> 18)+BIT_30+1))>>16)+1;// scale = 2**32 // rrx0 calculated with x1_u=x/2**34 + 1 instead of x1=x/2**34, in order to assure that // for any possible values of 34 LS bits of x, the estimate r1 is lower than exact value of r const uint32_t A = 2799880908; // scale = 2**32 const uint32_t B = 2343892134; // scale = 2**30 static uint32_t const f_tab[2] = { 3293824581, 2329085697 }; // F/sqrt(2**i), scale = 2**33 uint32_t rrx0_n = 0 - rrx0; // scale = 2**32, nbits=27 uint32_t termA = rrx0_n + A; // scale = 2**32, nbits=32 uint64_t termAA = ((uint64_t)termA * termA) >> 34; // scale = 2**30, nbits=30 uint64_t termAB = termAA + B; // scale = 2**30, nbits=32 uint64_t termRF = (r0*(uint64_t)f_tab[e]) >> 9; // scale = 2**33, nbits=32 uint64_t r1 = (termAB*termRF)>>33; // scale = 2**30; nbits=30 // r1 is low estimate // r1 is in range [0x1ffffffe .. 0x3fffffe1] // r1*sqrt(x) is in range [0.999_999_891_641_538_03..0.999_999_999_782_706_57], i.e. // Est. Precisions: ~23.13 bits // // Further improve precision of the estimate by canonical 2nd-order NR iteration // r2 = r1 * (P2*(r0*r0*x)**2 + P1*(r0*r0*x) + P0) // where // P2 = 3/8 // P1 = -5/4 // P0 = 15/8 // At this point precision is of high importance, so evaluation is done in a way // that minimizes impact of truncation while still doing majority of the work with // efficient 64-bit arithmetic. // The transformed equation is r2 = r2 + r2*rrx1_n*(rrx1_n*3 + 4)/8 // where rrx1_n = 1 - r1*r1*x // uint64_t mant64 = (mantH << 16) | (mantL >> 48); // Upper 64 bits of mantissa uint64_t r1r1 = (r1*r1) << e; // scale = 2**60; nbits=60 __uint128 rrx1_x = (__uint128)r1r1*mant64 + r1r1; // r1*r1*(x+1), scale 2**124 uint64_t rrx1 = (uint64_t)(rrx1_x>>53)+(r1r1<<11)+1; // r1*r1*x , scale = 2**71; nbits=64 // rrx1 calculated with x2_u=x+1 instead of x2=x, in order to assure that // for any possible values of 48 LS bits of x, the estimates r2 and y2 are lower than exact values of r and y uint64_t rrx1_n = 0 - rrx1; // rrx1_n=1-r1^2*x , scale = 2**71, nbits=49 uint64_t term1 = rrx1_n; // rrx1_n*(1/2) , scale = 2**72, nbits=49 uint64_t term2 = umulh64(rrx1_n*3,rrx1_n) >> 9; // rrx1_n*rrx1_n*(3/8), scale = 2**72, nbits=27 uint64_t deltaR = umulh64(r1<<26,term1+term2); // (rrx1_n*1/2+rrx1_n*rrx1_n*(3/8))*r1, scale = 2**64, nbits=41 uint64_t r2 = (r1 << 34)+deltaR; // scale = 2**64, nbits=64 // r2 is low estimate, // r2 is in range [0x8000000000000000..0xffffffffffffffff] // r2*sqrt(x) is in range [0.999_999_999_999_999_999_888_676 .. 0.999_999_999_999_999_999_999_999_989] // Est. Precisions: ~62.96 bits // Worst-case errors are caused by unlucky truncation of cases where exact result scaled by 2**64 is in form N+eps // Calculate y = estimate of sqrt(1+x)-1 = x*rsqrt(x)+rsqrt(x) uint64_t y2 = (umulh64(mant64, r2) + r2) << e; // scale = 2**64, nbits=64 // scale = 2**64, nbits=64 if (r2 >= (uint64_t)(-2)) y2 = 0; // // Improve precision of y by 1-st order NR iteration // It is a canonical NR iteration y = (x + y*y)/(2*y) used // in a form y = y + (x - y*y)*r/2 // // Calculate bits[-60:-123] of dX = (x+1)*2**e - (y+1)*(y+1) // All upper bits of difference are known to be 0 __uint128 y2y2 = (__uint128)y2 * y2; // y*y, scale 2**128 uint64_t yy2 = (uint64_t)(y2y2 >> 5)+(y2<<60);// y*y+2*y, scale 2**123 uint64_t dx = ((mantL<<e)<<11) - yy2; // scale 2**123 dx -= (dx != 0); // subtract 1 in order to guarantee that estimate is lower than exact value uint64_t deltaY = umulh64(dx, r2); // (x - y*y)*r/2, scale 2**124 // Round deltaY to scale = 2**112 const uint64_t GUARD_MSK = (1 << 12)-1; // scale 2**124 const uint64_t GUARD_BIT = 1 << 11; // scale 2**124 const uint64_t MAX_ERR = 6; // scale 2**124 uint64_t round_incr = GUARD_BIT; uint64_t guard_bits = deltaY & GUARD_MSK; if (guard_bits - (GUARD_BIT-MAX_ERR) < MAX_ERR) { // Guard bits are very close to the half-bit (from below) // Direction of the rounding should be found by additional calculations __uint128 midY = (((__uint128)y2 << 49)+(deltaY>>11)) | 1 | ((__uint128)1 << 113); // scale 2**113 // MidY resides between two representable output points __uint128 rndDir = midY*midY; // scale 2**226 uint64_t rndDirH = (uint64_t)(rndDir >> 64); // scale 2**162 if ((rndDirH >> (162-113+e)) & 1) round_incr = GUARD_MSK; // when guard bit of the square = 1, it means that the square of midY < x, so we need to round up } deltaY = (deltaY + round_incr) >> 12; // scale 2**112 __uint128 y = ((__uint128)y2 << 48) + deltaY; // scale 2**112 // format output mantL = (uint64_t)(y); mantH = (uint64_t)(y >> 64) & MANT_H_MSK; uint64_t resExp = expx / 2; mshalf = (resExp << 48) | mantH; return ret__float128(mshalf, mantL); } Hopefully, this one is perfectly precise. On Intel (IvyBridge, Haswell, Skylake) it is quite fast. On AMD (Zen3) - less so. I didn't figure out yet, what causes a slowdown. But even on AMD, it's slow only relatively to the same code on Intel. Relatively to current library it is 7.4 times faster. Also, I hope that this post could be considered as giving a code away to any takers with BSD-like license. If it is not, then tell me, how to do it right.