[Bug libquadmath/105101] incorrect rounding for sqrtq

already5chosen at yahoo dot com via Gcc-bugs Fri, 15 Apr 2022 06:49:07 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105101


--- Comment #9 from Michael_S <already5chosen at yahoo dot com> ---
(In reply to Michael_S from comment #4)
> If you want quick fix for immediate shipment then you can take that:
> 
> #include <math.h>
> #include <quadmath.h>
> 
> __float128 quick_and_dirty_sqrtq(__float128 x)
> {
>   if (isnanq(x))
>     return x;
> 
>   if (x==0)
>     return x;
> 
>   if (x < 0)
>     return nanq("");
> 
>   if (isinfq(x))
>     return x;
> 
>   int xExp;
>   x = frexpq(x, &xExp);
>   if (xExp & 1)
>     x *= 2.0; // x in [0.5:2.0)
> 
>   __float128 r = (__float128)(1.0/sqrt((double)x)); // r=rsqrt(x) estimate
> (53 bits)
>   r *= 1.5 - r*r*x*0.5; // NR iteration improves precision of r to 105.4 bit
> 
>   __float128 y  = x*r;  // y=sqrt(x) estimate (105.4 bits)
> 
>   // extended-precision NR iteration
>   __float128 yH = (double)y;
>   __float128 yL = y - yH;
> 
>   __float128 deltaX = x - yH*yH;
>   deltaX -= yH*yL*2;
>   deltaX -= yL*yL;
>   y += deltaX*r*0.5; // improve precision of y to ~210.2 bits. Not enough
> for perfect rounding, but not too bad
> 
>   return ldexpq(y, xExp >> 1);
> }
> 
> It is very slow, even slower than what you have now, which by itself is
> quite astonishingly slow.
> It is also not sufficiently precise for correct rounding in all cases.
> But, at least, the worst error is something like (0.5+2**-98) ULP, so you are
> unlikely to be ever caught by black box type of testing.
> It's biggest advantage is extreme portability.
> Should run on all platforms where double==IEEE binary64 and __float128 ==
> IEEE binary128.
> 
> May be, few days later I'll have better variant for "good" 64-bit platforms
> i.e. for those where we have __int128.
> It would be 15-25 times faster than the variant above and rounding would be
> mathematically correct rather than just "impossible to be caught" like above.
> But it would not run everywhere.
> Also, I want to give it away under MIT or BSD license, rather than under GPL.

Here.

#include <stdint.h>
#include <string.h>
#include <quadmath.h>

static __inline uint64_t umulx64(uint64_t mult1, uint64_t mult2, int rshift)
{
  return (uint64_t)(((unsigned __int128)mult1 * mult2)>>rshift);
}

static __inline uint64_t umulh64(uint64_t mult1, uint64_t mult2)
{
  return umulx64(mult1, mult2, 64);
}

static __inline __float128 ret__float128(uint64_t wordH, uint64_t wordL) {
  unsigned __int128 res_u128 = ((unsigned __int128)wordH << 64) | wordL;
  __float128 result;
  memcpy(&result, &res_u128, sizeof(result)); // hopefully __int128 and
__float128 have the endianness
  return result;
}

__float128 slow_and_clean_sqrtq(__float128 src)
{
  typedef unsigned __int128 __uint128;
  const uint64_t SIGN_BIT   = (uint64_t)1  << 63;
  const uint64_t MANT_H_MSK = (uint64_t)-1 >> 16;
  const uint64_t nNAN_MSW = (uint64_t)0xFFFF8 << 44; // -NaN

  __uint128 src_u128;
  memcpy(&src_u128, &src, sizeof(src_u128));
  uint64_t mshalf = (uint64_t)(src_u128 >> 64); // MS half
  uint64_t mantL  = (uint64_t)(src_u128);       // LS half

  // parse MS half of source
  unsigned exp   = mshalf >> 48;
  uint64_t mantH = mshalf & MANT_H_MSK;
  unsigned expx = exp + 0x3FFF;
  if (exp - 1 >= 0x7FFF-1) { // special cases: exp = 0, exp=max_exp or sign=1
    if (exp > 0x7fff) {  // negative
      if (exp==0xFFFF) { // NaN or -Inf
        if ((mantH | mantL)==0) // -Inf
          mshalf = nNAN_MSW;
        return ret__float128(mshalf, mantL); // in case of NaN the leave number
intact
      }
      if (mshalf==SIGN_BIT && mantL==0) // -0
        return ret__float128(mshalf, mantL); // in case of -0 leave the number
intact
      // normal or sub-normal
      mantL = 0;
      mshalf = nNAN_MSW;
      return ret__float128(mshalf, mantL);
    }
    // non-negative
    if (exp == 0x7fff) // NaN or -Inf
      return ret__float128(mshalf, mantL); // in case of positive NaN or Inf
leave the number intact

    // exp==0 : zero or subnormal
    if ((mantH | mantL)==0) // +0
      return ret__float128(mshalf, mantL); // in case of +0 leave the number
intact

    // positive subnormal
    // normalize
    if (mantH == 0) {
      expx -= 48;
      int lz = __builtin_clzll(mantL);
      expx -= lz;
      mantL <<= lz + 1;
      mantH = mantL >> 16;
      mantL = mantL << 48;
    } else {
      int lz = __builtin_clzll(mantH)-16;
      expx -= lz;
      mantH = (mantH << (lz+1)) | mantL >> (63-lz);
      mantL = mantL << (lz + 1);
    }
    mantH &= MANT_H_MSK;
  }

  // Normal case
  int e = expx & 1; // e= LS bit of exponent

  const uint64_t BIT_30 = (uint64_t)1 << 30;
  static const uint8_t rsqrt_tab[64] = { // floor(1/sqrt(1+i/32)*512) - 256
   252, 248, 244, 240,    237, 233, 230, 226,
   223, 220, 216, 213,    210, 207, 204, 201,
   199, 196, 193, 190,    188, 185, 183, 180,
   178, 175, 173, 171,    168, 166, 164, 162,
   159, 157, 155, 153,    151, 149, 147, 145,
   143, 141, 139, 138,    136, 134, 132, 131,
   129, 127, 125, 124,    122, 121, 119, 117,
   116, 114, 113, 111,    110, 108, 107, 106,
 };

  // Look for approximate value of rsqrt(x)=1/sqrt(x)
  // where x = 1:mantH:mantL
  unsigned r0 = rsqrt_tab[mantH >> (48-6)]+256; // scale = 2**9
  // r0 is low estimate, r0*sqrt(x) is in range [0.99119507..0.99991213]
  // Est. Precisions: ~6.82 bits

  //
  // Improve precision of the estimate by 2nd-order NR iteration with custom
polynomial
  // r1 = r0 * (P2*(r0*r0*x)**2 + P1*(r0*r0*x) + P0)
  // where
  // P2 =  0.38345164828933775
  // P1 = -1.26684607889193623
  // P0 =  1.88339443966540210
  // Calculations are implemented in a way that minimizes latency
  // and minimizes requirement to precision of the coefficients as
  // ((rrx0_n + A)**2 + B)*(r0*F[e])
  // where rrx0_n = 1 - r0*r0*x
  //
  // The last multiplications serves additional purpose of adjustment by
exponent (multiplication by sqrt(0.5))
  // Only upper 30 bits of mantissa used in evaluation.  Use of minimal amount
of input bits
  // simplifies validation by exhausting with minimal negative effect on
worst-case precision
  //
  uint32_t rrx0 = ((r0*r0 * ((mantH >> 18)+BIT_30+1))>>16)+1;// scale = 2**32
  // rrx0 calculated with x1_u=x/2**34 + 1 instead of x1=x/2**34, in order to
assure that
  // for any possible values of 34 LS bits of x, the estimate r1 is lower than
exact value of r
  const uint32_t A = 2799880908;                         // scale = 2**32
  const uint32_t B = 2343892134;                         // scale = 2**30
  static uint32_t const f_tab[2] = { 3293824581, 2329085697 }; // F/sqrt(2**i),
scale = 2**33
  uint32_t rrx0_n = 0 -  rrx0;                           // scale = 2**32,
nbits=27
  uint32_t termA  = rrx0_n + A;                          // scale = 2**32,
nbits=32
  uint64_t termAA = ((uint64_t)termA * termA) >> 34;     // scale = 2**30,
nbits=30
  uint64_t termAB = termAA + B;                          // scale = 2**30,
nbits=32
  uint64_t termRF = (r0*(uint64_t)f_tab[e]) >> 9;        // scale = 2**33,
nbits=32
  uint64_t r1  = (termAB*termRF)>>33;                    // scale = 2**30;
nbits=30
  // r1 is low estimate
  // r1 is in range [0x1ffffffe .. 0x3fffffe1]
  // r1*sqrt(x) is in range
[0.999_999_891_641_538_03..0.999_999_999_782_706_57], i.e.
  // Est. Precisions: ~23.13 bits

  //
  // Further improve precision of the estimate by canonical 2nd-order NR
iteration
  // r2 = r1 * (P2*(r0*r0*x)**2 + P1*(r0*r0*x) + P0)
  // where
  // P2 =  3/8
  // P1 = -5/4
  // P0 =  15/8
  // At this point precision is of high importance, so evaluation is done in a
way
  // that minimizes impact of truncation while still doing majority of the work
with
  // efficient 64-bit arithmetic.
  // The transformed equation is r2 = r2 + r2*rrx1_n*(rrx1_n*3 + 4)/8
  // where rrx1_n = 1 - r1*r1*x
  //
  uint64_t  mant64 = (mantH << 16) | (mantL >> 48);      // Upper 64 bits of
mantissa
  uint64_t  r1r1   = (r1*r1) << e;                       // scale = 2**60;
nbits=60
  __uint128 rrx1_x = (__uint128)r1r1*mant64 + r1r1;      // r1*r1*(x+1), scale
2**124
  uint64_t rrx1 = (uint64_t)(rrx1_x>>53)+(r1r1<<11)+1;   // r1*r1*x           
, scale = 2**71; nbits=64
  // rrx1 calculated with x2_u=x+1 instead of x2=x, in order to assure that
  // for any possible values of 48 LS bits of x, the estimates r2 and y2 are
lower than exact values of r and y
  uint64_t rrx1_n = 0 - rrx1;                            // rrx1_n=1-r1^2*x   
, scale = 2**71, nbits=49
  uint64_t term1 = rrx1_n;                               // rrx1_n*(1/2)      
, scale = 2**72, nbits=49
  uint64_t term2 = umulh64(rrx1_n*3,rrx1_n) >>  9;       //
rrx1_n*rrx1_n*(3/8), scale = 2**72, nbits=27
  uint64_t deltaR = umulh64(r1<<26,term1+term2);         //
(rrx1_n*1/2+rrx1_n*rrx1_n*(3/8))*r1, scale = 2**64, nbits=41
  uint64_t r2 = (r1 << 34)+deltaR;                       // scale = 2**64,
nbits=64
  // r2 is low estimate,
  // r2 is in range [0x8000000000000000..0xffffffffffffffff]
  // r2*sqrt(x) is in range [0.999_999_999_999_999_999_888_676  ..
0.999_999_999_999_999_999_999_999_989]
  // Est. Precisions: ~62.96 bits
  // Worst-case errors are caused by unlucky truncation of cases where exact
result scaled by 2**64 is in form N+eps

  // Calculate y = estimate of sqrt(1+x)-1 = x*rsqrt(x)+rsqrt(x)
  uint64_t y2 = (umulh64(mant64, r2) + r2) << e; // scale = 2**64, nbits=64
  // scale = 2**64, nbits=64

  if (r2 >= (uint64_t)(-2))
    y2 = 0;

  //
  // Improve precision of y by 1-st order NR iteration
  // It is a canonical NR iteration y = (x + y*y)/(2*y) used
  // in a form y = y + (x - y*y)*r/2
  //
  // Calculate bits[-60:-123] of dX = (x+1)*2**e - (y+1)*(y+1)
  // All upper bits of difference are known to be 0
  __uint128 y2y2 = (__uint128)y2 * y2;            // y*y,     scale 2**128
  uint64_t  yy2  = (uint64_t)(y2y2 >> 5)+(y2<<60);// y*y+2*y, scale 2**123
  uint64_t  dx   = ((mantL<<e)<<11) - yy2;        // scale 2**123
  dx -= (dx != 0); // subtract 1 in order to guarantee that estimate is lower
than exact value
  uint64_t  deltaY = umulh64(dx, r2);             // (x - y*y)*r/2, scale
2**124

  // Round deltaY to scale = 2**112
  const uint64_t GUARD_MSK  = (1 << 12)-1;        // scale 2**124
  const uint64_t GUARD_BIT  = 1 << 11;            // scale 2**124
  const uint64_t MAX_ERR    = 6;                  // scale 2**124
  uint64_t round_incr = GUARD_BIT;
  uint64_t guard_bits = deltaY & GUARD_MSK;
  if (guard_bits - (GUARD_BIT-MAX_ERR) < MAX_ERR) {
    // Guard bits are very close to the half-bit (from below)
    // Direction of the rounding should be found by additional calculations
    __uint128 midY = (((__uint128)y2 << 49)+(deltaY>>11)) | 1 | ((__uint128)1
<< 113); // scale 2**113
    // MidY resides between two representable output points
    __uint128 rndDir = midY*midY;                // scale 2**226
    uint64_t rndDirH = (uint64_t)(rndDir >> 64); // scale 2**162
    if ((rndDirH >> (162-113+e)) & 1)
      round_incr = GUARD_MSK; // when guard bit of the square = 1, it means
that the square of midY < x, so we need to round up
  }
  deltaY = (deltaY + round_incr) >> 12;         // scale 2**112
  __uint128 y = ((__uint128)y2 << 48) + deltaY; // scale 2**112

  // format output
  mantL = (uint64_t)(y);
  mantH = (uint64_t)(y >> 64) & MANT_H_MSK;
  uint64_t resExp = expx / 2;
  mshalf = (resExp << 48) | mantH;

  return ret__float128(mshalf, mantL);
}

Hopefully, this one is perfectly precise.
On Intel (IvyBridge, Haswell, Skylake) it is quite fast.
On AMD (Zen3) - less so. I didn't figure out yet, what causes a slowdown.

But even on AMD, it's slow only relatively to the same code on Intel.
Relatively to current library it is 7.4 times faster.

Also, I hope that this post could be considered as giving a code away to
any takers with BSD-like license.
If it is not, then tell me, how to do it right.

[Bug libquadmath/105101] incorrect rounding for sqrtq

Reply via email to