[Bug libquadmath/105101] incorrect rounding for sqrtq

already5chosen at yahoo dot com via Gcc-bugs Thu, 21 Apr 2022 08:10:03 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105101


--- Comment #13 from Michael_S <already5chosen at yahoo dot com> ---
It turned out that on all micro-architectures that I care about (and majority
of those that I don't care) double precision floating point division is quite
fast.
It's so fast that it easily beats my clever reciprocal estimator.
Which means that this simple code is faster than complicated code above.
Not by much, but faster. And much simpler.

#include <stdint.h>
#include <string.h>
#include <quadmath.h>

static __inline uint64_t umulx64(uint64_t mult1, uint64_t mult2, int rshift)
{
  return (uint64_t)(((unsigned __int128)mult1 * mult2)>>rshift);
}

static __inline uint64_t umulh64(uint64_t mult1, uint64_t mult2)
{
  return umulx64(mult1, mult2, 64);
}

static __inline void set__float128(__float128* dst, uint64_t wordH, uint64_t
wordL) {
  unsigned __int128 res_u128 = ((unsigned __int128)wordH << 64) | wordL;
  memcpy(dst, &res_u128, sizeof(*dst)); // hopefully __int128 and __float128
have the endianness
}

// return number of leading zeros
static __inline int normalize_float128(uint64_t* pMantH, uint64_t* pMantL) {
  const uint64_t MANT_H_MSK   = (uint64_t)-1 >> 16;
  uint64_t mantH = *pMantH, mantL = *pMantL;
  int lz;
  if (mantH == 0) {
    lz = __builtin_clzll(mantL);
    mantL <<= lz + 1; // shift out all leading zeroes and first leading 1
    mantH = mantL >> 16;
    mantL = mantL << 48;
    lz += 48;
  } else {
    lz = __builtin_clzll(mantH)-16;
    mantH = (mantH << (lz+1)) | mantL >> (63-lz);
    mantL = mantL << (lz + 1);
  }
  mantH &= MANT_H_MSK;
  *pMantH = mantH;
  *pMantL = mantL;
  return lz;
}

static __inline uint64_t d2u(double x) {
  uint64_t y;
  memcpy(&y, &x, sizeof(y));
  return y;
}

static __inline double u2d(uint64_t x) {
  double y;
  memcpy(&y, &x, sizeof(y));
  return y;
}

// calc_reciprocal
// calculate lower estimate for r = floor(2**64/(1 + mantHi*2**(-48) +
mantLo*2**(-112)))
static __inline uint64_t calc_reciprocal(uint64_t mantHi, uint64_t mantLo)
{
  const uint64_t DBL_1PLUS = ((uint64_t)0x3FF << 52) + 18;
  uint64_t mant_u = (mantHi << 4) + DBL_1PLUS;
  uint64_t r1 = d2u(2.0/u2d(mant_u)) << 11; // scale = 2**64, nbits=64
  // r1 is low estimate
  // r1 is in range   [0x7fffffffffffe000..0xfffffffffffee000 ]
  // r1*x is in range [0.99999999999999595..0.999999999999999889]
  // Est. Precisions: ~47.81 bits

  //
  // Improve precision of the estimate by canonical 1st-order NR iteration
  // r = r1*(2-r1*x)
  // At this point precision is of high importance, so evaluation is done in a
way
  // that minimizes impact of truncation.
  // The transformed equation is r = r1 + r1*(1-r1*x)
  //
  uint64_t mant64 = (mantHi << 16)|(mantLo >> 48); // Upper 64 bits of
mantissa, not including leading 1
  uint64_t rx1    = umulh64(r1,mant64) + r1 + 2;   // r1*(2**64+x+1), scale =
2**64; nbits=64
  // rx1 calculated with x2_u=x+1 instead of x2=x, in order to assure that
  // for any possible values of 48 LS bits of x, the estimate r2 is lower than
exact values of r
  uint64_t r = r1 + umulh64(0-rx1, r1);            // scale = 2**64, nbits=64
  // r is low estimate
  // r is in range [0x7fffffffffffffff..0xfffffffffffffffd]
  // r*x is in range
[0.999_999_999_999_999_999_783..0.999_999_999_999_999_999_999_957]
  // Est. Precisions: ~62.00069 bits
  // max(r*x-1) ~= 3.998079 * 2**-64
  return r;
}

void my__divtf3(__float128* result, const __float128* num, const __float128*
den)
{
  typedef unsigned __int128 __uint128;
  __uint128 u_num, u_den;
  memcpy(&u_num, num, sizeof(u_num));
  memcpy(&u_den, den, sizeof(u_den));

  const uint64_t BIT_48       = (uint64_t)1 << 48;
  const uint64_t BIT_63       = (uint64_t)1 << 63;

  const uint64_t SIGN_BIT     = BIT_63;
  const uint64_t SIGN_OFF_MSK = ~SIGN_BIT;
  const uint64_t MANT_H_MSK   = (uint64_t)-1 >> 16;
  const uint64_t EXP_MSK      = (uint64_t)0x7FFF  << 48;
  const uint64_t pNAN_MSW     = (uint64_t)0x7FFF8 << 44;    // +NaN
  const uint64_t pINF_MSW     = EXP_MSK;                    // +Inf
  // const uint64_t nNAN_MSW     = (uint64_t)0xFFFF8 << 44; // -NaN

  // parse num and handle special cases
  uint64_t numHi = (uint64_t)(u_num >> 64);
  uint64_t numLo = (uint64_t)u_num;
  uint64_t denHi = (uint64_t)(u_den >> 64);
  uint64_t denLo = (uint64_t)u_den;
  unsigned numSexp = numHi >> 48;
  unsigned numExp  = numSexp & 0x7FFF;
  int      inumExp = numExp;
  uint64_t numMantHi = numHi & MANT_H_MSK;
  uint64_t resSign = (numHi ^ denHi) & SIGN_BIT;
  if (numExp - 1 >= 0x7FFF - 1) { // special cases
    if (numExp == 0x7FFF) { // inf or Nan
       if ((numMantHi | numLo)==0) { // inf
         numHi = resSign | pINF_MSW; // Inf with proper sign
         if ((denLo == 0) && ((denHi & SIGN_OFF_MSK)==pINF_MSW)) { // den inf
           numHi = pNAN_MSW; // Inf/Inf => NaN
         }
       }
       set__float128(result, numHi, numLo);
       return;
    }

    // numExp == 0 - zero or sub-normal
    if (((numHi & SIGN_OFF_MSK) | numLo)==0) { // zero
      if ((denHi & EXP_MSK)==pINF_MSW) { // den = inf or Nan
        if ((denLo == 0) && ((denHi & SIGN_OFF_MSK)==pINF_MSW)) { // den= inf
          numHi = resSign;  // return zero with proper sign
        } else { // den= Nan
          // return den
          numLo = denLo;
          numHi = denHi;
        }
      }
      if (((denHi & SIGN_OFF_MSK) | denLo)==0) { // den is zero
        numHi = pNAN_MSW; // 0/0 => NaN
      }
      set__float128(result, numHi, numLo);
      return;
    }

    // num is sub-normal: normalize
    inumExp -= normalize_float128(&numMantHi, &numLo);
  }

  //
  // num is normal or normalized
  //

  // parse den and handle special cases
  unsigned denSexp   = denHi >> 48;
  unsigned denExp    = denSexp & 0x7FFF;
  int      idenExp   = denExp;
  uint64_t denMantHi = denHi & MANT_H_MSK;
  if (denExp - 1 >= 0x7FFF - 1) { // special cases
    if (denExp == 0x7FFF) { // inf or Nan
       if ((denMantHi | denLo)==0) { // den is inf
         // return zero with proper sign
         denHi = resSign;
         denLo = 0;
       }
       // if den==Nan then return den
       set__float128(result, denHi, denLo);
       return;
    }

    // denExp == 0 - zero or sub-normal
    if (((denHi & SIGN_OFF_MSK) | denLo)==0) { // den is zero
      // return Inf with proper sign
      numHi = resSign | pINF_MSW;
      numLo = 0;
      set__float128(result, numHi, numLo);
      return;
    }

    // den is sub-normal: normalize
    idenExp -= normalize_float128(&denMantHi, &denLo);
  }

  //
  // both num and den are normal or normalized
  //

  // calculate exponent of result
  int expDecr = (denMantHi > numMantHi) | ((denMantHi == numMantHi) & (denLo >
numLo)); // mant(den) >= mant(num)
  int iResExp = inumExp - idenExp - expDecr + 0x3FFF;
  if (iResExp >= 0x7FFF) { // Overflow
    // return Inf with proper sign
    set__float128(result, resSign | pINF_MSW, 0);
    return;
  }
  if (iResExp < -112) { // underflow
    // return Zero with proper sign
    set__float128(result, resSign, 0);
    return;
  }

  // calculate 64-bit approximation from below for 1/mantissa of den
  uint64_t r = calc_reciprocal(denMantHi, denLo);
  // r is low estimate
  // r is in range [0x7fffffffffffffff..0xfffffffffffffffd]
  // r*x is in range
[0.999_999_999_999_999_999_783..0.999_999_999_999_999_999_999_957]
  // Est. Precisions: ~62.00069 bits
  // max(r*x-1) ~= 3.998079 * 2**-64

  uint64_t numMsw = ((numMantHi << 15) | (numLo >> (64-15))) + BIT_63; // scale
= 2**63
  uint64_t res1 = umulh64(numMsw, r) << expDecr;                       // scale
= 2**63

  __uint128 denx = ((__uint128)(denMantHi | BIT_48) << 64) | denLo;    // scale
= 2**112
  uint64_t prod1 = (uint64_t)((res1*(denx << 11)) >> 64)+1;            // scale
= 2**122
  uint64_t num122Lo = numLo << (10+expDecr);                           // scale
= 2**122
  uint64_t deltaProd= num122Lo - prod1;                                // scale
= 2**122
  uint64_t deltaRes = umulh64(deltaProd, r);                           // scale
= 2**122

  // Round deltaRes to scale = 2**112
  const unsigned N_GUARD_BITS = 10;
  const unsigned MAX_ERR      = 3;                     // scale 2**122
  __uint128 res2x;
  if (iResExp > 0) { // normal result
    const unsigned GUARD_BIT  = 1 << (N_GUARD_BITS-1); // scale 2**122
    const unsigned GUARD_MSK  = GUARD_BIT*2-1;         // scale 2**122
    uint64_t round_incr = GUARD_BIT;
    unsigned guard_bits = deltaRes & GUARD_MSK;
    if (guard_bits - (GUARD_BIT-MAX_ERR) < MAX_ERR) {
      // Guard bits are very close to the half-ULP (from below)
      // Direction of the rounding should be found by additional calculations
      __uint128 midRes = (((__uint128)res1 << 50)+(deltaRes >> 9)) | 1; //
scale 2**113
      // midRes resides between two representable output points
      __uint128 midProduct = midRes*denx;                  // scale 2**225
      uint64_t midProductH = (uint64_t)(midProduct >> 64); // scale 2**161
      if ((midProductH >> (161-113+expDecr)) & 1)
        round_incr = GUARD_MSK; // when guard bit of the product = 1, it means
that the product of midRes < num, so we need to round up
    }
    deltaRes = (deltaRes + round_incr) >> 10;       // scale = 2**112
    res2x    = ((__uint128)res1 << 49) + deltaRes;  // scale = 2**112
  } else { // sub-normal result
    int shift = 1 - iResExp; // range [1:113]
    const __uint128 GUARD_BIT = (__uint128)1 << (shift+N_GUARD_BITS-1); //
scale = 2**122
    const __uint128 GUARD_MSK = GUARD_BIT*2 - 1;                        //
scale = 2**122
    __uint128 resx = ((__uint128)res1 << 59) + deltaRes;                //
scale = 2**122
    __uint128 guard_bits = resx & GUARD_MSK;
    resx >>= (shift+N_GUARD_BITS-2);                                    //
scale = 2**(114-shift)
    unsigned round_incr = 2;
    if (guard_bits - (GUARD_BIT-MAX_ERR) < MAX_ERR) {
      // Guard bits are very close to the half-ULP (from below)
      // Direction of the rounding should be found by additional calculations
      __uint128 midRes = resx ^ 3;             // scale 2**(114-shift, 2 LS
bits changed from '01' to '10'
      // midRes resides between two representable output points
      __uint128 midProduct = midRes*denx;      // scale 2**(226-shift)
      midProduct -= ((unsigned)resx >> 2) & 1; // Decrement a product when LS
Bit of non-rounded result=='1'
                                               // That's a dirty trick that
breaks ties toward nearest even.
                                               // Only necessary in subnormal
case. For normal results tie is impossible
      int guardPos = 226-113+expDecr-shift;
      uint64_t midProductGuardBit = (uint64_t)(midProduct >> guardPos) & 1;
      // When a guard bit of the product = 1, it means that the product of
midRes*den < num
      // Which means that we have to round up
      round_incr += midProductGuardBit;
    }
    res2x = (resx + round_incr) >> 2;
    iResExp = 0;
  }

  numMantHi = (res2x >> 64) & MANT_H_MSK;
  numLo     = res2x;
  numHi     = ((uint64_t)(iResExp) << 48) | numMantHi | resSign;

  set__float128(result, numHi, numLo);
}

[Bug libquadmath/105101] incorrect rounding for sqrtq

Reply via email to