https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105101
--- Comment #12 from Michael_S <already5chosen at yahoo dot com> --- (In reply to Michael_S from comment #11) > (In reply to Michael_S from comment #10) > > BTW, the same ideas as in the code above could improve speed of division > > operation (on modern 64-bit HW) by factor of 3 (on Intel) or 2 (on AMD). > > Did it. > On Intel it's even better than anticipated. 5x speedup on Haswell and > Skylake, > 4.5x on Ivy Bridge. > Unfortunately, right now I have no connection to my Zen3 test system, so > can't > measure on it with final variant. But according to preliminary tests the > speedup should be slightly better than 2x. O.k. It seems now I fixed all details of rounding. Including cases of sub-normal results. Even ties now appear to be broken to even, as prescribed. It can take a bit more polish, esp. a comments, but even in this form much better than what you have now. #include <stdint.h> #include <string.h> #include <quadmath.h> static __inline uint64_t umulx64(uint64_t mult1, uint64_t mult2, int rshift) { return (uint64_t)(((unsigned __int128)mult1 * mult2)>>rshift); } static __inline uint64_t umulh64(uint64_t mult1, uint64_t mult2) { return umulx64(mult1, mult2, 64); } static __inline void set__float128(__float128* dst, uint64_t wordH, uint64_t wordL) { unsigned __int128 res_u128 = ((unsigned __int128)wordH << 64) | wordL; memcpy(dst, &res_u128, sizeof(*dst)); // hopefully __int128 and __float128 have the endianness } // return number of leading zeros static __inline int normalize_float128(uint64_t* pMantH, uint64_t* pMantL) { const uint64_t MANT_H_MSK = (uint64_t)-1 >> 16; uint64_t mantH = *pMantH, mantL = *pMantL; int lz; if (mantH == 0) { lz = __builtin_clzll(mantL); mantL <<= lz + 1; // shift out all leading zeroes and first leading 1 mantH = mantL >> 16; mantL = mantL << 48; lz += 48; } else { lz = __builtin_clzll(mantH)-16; mantH = (mantH << (lz+1)) | mantL >> (63-lz); mantL = mantL << (lz + 1); } mantH &= MANT_H_MSK; *pMantH = mantH; *pMantL = mantL; return lz; } void my__divtf3(__float128* result, const __float128* num, const __float128* den) { typedef unsigned __int128 __uint128; __uint128 u_num, u_den; memcpy(&u_num, num, sizeof(u_num)); memcpy(&u_den, den, sizeof(u_den)); const uint64_t BIT_30 = (uint64_t)1 << 30; const uint64_t BIT_48 = (uint64_t)1 << 48; const uint64_t BIT_63 = (uint64_t)1 << 63; const uint64_t SIGN_BIT = BIT_63; const uint64_t SIGN_OFF_MSK = ~SIGN_BIT; const uint64_t MANT_H_MSK = (uint64_t)-1 >> 16; const uint64_t EXP_MSK = (uint64_t)0x7FFF << 48; const uint64_t pNAN_MSW = (uint64_t)0x7FFF8 << 44; // +NaN const uint64_t pINF_MSW = EXP_MSK; // +Inf // parse num and handle special cases uint64_t numHi = (uint64_t)(u_num >> 64); uint64_t numLo = (uint64_t)u_num; uint64_t denHi = (uint64_t)(u_den >> 64); uint64_t denLo = (uint64_t)u_den; unsigned numSexp = numHi >> 48; unsigned numExp = numSexp & 0x7FFF; int inumExp = numExp; uint64_t numMantHi = numHi & MANT_H_MSK; uint64_t resSign = (numHi ^ denHi) & SIGN_BIT; if (numExp - 1 >= 0x7FFF - 1) { // special cases if (numExp == 0x7FFF) { // inf or Nan if ((numMantHi | numLo)==0) { // inf numHi = resSign | pINF_MSW; // Inf with proper sign if ((denLo == 0) && ((denHi & SIGN_OFF_MSK)==pINF_MSW)) { // den is inf numHi = pNAN_MSW; // Inf/Inf => NaN } } set__float128(result, numHi, numLo); return; } // numExp == 0 -> zero or sub-normal if (((numHi & SIGN_OFF_MSK) | numLo)==0) { // zero if ((denHi & EXP_MSK)==pINF_MSW) { // den = inf or Nan if ((denLo == 0) && ((denHi & SIGN_OFF_MSK)==pINF_MSW)) { // den is inf numHi = resSign; // return zero with proper sign } else { // den is Nan // return den numLo = denLo; numHi = denHi; } } if (((denHi & SIGN_OFF_MSK) | denLo)==0) { // den is zero numHi = pNAN_MSW; // 0/0 => NaN } set__float128(result, numHi, numLo); return; } // num is sub-normal: normalize inumExp -= normalize_float128(&numMantHi, &numLo); } // // num is normal or normalized // // parse den and handle special cases unsigned denSexp = denHi >> 48; unsigned denExp = denSexp & 0x7FFF; int idenExp = denExp; uint64_t denMantHi = denHi & MANT_H_MSK; if (denExp - 1 >= 0x7FFF - 1) { // special cases if (denExp == 0x7FFF) { // inf or Nan if ((denMantHi | denLo)==0) { // den is inf // return zero with proper sign denHi = resSign; denLo = 0; } // if den==Nan then return den set__float128(result, denHi, denLo); return; } // denExp == 0 -> zero or sub-normal if (((denHi & SIGN_OFF_MSK) | denLo)==0) { // den is zero // return Inf with proper sign numHi = resSign | pINF_MSW; numLo = 0; set__float128(result, numHi, numLo); return; } // den is sub-normal: normalize idenExp -= normalize_float128(&denMantHi, &denLo); } // // both num and den are normal or normalized // // calculate exponent of result int expDecr = (denMantHi > numMantHi) | ((denMantHi == numMantHi) & (denLo > numLo)); // mant(den) >= mant(num) int iResExp = inumExp - idenExp - expDecr + 0x3FFF; if (iResExp >= 0x7FFF) { // Overflow // return Inf with proper sign set__float128(result, resSign | pINF_MSW, 0); return; } if (iResExp < -112) { // underflow // return Zero with proper sign set__float128(result, resSign, 0); return; } // calculate 64-bit approximation from below for 1/mantissa of den static const uint8_t recip_tab[64] = { // floor(1/(1+i/32)*512) - 256 248, 240, 233, 225, 218, 212, 205, 199, 192, 186, 180, 175, 169, 164, 158, 153, 148, 143, 138, 134, 129, 125, 120, 116, 112, 108, 104, 100, 96 , 92 , 88 , 85 , 81 , 78 , 74 , 71 , 68 , 65 , 62 , 59 , 56 , 53 , 50 , 47 , 44 , 41 , 39 , 36 , 33 , 31 , 28 , 26 , 24 , 21 , 19 , 17 , 14 , 12 , 10 , 8 , 6 , 4 , 2 , 0 , }; // Look for approximate value of recip(x)=1/x // where x = 1:denMantHi:denMantLo unsigned r0 = recip_tab[denMantHi >> (48-6)]+256; // scale = 2**9 // r0 is low estimate, r0*x is in range [0.98348999..1.0] // Est. Precisions: ~5.92 bits // Improve precision of r by two NR iterations uint64_t denMant30 = (denMantHi >> 18)+BIT_30+1; // scale = 2**30, up to 32 bits // Only 30 upper bits of mantissa used in evaluation. Use of minimal amount of input bits // simplifies validation by exhausting with minimal negative effect on worst-case precision // LSB of denMant30 is incremented by 1 in order to assure that // for any possible values of LS bits of denMantHi:Lo // the estimate r1 is lower than exact value of r const uint64_t TWO_SCALE60 = ((uint64_t)2 << 60) - 1; // scale = 2**60, 61 bit uint64_t r1 = (uint64_t)r0 << 21; // scale = 2**30, 30 bits r1 = (((TWO_SCALE60 - r1 * denMant30) >> 29)*r1) >> 31; r1 = (((TWO_SCALE60 - r1 * denMant30) >> 29)*r1) << 3;// scale = 2**64 // r1 is low estimate // r1 is in range [0x7ffffffeffffffe8..0xfffffefbffc00000] // r1*x is in range [0.9999999243512687..0.9999999999999999954] // Est. Precisions: ~23.65 bits // // Further improve precision of the estimate by canonical 2nd-order NR iteration // r2 = r1 * (P2*(r0*r0*x)**2 + P1*(r0*r0*x) + P0) // where // P2 = +1 // P1 = -3 // P0 = +3 // At this point precision is of high importance, so evaluation is done in a way // that minimizes impact of truncation while still doing majority of the work with // efficient 64-bit arithmetic. // The transformed equation is r2 = r1 + r1*rx1_n*(rx1_n + 1) // where rx1_n = 1 - r1*x // uint64_t denMant64 = (denMantHi << 16)|(denLo >> 48); // Upper 64 bits of mantissa, not including leading 1 uint64_t rx1 = umulh64(r1,denMant64) + r1 + 2; // r1*(2**64+x+1), scale = 2**64; nbits=64 // rx1 calculated with x2_u=x+1 instead of x2=x, in order to assure that // for any possible values of 48 LS bits of x, the estimate r2 is lower than exact values of r uint64_t rx1_n = 0 - rx1; // rx1_n=1-r1*x , scale = 2**64, nbits=42 uint64_t termRB2 = umulh64(rx1_n, r1); // rx1_n*r1 , scale = 2**64, nbits=42 uint64_t deltaR1 = umulh64(termRB2,rx1_n)+termRB2; // termRB2*(rx1_n+1) , scale = 2**64, nbits=43 uint64_t r2 = r1 + deltaR1; // scale = 2**64, nbits=64 // r2 is low estimate, // r2 is in range [0x7fffffffffffffff..0xfffffffffffffffe] // r2*x is in range [0.999_999_999_999_999_999_674_994 .. 0.999_999_999_999_999_999_999_726] // Est. Precisions: ~61.41 bits // max(r2*x-1) ~= 5.995 * 2**-64 uint64_t numMsw = ((numMantHi << 15) | (numLo >> (64-15))) + BIT_63; // scale = 2**63 uint64_t res1 = umulh64(numMsw, r2) << expDecr; // scale = 2**63 __uint128 denx = ((__uint128)(denMantHi | BIT_48) << 64) | denLo; // scale = 2**112 uint64_t prod1 = (uint64_t)((res1*(denx << 11)) >> 64)+1; // scale = 2**122 uint64_t num122Lo = numLo << (10+expDecr); // scale = 2**122 uint64_t deltaProd= num122Lo - prod1; // scale = 2**122 uint64_t deltaRes = umulh64(deltaProd, r2); // scale = 2**122 // Round deltaRes to scale = 2**112 const unsigned N_GUARD_BITS = 10; const unsigned MAX_ERR = 3; // scale 2**122 __uint128 res2x; if (iResExp > 0) { // normal result const unsigned GUARD_BIT = 1 << (N_GUARD_BITS-1); // scale 2**122 const unsigned GUARD_MSK = GUARD_BIT*2-1; // scale 2**122 uint64_t round_incr = GUARD_BIT; unsigned guard_bits = deltaRes & GUARD_MSK; if (guard_bits - (GUARD_BIT-MAX_ERR) < MAX_ERR) { // Guard bits are very close to the half-ULP (from below) // Direction of the rounding should be found by additional calculations __uint128 midRes = (((__uint128)res1 << 50)+(deltaRes >> 9)) | 1; // scale 2**113 // midRes resides between two representable output points __uint128 midProduct = midRes*denx; // scale 2**225 uint64_t midProductH = (uint64_t)(midProduct >> 64); // scale 2**161 if ((midProductH >> (161-113+expDecr)) & 1) round_incr = GUARD_MSK; // when guard bit of the product = 1, it means that the product of midRes < num, so we need to round up } deltaRes = (deltaRes + round_incr) >> 10; // scale = 2**112 res2x = ((__uint128)res1 << 49) + deltaRes; // scale = 2**112 } else { // sub-normal result int shift = 1 - iResExp; // range [1:113] const __uint128 GUARD_BIT = (__uint128)1 << (shift+N_GUARD_BITS-1); // scale = 2**122 const __uint128 GUARD_MSK = GUARD_BIT*2 - 1; // scale = 2**122 __uint128 resx = ((__uint128)res1 << 59) + deltaRes; // scale = 2**122 __uint128 guard_bits = resx & GUARD_MSK; resx >>= (shift+N_GUARD_BITS-2); // scale = 2**(114-shift) unsigned round_incr = 2; if (guard_bits - (GUARD_BIT-MAX_ERR) < MAX_ERR) { // Guard bits are very close to the half-ULP (from below) // Direction of the rounding should be found by additional calculations __uint128 midRes = resx ^ 3; // scale 2**(114-shift, 2 LS bits changed from '01' to '10' // midRes resides between two representable output points __uint128 midProduct = midRes*denx; // scale 2**(226-shift) midProduct -= ((unsigned)resx >> 2) & 1; // Decrement a product when LS Bit of non-rounded result=='1' // That's a dirty trick that breaks ties toward nearest even. // Only necessary in subnormal case. For normal results tie is impossible int guardPos = 226-113+expDecr-shift; uint64_t midProductGuardBit = (uint64_t)(midProduct >> guardPos) & 1; // When a guard bit of the product = 1, it means that the product of midRes*den < num // Which means that we have to round up round_incr += midProductGuardBit; } res2x = (resx + round_incr) >> 2; iResExp = 0; } numMantHi = (res2x >> 64) & MANT_H_MSK; numLo = res2x; numHi = ((uint64_t)(iResExp) << 48) | numMantHi | resSign; set__float128(result, numHi, numLo); }