https://github.com/frasercrmck updated https://github.com/llvm/llvm-project/pull/133226
>From 497781f9fe1d1750d37a36948be285cdb6c14b04 Mon Sep 17 00:00:00 2001 From: Fraser Cormack <fra...@codeplay.com> Date: Tue, 25 Mar 2025 18:19:38 +0000 Subject: [PATCH 1/2] [libclc] Move atan2/atan2pi to the CLC library --- libclc/clc/include/clc/math/clc_atan2.h | 20 ++ libclc/clc/include/clc/math/clc_atan2pi.h | 20 ++ libclc/clc/include/clc/math/tables.h | 3 +- libclc/clc/lib/generic/SOURCES | 2 + libclc/clc/lib/generic/math/clc_atan2.cl | 26 ++ libclc/clc/lib/generic/math/clc_atan2.inc | 243 +++++++++++++++++++ libclc/clc/lib/generic/math/clc_atan2pi.cl | 26 ++ libclc/clc/lib/generic/math/clc_atan2pi.inc | 227 ++++++++++++++++++ libclc/clc/lib/generic/math/clc_tables.cl | 177 ++++++++++++++ libclc/generic/lib/math/atan2.cl | 222 +---------------- libclc/generic/lib/math/atan2pi.cl | 206 +--------------- libclc/generic/lib/math/tables.cl | 253 -------------------- 12 files changed, 751 insertions(+), 674 deletions(-) create mode 100644 libclc/clc/include/clc/math/clc_atan2.h create mode 100644 libclc/clc/include/clc/math/clc_atan2pi.h create mode 100644 libclc/clc/lib/generic/math/clc_atan2.cl create mode 100644 libclc/clc/lib/generic/math/clc_atan2.inc create mode 100644 libclc/clc/lib/generic/math/clc_atan2pi.cl create mode 100644 libclc/clc/lib/generic/math/clc_atan2pi.inc diff --git a/libclc/clc/include/clc/math/clc_atan2.h b/libclc/clc/include/clc/math/clc_atan2.h new file mode 100644 index 0000000000000..85b99d0279ee7 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_atan2.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_ATAN2_H__ +#define __CLC_MATH_CLC_ATAN2_H__ + +#define __CLC_BODY <clc/shared/binary_decl.inc> +#define __CLC_FUNCTION __clc_atan2 + +#include <clc/math/gentype.inc> + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_ATAN2_H__ diff --git a/libclc/clc/include/clc/math/clc_atan2pi.h b/libclc/clc/include/clc/math/clc_atan2pi.h new file mode 100644 index 0000000000000..af41165f7dcf2 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_atan2pi.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_ATAN2PI_H__ +#define __CLC_MATH_CLC_ATAN2PI_H__ + +#define __CLC_BODY <clc/shared/binary_decl.inc> +#define __CLC_FUNCTION __clc_atan2pi + +#include <clc/math/gentype.inc> + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_ATAN2PI_H__ diff --git a/libclc/clc/include/clc/math/tables.h b/libclc/clc/include/clc/math/tables.h index b801ba65bb945..6a0cd80f9c8cb 100644 --- a/libclc/clc/include/clc/math/tables.h +++ b/libclc/clc/include/clc/math/tables.h @@ -79,7 +79,8 @@ CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl); TABLE_FUNCTION_DECL(double2, ln_tbl); CLC_TABLE_FUNCTION_DECL(double, ln_tbl_lo); CLC_TABLE_FUNCTION_DECL(double, ln_tbl_hi); -TABLE_FUNCTION_DECL(double2, atan_jby256_tbl); +CLC_TABLE_FUNCTION_DECL(double, atan_jby256_tbl_head); +CLC_TABLE_FUNCTION_DECL(double, atan_jby256_tbl_tail); TABLE_FUNCTION_DECL(double2, two_to_jby64_ep_tbl); TABLE_FUNCTION_DECL(double2, sinh_tbl); TABLE_FUNCTION_DECL(double2, cosh_tbl); diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index dd94d97303944..d851065bb2e23 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -24,6 +24,8 @@ math/clc_asin.cl math/clc_asinh.cl math/clc_asinpi.cl math/clc_atan.cl +math/clc_atan2.cl +math/clc_atan2pi.cl math/clc_atanh.cl math/clc_atanpi.cl math/clc_ceil.cl diff --git a/libclc/clc/lib/generic/math/clc_atan2.cl b/libclc/clc/lib/generic/math/clc_atan2.cl new file mode 100644 index 0000000000000..b10bf32333a32 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_atan2.cl @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/clc_convert.h> +#include <clc/float/definitions.h> +#include <clc/internal/clc.h> +#include <clc/math/clc_copysign.h> +#include <clc/math/clc_fabs.h> +#include <clc/math/clc_fma.h> +#include <clc/math/clc_ldexp.h> +#include <clc/math/clc_mad.h> +#include <clc/math/math.h> +#include <clc/math/tables.h> +#include <clc/relational/clc_isinf.h> +#include <clc/relational/clc_isnan.h> +#include <clc/relational/clc_select.h> +#include <clc/shared/clc_max.h> +#include <clc/shared/clc_min.h> + +#define __CLC_BODY <clc_atan2.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/clc/lib/generic/math/clc_atan2.inc b/libclc/clc/lib/generic/math/clc_atan2.inc new file mode 100644 index 0000000000000..0917f3adf2d90 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_atan2.inc @@ -0,0 +1,243 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y, __CLC_GENTYPE x) { + const __CLC_GENTYPE pi = 0x1.921fb6p+1f; + const __CLC_GENTYPE piby2 = 0x1.921fb6p+0f; + const __CLC_GENTYPE piby4 = 0x1.921fb6p-1f; + const __CLC_GENTYPE threepiby4 = 0x1.2d97c8p+1f; + + __CLC_GENTYPE ax = __clc_fabs(x); + __CLC_GENTYPE ay = __clc_fabs(y); + __CLC_GENTYPE v = __clc_min(ax, ay); + __CLC_GENTYPE u = __clc_max(ax, ay); + + // Scale since u could be large, as in "regular" divide + __CLC_GENTYPE s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f; + __CLC_GENTYPE vbyu = s * MATH_DIVIDE(v, s * u); + + __CLC_GENTYPE vbyu2 = vbyu * vbyu; + +#define USE_2_2_APPROXIMATION +#if defined USE_2_2_APPROXIMATION + __CLC_GENTYPE p = + __clc_mad(vbyu2, __clc_mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), + -0x1.5554d0p-2f) * + vbyu2 * vbyu; + __CLC_GENTYPE q = + __clc_mad(vbyu2, __clc_mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f); +#else + __CLC_GENTYPE p = + __clc_mad(vbyu2, __clc_mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f), + -0x1.55554ep-2f) * + vbyu2 * vbyu; + __CLC_GENTYPE q = __clc_mad( + vbyu2, + __clc_mad(vbyu2, __clc_mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f), + 0x1.76b4b8p+0f), + 1.0f); +#endif + + // Octant 0 result + __CLC_GENTYPE a = __clc_mad(p, MATH_RECIP(q), vbyu); + + // Fix up 3 other octants + __CLC_GENTYPE at = piby2 - a; + a = ay > ax ? at : a; + at = pi - a; + a = x < 0.0F ? at : a; + + // y == 0 => 0 for x >= 0, pi for x < 0 + at = __CLC_AS_INTN(x) < 0 ? pi : 0.0f; + a = y == 0.0f ? at : a; + + // x and y are +- Inf + at = x > 0.0f ? piby4 : threepiby4; + a = __clc_select(a, at, __clc_isinf(x) && __clc_isinf(y)); + + // x or y is NaN + a = __clc_select(a, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isnan(y)); + + // Fixup sign and return + return __clc_copysign(a, y); +} + +#elif __CLC_FPSIZE == 64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y, __CLC_GENTYPE x) { + const __CLC_GENTYPE pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */ + const __CLC_GENTYPE piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */ + const __CLC_GENTYPE piby4 = 7.8539816339744831e-01; /* 0x3fe921fb54442d18 */ + // 0x4002d97c7f3321d2 + const __CLC_GENTYPE three_piby4 = 2.3561944901923449e+00; + const __CLC_GENTYPE pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 */ + const __CLC_GENTYPE pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 */ + // 0x3ff921fb54442d18 + const __CLC_GENTYPE piby2_head = 1.5707963267948965e+00; + // 0x3c91a62633145c07 + const __CLC_GENTYPE piby2_tail = 6.1232339957367660e-17; + + __CLC_GENTYPE x2 = x; + // Important to capture -0.0 in xneg and yneg, so comparison done as integer + __CLC_LONGN xneg = __CLC_AS_LONGN(x) < 0; + __CLC_INTN xexp = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x) >> EXPSHIFTBITS_DP64) & 0x7ff; + + __CLC_GENTYPE y2 = y; + __CLC_LONGN yneg = __CLC_AS_LONGN(y) < 0; + __CLC_INTN yexp = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(y) >> EXPSHIFTBITS_DP64) & 0x7ff; + + __CLC_LONGN cond2 = __CLC_CONVERT_LONGN(xexp < 1021 && yexp < 1021); + __CLC_LONGN diffexp = __CLC_CONVERT_LONGN(yexp - xexp); + + // Scale up both x and y if they are both below 1/4 + __CLC_GENTYPE x1 = __clc_ldexp(x, 1024); + __CLC_INTN xexp1 = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x1) >> EXPSHIFTBITS_DP64) & 0x7ff; + __CLC_GENTYPE y1 = __clc_ldexp(y, 1024); + __CLC_INTN yexp1 = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(y1) >> EXPSHIFTBITS_DP64) & 0x7ff; + __CLC_LONGN diffexp1 = __CLC_CONVERT_LONGN(yexp1 - xexp1); + + diffexp = __clc_select(diffexp, diffexp1, cond2); + x = cond2 ? x1 : x; + y = cond2 ? y1 : y; + + // General case: take absolute values of arguments + __CLC_GENTYPE u = __clc_fabs(x); + __CLC_GENTYPE v = __clc_fabs(y); + + // Swap u and v if necessary to obtain 0 < v < u. Compute v/u. + __CLC_LONGN swap_vu = u < v; + __CLC_GENTYPE uu = u; + u = swap_vu ? v : u; + v = swap_vu ? uu : v; + + __CLC_GENTYPE vbyu = v / u; + __CLC_GENTYPE q1, q2; + + // General values of v/u. Use a look-up table and series expansion. + + { + __CLC_GENTYPE val = vbyu > 0.0625 ? vbyu : 0.063; + __CLC_INTN index = __CLC_CONVERT_INTN(__clc_fma(256.0, val, 0.5)); + q1 = USE_TABLE(atan_jby256_tbl_head, index - 16); + q2 = USE_TABLE(atan_jby256_tbl_tail, index - 16); + __CLC_GENTYPE c = __CLC_CONVERT_GENTYPE(index) * 0x1.0p-8; + + // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1 + // u_exponent could be EMAX so we have to do it in 2 steps + __CLC_INTN m = + -(__CLC_CONVERT_INTN(__CLC_AS_ULONGN(u) >> EXPSHIFTBITS_DP64) - + EXPBIAS_DP64); + __CLC_GENTYPE um = __clc_ldexp(u, m); + __CLC_GENTYPE vm = __clc_ldexp(v, m); + + // 26 leading bits of u + __CLC_GENTYPE u1 = __CLC_AS_GENTYPE(__CLC_AS_ULONGN(um) & 0xfffffffff8000000UL); + __CLC_GENTYPE u2 = um - u1; + + __CLC_GENTYPE r = MATH_DIVIDE(__clc_fma(-c, u2, __clc_fma(-c, u1, vm)), + __clc_fma(c, vm, um)); + + // Polynomial approximation to atan(r) + __CLC_GENTYPE s = r * r; + q2 = q2 + __clc_fma((s * __clc_fma(-s, 0.19999918038989143496, + 0.33333333333224095522)), + -r, r); + } + + __CLC_GENTYPE q3, q4; + { + q3 = 0.0; + q4 = vbyu; + } + + __CLC_GENTYPE q5, q6; + { + __CLC_GENTYPE u1 = __CLC_AS_GENTYPE(__CLC_AS_ULONGN(u) & 0xffffffff00000000UL); + __CLC_GENTYPE u2 = u - u1; + __CLC_GENTYPE vu1 = __CLC_AS_GENTYPE(__CLC_AS_ULONGN(vbyu) & 0xffffffff00000000UL); + __CLC_GENTYPE vu2 = vbyu - vu1; + + q5 = 0.0; + __CLC_GENTYPE s = vbyu * vbyu; + q6 = vbyu + + __clc_fma( + -vbyu * s, + __clc_fma( + -s, + __clc_fma(-s, + __clc_fma(-s, + __clc_fma(-s, 0.90029810285449784439E-01, + 0.11110736283514525407), + 0.14285713561807169030), + 0.19999999999393223405), + 0.33333333333333170500), + MATH_DIVIDE(__clc_fma(-u, vu2, + __clc_fma(-u2, vu1, __clc_fma(-u1, vu1, v))), + u)); + } + + q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5; + q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6; + + q1 = vbyu > 0.0625 ? q1 : q3; + q2 = vbyu > 0.0625 ? q2 : q4; + + // Tidy-up according to which quadrant the arguments lie in + __CLC_GENTYPE res1, res2, res3, res4; + q1 = swap_vu ? piby2_head - q1 : q1; + q2 = swap_vu ? piby2_tail - q2 : q2; + q1 = xneg ? pi_head - q1 : q1; + q2 = xneg ? pi_tail - q2 : q2; + q1 = q1 + q2; + res4 = yneg ? -q1 : q1; + + res1 = yneg ? -three_piby4 : three_piby4; + res2 = yneg ? -piby4 : piby4; + res3 = xneg ? res1 : res2; + + res3 = __clc_select(res4, res3, + __CLC_CONVERT_LONGN(__clc_isinf(x2) && __clc_isinf(y2))); + res1 = yneg ? -pi : pi; + + // abs(x)/abs(y) > 2^56 and x < 0 + res3 = (diffexp < -56 && xneg) ? res1 : res3; + + res4 = MATH_DIVIDE(y, x); + // x positive and dominant over y by a factor of 2^28 + res3 = diffexp < -28 && xneg == 0 ? res4 : res3; + + // abs(y)/abs(x) > 2^56 + res4 = yneg ? -piby2 : piby2; // atan(y/x) is insignificant compared to piby2 + res3 = diffexp > 56 ? res4 : res3; + + res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of y + res4 = xneg ? res1 : y2; + + // Zero y gives +-0 for positive x and +-pi for negative x + res3 = y2 == 0.0 ? res4 : res3; + res3 = __clc_isnan(y2) ? y2 : res3; + res3 = __clc_isnan(x2) ? x2 : res3; + + return res3; +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE x, + __CLC_GENTYPE y) { + return __CLC_CONVERT_GENTYPE( + __clc_atan2(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y))); +} + +#endif diff --git a/libclc/clc/lib/generic/math/clc_atan2pi.cl b/libclc/clc/lib/generic/math/clc_atan2pi.cl new file mode 100644 index 0000000000000..7f75c11d15d7b --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_atan2pi.cl @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/clc_convert.h> +#include <clc/float/definitions.h> +#include <clc/internal/clc.h> +#include <clc/math/clc_copysign.h> +#include <clc/math/clc_fabs.h> +#include <clc/math/clc_fma.h> +#include <clc/math/clc_ldexp.h> +#include <clc/math/clc_mad.h> +#include <clc/math/math.h> +#include <clc/math/tables.h> +#include <clc/relational/clc_isinf.h> +#include <clc/relational/clc_isnan.h> +#include <clc/relational/clc_select.h> +#include <clc/shared/clc_max.h> +#include <clc/shared/clc_min.h> + +#define __CLC_BODY <clc_atan2pi.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/clc/lib/generic/math/clc_atan2pi.inc b/libclc/clc/lib/generic/math/clc_atan2pi.inc new file mode 100644 index 0000000000000..79b2551e077cc --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_atan2pi.inc @@ -0,0 +1,227 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2pi(__CLC_GENTYPE y, + __CLC_GENTYPE x) { + const __CLC_GENTYPE pi = 0x1.921fb6p+1f; + + __CLC_GENTYPE ax = __clc_fabs(x); + __CLC_GENTYPE ay = __clc_fabs(y); + __CLC_GENTYPE v = __clc_min(ax, ay); + __CLC_GENTYPE u = __clc_max(ax, ay); + + // Scale since u could be large, as in "regular" divide + __CLC_GENTYPE s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f; + __CLC_GENTYPE vbyu = s * MATH_DIVIDE(v, s * u); + + __CLC_GENTYPE vbyu2 = vbyu * vbyu; + + __CLC_GENTYPE p = + __clc_mad(vbyu2, __clc_mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), + -0x1.5554d0p-2f) * + vbyu2 * vbyu; + __CLC_GENTYPE q = + __clc_mad(vbyu2, __clc_mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f); + + // Octant 0 result + __CLC_GENTYPE a = MATH_DIVIDE(__clc_mad(p, MATH_RECIP(q), vbyu), pi); + + // Fix up 3 other octants + __CLC_GENTYPE at = 0.5f - a; + a = ay > ax ? at : a; + at = 1.0f - a; + a = x < 0.0F ? at : a; + + // y == 0 => 0 for x >= 0, pi for x < 0 + at = __CLC_AS_INTN(x) < 0 ? 1.0f : 0.0f; + a = y == 0.0f ? at : a; + + // x and y are +- Inf + at = x > 0.0f ? 0.25f : 0.75f; + a = __clc_select(a, at, __clc_isinf(x) && __clc_isinf(y)); + + // x or y is NaN + a = __clc_select(a, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isnan(y)); + + // Fixup sign and return + return __clc_copysign(a, y); +} + +#elif __CLC_FPSIZE == 64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2pi(__CLC_GENTYPE y, + __CLC_GENTYPE x) { + const __CLC_GENTYPE pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */ + const __CLC_GENTYPE pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 */ + const __CLC_GENTYPE pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 */ + // 0x3ff921fb54442d18 + const __CLC_GENTYPE piby2_head = 1.5707963267948965e+00; + // 0x3c91a62633145c07 + const __CLC_GENTYPE piby2_tail = 6.1232339957367660e-17; + + __CLC_GENTYPE x2 = x; + __CLC_LONGN xneg = __CLC_AS_LONGN(x) < 0; + __CLC_INTN xexp = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x) >> EXPSHIFTBITS_DP64) & 0x7ff; + + __CLC_GENTYPE y2 = y; + __CLC_LONGN yneg = __CLC_AS_LONGN(y) < 0; + __CLC_INTN yexp = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(y) >> EXPSHIFTBITS_DP64) & 0x7ff; + + __CLC_LONGN cond2 = __CLC_CONVERT_LONGN(xexp < 1021 & yexp < 1021); + __CLC_LONGN diffexp = __CLC_CONVERT_LONGN(yexp - xexp); + + // Scale up both x and y if they are both below 1/4 + __CLC_GENTYPE x1 = __clc_ldexp(x, 1024); + __CLC_INTN xexp1 = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(x1) >> EXPSHIFTBITS_DP64) & 0x7ff; + __CLC_GENTYPE y1 = __clc_ldexp(y, 1024); + __CLC_INTN yexp1 = + __CLC_CONVERT_INTN(__CLC_AS_ULONGN(y1) >> EXPSHIFTBITS_DP64) & 0x7ff; + __CLC_LONGN diffexp1 = __CLC_CONVERT_LONGN(yexp1 - xexp1); + + diffexp = __clc_select(diffexp, diffexp1, cond2); + x = cond2 ? x1 : x; + y = cond2 ? y1 : y; + + // General case: take absolute values of arguments + __CLC_GENTYPE u = __clc_fabs(x); + __CLC_GENTYPE v = __clc_fabs(y); + + // Swap u and v if necessary to obtain 0 < v < u. Compute v/u. + __CLC_LONGN swap_vu = u < v; + __CLC_GENTYPE uu = u; + u = swap_vu ? v : u; + v = swap_vu ? uu : v; + + __CLC_GENTYPE vbyu = v / u; + __CLC_GENTYPE q1, q2; + + // General values of v/u. Use a look-up table and series expansion. + + { + __CLC_GENTYPE val = vbyu > 0.0625 ? vbyu : 0.063; + __CLC_INTN index = __CLC_CONVERT_INTN(__clc_fma(256.0, val, 0.5)); + q1 = USE_TABLE(atan_jby256_tbl_head, (index - 16)); + q2 = USE_TABLE(atan_jby256_tbl_tail, (index - 16)); + __CLC_GENTYPE c = __CLC_CONVERT_GENTYPE(index) * 0x1.0p-8; + + // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1 + // u_exponent could be EMAX so we have to do it in 2 steps + __CLC_INTN m = + -(__CLC_CONVERT_INTN(__CLC_AS_ULONGN(u) >> EXPSHIFTBITS_DP64) - + EXPBIAS_DP64); + __CLC_GENTYPE um = __clc_ldexp(u, m); + __CLC_GENTYPE vm = __clc_ldexp(v, m); + + // 26 leading bits of u + __CLC_GENTYPE u1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(um) & 0xfffffffff8000000UL); + __CLC_GENTYPE u2 = um - u1; + + __CLC_GENTYPE r = MATH_DIVIDE(__clc_fma(-c, u2, __clc_fma(-c, u1, vm)), + __clc_fma(c, vm, um)); + + // Polynomial approximation to atan(r) + __CLC_GENTYPE s = r * r; + q2 = q2 + __clc_fma((s * __clc_fma(-s, 0.19999918038989143496, + 0.33333333333224095522)), + -r, r); + } + + __CLC_GENTYPE q3, q4; + { + q3 = 0.0; + q4 = vbyu; + } + + __CLC_GENTYPE q5, q6; + { + __CLC_GENTYPE u1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(u) & 0xffffffff00000000UL); + __CLC_GENTYPE u2 = u - u1; + __CLC_GENTYPE vu1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(vbyu) & 0xffffffff00000000UL); + __CLC_GENTYPE vu2 = vbyu - vu1; + + q5 = 0.0; + __CLC_GENTYPE s = vbyu * vbyu; + q6 = vbyu + + __clc_fma( + -vbyu * s, + __clc_fma( + -s, + __clc_fma(-s, + __clc_fma(-s, + __clc_fma(-s, 0.90029810285449784439E-01, + 0.11110736283514525407), + 0.14285713561807169030), + 0.19999999999393223405), + 0.33333333333333170500), + MATH_DIVIDE(__clc_fma(-u, vu2, + __clc_fma(-u2, vu1, __clc_fma(-u1, vu1, v))), + u)); + } + + q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5; + q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6; + + q1 = vbyu > 0.0625 ? q1 : q3; + q2 = vbyu > 0.0625 ? q2 : q4; + + // Tidy-up according to which quadrant the arguments lie in + __CLC_GENTYPE res1, res2, res3, res4; + q1 = swap_vu ? piby2_head - q1 : q1; + q2 = swap_vu ? piby2_tail - q2 : q2; + q1 = xneg ? pi_head - q1 : q1; + q2 = xneg ? pi_tail - q2 : q2; + q1 = MATH_DIVIDE(q1 + q2, pi); + res4 = yneg ? -q1 : q1; + + res1 = yneg ? -0.75 : 0.75; + res2 = yneg ? -0.25 : 0.25; + res3 = xneg ? res1 : res2; + + res3 = __clc_select(res4, res3, + __CLC_CONVERT_LONGN(__clc_isinf(y2) & __clc_isinf(x2))); + res1 = yneg ? -1.0 : 1.0; + + // abs(x)/abs(y) > 2^56 and x < 0 + res3 = diffexp < -56 && xneg ? res1 : res3; + + res4 = MATH_DIVIDE(MATH_DIVIDE(y, x), pi); + // x positive and dominant over y by a factor of 2^28 + res3 = diffexp < -28 && xneg == 0 ? res4 : res3; + + // abs(y)/abs(x) > 2^56 + res4 = yneg ? -0.5 : 0.5; // atan(y/x) is insignificant compared to piby2 + res3 = diffexp > 56 ? res4 : res3; + + res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of y + res4 = xneg ? res1 : y2; + + // Zero y gives +-0 for positive x and +-pi for negative x + res3 = y2 == 0.0 ? res4 : res3; + res3 = __clc_isnan(y2) ? y2 : res3; + res3 = __clc_isnan(x2) ? x2 : res3; + + return res3; +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2pi(__CLC_GENTYPE x, + __CLC_GENTYPE y) { + return __CLC_CONVERT_GENTYPE( + __clc_atan2pi(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y))); +} + +#endif diff --git a/libclc/clc/lib/generic/math/clc_tables.cl b/libclc/clc/lib/generic/math/clc_tables.cl index 7ad005f91b575..2d6d280f7ea06 100644 --- a/libclc/clc/lib/generic/math/clc_tables.cl +++ b/libclc/clc/lib/generic/math/clc_tables.cl @@ -180,4 +180,181 @@ DECLARE_TABLE(double, LN_TBL_HI, 65) = { CLC_TABLE_FUNCTION(double, LN_TBL_HI, ln_tbl_hi); +// Arrays atan_jby256_head and atan_jby256_tail contain leading and trailing +// parts respectively of precomputed values of atan(j/256), for j = 16, 17, ..., +// 256. atan_jby256_head contains the first 21 bits of precision, and +// atan_jby256_tail contains a further 53 bits precision. + +DECLARE_TABLE(double, ATAN_JBY256_TBL_HEAD, 241) = { + 0x1.ff55b00000000p-5, 0x1.0f99e00000000p-4, 0x1.1f86d00000000p-4, + 0x1.2f71900000000p-4, 0x1.3f59f00000000p-4, 0x1.4f3fd00000000p-4, + 0x1.5f23200000000p-4, 0x1.6f03b00000000p-4, 0x1.7ee1800000000p-4, + 0x1.8ebc500000000p-4, 0x1.9e94100000000p-4, 0x1.ae68a00000000p-4, + 0x1.be39e00000000p-4, 0x1.ce07c00000000p-4, 0x1.ddd2100000000p-4, + 0x1.ed98c00000000p-4, 0x1.fd5ba00000000p-4, 0x1.068d500000000p-3, + 0x1.0e6ad00000000p-3, 0x1.1646500000000p-3, 0x1.1e1fa00000000p-3, + 0x1.25f6e00000000p-3, 0x1.2dcbd00000000p-3, 0x1.359e800000000p-3, + 0x1.3d6ee00000000p-3, 0x1.453ce00000000p-3, 0x1.4d08700000000p-3, + 0x1.54d1800000000p-3, 0x1.5c98100000000p-3, 0x1.645bf00000000p-3, + 0x1.6c1d400000000p-3, 0x1.73dbd00000000p-3, 0x1.7b97b00000000p-3, + 0x1.8350b00000000p-3, 0x1.8b06e00000000p-3, 0x1.92ba300000000p-3, + 0x1.9a6a800000000p-3, 0x1.a217e00000000p-3, 0x1.a9c2300000000p-3, + 0x1.b169600000000p-3, 0x1.b90d700000000p-3, 0x1.c0ae500000000p-3, + 0x1.c84bf00000000p-3, 0x1.cfe6500000000p-3, 0x1.d77d500000000p-3, + 0x1.df11000000000p-3, 0x1.e6a1400000000p-3, 0x1.ee2e100000000p-3, + 0x1.f5b7500000000p-3, 0x1.fd3d100000000p-3, 0x1.025fa00000000p-2, + 0x1.061ee00000000p-2, 0x1.09dc500000000p-2, 0x1.0d97e00000000p-2, + 0x1.1151a00000000p-2, 0x1.1509700000000p-2, 0x1.18bf500000000p-2, + 0x1.1c73500000000p-2, 0x1.2025500000000p-2, 0x1.23d5600000000p-2, + 0x1.2783700000000p-2, 0x1.2b2f700000000p-2, 0x1.2ed9800000000p-2, + 0x1.3281800000000p-2, 0x1.3627700000000p-2, 0x1.39cb400000000p-2, + 0x1.3d6d100000000p-2, 0x1.410cb00000000p-2, 0x1.44aa400000000p-2, + 0x1.4845a00000000p-2, 0x1.4bdee00000000p-2, 0x1.4f75f00000000p-2, + 0x1.530ad00000000p-2, 0x1.569d800000000p-2, 0x1.5a2e000000000p-2, + 0x1.5dbc300000000p-2, 0x1.6148400000000p-2, 0x1.64d1f00000000p-2, + 0x1.6859700000000p-2, 0x1.6bdea00000000p-2, 0x1.6f61900000000p-2, + 0x1.72e2200000000p-2, 0x1.7660700000000p-2, 0x1.79dc600000000p-2, + 0x1.7d56000000000p-2, 0x1.80cd400000000p-2, 0x1.8442200000000p-2, + 0x1.87b4b00000000p-2, 0x1.8b24d00000000p-2, 0x1.8e92900000000p-2, + 0x1.91fde00000000p-2, 0x1.9566d00000000p-2, 0x1.98cd500000000p-2, + 0x1.9c31600000000p-2, 0x1.9f93000000000p-2, 0x1.a2f2300000000p-2, + 0x1.a64ee00000000p-2, 0x1.a9a9200000000p-2, 0x1.ad00f00000000p-2, + 0x1.b056400000000p-2, 0x1.b3a9100000000p-2, 0x1.b6f9600000000p-2, + 0x1.ba47300000000p-2, 0x1.bd92800000000p-2, 0x1.c0db400000000p-2, + 0x1.c421900000000p-2, 0x1.c765500000000p-2, 0x1.caa6800000000p-2, + 0x1.cde5300000000p-2, 0x1.d121500000000p-2, 0x1.d45ae00000000p-2, + 0x1.d791f00000000p-2, 0x1.dac6700000000p-2, 0x1.ddf8500000000p-2, + 0x1.e127b00000000p-2, 0x1.e454800000000p-2, 0x1.e77eb00000000p-2, + 0x1.eaa6500000000p-2, 0x1.edcb600000000p-2, 0x1.f0ede00000000p-2, + 0x1.f40dd00000000p-2, 0x1.f72b200000000p-2, 0x1.fa45d00000000p-2, + 0x1.fd5e000000000p-2, 0x1.0039c00000000p-1, 0x1.01c3400000000p-1, + 0x1.034b700000000p-1, 0x1.04d2500000000p-1, 0x1.0657e00000000p-1, + 0x1.07dc300000000p-1, 0x1.095f300000000p-1, 0x1.0ae0e00000000p-1, + 0x1.0c61400000000p-1, 0x1.0de0500000000p-1, 0x1.0f5e200000000p-1, + 0x1.10daa00000000p-1, 0x1.1255d00000000p-1, 0x1.13cfb00000000p-1, + 0x1.1548500000000p-1, 0x1.16bfa00000000p-1, 0x1.1835a00000000p-1, + 0x1.19aa500000000p-1, 0x1.1b1dc00000000p-1, 0x1.1c8fe00000000p-1, + 0x1.1e00b00000000p-1, 0x1.1f70400000000p-1, 0x1.20de800000000p-1, + 0x1.224b700000000p-1, 0x1.23b7100000000p-1, 0x1.2521700000000p-1, + 0x1.268a900000000p-1, 0x1.27f2600000000p-1, 0x1.2958e00000000p-1, + 0x1.2abe200000000p-1, 0x1.2c22100000000p-1, 0x1.2d84c00000000p-1, + 0x1.2ee6200000000p-1, 0x1.3046400000000p-1, 0x1.31a5200000000p-1, + 0x1.3302b00000000p-1, 0x1.345f000000000p-1, 0x1.35ba000000000p-1, + 0x1.3713d00000000p-1, 0x1.386c500000000p-1, 0x1.39c3900000000p-1, + 0x1.3b19800000000p-1, 0x1.3c6e400000000p-1, 0x1.3dc1c00000000p-1, + 0x1.3f13f00000000p-1, 0x1.4064f00000000p-1, 0x1.41b4a00000000p-1, + 0x1.4303200000000p-1, 0x1.4450600000000p-1, 0x1.459c600000000p-1, + 0x1.46e7200000000p-1, 0x1.4830a00000000p-1, 0x1.4978f00000000p-1, + 0x1.4ac0000000000p-1, 0x1.4c05e00000000p-1, 0x1.4d4a800000000p-1, + 0x1.4e8de00000000p-1, 0x1.4fd0100000000p-1, 0x1.5111000000000p-1, + 0x1.5250c00000000p-1, 0x1.538f500000000p-1, 0x1.54cca00000000p-1, + 0x1.5608d00000000p-1, 0x1.5743c00000000p-1, 0x1.587d800000000p-1, + 0x1.59b6000000000p-1, 0x1.5aed600000000p-1, 0x1.5c23900000000p-1, + 0x1.5d58900000000p-1, 0x1.5e8c600000000p-1, 0x1.5fbf000000000p-1, + 0x1.60f0800000000p-1, 0x1.6220d00000000p-1, 0x1.634ff00000000p-1, + 0x1.647de00000000p-1, 0x1.65aab00000000p-1, 0x1.66d6600000000p-1, + 0x1.6800e00000000p-1, 0x1.692a400000000p-1, 0x1.6a52700000000p-1, + 0x1.6b79800000000p-1, 0x1.6c9f700000000p-1, 0x1.6dc4400000000p-1, + 0x1.6ee7f00000000p-1, 0x1.700a700000000p-1, 0x1.712be00000000p-1, + 0x1.724c300000000p-1, 0x1.736b600000000p-1, 0x1.7489700000000p-1, + 0x1.75a6700000000p-1, 0x1.76c2400000000p-1, 0x1.77dd100000000p-1, + 0x1.78f6b00000000p-1, 0x1.7a0f400000000p-1, 0x1.7b26c00000000p-1, + 0x1.7c3d300000000p-1, 0x1.7d52800000000p-1, 0x1.7e66c00000000p-1, + 0x1.7f79e00000000p-1, 0x1.808c000000000p-1, 0x1.819d000000000p-1, + 0x1.82ad000000000p-1, 0x1.83bbe00000000p-1, 0x1.84c9c00000000p-1, + 0x1.85d6900000000p-1, 0x1.86e2500000000p-1, 0x1.87ed000000000p-1, + 0x1.88f6b00000000p-1, 0x1.89ff500000000p-1, 0x1.8b06f00000000p-1, + 0x1.8c0d900000000p-1, 0x1.8d13200000000p-1, 0x1.8e17a00000000p-1, + 0x1.8f1b300000000p-1, 0x1.901db00000000p-1, 0x1.911f300000000p-1, + 0x1.921fb00000000p-1, +}; + +CLC_TABLE_FUNCTION(double, ATAN_JBY256_TBL_HEAD, atan_jby256_tbl_head); + +DECLARE_TABLE(double, ATAN_JBY256_TBL_TAIL, 241) = { + 0x1.6e59fbd38db2cp-26, 0x1.4e3aa54dedf96p-25, 0x1.7e105ab1bda88p-25, + 0x1.8c5254d013fd0p-27, 0x1.cf8ab3ad62670p-29, 0x1.9dca4bec80468p-26, + 0x1.3f4b5ec98a8dap-26, 0x1.b9d49619d81fep-25, 0x1.3017887460934p-27, + 0x1.11e3eca0b9944p-26, 0x1.4f3f73c5a332ep-26, 0x1.c71c8ae0e00a6p-26, + 0x1.7cde0f86fbdc7p-25, 0x1.70f328c889c72p-26, 0x1.c07ae9b994efep-26, + 0x1.0c8021d7b1698p-27, 0x1.35585edb8cb22p-25, 0x1.0842567b30e96p-24, + 0x1.99e811031472ep-24, 0x1.041821416bceep-25, 0x1.f6086e4dc96f4p-24, + 0x1.71a535c5f1b58p-27, 0x1.65f743fe63ca1p-24, 0x1.dbd733472d014p-24, + 0x1.d18cc4d8b0d1dp-24, 0x1.8c12553c8fb29p-24, 0x1.53b49e2e8f991p-24, + 0x1.7422ae148c141p-24, 0x1.e3ec269df56a8p-27, 0x1.ff6754e7e0ac9p-24, + 0x1.131267b1b5aadp-24, 0x1.d14fa403a94bcp-24, 0x1.2f396c089a3d8p-25, + 0x1.c731d78fa95bbp-24, 0x1.c50f385177399p-24, 0x1.f41409c6f2c20p-25, + 0x1.d2d90c4c39ec0p-24, 0x1.80420696f2106p-25, 0x1.b40327943a2e8p-27, + 0x1.5d35e02f3d2a2p-25, 0x1.4a498288117b0p-25, 0x1.35da119afb324p-25, + 0x1.14e85cdb9a908p-24, 0x1.38754e5547b9ap-25, 0x1.be40ae6ce3246p-24, + 0x1.0c993b3bea7e7p-24, 0x1.1d2dd89ac3359p-24, 0x1.1476603332c46p-25, + 0x1.f25901bac55b7p-24, 0x1.f881b7c826e28p-24, 0x1.441996d698d20p-24, + 0x1.407ac521ea089p-23, 0x1.2fb0c6c4b1723p-23, 0x1.ca135966a3e18p-23, + 0x1.b1218e4d646e4p-25, 0x1.d4e72a350d288p-25, 0x1.4617e2f04c329p-23, + 0x1.096ec41e82650p-25, 0x1.9f91f25773e6ep-24, 0x1.59c0820f1d674p-25, + 0x1.02bf7a2df1064p-25, 0x1.fb36bfc40508fp-23, 0x1.ea08f3f8dc892p-24, + 0x1.3ed6254656a0ep-24, 0x1.b83f5e5e69c58p-25, 0x1.d6ec2af768592p-23, + 0x1.493889a226f94p-25, 0x1.5ad8fa65279bap-23, 0x1.b615784d45434p-25, + 0x1.09a184368f145p-23, 0x1.61a2439b0d91cp-24, 0x1.ce1a65e39a978p-24, + 0x1.32a39a93b6a66p-23, 0x1.1c3699af804e7p-23, 0x1.75e0f4e44ede8p-26, + 0x1.f77ced1a7a83bp-23, 0x1.84e7f0cb1b500p-29, 0x1.ec6b838b02dfep-23, + 0x1.3ebf4dfbeda87p-23, 0x1.9397aed9cb475p-23, 0x1.07937bc239c54p-24, + 0x1.aa754553131b6p-23, 0x1.4a05d407c45dcp-24, 0x1.132231a206dd0p-23, + 0x1.2d8ecfdd69c88p-24, 0x1.a852c74218606p-24, 0x1.71bf2baeebb50p-23, + 0x1.83d7db7491820p-27, 0x1.ca50d92b6da14p-25, 0x1.6f5cde8530298p-26, + 0x1.f343198910740p-24, 0x1.0e8d241ccd80ap-24, 0x1.1535ac619e6c8p-24, + 0x1.7316041c36cd2p-24, 0x1.985a000637d8ep-24, 0x1.f2f29858c0a68p-25, + 0x1.879847f96d909p-23, 0x1.ab3d319e12e42p-23, 0x1.5088162dfc4c2p-24, + 0x1.05749a1cd9d8cp-25, 0x1.da65c6c6b8618p-26, 0x1.739bf7df1ad64p-25, + 0x1.bc31252aa3340p-25, 0x1.e528191ad3aa8p-26, 0x1.929d93df19f18p-23, + 0x1.ff11eb693a080p-26, 0x1.55ae3f145a3a0p-27, 0x1.cbcd8c6c0ca82p-24, + 0x1.0cb04d425d304p-24, 0x1.9adfcab5be678p-24, 0x1.93d90c5662508p-23, + 0x1.68489bd35ff40p-24, 0x1.586ed3da2b7e0p-28, 0x1.7604d2e850eeep-23, + 0x1.ac1d12bfb53d8p-24, 0x1.9b3d468274740p-28, 0x1.fc5d68d10e53cp-24, + 0x1.8f9e51884becbp-23, 0x1.a87f0869c06d1p-23, 0x1.31e7279f685fap-23, + 0x1.6a8282f9719b0p-27, 0x1.0d2724a8a44e0p-25, 0x1.a60524b11ad4ep-23, + 0x1.75fdf832750f0p-26, 0x1.cf06902e4cd36p-23, 0x1.e82422d4f6d10p-25, + 0x1.24a091063e6c0p-26, 0x1.8a1a172dc6f38p-24, 0x1.29b6619f8a92dp-22, + 0x1.9274d9c1b70c8p-24, 0x1.0c34b1fbb7930p-26, 0x1.639866c20eb50p-25, + 0x1.6d6d0f6832e9ep-23, 0x1.af54def99f25ep-22, 0x1.16cfc52a00262p-22, + 0x1.dcc1e83569c32p-23, 0x1.37f7a551ed425p-22, 0x1.f6360adc98887p-22, + 0x1.2c6ec8d35a2c1p-22, 0x1.bd44df84cb036p-23, 0x1.117cf826e310ep-22, + 0x1.ca533f332cfc9p-22, 0x1.0f208509dbc2ep-22, 0x1.cd07d93c945dep-23, + 0x1.57bdfd67e6d72p-22, 0x1.aab89c516c658p-24, 0x1.3e823b1a1b8a0p-25, + 0x1.307464a9d6d3cp-23, 0x1.c5993cd438843p-22, 0x1.ba2fca02ab554p-22, + 0x1.01a5b6983a268p-23, 0x1.273d1b350efc8p-25, 0x1.64c238c37b0c6p-23, + 0x1.aded07370a300p-25, 0x1.78091197eb47ep-23, 0x1.4b0f245e0dabcp-24, + 0x1.080d9794e2eafp-22, 0x1.d4ec242b60c76p-23, 0x1.221d2f940caa0p-27, + 0x1.cdbc42b2bba5cp-24, 0x1.cce37bb440840p-25, 0x1.6c1d999cf1dd0p-22, + 0x1.bed8a07eb0870p-26, 0x1.69ed88f490e3cp-24, 0x1.cd41719b73ef0p-25, + 0x1.cbc4ac95b41b7p-22, 0x1.238f1b890f5d7p-22, 0x1.50c4282259cc4p-24, + 0x1.713d2de87b3e2p-22, 0x1.1d5a7d2255276p-23, 0x1.c0dfd48227ac1p-22, + 0x1.1c964dab76753p-22, 0x1.6de56d5704496p-23, 0x1.4aeb71fd19968p-23, + 0x1.fbf91c57b1918p-23, 0x1.d6bef7fbe5d9ap-22, 0x1.464d3dc249066p-22, + 0x1.638e2ec4d9073p-22, 0x1.16f4a7247ea7cp-24, 0x1.1a0a740f1d440p-28, + 0x1.6edbb0114a33cp-23, 0x1.dbee8bf1d513cp-24, 0x1.5b8bdb0248f73p-22, + 0x1.7de3d3f5eac64p-22, 0x1.ee24187ae448ap-23, 0x1.e06c591ec5192p-22, + 0x1.4e3861a332738p-24, 0x1.a9599dcc2bfe4p-24, 0x1.f732fbad43468p-25, + 0x1.eb9f573b727d9p-22, 0x1.8b212a2eb9897p-22, 0x1.384884c167215p-22, + 0x1.0e2d363020051p-22, 0x1.2820879fbd022p-22, 0x1.a1ab9893e4b30p-22, + 0x1.2d1b817a24478p-23, 0x1.15d7b8ded4878p-25, 0x1.8968f9db3a5e4p-24, + 0x1.71c4171fe135fp-22, 0x1.6d80f605d0d8cp-22, 0x1.c91f043691590p-24, + 0x1.39f8a15fce2b2p-23, 0x1.55beda9d94b80p-27, 0x1.b12c15d60949ap-23, + 0x1.24167b312bfe3p-22, 0x1.0ab8633070277p-22, 0x1.54554ebbc80eep-23, + 0x1.0204aef5a4bb8p-25, 0x1.8af08c679cf2cp-22, 0x1.0852a330ae6c8p-22, + 0x1.6d3eb9ec32916p-23, 0x1.685cb7fcbbafep-23, 0x1.1f751c1e0bd95p-22, + 0x1.705b1b0f72560p-26, 0x1.b98d8d808ca92p-22, 0x1.2ea22c75cc980p-25, + 0x1.7aba62bca0350p-22, 0x1.d73833442278cp-22, 0x1.5a5ca1fb18bf9p-22, + 0x1.1a6092b6ecf28p-25, 0x1.44fd049aac104p-24, 0x1.c114fd8df5180p-29, + 0x1.5972f130feae5p-22, 0x1.ca034a55fe198p-24, 0x1.6e2b149990227p-22, + 0x1.b00000294592cp-24, 0x1.8b9bdc442620ep-22, 0x1.d94fdfabf3e4ep-23, + 0x1.5db30b145ad9ap-23, 0x1.e3e1eb95022b0p-23, 0x1.d5b8b45442bd6p-22, + 0x1.7a046231ecd2ep-22, 0x1.feafe3ef55232p-22, 0x1.839e7bfd78267p-22, + 0x1.45cf49d6fa900p-25, 0x1.be3132b27f380p-27, 0x1.533980bb84f9fp-22, + 0x1.889e2ce3ba390p-26, 0x1.f7778c3ad0cc8p-24, 0x1.46660cec4eba2p-23, + 0x1.5110b4611a626p-23, +}; + +CLC_TABLE_FUNCTION(double, ATAN_JBY256_TBL_TAIL, atan_jby256_tbl_tail); + #endif // cl_khr_fp64 diff --git a/libclc/generic/lib/math/atan2.cl b/libclc/generic/lib/math/atan2.cl index 635f1cdfaca7e..20651c1ec1bf3 100644 --- a/libclc/generic/lib/math/atan2.cl +++ b/libclc/generic/lib/math/atan2.cl @@ -8,223 +8,9 @@ #include <clc/clc.h> #include <clc/clcmacro.h> -#include <clc/math/math.h> -#include <clc/math/tables.h> +#include <clc/math/clc_atan2.h> -_CLC_OVERLOAD _CLC_DEF float atan2(float y, float x) -{ - const float pi = 0x1.921fb6p+1f; - const float piby2 = 0x1.921fb6p+0f; - const float piby4 = 0x1.921fb6p-1f; - const float threepiby4 = 0x1.2d97c8p+1f; +#define FUNCTION atan2 +#define __CLC_BODY <clc/shared/binary_def.inc> - float ax = fabs(x); - float ay = fabs(y); - float v = min(ax, ay); - float u = max(ax, ay); - - // Scale since u could be large, as in "regular" divide - float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f; - float vbyu = s * MATH_DIVIDE(v, s*u); - - float vbyu2 = vbyu * vbyu; - -#define USE_2_2_APPROXIMATION -#if defined USE_2_2_APPROXIMATION - float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu; - float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f); -#else - float p = mad(vbyu2, mad(vbyu2, -0x1.55cd22p-5f, -0x1.26cf76p-2f), -0x1.55554ep-2f) * vbyu2 * vbyu; - float q = mad(vbyu2, mad(vbyu2, mad(vbyu2, 0x1.9f1304p-5f, 0x1.2656fap-1f), 0x1.76b4b8p+0f), 1.0f); -#endif - - // Octant 0 result - float a = mad(p, MATH_RECIP(q), vbyu); - - // Fix up 3 other octants - float at = piby2 - a; - a = ay > ax ? at : a; - at = pi - a; - a = x < 0.0F ? at : a; - - // y == 0 => 0 for x >= 0, pi for x < 0 - at = as_int(x) < 0 ? pi : 0.0f; - a = y == 0.0f ? at : a; - - // if (!FINITE_ONLY()) { - // x and y are +- Inf - at = x > 0.0f ? piby4 : threepiby4; - a = ax == INFINITY & ay == INFINITY ? at : a; - - // x or y is NaN - a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a; - // } - - // Fixup sign and return - return copysign(a, y); -} - -_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, atan2, float, float); - -#ifdef cl_khr_fp64 - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_OVERLOAD _CLC_DEF double atan2(double y, double x) -{ - const double pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */ - const double piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */ - const double piby4 = 7.8539816339744831e-01; /* 0x3fe921fb54442d18 */ - const double three_piby4 = 2.3561944901923449e+00; /* 0x4002d97c7f3321d2 */ - const double pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 */ - const double pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 */ - const double piby2_head = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */ - const double piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */ - - double x2 = x; - int xneg = as_int2(x).hi < 0; - int xexp = (as_int2(x).hi >> 20) & 0x7ff; - - double y2 = y; - int yneg = as_int2(y).hi < 0; - int yexp = (as_int2(y).hi >> 20) & 0x7ff; - - int cond2 = (xexp < 1021) & (yexp < 1021); - int diffexp = yexp - xexp; - - // Scale up both x and y if they are both below 1/4 - double x1 = ldexp(x, 1024); - int xexp1 = (as_int2(x1).hi >> 20) & 0x7ff; - double y1 = ldexp(y, 1024); - int yexp1 = (as_int2(y1).hi >> 20) & 0x7ff; - int diffexp1 = yexp1 - xexp1; - - diffexp = cond2 ? diffexp1 : diffexp; - x = cond2 ? x1 : x; - y = cond2 ? y1 : y; - - // General case: take absolute values of arguments - double u = fabs(x); - double v = fabs(y); - - // Swap u and v if necessary to obtain 0 < v < u. Compute v/u. - int swap_vu = u < v; - double uu = u; - u = swap_vu ? v : u; - v = swap_vu ? uu : v; - - double vbyu = v / u; - double q1, q2; - - // General values of v/u. Use a look-up table and series expansion. - - { - double val = vbyu > 0.0625 ? vbyu : 0.063; - int index = convert_int(fma(256.0, val, 0.5)); - double2 tv = USE_TABLE(atan_jby256_tbl, index - 16); - q1 = tv.s0; - q2 = tv.s1; - double c = (double)index * 0x1.0p-8; - - // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1 - // u_exponent could be EMAX so we have to do it in 2 steps - int m = -((int)(as_ulong(u) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); - //double um = __amdil_ldexp_f64(u, m); - //double vm = __amdil_ldexp_f64(v, m); - double um = ldexp(u, m); - double vm = ldexp(v, m); - - // 26 leading bits of u - double u1 = as_double(as_ulong(um) & 0xfffffffff8000000UL); - double u2 = um - u1; - - double r = MATH_DIVIDE(fma(-c, u2, fma(-c, u1, vm)), fma(c, vm, um)); - - // Polynomial approximation to atan(r) - double s = r * r; - q2 = q2 + fma((s * fma(-s, 0.19999918038989143496, 0.33333333333224095522)), -r, r); - } - - - double q3, q4; - { - q3 = 0.0; - q4 = vbyu; - } - - double q5, q6; - { - double u1 = as_double(as_ulong(u) & 0xffffffff00000000UL); - double u2 = u - u1; - double vu1 = as_double(as_ulong(vbyu) & 0xffffffff00000000UL); - double vu2 = vbyu - vu1; - - q5 = 0.0; - double s = vbyu * vbyu; - q6 = vbyu + fma(-vbyu * s, - fma(-s, - fma(-s, - fma(-s, - fma(-s, 0.90029810285449784439E-01, - 0.11110736283514525407), - 0.14285713561807169030), - 0.19999999999393223405), - 0.33333333333333170500), - MATH_DIVIDE(fma(-u, vu2, fma(-u2, vu1, fma(-u1, vu1, v))), u)); - } - - - q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5; - q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6; - - q1 = vbyu > 0.0625 ? q1 : q3; - q2 = vbyu > 0.0625 ? q2 : q4; - - // Tidy-up according to which quadrant the arguments lie in - double res1, res2, res3, res4; - q1 = swap_vu ? piby2_head - q1 : q1; - q2 = swap_vu ? piby2_tail - q2 : q2; - q1 = xneg ? pi_head - q1 : q1; - q2 = xneg ? pi_tail - q2 : q2; - q1 = q1 + q2; - res4 = yneg ? -q1 : q1; - - res1 = yneg ? -three_piby4 : three_piby4; - res2 = yneg ? -piby4 : piby4; - res3 = xneg ? res1 : res2; - - res3 = isinf(x2) & isinf(y2) ? res3 : res4; - res1 = yneg ? -pi : pi; - - // abs(x)/abs(y) > 2^56 and x < 0 - res3 = (diffexp < -56 && xneg) ? res1 : res3; - - res4 = MATH_DIVIDE(y, x); - // x positive and dominant over y by a factor of 2^28 - res3 = diffexp < -28 & xneg == 0 ? res4 : res3; - - // abs(y)/abs(x) > 2^56 - res4 = yneg ? -piby2 : piby2; // atan(y/x) is insignificant compared to piby2 - res3 = diffexp > 56 ? res4 : res3; - - res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of y - res4 = xneg ? res1 : y2; - - res3 = y2 == 0.0 ? res4 : res3; // Zero y gives +-0 for positive x and +-pi for negative x - res3 = isnan(y2) ? y2 : res3; - res3 = isnan(x2) ? x2 : res3; - - return res3; -} - -_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan2, double, double); - -#endif - -#ifdef cl_khr_fp16 - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -_CLC_DEFINE_BINARY_BUILTIN_FP16(atan2) - -#endif +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/atan2pi.cl b/libclc/generic/lib/math/atan2pi.cl index 667e4519d8043..316db1d6c9c48 100644 --- a/libclc/generic/lib/math/atan2pi.cl +++ b/libclc/generic/lib/math/atan2pi.cl @@ -8,207 +8,9 @@ #include <clc/clc.h> #include <clc/clcmacro.h> -#include <clc/math/math.h> -#include <clc/math/tables.h> +#include <clc/math/clc_atan2pi.h> -_CLC_OVERLOAD _CLC_DEF float atan2pi(float y, float x) { - const float pi = 0x1.921fb6p+1f; +#define FUNCTION atan2pi +#define __CLC_BODY <clc/shared/binary_def.inc> - float ax = fabs(x); - float ay = fabs(y); - float v = min(ax, ay); - float u = max(ax, ay); - - // Scale since u could be large, as in "regular" divide - float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f; - float vbyu = s * MATH_DIVIDE(v, s*u); - - float vbyu2 = vbyu * vbyu; - - float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu; - float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f); - - // Octant 0 result - float a = MATH_DIVIDE(mad(p, MATH_RECIP(q), vbyu), pi); - - // Fix up 3 other octants - float at = 0.5f - a; - a = ay > ax ? at : a; - at = 1.0f - a; - a = x < 0.0F ? at : a; - - // y == 0 => 0 for x >= 0, pi for x < 0 - at = as_int(x) < 0 ? 1.0f : 0.0f; - a = y == 0.0f ? at : a; - - // if (!FINITE_ONLY()) { - // x and y are +- Inf - at = x > 0.0f ? 0.25f : 0.75f; - a = ax == INFINITY & ay == INFINITY ? at : a; - - // x or y is NaN - a = isnan(x) | isnan(y) ? as_float(QNANBITPATT_SP32) : a; - // } - - // Fixup sign and return - return copysign(a, y); -} - -_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, atan2pi, float, float) - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_OVERLOAD _CLC_DEF double atan2pi(double y, double x) { - const double pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */ - const double pi_head = 3.1415926218032836e+00; /* 0x400921fb50000000 */ - const double pi_tail = 3.1786509547056392e-08; /* 0x3e6110b4611a6263 */ - const double piby2_head = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */ - const double piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */ - - double x2 = x; - int xneg = as_int2(x).hi < 0; - int xexp = (as_int2(x).hi >> 20) & 0x7ff; - - double y2 = y; - int yneg = as_int2(y).hi < 0; - int yexp = (as_int2(y).hi >> 20) & 0x7ff; - - int cond2 = (xexp < 1021) & (yexp < 1021); - int diffexp = yexp - xexp; - - // Scale up both x and y if they are both below 1/4 - double x1 = ldexp(x, 1024); - int xexp1 = (as_int2(x1).hi >> 20) & 0x7ff; - double y1 = ldexp(y, 1024); - int yexp1 = (as_int2(y1).hi >> 20) & 0x7ff; - int diffexp1 = yexp1 - xexp1; - - diffexp = cond2 ? diffexp1 : diffexp; - x = cond2 ? x1 : x; - y = cond2 ? y1 : y; - - // General case: take absolute values of arguments - double u = fabs(x); - double v = fabs(y); - - // Swap u and v if necessary to obtain 0 < v < u. Compute v/u. - int swap_vu = u < v; - double uu = u; - u = swap_vu ? v : u; - v = swap_vu ? uu : v; - - double vbyu = v / u; - double q1, q2; - - // General values of v/u. Use a look-up table and series expansion. - - { - double val = vbyu > 0.0625 ? vbyu : 0.063; - int index = convert_int(fma(256.0, val, 0.5)); - double2 tv = USE_TABLE(atan_jby256_tbl, (index - 16)); - q1 = tv.s0; - q2 = tv.s1; - double c = (double)index * 0x1.0p-8; - - // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1 - // u_exponent could be EMAX so we have to do it in 2 steps - int m = -((int)(as_ulong(u) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); - double um = ldexp(u, m); - double vm = ldexp(v, m); - - // 26 leading bits of u - double u1 = as_double(as_ulong(um) & 0xfffffffff8000000UL); - double u2 = um - u1; - - double r = MATH_DIVIDE(fma(-c, u2, fma(-c, u1, vm)), fma(c, vm, um)); - - // Polynomial approximation to atan(r) - double s = r * r; - q2 = q2 + fma((s * fma(-s, 0.19999918038989143496, 0.33333333333224095522)), -r, r); - } - - - double q3, q4; - { - q3 = 0.0; - q4 = vbyu; - } - - double q5, q6; - { - double u1 = as_double(as_ulong(u) & 0xffffffff00000000UL); - double u2 = u - u1; - double vu1 = as_double(as_ulong(vbyu) & 0xffffffff00000000UL); - double vu2 = vbyu - vu1; - - q5 = 0.0; - double s = vbyu * vbyu; - q6 = vbyu + fma(-vbyu * s, - fma(-s, - fma(-s, - fma(-s, - fma(-s, 0.90029810285449784439E-01, - 0.11110736283514525407), - 0.14285713561807169030), - 0.19999999999393223405), - 0.33333333333333170500), - MATH_DIVIDE(fma(-u, vu2, fma(-u2, vu1, fma(-u1, vu1, v))), u)); - } - - - q3 = vbyu < 0x1.d12ed0af1a27fp-27 ? q3 : q5; - q4 = vbyu < 0x1.d12ed0af1a27fp-27 ? q4 : q6; - - q1 = vbyu > 0.0625 ? q1 : q3; - q2 = vbyu > 0.0625 ? q2 : q4; - - // Tidy-up according to which quadrant the arguments lie in - double res1, res2, res3, res4; - q1 = swap_vu ? piby2_head - q1 : q1; - q2 = swap_vu ? piby2_tail - q2 : q2; - q1 = xneg ? pi_head - q1 : q1; - q2 = xneg ? pi_tail - q2 : q2; - q1 = MATH_DIVIDE(q1 + q2, pi); - res4 = yneg ? -q1 : q1; - - res1 = yneg ? -0.75 : 0.75; - res2 = yneg ? -0.25 : 0.25; - res3 = xneg ? res1 : res2; - - res3 = isinf(y2) & isinf(x2) ? res3 : res4; - res1 = yneg ? -1.0 : 1.0; - - // abs(x)/abs(y) > 2^56 and x < 0 - res3 = (diffexp < -56 && xneg) ? res1 : res3; - - res4 = MATH_DIVIDE(MATH_DIVIDE(y, x), pi); - // x positive and dominant over y by a factor of 2^28 - res3 = diffexp < -28 & xneg == 0 ? res4 : res3; - - // abs(y)/abs(x) > 2^56 - res4 = yneg ? -0.5 : 0.5; // atan(y/x) is insignificant compared to piby2 - res3 = diffexp > 56 ? res4 : res3; - - res3 = x2 == 0.0 ? res4 : res3; // Zero x gives +- pi/2 depending on sign of y - res4 = xneg ? res1 : y2; - - res3 = y2 == 0.0 ? res4 : res3; // Zero y gives +-0 for positive x and +-pi for negative x - res3 = isnan(y2) ? y2 : res3; - res3 = isnan(x2) ? x2 : res3; - - return res3; -} - - -_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, atan2pi, double, double) - -#endif - -#ifdef cl_khr_fp16 - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -_CLC_DEFINE_BINARY_BUILTIN_FP16(atan2pi) - -#endif +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/tables.cl b/libclc/generic/lib/math/tables.cl index d7d3ba4aafec9..c03b4d1a3c18a 100644 --- a/libclc/generic/lib/math/tables.cl +++ b/libclc/generic/lib/math/tables.cl @@ -745,258 +745,6 @@ TABLE_FUNCTION(float2, EXP_TBL_EP, exp_tbl_ep); #ifdef cl_khr_fp64 - - -// Arrays atan_jby256_lead and atan_jby256_tail contain -// leading and trailing parts respectively of precomputed -// values of atan(j/256), for j = 16, 17, ..., 256. -// atan_jby256_lead contains the first 21 bits of precision, -// and atan_jby256_tail contains a further 53 bits precision. - -DECLARE_TABLE(double2, ATAN_JBY256_TBL, 241) = { - (double2)(0x1.ff55b00000000p-5, 0x1.6e59fbd38db2cp-26), - (double2)(0x1.0f99e00000000p-4, 0x1.4e3aa54dedf96p-25), - (double2)(0x1.1f86d00000000p-4, 0x1.7e105ab1bda88p-25), - (double2)(0x1.2f71900000000p-4, 0x1.8c5254d013fd0p-27), - (double2)(0x1.3f59f00000000p-4, 0x1.cf8ab3ad62670p-29), - (double2)(0x1.4f3fd00000000p-4, 0x1.9dca4bec80468p-26), - (double2)(0x1.5f23200000000p-4, 0x1.3f4b5ec98a8dap-26), - (double2)(0x1.6f03b00000000p-4, 0x1.b9d49619d81fep-25), - (double2)(0x1.7ee1800000000p-4, 0x1.3017887460934p-27), - (double2)(0x1.8ebc500000000p-4, 0x1.11e3eca0b9944p-26), - (double2)(0x1.9e94100000000p-4, 0x1.4f3f73c5a332ep-26), - (double2)(0x1.ae68a00000000p-4, 0x1.c71c8ae0e00a6p-26), - (double2)(0x1.be39e00000000p-4, 0x1.7cde0f86fbdc7p-25), - (double2)(0x1.ce07c00000000p-4, 0x1.70f328c889c72p-26), - (double2)(0x1.ddd2100000000p-4, 0x1.c07ae9b994efep-26), - (double2)(0x1.ed98c00000000p-4, 0x1.0c8021d7b1698p-27), - (double2)(0x1.fd5ba00000000p-4, 0x1.35585edb8cb22p-25), - (double2)(0x1.068d500000000p-3, 0x1.0842567b30e96p-24), - (double2)(0x1.0e6ad00000000p-3, 0x1.99e811031472ep-24), - (double2)(0x1.1646500000000p-3, 0x1.041821416bceep-25), - (double2)(0x1.1e1fa00000000p-3, 0x1.f6086e4dc96f4p-24), - (double2)(0x1.25f6e00000000p-3, 0x1.71a535c5f1b58p-27), - (double2)(0x1.2dcbd00000000p-3, 0x1.65f743fe63ca1p-24), - (double2)(0x1.359e800000000p-3, 0x1.dbd733472d014p-24), - (double2)(0x1.3d6ee00000000p-3, 0x1.d18cc4d8b0d1dp-24), - (double2)(0x1.453ce00000000p-3, 0x1.8c12553c8fb29p-24), - (double2)(0x1.4d08700000000p-3, 0x1.53b49e2e8f991p-24), - (double2)(0x1.54d1800000000p-3, 0x1.7422ae148c141p-24), - (double2)(0x1.5c98100000000p-3, 0x1.e3ec269df56a8p-27), - (double2)(0x1.645bf00000000p-3, 0x1.ff6754e7e0ac9p-24), - (double2)(0x1.6c1d400000000p-3, 0x1.131267b1b5aadp-24), - (double2)(0x1.73dbd00000000p-3, 0x1.d14fa403a94bcp-24), - (double2)(0x1.7b97b00000000p-3, 0x1.2f396c089a3d8p-25), - (double2)(0x1.8350b00000000p-3, 0x1.c731d78fa95bbp-24), - (double2)(0x1.8b06e00000000p-3, 0x1.c50f385177399p-24), - (double2)(0x1.92ba300000000p-3, 0x1.f41409c6f2c20p-25), - (double2)(0x1.9a6a800000000p-3, 0x1.d2d90c4c39ec0p-24), - (double2)(0x1.a217e00000000p-3, 0x1.80420696f2106p-25), - (double2)(0x1.a9c2300000000p-3, 0x1.b40327943a2e8p-27), - (double2)(0x1.b169600000000p-3, 0x1.5d35e02f3d2a2p-25), - (double2)(0x1.b90d700000000p-3, 0x1.4a498288117b0p-25), - (double2)(0x1.c0ae500000000p-3, 0x1.35da119afb324p-25), - (double2)(0x1.c84bf00000000p-3, 0x1.14e85cdb9a908p-24), - (double2)(0x1.cfe6500000000p-3, 0x1.38754e5547b9ap-25), - (double2)(0x1.d77d500000000p-3, 0x1.be40ae6ce3246p-24), - (double2)(0x1.df11000000000p-3, 0x1.0c993b3bea7e7p-24), - (double2)(0x1.e6a1400000000p-3, 0x1.1d2dd89ac3359p-24), - (double2)(0x1.ee2e100000000p-3, 0x1.1476603332c46p-25), - (double2)(0x1.f5b7500000000p-3, 0x1.f25901bac55b7p-24), - (double2)(0x1.fd3d100000000p-3, 0x1.f881b7c826e28p-24), - (double2)(0x1.025fa00000000p-2, 0x1.441996d698d20p-24), - (double2)(0x1.061ee00000000p-2, 0x1.407ac521ea089p-23), - (double2)(0x1.09dc500000000p-2, 0x1.2fb0c6c4b1723p-23), - (double2)(0x1.0d97e00000000p-2, 0x1.ca135966a3e18p-23), - (double2)(0x1.1151a00000000p-2, 0x1.b1218e4d646e4p-25), - (double2)(0x1.1509700000000p-2, 0x1.d4e72a350d288p-25), - (double2)(0x1.18bf500000000p-2, 0x1.4617e2f04c329p-23), - (double2)(0x1.1c73500000000p-2, 0x1.096ec41e82650p-25), - (double2)(0x1.2025500000000p-2, 0x1.9f91f25773e6ep-24), - (double2)(0x1.23d5600000000p-2, 0x1.59c0820f1d674p-25), - (double2)(0x1.2783700000000p-2, 0x1.02bf7a2df1064p-25), - (double2)(0x1.2b2f700000000p-2, 0x1.fb36bfc40508fp-23), - (double2)(0x1.2ed9800000000p-2, 0x1.ea08f3f8dc892p-24), - (double2)(0x1.3281800000000p-2, 0x1.3ed6254656a0ep-24), - (double2)(0x1.3627700000000p-2, 0x1.b83f5e5e69c58p-25), - (double2)(0x1.39cb400000000p-2, 0x1.d6ec2af768592p-23), - (double2)(0x1.3d6d100000000p-2, 0x1.493889a226f94p-25), - (double2)(0x1.410cb00000000p-2, 0x1.5ad8fa65279bap-23), - (double2)(0x1.44aa400000000p-2, 0x1.b615784d45434p-25), - (double2)(0x1.4845a00000000p-2, 0x1.09a184368f145p-23), - (double2)(0x1.4bdee00000000p-2, 0x1.61a2439b0d91cp-24), - (double2)(0x1.4f75f00000000p-2, 0x1.ce1a65e39a978p-24), - (double2)(0x1.530ad00000000p-2, 0x1.32a39a93b6a66p-23), - (double2)(0x1.569d800000000p-2, 0x1.1c3699af804e7p-23), - (double2)(0x1.5a2e000000000p-2, 0x1.75e0f4e44ede8p-26), - (double2)(0x1.5dbc300000000p-2, 0x1.f77ced1a7a83bp-23), - (double2)(0x1.6148400000000p-2, 0x1.84e7f0cb1b500p-29), - (double2)(0x1.64d1f00000000p-2, 0x1.ec6b838b02dfep-23), - (double2)(0x1.6859700000000p-2, 0x1.3ebf4dfbeda87p-23), - (double2)(0x1.6bdea00000000p-2, 0x1.9397aed9cb475p-23), - (double2)(0x1.6f61900000000p-2, 0x1.07937bc239c54p-24), - (double2)(0x1.72e2200000000p-2, 0x1.aa754553131b6p-23), - (double2)(0x1.7660700000000p-2, 0x1.4a05d407c45dcp-24), - (double2)(0x1.79dc600000000p-2, 0x1.132231a206dd0p-23), - (double2)(0x1.7d56000000000p-2, 0x1.2d8ecfdd69c88p-24), - (double2)(0x1.80cd400000000p-2, 0x1.a852c74218606p-24), - (double2)(0x1.8442200000000p-2, 0x1.71bf2baeebb50p-23), - (double2)(0x1.87b4b00000000p-2, 0x1.83d7db7491820p-27), - (double2)(0x1.8b24d00000000p-2, 0x1.ca50d92b6da14p-25), - (double2)(0x1.8e92900000000p-2, 0x1.6f5cde8530298p-26), - (double2)(0x1.91fde00000000p-2, 0x1.f343198910740p-24), - (double2)(0x1.9566d00000000p-2, 0x1.0e8d241ccd80ap-24), - (double2)(0x1.98cd500000000p-2, 0x1.1535ac619e6c8p-24), - (double2)(0x1.9c31600000000p-2, 0x1.7316041c36cd2p-24), - (double2)(0x1.9f93000000000p-2, 0x1.985a000637d8ep-24), - (double2)(0x1.a2f2300000000p-2, 0x1.f2f29858c0a68p-25), - (double2)(0x1.a64ee00000000p-2, 0x1.879847f96d909p-23), - (double2)(0x1.a9a9200000000p-2, 0x1.ab3d319e12e42p-23), - (double2)(0x1.ad00f00000000p-2, 0x1.5088162dfc4c2p-24), - (double2)(0x1.b056400000000p-2, 0x1.05749a1cd9d8cp-25), - (double2)(0x1.b3a9100000000p-2, 0x1.da65c6c6b8618p-26), - (double2)(0x1.b6f9600000000p-2, 0x1.739bf7df1ad64p-25), - (double2)(0x1.ba47300000000p-2, 0x1.bc31252aa3340p-25), - (double2)(0x1.bd92800000000p-2, 0x1.e528191ad3aa8p-26), - (double2)(0x1.c0db400000000p-2, 0x1.929d93df19f18p-23), - (double2)(0x1.c421900000000p-2, 0x1.ff11eb693a080p-26), - (double2)(0x1.c765500000000p-2, 0x1.55ae3f145a3a0p-27), - (double2)(0x1.caa6800000000p-2, 0x1.cbcd8c6c0ca82p-24), - (double2)(0x1.cde5300000000p-2, 0x1.0cb04d425d304p-24), - (double2)(0x1.d121500000000p-2, 0x1.9adfcab5be678p-24), - (double2)(0x1.d45ae00000000p-2, 0x1.93d90c5662508p-23), - (double2)(0x1.d791f00000000p-2, 0x1.68489bd35ff40p-24), - (double2)(0x1.dac6700000000p-2, 0x1.586ed3da2b7e0p-28), - (double2)(0x1.ddf8500000000p-2, 0x1.7604d2e850eeep-23), - (double2)(0x1.e127b00000000p-2, 0x1.ac1d12bfb53d8p-24), - (double2)(0x1.e454800000000p-2, 0x1.9b3d468274740p-28), - (double2)(0x1.e77eb00000000p-2, 0x1.fc5d68d10e53cp-24), - (double2)(0x1.eaa6500000000p-2, 0x1.8f9e51884becbp-23), - (double2)(0x1.edcb600000000p-2, 0x1.a87f0869c06d1p-23), - (double2)(0x1.f0ede00000000p-2, 0x1.31e7279f685fap-23), - (double2)(0x1.f40dd00000000p-2, 0x1.6a8282f9719b0p-27), - (double2)(0x1.f72b200000000p-2, 0x1.0d2724a8a44e0p-25), - (double2)(0x1.fa45d00000000p-2, 0x1.a60524b11ad4ep-23), - (double2)(0x1.fd5e000000000p-2, 0x1.75fdf832750f0p-26), - (double2)(0x1.0039c00000000p-1, 0x1.cf06902e4cd36p-23), - (double2)(0x1.01c3400000000p-1, 0x1.e82422d4f6d10p-25), - (double2)(0x1.034b700000000p-1, 0x1.24a091063e6c0p-26), - (double2)(0x1.04d2500000000p-1, 0x1.8a1a172dc6f38p-24), - (double2)(0x1.0657e00000000p-1, 0x1.29b6619f8a92dp-22), - (double2)(0x1.07dc300000000p-1, 0x1.9274d9c1b70c8p-24), - (double2)(0x1.095f300000000p-1, 0x1.0c34b1fbb7930p-26), - (double2)(0x1.0ae0e00000000p-1, 0x1.639866c20eb50p-25), - (double2)(0x1.0c61400000000p-1, 0x1.6d6d0f6832e9ep-23), - (double2)(0x1.0de0500000000p-1, 0x1.af54def99f25ep-22), - (double2)(0x1.0f5e200000000p-1, 0x1.16cfc52a00262p-22), - (double2)(0x1.10daa00000000p-1, 0x1.dcc1e83569c32p-23), - (double2)(0x1.1255d00000000p-1, 0x1.37f7a551ed425p-22), - (double2)(0x1.13cfb00000000p-1, 0x1.f6360adc98887p-22), - (double2)(0x1.1548500000000p-1, 0x1.2c6ec8d35a2c1p-22), - (double2)(0x1.16bfa00000000p-1, 0x1.bd44df84cb036p-23), - (double2)(0x1.1835a00000000p-1, 0x1.117cf826e310ep-22), - (double2)(0x1.19aa500000000p-1, 0x1.ca533f332cfc9p-22), - (double2)(0x1.1b1dc00000000p-1, 0x1.0f208509dbc2ep-22), - (double2)(0x1.1c8fe00000000p-1, 0x1.cd07d93c945dep-23), - (double2)(0x1.1e00b00000000p-1, 0x1.57bdfd67e6d72p-22), - (double2)(0x1.1f70400000000p-1, 0x1.aab89c516c658p-24), - (double2)(0x1.20de800000000p-1, 0x1.3e823b1a1b8a0p-25), - (double2)(0x1.224b700000000p-1, 0x1.307464a9d6d3cp-23), - (double2)(0x1.23b7100000000p-1, 0x1.c5993cd438843p-22), - (double2)(0x1.2521700000000p-1, 0x1.ba2fca02ab554p-22), - (double2)(0x1.268a900000000p-1, 0x1.01a5b6983a268p-23), - (double2)(0x1.27f2600000000p-1, 0x1.273d1b350efc8p-25), - (double2)(0x1.2958e00000000p-1, 0x1.64c238c37b0c6p-23), - (double2)(0x1.2abe200000000p-1, 0x1.aded07370a300p-25), - (double2)(0x1.2c22100000000p-1, 0x1.78091197eb47ep-23), - (double2)(0x1.2d84c00000000p-1, 0x1.4b0f245e0dabcp-24), - (double2)(0x1.2ee6200000000p-1, 0x1.080d9794e2eafp-22), - (double2)(0x1.3046400000000p-1, 0x1.d4ec242b60c76p-23), - (double2)(0x1.31a5200000000p-1, 0x1.221d2f940caa0p-27), - (double2)(0x1.3302b00000000p-1, 0x1.cdbc42b2bba5cp-24), - (double2)(0x1.345f000000000p-1, 0x1.cce37bb440840p-25), - (double2)(0x1.35ba000000000p-1, 0x1.6c1d999cf1dd0p-22), - (double2)(0x1.3713d00000000p-1, 0x1.bed8a07eb0870p-26), - (double2)(0x1.386c500000000p-1, 0x1.69ed88f490e3cp-24), - (double2)(0x1.39c3900000000p-1, 0x1.cd41719b73ef0p-25), - (double2)(0x1.3b19800000000p-1, 0x1.cbc4ac95b41b7p-22), - (double2)(0x1.3c6e400000000p-1, 0x1.238f1b890f5d7p-22), - (double2)(0x1.3dc1c00000000p-1, 0x1.50c4282259cc4p-24), - (double2)(0x1.3f13f00000000p-1, 0x1.713d2de87b3e2p-22), - (double2)(0x1.4064f00000000p-1, 0x1.1d5a7d2255276p-23), - (double2)(0x1.41b4a00000000p-1, 0x1.c0dfd48227ac1p-22), - (double2)(0x1.4303200000000p-1, 0x1.1c964dab76753p-22), - (double2)(0x1.4450600000000p-1, 0x1.6de56d5704496p-23), - (double2)(0x1.459c600000000p-1, 0x1.4aeb71fd19968p-23), - (double2)(0x1.46e7200000000p-1, 0x1.fbf91c57b1918p-23), - (double2)(0x1.4830a00000000p-1, 0x1.d6bef7fbe5d9ap-22), - (double2)(0x1.4978f00000000p-1, 0x1.464d3dc249066p-22), - (double2)(0x1.4ac0000000000p-1, 0x1.638e2ec4d9073p-22), - (double2)(0x1.4c05e00000000p-1, 0x1.16f4a7247ea7cp-24), - (double2)(0x1.4d4a800000000p-1, 0x1.1a0a740f1d440p-28), - (double2)(0x1.4e8de00000000p-1, 0x1.6edbb0114a33cp-23), - (double2)(0x1.4fd0100000000p-1, 0x1.dbee8bf1d513cp-24), - (double2)(0x1.5111000000000p-1, 0x1.5b8bdb0248f73p-22), - (double2)(0x1.5250c00000000p-1, 0x1.7de3d3f5eac64p-22), - (double2)(0x1.538f500000000p-1, 0x1.ee24187ae448ap-23), - (double2)(0x1.54cca00000000p-1, 0x1.e06c591ec5192p-22), - (double2)(0x1.5608d00000000p-1, 0x1.4e3861a332738p-24), - (double2)(0x1.5743c00000000p-1, 0x1.a9599dcc2bfe4p-24), - (double2)(0x1.587d800000000p-1, 0x1.f732fbad43468p-25), - (double2)(0x1.59b6000000000p-1, 0x1.eb9f573b727d9p-22), - (double2)(0x1.5aed600000000p-1, 0x1.8b212a2eb9897p-22), - (double2)(0x1.5c23900000000p-1, 0x1.384884c167215p-22), - (double2)(0x1.5d58900000000p-1, 0x1.0e2d363020051p-22), - (double2)(0x1.5e8c600000000p-1, 0x1.2820879fbd022p-22), - (double2)(0x1.5fbf000000000p-1, 0x1.a1ab9893e4b30p-22), - (double2)(0x1.60f0800000000p-1, 0x1.2d1b817a24478p-23), - (double2)(0x1.6220d00000000p-1, 0x1.15d7b8ded4878p-25), - (double2)(0x1.634ff00000000p-1, 0x1.8968f9db3a5e4p-24), - (double2)(0x1.647de00000000p-1, 0x1.71c4171fe135fp-22), - (double2)(0x1.65aab00000000p-1, 0x1.6d80f605d0d8cp-22), - (double2)(0x1.66d6600000000p-1, 0x1.c91f043691590p-24), - (double2)(0x1.6800e00000000p-1, 0x1.39f8a15fce2b2p-23), - (double2)(0x1.692a400000000p-1, 0x1.55beda9d94b80p-27), - (double2)(0x1.6a52700000000p-1, 0x1.b12c15d60949ap-23), - (double2)(0x1.6b79800000000p-1, 0x1.24167b312bfe3p-22), - (double2)(0x1.6c9f700000000p-1, 0x1.0ab8633070277p-22), - (double2)(0x1.6dc4400000000p-1, 0x1.54554ebbc80eep-23), - (double2)(0x1.6ee7f00000000p-1, 0x1.0204aef5a4bb8p-25), - (double2)(0x1.700a700000000p-1, 0x1.8af08c679cf2cp-22), - (double2)(0x1.712be00000000p-1, 0x1.0852a330ae6c8p-22), - (double2)(0x1.724c300000000p-1, 0x1.6d3eb9ec32916p-23), - (double2)(0x1.736b600000000p-1, 0x1.685cb7fcbbafep-23), - (double2)(0x1.7489700000000p-1, 0x1.1f751c1e0bd95p-22), - (double2)(0x1.75a6700000000p-1, 0x1.705b1b0f72560p-26), - (double2)(0x1.76c2400000000p-1, 0x1.b98d8d808ca92p-22), - (double2)(0x1.77dd100000000p-1, 0x1.2ea22c75cc980p-25), - (double2)(0x1.78f6b00000000p-1, 0x1.7aba62bca0350p-22), - (double2)(0x1.7a0f400000000p-1, 0x1.d73833442278cp-22), - (double2)(0x1.7b26c00000000p-1, 0x1.5a5ca1fb18bf9p-22), - (double2)(0x1.7c3d300000000p-1, 0x1.1a6092b6ecf28p-25), - (double2)(0x1.7d52800000000p-1, 0x1.44fd049aac104p-24), - (double2)(0x1.7e66c00000000p-1, 0x1.c114fd8df5180p-29), - (double2)(0x1.7f79e00000000p-1, 0x1.5972f130feae5p-22), - (double2)(0x1.808c000000000p-1, 0x1.ca034a55fe198p-24), - (double2)(0x1.819d000000000p-1, 0x1.6e2b149990227p-22), - (double2)(0x1.82ad000000000p-1, 0x1.b00000294592cp-24), - (double2)(0x1.83bbe00000000p-1, 0x1.8b9bdc442620ep-22), - (double2)(0x1.84c9c00000000p-1, 0x1.d94fdfabf3e4ep-23), - (double2)(0x1.85d6900000000p-1, 0x1.5db30b145ad9ap-23), - (double2)(0x1.86e2500000000p-1, 0x1.e3e1eb95022b0p-23), - (double2)(0x1.87ed000000000p-1, 0x1.d5b8b45442bd6p-22), - (double2)(0x1.88f6b00000000p-1, 0x1.7a046231ecd2ep-22), - (double2)(0x1.89ff500000000p-1, 0x1.feafe3ef55232p-22), - (double2)(0x1.8b06f00000000p-1, 0x1.839e7bfd78267p-22), - (double2)(0x1.8c0d900000000p-1, 0x1.45cf49d6fa900p-25), - (double2)(0x1.8d13200000000p-1, 0x1.be3132b27f380p-27), - (double2)(0x1.8e17a00000000p-1, 0x1.533980bb84f9fp-22), - (double2)(0x1.8f1b300000000p-1, 0x1.889e2ce3ba390p-26), - (double2)(0x1.901db00000000p-1, 0x1.f7778c3ad0cc8p-24), - (double2)(0x1.911f300000000p-1, 0x1.46660cec4eba2p-23), - (double2)(0x1.921fb00000000p-1, 0x1.5110b4611a626p-23), -}; - DECLARE_TABLE(double2, TWO_TO_JBY64_EP, 64) = { (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0), (double2)(0x1.02c9a30000000p+0, 0x1.cef00c1dcdef9p-25), @@ -2197,7 +1945,6 @@ DECLARE_TABLE(double2, LOG_F_INV_TBL, 258) = { (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0), }; -TABLE_FUNCTION(double2, ATAN_JBY256_TBL, atan_jby256_tbl); TABLE_FUNCTION(double2, TWO_TO_JBY64_EP, two_to_jby64_ep_tbl); TABLE_FUNCTION(double2, SINH_TBL, sinh_tbl); TABLE_FUNCTION(double2, COSH_TBL, cosh_tbl); >From 50d1baf2e2d85f7104ce1ed954431a86f380ac14 Mon Sep 17 00:00:00 2001 From: Fraser Cormack <fra...@codeplay.com> Date: Thu, 27 Mar 2025 10:30:16 +0000 Subject: [PATCH 2/2] fix formatting --- libclc/clc/lib/generic/math/clc_atan2.inc | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/libclc/clc/lib/generic/math/clc_atan2.inc b/libclc/clc/lib/generic/math/clc_atan2.inc index 0917f3adf2d90..61ffeebbc5d11 100644 --- a/libclc/clc/lib/generic/math/clc_atan2.inc +++ b/libclc/clc/lib/generic/math/clc_atan2.inc @@ -8,7 +8,8 @@ #if __CLC_FPSIZE == 32 -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y, __CLC_GENTYPE x) { +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y, + __CLC_GENTYPE x) { const __CLC_GENTYPE pi = 0x1.921fb6p+1f; const __CLC_GENTYPE piby2 = 0x1.921fb6p+0f; const __CLC_GENTYPE piby4 = 0x1.921fb6p-1f; @@ -71,7 +72,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y, __CLC_GENTYPE #elif __CLC_FPSIZE == 64 -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y, __CLC_GENTYPE x) { +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y, + __CLC_GENTYPE x) { const __CLC_GENTYPE pi = 3.1415926535897932e+00; /* 0x400921fb54442d18 */ const __CLC_GENTYPE piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */ const __CLC_GENTYPE piby4 = 7.8539816339744831e-01; /* 0x3fe921fb54442d18 */ @@ -142,7 +144,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y, __CLC_GENTYPE __CLC_GENTYPE vm = __clc_ldexp(v, m); // 26 leading bits of u - __CLC_GENTYPE u1 = __CLC_AS_GENTYPE(__CLC_AS_ULONGN(um) & 0xfffffffff8000000UL); + __CLC_GENTYPE u1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(um) & 0xfffffffff8000000UL); __CLC_GENTYPE u2 = um - u1; __CLC_GENTYPE r = MATH_DIVIDE(__clc_fma(-c, u2, __clc_fma(-c, u1, vm)), @@ -163,9 +166,11 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y, __CLC_GENTYPE __CLC_GENTYPE q5, q6; { - __CLC_GENTYPE u1 = __CLC_AS_GENTYPE(__CLC_AS_ULONGN(u) & 0xffffffff00000000UL); + __CLC_GENTYPE u1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(u) & 0xffffffff00000000UL); __CLC_GENTYPE u2 = u - u1; - __CLC_GENTYPE vu1 = __CLC_AS_GENTYPE(__CLC_AS_ULONGN(vbyu) & 0xffffffff00000000UL); + __CLC_GENTYPE vu1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(vbyu) & 0xffffffff00000000UL); __CLC_GENTYPE vu2 = vbyu - vu1; q5 = 0.0; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits