[libclc] [libclc] Move fp32 sincos helpers to CLC library (PR #132753)

Fraser Cormack via cfe-commits Mon, 24 Mar 2025 08:00:48 -0700

https://github.com/frasercrmck created 
https://github.com/llvm/llvm-project/pull/132753


This commit moves most of the sincos helper functions to the CLC library. It 
simultaneously vectorizes them with the aim to increase performance for vector 
types by avoiding scalarization.

Some helpers for double types remain as they use various features not yet 
ready, like 'fract' which in turn relies on 'fmin'; neither of these are in the 
CLC library. They also use table lookups and type punning which don't translate 
well to vector versions.

As a proof of concept, float and half versions of the sin and cos builtins are 
now vectorized and use the CLC helpers to do so. They remain in the OpenCL 
layer but will be simpler to move to the CLC library when the double versions 
are ready.

>From c0a37ec1a7fe39c291be036aef917b75f892e48d Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fra...@codeplay.com>
Date: Thu, 20 Mar 2025 16:41:48 +0000
Subject: [PATCH 1/2] [libclc Move fp32 sincos helpers to CLC library

This commit moves most of the sincos helper functions to the CLC
library. It simultaneously vectorizes them with the aim to increase
performance for vector types by avoiding scalarization.

Some helpers for double types remain as they use various features not
yet ready, like 'fract' which in turn relies on 'fmin'; neither of these
are in the CLC library. They also use table lookups and type punning
which don't translate well to vector versions.

As a proof of concept, float and half versions of the sin and cos
builtins are now vectorized and use the CLC helpers to do so. They
remain in the OpenCL layer but will be simpler to move to the CLC
library when the double versions are ready.
---
 .../clc/include/clc/math/clc_sincos_helpers.h |  20 ++
 .../include/clc/math/clc_sincos_helpers.inc   |  16 +
 libclc/clc/include/clc/math/gentype.inc       |   8 +
 libclc/clc/lib/generic/SOURCES                |   1 +
 .../lib/generic/math/clc_sincos_helpers.cl    |  32 ++
 .../lib/generic/math/clc_sincos_helpers.inc   | 304 ++++++++++++++++++
 libclc/generic/lib/math/clc_tan.cl            |   1 +
 libclc/generic/lib/math/cos.cl                |  34 +-
 libclc/generic/lib/math/cos.inc               |  37 +++
 libclc/generic/lib/math/sin.cl                |  37 +--
 libclc/generic/lib/math/sin.inc               |  40 +++
 libclc/generic/lib/math/sincos_helpers.cl     | 288 -----------------
 libclc/generic/lib/math/sincos_helpers.h      |   3 -
 13 files changed, 469 insertions(+), 352 deletions(-)
 create mode 100644 libclc/clc/include/clc/math/clc_sincos_helpers.h
 create mode 100644 libclc/clc/include/clc/math/clc_sincos_helpers.inc
 create mode 100644 libclc/clc/lib/generic/math/clc_sincos_helpers.cl
 create mode 100644 libclc/clc/lib/generic/math/clc_sincos_helpers.inc
 create mode 100644 libclc/generic/lib/math/cos.inc
 create mode 100644 libclc/generic/lib/math/sin.inc

diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers.h 
b/libclc/clc/include/clc/math/clc_sincos_helpers.h
new file mode 100644
index 0000000000000..25ed60b3520e0
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers.h
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_SINCOS_HELPERS_H__
+#define __CLC_MATH_CLC_SINCOS_HELPERS_H__
+
+#define __FLOAT_ONLY
+#define __CLC_BODY <clc/math/clc_sincos_helpers.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __FLOAT_ONLY
+
+#endif // __CLC_MATH_CLC_SINCOS_HELPERS_H__
diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers.inc 
b/libclc/clc/include/clc/math/clc_sincos_helpers.inc
new file mode 100644
index 0000000000000..c891dd91dfd2b
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers.inc
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_sinf_piby4(__CLC_FLOATN x,
+                                                      __CLC_FLOATN y);
+_CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x,
+                                                      __CLC_FLOATN y);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_INTN __clc_argReductionS(private __CLC_FLOATN *r,
+                                                       private __CLC_FLOATN 
*rr,
+                                                       __CLC_FLOATN x);
diff --git a/libclc/clc/include/clc/math/gentype.inc 
b/libclc/clc/include/clc/math/gentype.inc
index 01eaa5b37ac08..9692a4d37011b 100644
--- a/libclc/clc/include/clc/math/gentype.inc
+++ b/libclc/clc/include/clc/math/gentype.inc
@@ -29,6 +29,10 @@
 #define __CLC_UINTN __CLC_XCONCAT(uint, __CLC_VECSIZE)
 #define __CLC_ULONGN __CLC_XCONCAT(ulong, __CLC_VECSIZE)
 
+#define __CLC_AS_HALFN __CLC_XCONCAT(__clc_as_, __CLC_HALFN)
+#define __CLC_AS_FLOATN __CLC_XCONCAT(__clc_as_, __CLC_FLOATN)
+#define __CLC_AS_DOUBLEN __CLC_XCONCAT(__clc_as_, __CLC_DOUBLEN)
+
 #define __CLC_AS_CHARN __CLC_XCONCAT(__clc_as_, __CLC_CHARN)
 #define __CLC_AS_SHORTN __CLC_XCONCAT(__clc_as_, __CLC_SHORTN)
 #define __CLC_AS_INTN __CLC_XCONCAT(__clc_as_, __CLC_INTN)
@@ -278,6 +282,10 @@
 #undef __CLC_AS_INTN
 #undef __CLC_AS_LONGN
 
+#undef __CLC_AS_HALFN
+#undef __CLC_AS_FLOATN
+#undef __CLC_AS_DOUBLEN
+
 #undef __CLC_AS_UCHARN
 #undef __CLC_AS_USHORTN
 #undef __CLC_AS_UINTN
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index f7688d0442253..490ce5c364465 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -35,6 +35,7 @@ math/clc_nextafter.cl
 math/clc_rint.cl
 math/clc_round.cl
 math/clc_rsqrt.cl
+math/clc_sincos_helpers.cl
 math/clc_sqrt.cl
 math/clc_sw_fma.cl
 math/clc_trunc.cl
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl 
b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
new file mode 100644
index 0000000000000..d8a7b10d8e868
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
@@ -0,0 +1,32 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/clc_convert.h>
+#include <clc/integer/clc_clz.h>
+#include <clc/integer/clc_mul_hi.h>
+#include <clc/internal/clc.h>
+#include <clc/math/clc_fma.h>
+#include <clc/math/clc_mad.h>
+#include <clc/math/clc_trunc.h>
+#include <clc/math/math.h>
+
+#define bitalign(hi, lo, shift) ((hi) << (32 - (shift))) | ((lo) >> (shift));
+
+#define FULL_MUL(A, B, HI, LO)                                                 
\
+  LO = A * B;                                                                  
\
+  HI = __clc_mul_hi(A, B)
+
+#define FULL_MAD(A, B, C, HI, LO)                                              
\
+  LO = ((A) * (B) + (C));                                                      
\
+  HI = __clc_mul_hi(A, B);                                                     
\
+  HI += LO < C ? 1U : 0U;
+
+#define __FLOAT_ONLY
+#define __CLC_BODY <clc_sincos_helpers.inc>
+
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc 
b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
new file mode 100644
index 0000000000000..d9d2b81226b72
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
@@ -0,0 +1,304 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_sinf_piby4(__CLC_FLOATN x,
+                                                     __CLC_FLOATN y) {
+  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+  // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+  // = x * f(w)
+  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+  // We use a minimax approximation of (f(w) - 1) / w
+  // because this produces an expansion in even powers of x.
+
+  const __CLC_FLOATN c1 = -0.1666666666e0f;
+  const __CLC_FLOATN c2 = 0.8333331876e-2f;
+  const __CLC_FLOATN c3 = -0.198400874e-3f;
+  const __CLC_FLOATN c4 = 0.272500015e-5f;
+  const __CLC_FLOATN c5 = -2.5050759689e-08f; // 0xb2d72f34
+  const __CLC_FLOATN c6 = 1.5896910177e-10f;  // 0x2f2ec9d3
+
+  __CLC_FLOATN z = x * x;
+  __CLC_FLOATN v = z * x;
+  __CLC_FLOATN r = __clc_mad(
+      z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3), c2);
+  __CLC_FLOATN ret =
+      x - __clc_mad(v, -c1, __clc_mad(z, __clc_mad(y, 0.5f, -v * r), -y));
+
+  return ret;
+}
+
+_CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x,
+                                                     __CLC_FLOATN y) {
+  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+  // = f(w)
+  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+  // because this produces an expansion in even powers of x.
+
+  const __CLC_FLOATN c1 = 0.416666666e-1f;
+  const __CLC_FLOATN c2 = -0.138888876e-2f;
+  const __CLC_FLOATN c3 = 0.248006008e-4f;
+  const __CLC_FLOATN c4 = -0.2730101334e-6f;
+  const __CLC_FLOATN c5 = 2.0875723372e-09f;  // 0x310f74f6
+  const __CLC_FLOATN c6 = -1.1359647598e-11f; // 0xad47d74e
+
+  __CLC_FLOATN z = x * x;
+  __CLC_FLOATN r =
+      z *
+      __clc_mad(
+          z,
+          __clc_mad(z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), 
c3),
+                    c2),
+          c1);
+
+  // if |x| < 0.3
+  __CLC_FLOATN qx = 0.0f;
+
+  __CLC_INTN ix = __CLC_AS_INTN(x) & EXSIGNBIT_SP32;
+
+  //  0.78125 > |x| >= 0.3
+  __CLC_FLOATN xby4 = __CLC_AS_FLOATN(ix - 0x01000000);
+  qx = ((ix >= 0x3e99999a) & (ix <= 0x3f480000)) ? xby4 : qx;
+
+  // x > 0.78125
+  qx = ix > 0x3f480000 ? 0.28125f : qx;
+
+  __CLC_FLOATN hz = __clc_mad(z, 0.5f, -qx);
+  __CLC_FLOATN a = 1.0f - qx;
+  __CLC_FLOATN ret = a - (hz - __clc_mad(z, r, -x * y));
+  return ret;
+}
+
+_CLC_DEF _CLC_OVERLOAD void __clc_fullMulS(private __CLC_FLOATN *hi,
+                                           private __CLC_FLOATN *lo,
+                                           __CLC_FLOATN a, __CLC_FLOATN b,
+                                           __CLC_FLOATN bh, __CLC_FLOATN bt) {
+  if (__CLC_HAVE_HW_FMA32()) {
+    __CLC_FLOATN ph = a * b;
+    *hi = ph;
+    *lo = __clc_fma(a, b, -ph);
+  } else {
+    __CLC_FLOATN ah = __CLC_AS_FLOATN(__CLC_AS_UINTN(a) & 0xfffff000U);
+    __CLC_FLOATN at = a - ah;
+    __CLC_FLOATN ph = a * b;
+    __CLC_FLOATN pt = __clc_mad(
+        at, bt, __clc_mad(at, bh, __clc_mad(ah, bt, __clc_mad(ah, bh, -ph))));
+    *hi = ph;
+    *lo = pt;
+  }
+}
+
+_CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_removePi2S(private __CLC_FLOATN *hi,
+                                                     private __CLC_FLOATN *lo,
+                                                     __CLC_FLOATN x) {
+  // 72 bits of pi/2
+  const __CLC_FLOATN fpiby2_1 = (__CLC_FLOATN)0xC90FDA / 0x1.0p+23f;
+  const __CLC_FLOATN fpiby2_1_h = (__CLC_FLOATN)0xC90 / 0x1.0p+11f;
+  const __CLC_FLOATN fpiby2_1_t = (__CLC_FLOATN)0xFDA / 0x1.0p+23f;
+
+  const __CLC_FLOATN fpiby2_2 = (__CLC_FLOATN)0xA22168 / 0x1.0p+47f;
+  const __CLC_FLOATN fpiby2_2_h = (__CLC_FLOATN)0xA22 / 0x1.0p+35f;
+  const __CLC_FLOATN fpiby2_2_t = (__CLC_FLOATN)0x168 / 0x1.0p+47f;
+
+  const __CLC_FLOATN fpiby2_3 = (__CLC_FLOATN)0xC234C4 / 0x1.0p+71f;
+  const __CLC_FLOATN fpiby2_3_h = (__CLC_FLOATN)0xC23 / 0x1.0p+59f;
+  const __CLC_FLOATN fpiby2_3_t = (__CLC_FLOATN)0x4C4 / 0x1.0p+71f;
+
+  const __CLC_FLOATN twobypi = 0x1.45f306p-1f;
+
+  __CLC_FLOATN fnpi2 = __clc_trunc(__clc_mad(x, twobypi, 0.5f));
+
+  // subtract n * pi/2 from x
+  __CLC_FLOATN rhead, rtail;
+  __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
+  __CLC_FLOATN v = x - rhead;
+  __CLC_FLOATN rem = v + (((x - v) - rhead) - rtail);
+
+  __CLC_FLOATN rhead2, rtail2;
+  __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
+  v = rem - rhead2;
+  rem = v + (((rem - v) - rhead2) - rtail2);
+
+  __CLC_FLOATN rhead3, rtail3;
+  __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
+  v = rem - rhead3;
+
+  *hi = v + ((rem - v) - rhead3);
+  *lo = -rtail3;
+  return fnpi2;
+}
+
+_CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionSmallS(
+    private __CLC_FLOATN *r, private __CLC_FLOATN *rr, __CLC_FLOATN x) {
+  __CLC_FLOATN fnpi2 = __clc_removePi2S(r, rr, x);
+  return __CLC_CONVERT_INTN(fnpi2) & 0x3;
+}
+
+_CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
+    private __CLC_FLOATN *r, private __CLC_FLOATN *rr, __CLC_FLOATN x) {
+  __CLC_INTN xe = __CLC_AS_INTN((__CLC_AS_UINTN(x) >> 23) - 127);
+  __CLC_UINTN xm = 0x00800000U | (__CLC_AS_UINTN(x) & 0x7fffffU);
+
+  // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041
+  // FE5163AB
+  const __CLC_UINTN b6 = 0xA2F9836EU;
+  const __CLC_UINTN b5 = 0x4E441529U;
+  const __CLC_UINTN b4 = 0xFC2757D1U;
+  const __CLC_UINTN b3 = 0xF534DDC0U;
+  const __CLC_UINTN b2 = 0xDB629599U;
+  const __CLC_UINTN b1 = 0x3C439041U;
+  const __CLC_UINTN b0 = 0xFE5163ABU;
+
+  __CLC_UINTN p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
+
+  FULL_MUL(xm, b0, c0, p0);
+  FULL_MAD(xm, b1, c0, c1, p1);
+  FULL_MAD(xm, b2, c1, c0, p2);
+  FULL_MAD(xm, b3, c0, c1, p3);
+  FULL_MAD(xm, b4, c1, c0, p4);
+  FULL_MAD(xm, b5, c0, c1, p5);
+  FULL_MAD(xm, b6, c1, p7, p6);
+
+  __CLC_UINTN fbits = (__CLC_UINTN)224 + (__CLC_UINTN)23 - __CLC_AS_UINTN(xe);
+
+  // shift amount to get 2 lsb of integer part at top 2 bits
+  //   min: 25 (xe=18) max: 134 (xe=127)
+  __CLC_UINTN shift = 256U - 2 - fbits;
+
+  // Shift by up to 134/32 = 4 words
+  __CLC_INTN c = shift > 31;
+  p7 = c ? p6 : p7;
+  p6 = c ? p5 : p6;
+  p5 = c ? p4 : p5;
+  p4 = c ? p3 : p4;
+  p3 = c ? p2 : p3;
+  p2 = c ? p1 : p2;
+  p1 = c ? p0 : p1;
+  shift -= (c ? 32U : 0U);
+
+  c = shift > 31;
+  p7 = c ? p6 : p7;
+  p6 = c ? p5 : p6;
+  p5 = c ? p4 : p5;
+  p4 = c ? p3 : p4;
+  p3 = c ? p2 : p3;
+  p2 = c ? p1 : p2;
+  shift -= (c ? 32U : 0U);
+
+  c = shift > 31;
+  p7 = c ? p6 : p7;
+  p6 = c ? p5 : p6;
+  p5 = c ? p4 : p5;
+  p4 = c ? p3 : p4;
+  p3 = c ? p2 : p3;
+  shift -= (c ? 32U : 0U);
+
+  c = shift > 31;
+  p7 = c ? p6 : p7;
+  p6 = c ? p5 : p6;
+  p5 = c ? p4 : p5;
+  p4 = c ? p3 : p4;
+  shift -= (c ? 32U : 0U);
+
+  // bitalign cannot handle a shift of 32
+  c = shift > 0;
+  shift = 32 - shift;
+  __CLC_UINTN t7 = bitalign(p7, p6, shift);
+  __CLC_UINTN t6 = bitalign(p6, p5, shift);
+  __CLC_UINTN t5 = bitalign(p5, p4, shift);
+  p7 = c ? t7 : p7;
+  p6 = c ? t6 : p6;
+  p5 = c ? t5 : p5;
+
+  // Get 2 lsb of int part and msb of fraction
+  __CLC_INTN i = __CLC_AS_INTN(p7 >> 29U);
+
+  // Scoot up 2 more bits so only fraction remains
+  p7 = bitalign(p7, p6, 30);
+  p6 = bitalign(p6, p5, 30);
+  p5 = bitalign(p5, p4, 30);
+
+  // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
+  __CLC_UINTN flip = (i & 1) != 0 ? 0xFFFFFFFFU : 0U;
+  __CLC_UINTN sign = (i & 1) != 0 ? 0x80000000U : 0U;
+  p7 = p7 ^ flip;
+  p6 = p6 ^ flip;
+  p5 = p5 ^ flip;
+
+  // Find exponent and shift away leading zeroes and hidden bit
+  xe = __CLC_AS_INTN(__clc_clz(p7)) + 1;
+  shift = 32 - __CLC_AS_UINTN(xe);
+  p7 = bitalign(p7, p6, shift);
+  p6 = bitalign(p6, p5, shift);
+
+  // Most significant part of fraction
+  __CLC_FLOATN q1 =
+      __CLC_AS_FLOATN(sign | ((127U - __CLC_AS_UINTN(xe)) << 23U) | p7 >> 9);
+
+  // Shift out bits we captured on q1
+  p7 = bitalign(p7, p6, 32 - 23);
+
+  // Get 24 more bits of fraction in another float, there are not long strings
+  // of zeroes here
+  __CLC_INTN xxe = __CLC_AS_INTN(__clc_clz(p7)) + 1;
+  p7 = bitalign(p7, p6, 32 - xxe);
+  __CLC_FLOATN q0 = __CLC_AS_FLOATN(
+      sign | ((127U - __CLC_AS_UINTN(xe + 23 + xxe)) << 23U) | p7 >> 9);
+
+  // At this point, the fraction q1 + q0 is correct to at least 48 bits
+  // Now we need to multiply the fraction by pi/2
+  // This loses us about 4 bits
+  // pi/2 = C90 FDA A22 168 C23 4C4
+
+  const __CLC_FLOATN pio2h = (__CLC_FLOATN)0xc90fda / 0x1.0p+23f;
+  const __CLC_FLOATN pio2hh = (__CLC_FLOATN)0xc90 / 0x1.0p+11f;
+  const __CLC_FLOATN pio2ht = (__CLC_FLOATN)0xfda / 0x1.0p+23f;
+  const __CLC_FLOATN pio2t = (__CLC_FLOATN)0xa22168 / 0x1.0p+47f;
+
+  __CLC_FLOATN rh, rt;
+
+  if (__CLC_HAVE_HW_FMA32()) {
+    rh = q1 * pio2h;
+    rt = __clc_fma(q0, pio2h, __clc_fma(q1, pio2t, __clc_fma(q1, pio2h, -rh)));
+  } else {
+    __CLC_FLOATN q1h = __CLC_AS_FLOATN(__CLC_AS_UINTN(q1) & 0xfffff000);
+    __CLC_FLOATN q1t = q1 - q1h;
+    rh = q1 * pio2h;
+    rt = __clc_mad(
+        q1t, pio2ht,
+        __clc_mad(q1t, pio2hh,
+                  __clc_mad(q1h, pio2ht, __clc_mad(q1h, pio2hh, -rh))));
+    rt = __clc_mad(q0, pio2h, __clc_mad(q1, pio2t, rt));
+  }
+
+  __CLC_FLOATN t = rh + rt;
+  rt = rt - (t - rh);
+
+  *r = t;
+  *rr = rt;
+  return ((i >> 1) + (i & 1)) & 0x3;
+}
+
+_CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionS(private __CLC_FLOATN *r,
+                                                      private __CLC_FLOATN *rr,
+                                                      __CLC_FLOATN x) {
+  __CLC_INTN is_small = x < (__CLC_FLOATN)0x1.0p+23f;
+#ifdef __CLC_SCALAR
+  if (is_small)
+    return __clc_argReductionSmallS(r, rr, x);
+  else
+    return __clc_argReductionLargeS(r, rr, x);
+#else
+  __CLC_FLOATN r1, rr1, r2, rr2;
+  __CLC_INTN ret1 = __clc_argReductionSmallS(&r1, &rr1, x);
+  __CLC_INTN ret2 = __clc_argReductionLargeS(&r2, &rr2, x);
+  *r = is_small ? r1 : r2;
+  *rr = is_small ? rr1 : rr2;
+  return is_small ? ret1 : ret2;
+#endif
+}
diff --git a/libclc/generic/lib/math/clc_tan.cl 
b/libclc/generic/lib/math/clc_tan.cl
index 5e2541a46b16d..eb02879339307 100644
--- a/libclc/generic/lib/math/clc_tan.cl
+++ b/libclc/generic/lib/math/clc_tan.cl
@@ -10,6 +10,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/clc_sincos_helpers.h>
 #include <clc/math/math.h>
 #include <clc/math/tables.h>
 #include <clc/relational/clc_isinf.h>
diff --git a/libclc/generic/lib/math/cos.cl b/libclc/generic/lib/math/cos.cl
index 83f18d17f3fb8..dab8af6f01fa5 100644
--- a/libclc/generic/lib/math/cos.cl
+++ b/libclc/generic/lib/math/cos.cl
@@ -8,30 +8,14 @@
 
 #include "sincos_helpers.h"
 #include <clc/clc.h>
+#include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
+#include <clc/math/clc_sincos_helpers.h>
 #include <clc/math/math.h>
 
-_CLC_OVERLOAD _CLC_DEF float cos(float x)
-{
-    int ix = as_int(x);
-    int ax = ix & 0x7fffffff;
-    float dx = as_float(ax);
-
-    float r0, r1;
-    int regn = __clc_argReductionS(&r0, &r1, dx);
-
-    float ss = -__clc_sinf_piby4(r0, r1);
-    float cc =  __clc_cosf_piby4(r0, r1);
-
-    float c =  (regn & 1) != 0 ? ss : cc;
-    c = as_float(as_int(c) ^ ((regn > 1) << 31));
-
-    c = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : c;
-
-    return c;
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cos, float);
+// FP32 and FP16 versions.
+#define __CLC_BODY <cos.inc>
+#include <clc/math/gentype.inc>
 
 #ifdef cl_khr_fp64
 
@@ -60,11 +44,3 @@ _CLC_OVERLOAD _CLC_DEF double cos(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cos, double);
 
 #endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEFINE_UNARY_BUILTIN_FP16(cos)
-
-#endif
diff --git a/libclc/generic/lib/math/cos.inc b/libclc/generic/lib/math/cos.inc
new file mode 100644
index 0000000000000..be499ba435691
--- /dev/null
+++ b/libclc/generic/lib/math/cos.inc
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __CLC_FPSIZE == 32
+
+_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN cos(__CLC_FLOATN x) {
+  __CLC_INTN ix = __CLC_AS_INTN(x);
+  __CLC_INTN ax = ix & 0x7fffffff;
+  __CLC_FLOATN dx = __CLC_AS_FLOATN(ax);
+
+  __CLC_FLOATN r0, r1;
+  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, dx);
+
+  __CLC_FLOATN ss = -__clc_sinf_piby4(r0, r1);
+  __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
+
+  __CLC_FLOATN c = (regn & 1) != 0 ? ss : cc;
+  c = __CLC_AS_FLOATN(__CLC_AS_INTN(c) ^ ((regn > 1) << 31));
+
+  c = ax >= PINFBITPATT_SP32 ? __CLC_AS_FLOATN((__CLC_UINTN)QNANBITPATT_SP32)
+                             : c;
+
+  return c;
+}
+
+#elif __CLC_FPSIZE == 16
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE cos(__CLC_GENTYPE x) {
+  return __CLC_CONVERT_GENTYPE(cos(__CLC_CONVERT_FLOATN(x)));
+}
+
+#endif
diff --git a/libclc/generic/lib/math/sin.cl b/libclc/generic/lib/math/sin.cl
index 894ed072dffa4..bfb095dcea095 100644
--- a/libclc/generic/lib/math/sin.cl
+++ b/libclc/generic/lib/math/sin.cl
@@ -8,33 +8,14 @@
 
 #include "sincos_helpers.h"
 #include <clc/clc.h>
+#include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
+#include <clc/math/clc_sincos_helpers.h>
 #include <clc/math/math.h>
 
-_CLC_OVERLOAD _CLC_DEF float sin(float x)
-{
-    int ix = as_int(x);
-    int ax = ix & 0x7fffffff;
-    float dx = as_float(ax);
-
-    float r0, r1;
-    int regn = __clc_argReductionS(&r0, &r1, dx);
-
-    float ss = __clc_sinf_piby4(r0, r1);
-    float cc = __clc_cosf_piby4(r0, r1);
-
-    float s = (regn & 1) != 0 ? cc : ss;
-    s = as_float(as_int(s) ^ ((regn > 1) << 31) ^ (ix ^ ax));
-
-    s = ax >= PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : s;
-
-    //Subnormals
-    s = x == 0.0f ? x : s;
-
-    return s;
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, sin, float);
+// FP32 and FP16 versions.
+#define __CLC_BODY <sin.inc>
+#include <clc/math/gentype.inc>
 
 #ifdef cl_khr_fp64
 
@@ -62,11 +43,3 @@ _CLC_OVERLOAD _CLC_DEF double sin(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sin, double);
 
 #endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEFINE_UNARY_BUILTIN_FP16(sin)
-
-#endif
diff --git a/libclc/generic/lib/math/sin.inc b/libclc/generic/lib/math/sin.inc
new file mode 100644
index 0000000000000..c2212f90c9a25
--- /dev/null
+++ b/libclc/generic/lib/math/sin.inc
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __CLC_FPSIZE == 32
+
+_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN sin(__CLC_FLOATN x) {
+  __CLC_INTN ix = __CLC_AS_INTN(x);
+  __CLC_INTN ax = ix & 0x7fffffff;
+  __CLC_FLOATN dx = __CLC_AS_FLOATN(ax);
+
+  __CLC_FLOATN r0, r1;
+  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, dx);
+
+  __CLC_FLOATN ss = __clc_sinf_piby4(r0, r1);
+  __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
+
+  __CLC_FLOATN s = (regn & 1) != 0 ? cc : ss;
+  s = __CLC_AS_FLOATN(__CLC_AS_INTN(s) ^ ((regn > 1) << 31) ^ (ix ^ ax));
+
+  s = ax >= PINFBITPATT_SP32 ? __CLC_AS_FLOATN((__CLC_UINTN)QNANBITPATT_SP32)
+                             : s;
+
+  // Subnormals
+  s = x == 0.0f ? x : s;
+
+  return s;
+}
+
+#elif __CLC_FPSIZE == 16
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE sin(__CLC_GENTYPE x) {
+  return __CLC_CONVERT_GENTYPE(sin(__CLC_CONVERT_FLOATN(x)));
+}
+
+#endif
diff --git a/libclc/generic/lib/math/sincos_helpers.cl 
b/libclc/generic/lib/math/sincos_helpers.cl
index 77834468a1def..32ab5af4ca90c 100644
--- a/libclc/generic/lib/math/sincos_helpers.cl
+++ b/libclc/generic/lib/math/sincos_helpers.cl
@@ -17,77 +17,9 @@
 #include <clc/math/tables.h>
 #include <clc/shared/clc_max.h>
 
-#define bitalign(hi, lo, shift) ((hi) << (32 - (shift))) | ((lo) >> (shift));
-
 #define bytealign(src0, src1, src2)                                            
\
   ((uint)(((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3) * 8)))
 
-_CLC_DEF float __clc_sinf_piby4(float x, float y) {
-  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-  // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-  // = x * f(w)
-  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-  // We use a minimax approximation of (f(w) - 1) / w
-  // because this produces an expansion in even powers of x.
-
-  const float c1 = -0.1666666666e0f;
-  const float c2 = 0.8333331876e-2f;
-  const float c3 = -0.198400874e-3f;
-  const float c4 = 0.272500015e-5f;
-  const float c5 = -2.5050759689e-08f; // 0xb2d72f34
-  const float c6 = 1.5896910177e-10f;  // 0x2f2ec9d3
-
-  float z = x * x;
-  float v = z * x;
-  float r = __clc_mad(
-      z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), c3), c2);
-  float ret =
-      x - __clc_mad(v, -c1, __clc_mad(z, __clc_mad(y, 0.5f, -v * r), -y));
-
-  return ret;
-}
-
-_CLC_DEF float __clc_cosf_piby4(float x, float y) {
-  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-  // = f(w)
-  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-  // because this produces an expansion in even powers of x.
-
-  const float c1 = 0.416666666e-1f;
-  const float c2 = -0.138888876e-2f;
-  const float c3 = 0.248006008e-4f;
-  const float c4 = -0.2730101334e-6f;
-  const float c5 = 2.0875723372e-09f;  // 0x310f74f6
-  const float c6 = -1.1359647598e-11f; // 0xad47d74e
-
-  float z = x * x;
-  float r =
-      z *
-      __clc_mad(
-          z,
-          __clc_mad(z, __clc_mad(z, __clc_mad(z, __clc_mad(z, c6, c5), c4), 
c3),
-                    c2),
-          c1);
-
-  // if |x| < 0.3
-  float qx = 0.0f;
-
-  int ix = as_int(x) & EXSIGNBIT_SP32;
-
-  //  0.78125 > |x| >= 0.3
-  float xby4 = as_float(ix - 0x01000000);
-  qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx;
-
-  // x > 0.78125
-  qx = ix > 0x3f480000 ? 0.28125f : qx;
-
-  float hz = __clc_mad(z, 0.5f, -qx);
-  float a = 1.0f - qx;
-  float ret = a - (hz - __clc_mad(z, r, -x * y));
-  return ret;
-}
-
 _CLC_DEF float __clc_tanf_piby4(float x, int regn) {
   // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
   float r = x * x;
@@ -106,226 +38,6 @@ _CLC_DEF float __clc_tanf_piby4(float x, int regn) {
   return regn & 1 ? tr : t;
 }
 
-_CLC_DEF void __clc_fullMulS(private float *hi, private float *lo, float a,
-                             float b, float bh, float bt) {
-  if (__CLC_HAVE_HW_FMA32()) {
-    float ph = a * b;
-    *hi = ph;
-    *lo = __clc_fma(a, b, -ph);
-  } else {
-    float ah = as_float(as_uint(a) & 0xfffff000U);
-    float at = a - ah;
-    float ph = a * b;
-    float pt = __clc_mad(
-        at, bt, __clc_mad(at, bh, __clc_mad(ah, bt, __clc_mad(ah, bh, -ph))));
-    *hi = ph;
-    *lo = pt;
-  }
-}
-
-_CLC_DEF float __clc_removePi2S(private float *hi, private float *lo, float x) 
{
-  // 72 bits of pi/2
-  const float fpiby2_1 = (float)0xC90FDA / 0x1.0p+23f;
-  const float fpiby2_1_h = (float)0xC90 / 0x1.0p+11f;
-  const float fpiby2_1_t = (float)0xFDA / 0x1.0p+23f;
-
-  const float fpiby2_2 = (float)0xA22168 / 0x1.0p+47f;
-  const float fpiby2_2_h = (float)0xA22 / 0x1.0p+35f;
-  const float fpiby2_2_t = (float)0x168 / 0x1.0p+47f;
-
-  const float fpiby2_3 = (float)0xC234C4 / 0x1.0p+71f;
-  const float fpiby2_3_h = (float)0xC23 / 0x1.0p+59f;
-  const float fpiby2_3_t = (float)0x4C4 / 0x1.0p+71f;
-
-  const float twobypi = 0x1.45f306p-1f;
-
-  float fnpi2 = __clc_trunc(__clc_mad(x, twobypi, 0.5f));
-
-  // subtract n * pi/2 from x
-  float rhead, rtail;
-  __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
-  float v = x - rhead;
-  float rem = v + (((x - v) - rhead) - rtail);
-
-  float rhead2, rtail2;
-  __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
-  v = rem - rhead2;
-  rem = v + (((rem - v) - rhead2) - rtail2);
-
-  float rhead3, rtail3;
-  __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
-  v = rem - rhead3;
-
-  *hi = v + ((rem - v) - rhead3);
-  *lo = -rtail3;
-  return fnpi2;
-}
-
-_CLC_DEF int __clc_argReductionSmallS(private float *r, private float *rr,
-                                      float x) {
-  float fnpi2 = __clc_removePi2S(r, rr, x);
-  return (int)fnpi2 & 0x3;
-}
-
-#define FULL_MUL(A, B, HI, LO)                                                 
\
-  LO = A * B;                                                                  
\
-  HI = __clc_mul_hi(A, B)
-
-#define FULL_MAD(A, B, C, HI, LO)                                              
\
-  LO = ((A) * (B) + (C));                                                      
\
-  HI = __clc_mul_hi(A, B);                                                     
\
-  HI += LO < C
-
-_CLC_DEF int __clc_argReductionLargeS(private float *r, private float *rr,
-                                      float x) {
-  int xe = (int)(as_uint(x) >> 23) - 127;
-  uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU);
-
-  // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041
-  // FE5163AB
-  const uint b6 = 0xA2F9836EU;
-  const uint b5 = 0x4E441529U;
-  const uint b4 = 0xFC2757D1U;
-  const uint b3 = 0xF534DDC0U;
-  const uint b2 = 0xDB629599U;
-  const uint b1 = 0x3C439041U;
-  const uint b0 = 0xFE5163ABU;
-
-  uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
-
-  FULL_MUL(xm, b0, c0, p0);
-  FULL_MAD(xm, b1, c0, c1, p1);
-  FULL_MAD(xm, b2, c1, c0, p2);
-  FULL_MAD(xm, b3, c0, c1, p3);
-  FULL_MAD(xm, b4, c1, c0, p4);
-  FULL_MAD(xm, b5, c0, c1, p5);
-  FULL_MAD(xm, b6, c1, p7, p6);
-
-  uint fbits = 224 + 23 - xe;
-
-  // shift amount to get 2 lsb of integer part at top 2 bits
-  //   min: 25 (xe=18) max: 134 (xe=127)
-  uint shift = 256U - 2 - fbits;
-
-  // Shift by up to 134/32 = 4 words
-  int c = shift > 31;
-  p7 = c ? p6 : p7;
-  p6 = c ? p5 : p6;
-  p5 = c ? p4 : p5;
-  p4 = c ? p3 : p4;
-  p3 = c ? p2 : p3;
-  p2 = c ? p1 : p2;
-  p1 = c ? p0 : p1;
-  shift -= (-c) & 32;
-
-  c = shift > 31;
-  p7 = c ? p6 : p7;
-  p6 = c ? p5 : p6;
-  p5 = c ? p4 : p5;
-  p4 = c ? p3 : p4;
-  p3 = c ? p2 : p3;
-  p2 = c ? p1 : p2;
-  shift -= (-c) & 32;
-
-  c = shift > 31;
-  p7 = c ? p6 : p7;
-  p6 = c ? p5 : p6;
-  p5 = c ? p4 : p5;
-  p4 = c ? p3 : p4;
-  p3 = c ? p2 : p3;
-  shift -= (-c) & 32;
-
-  c = shift > 31;
-  p7 = c ? p6 : p7;
-  p6 = c ? p5 : p6;
-  p5 = c ? p4 : p5;
-  p4 = c ? p3 : p4;
-  shift -= (-c) & 32;
-
-  // bitalign cannot handle a shift of 32
-  c = shift > 0;
-  shift = 32 - shift;
-  uint t7 = bitalign(p7, p6, shift);
-  uint t6 = bitalign(p6, p5, shift);
-  uint t5 = bitalign(p5, p4, shift);
-  p7 = c ? t7 : p7;
-  p6 = c ? t6 : p6;
-  p5 = c ? t5 : p5;
-
-  // Get 2 lsb of int part and msb of fraction
-  int i = p7 >> 29;
-
-  // Scoot up 2 more bits so only fraction remains
-  p7 = bitalign(p7, p6, 30);
-  p6 = bitalign(p6, p5, 30);
-  p5 = bitalign(p5, p4, 30);
-
-  // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
-  uint flip = i & 1 ? 0xffffffffU : 0U;
-  uint sign = i & 1 ? 0x80000000U : 0U;
-  p7 = p7 ^ flip;
-  p6 = p6 ^ flip;
-  p5 = p5 ^ flip;
-
-  // Find exponent and shift away leading zeroes and hidden bit
-  xe = __clc_clz(p7) + 1;
-  shift = 32 - xe;
-  p7 = bitalign(p7, p6, shift);
-  p6 = bitalign(p6, p5, shift);
-
-  // Most significant part of fraction
-  float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9));
-
-  // Shift out bits we captured on q1
-  p7 = bitalign(p7, p6, 32 - 23);
-
-  // Get 24 more bits of fraction in another float, there are not long strings
-  // of zeroes here
-  int xxe = __clc_clz(p7) + 1;
-  p7 = bitalign(p7, p6, 32 - xxe);
-  float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9));
-
-  // At this point, the fraction q1 + q0 is correct to at least 48 bits
-  // Now we need to multiply the fraction by pi/2
-  // This loses us about 4 bits
-  // pi/2 = C90 FDA A22 168 C23 4C4
-
-  const float pio2h = (float)0xc90fda / 0x1.0p+23f;
-  const float pio2hh = (float)0xc90 / 0x1.0p+11f;
-  const float pio2ht = (float)0xfda / 0x1.0p+23f;
-  const float pio2t = (float)0xa22168 / 0x1.0p+47f;
-
-  float rh, rt;
-
-  if (__CLC_HAVE_HW_FMA32()) {
-    rh = q1 * pio2h;
-    rt = __clc_fma(q0, pio2h, __clc_fma(q1, pio2t, __clc_fma(q1, pio2h, -rh)));
-  } else {
-    float q1h = as_float(as_uint(q1) & 0xfffff000);
-    float q1t = q1 - q1h;
-    rh = q1 * pio2h;
-    rt = __clc_mad(
-        q1t, pio2ht,
-        __clc_mad(q1t, pio2hh,
-                  __clc_mad(q1h, pio2ht, __clc_mad(q1h, pio2hh, -rh))));
-    rt = __clc_mad(q0, pio2h, __clc_mad(q1, pio2t, rt));
-  }
-
-  float t = rh + rt;
-  rt = rt - (t - rh);
-
-  *r = t;
-  *rr = rt;
-  return ((i >> 1) + (i & 1)) & 0x3;
-}
-
-_CLC_DEF int __clc_argReductionS(private float *r, private float *rr, float x) 
{
-  if (x < 0x1.0p+23f)
-    return __clc_argReductionSmallS(r, rr, x);
-  else
-    return __clc_argReductionLargeS(r, rr, x);
-}
-
 #ifdef cl_khr_fp64
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
diff --git a/libclc/generic/lib/math/sincos_helpers.h 
b/libclc/generic/lib/math/sincos_helpers.h
index 2c09dc1b79691..c94784081cd64 100644
--- a/libclc/generic/lib/math/sincos_helpers.h
+++ b/libclc/generic/lib/math/sincos_helpers.h
@@ -9,10 +9,7 @@
 #include <clc/clcfunc.h>
 #include <clc/clctypes.h>
 
-_CLC_DECL float __clc_sinf_piby4(float x, float y);
-_CLC_DECL float __clc_cosf_piby4(float x, float y);
 _CLC_DECL float __clc_tanf_piby4(float x, int y);
-_CLC_DECL int __clc_argReductionS(private float *r, private float *rr, float 
x);
 
 #ifdef cl_khr_fp64
 

>From 0df7ea2a0e2ce21042ca872ce3e0739ecbbaf23d Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fra...@codeplay.com>
Date: Mon, 24 Mar 2025 13:06:20 +0000
Subject: [PATCH 2/2] Reduce bithacking

---
 libclc/clc/include/clc/math/gentype.inc |  6 ++++++
 libclc/generic/lib/math/cos.cl          |  4 ++++
 libclc/generic/lib/math/cos.inc         |  9 +++------
 libclc/generic/lib/math/sin.cl          |  4 ++++
 libclc/generic/lib/math/sin.inc         | 12 +++++-------
 5 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/libclc/clc/include/clc/math/gentype.inc 
b/libclc/clc/include/clc/math/gentype.inc
index 9692a4d37011b..31aa16401e647 100644
--- a/libclc/clc/include/clc/math/gentype.inc
+++ b/libclc/clc/include/clc/math/gentype.inc
@@ -71,6 +71,7 @@
 #define __CLC_SCALAR_GENTYPE float
 #define __CLC_FPSIZE 32
 #define __CLC_FP_LIT(x) x##F
+#define __CLC_GENTYPE_NAN FLT_NAN
 
 #define __CLC_S_GENTYPE __CLC_XCONCAT(int, __CLC_VECSIZE)
 #define __CLC_U_GENTYPE __CLC_XCONCAT(uint, __CLC_VECSIZE)
@@ -127,6 +128,7 @@
 
 #undef __CLC_U_GENTYPE
 #undef __CLC_S_GENTYPE
+#undef __CLC_GENTYPE_NAN
 #undef __CLC_FP_LIT
 #undef __CLC_FPSIZE
 #undef __CLC_SCALAR_GENTYPE
@@ -138,6 +140,7 @@
 #define __CLC_SCALAR_GENTYPE double
 #define __CLC_FPSIZE 64
 #define __CLC_FP_LIT(x) (x)
+#define __CLC_GENTYPE_NAN DBL_NAN
 
 #define __CLC_S_GENTYPE __CLC_XCONCAT(long, __CLC_VECSIZE)
 #define __CLC_U_GENTYPE __CLC_XCONCAT(ulong, __CLC_VECSIZE)
@@ -194,6 +197,7 @@
 
 #undef __CLC_U_GENTYPE
 #undef __CLC_S_GENTYPE
+#undef __CLC_GENTYPE_NAN
 #undef __CLC_FP_LIT
 #undef __CLC_FPSIZE
 #undef __CLC_SCALAR_GENTYPE
@@ -207,6 +211,7 @@
 #define __CLC_SCALAR_GENTYPE half
 #define __CLC_FPSIZE 16
 #define __CLC_FP_LIT(x) x##H
+#define __CLC_GENTYPE_NAN HALF_NAN
 
 #define __CLC_S_GENTYPE __CLC_XCONCAT(short, __CLC_VECSIZE)
 #define __CLC_U_GENTYPE __CLC_XCONCAT(ushort, __CLC_VECSIZE)
@@ -263,6 +268,7 @@
 
 #undef __CLC_U_GENTYPE
 #undef __CLC_S_GENTYPE
+#undef __CLC_GENTYPE_NAN
 #undef __CLC_FP_LIT
 #undef __CLC_FPSIZE
 #undef __CLC_SCALAR_GENTYPE
diff --git a/libclc/generic/lib/math/cos.cl b/libclc/generic/lib/math/cos.cl
index dab8af6f01fa5..00ffa371ad8fc 100644
--- a/libclc/generic/lib/math/cos.cl
+++ b/libclc/generic/lib/math/cos.cl
@@ -10,8 +10,12 @@
 #include <clc/clc.h>
 #include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
+#include <clc/math/clc_fabs.h>
 #include <clc/math/clc_sincos_helpers.h>
 #include <clc/math/math.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
+#include <clc/relational/clc_select.h>
 
 // FP32 and FP16 versions.
 #define __CLC_BODY <cos.inc>
diff --git a/libclc/generic/lib/math/cos.inc b/libclc/generic/lib/math/cos.inc
index be499ba435691..1db96710457dd 100644
--- a/libclc/generic/lib/math/cos.inc
+++ b/libclc/generic/lib/math/cos.inc
@@ -9,12 +9,10 @@
 #if __CLC_FPSIZE == 32
 
 _CLC_OVERLOAD _CLC_DEF __CLC_FLOATN cos(__CLC_FLOATN x) {
-  __CLC_INTN ix = __CLC_AS_INTN(x);
-  __CLC_INTN ax = ix & 0x7fffffff;
-  __CLC_FLOATN dx = __CLC_AS_FLOATN(ax);
+  __CLC_FLOATN absx = __clc_fabs(x);
 
   __CLC_FLOATN r0, r1;
-  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, dx);
+  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
 
   __CLC_FLOATN ss = -__clc_sinf_piby4(r0, r1);
   __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
@@ -22,8 +20,7 @@ _CLC_OVERLOAD _CLC_DEF __CLC_FLOATN cos(__CLC_FLOATN x) {
   __CLC_FLOATN c = (regn & 1) != 0 ? ss : cc;
   c = __CLC_AS_FLOATN(__CLC_AS_INTN(c) ^ ((regn > 1) << 31));
 
-  c = ax >= PINFBITPATT_SP32 ? __CLC_AS_FLOATN((__CLC_UINTN)QNANBITPATT_SP32)
-                             : c;
+  c = __clc_select(c, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x));
 
   return c;
 }
diff --git a/libclc/generic/lib/math/sin.cl b/libclc/generic/lib/math/sin.cl
index bfb095dcea095..f776805ad1cdb 100644
--- a/libclc/generic/lib/math/sin.cl
+++ b/libclc/generic/lib/math/sin.cl
@@ -10,8 +10,12 @@
 #include <clc/clc.h>
 #include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
+#include <clc/math/clc_fabs.h>
 #include <clc/math/clc_sincos_helpers.h>
 #include <clc/math/math.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
+#include <clc/relational/clc_select.h>
 
 // FP32 and FP16 versions.
 #define __CLC_BODY <sin.inc>
diff --git a/libclc/generic/lib/math/sin.inc b/libclc/generic/lib/math/sin.inc
index c2212f90c9a25..dbc99116dfa7d 100644
--- a/libclc/generic/lib/math/sin.inc
+++ b/libclc/generic/lib/math/sin.inc
@@ -9,21 +9,19 @@
 #if __CLC_FPSIZE == 32
 
 _CLC_OVERLOAD _CLC_DEF __CLC_FLOATN sin(__CLC_FLOATN x) {
-  __CLC_INTN ix = __CLC_AS_INTN(x);
-  __CLC_INTN ax = ix & 0x7fffffff;
-  __CLC_FLOATN dx = __CLC_AS_FLOATN(ax);
+  __CLC_FLOATN absx = __clc_fabs(x);
 
   __CLC_FLOATN r0, r1;
-  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, dx);
+  __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
 
   __CLC_FLOATN ss = __clc_sinf_piby4(r0, r1);
   __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
 
   __CLC_FLOATN s = (regn & 1) != 0 ? cc : ss;
-  s = __CLC_AS_FLOATN(__CLC_AS_INTN(s) ^ ((regn > 1) << 31) ^ (ix ^ ax));
+  s = __CLC_AS_FLOATN(__CLC_AS_INTN(s) ^ ((regn > 1) << 31) ^
+                      (__CLC_AS_INTN(x) ^ __CLC_AS_INTN(absx)));
 
-  s = ax >= PINFBITPATT_SP32 ? __CLC_AS_FLOATN((__CLC_UINTN)QNANBITPATT_SP32)
-                             : s;
+  s = __clc_select(s, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x));
 
   // Subnormals
   s = x == 0.0f ? x : s;

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[libclc] [libclc] Move fp32 sincos helpers to CLC library (PR #132753)

Reply via email to