Author: Wenju He Date: 2025-05-16T10:20:32+01:00 New Revision: 299a278db16fa0944472af79bfec31dd678c5b37
URL: https://github.com/llvm/llvm-project/commit/299a278db16fa0944472af79bfec31dd678c5b37 DIFF: https://github.com/llvm/llvm-project/commit/299a278db16fa0944472af79bfec31dd678c5b37.diff LOG: [libclc] Improving vector code generated from scalar code (#140008) The previous method splits vector data into two halves. shuffle_vector concatenates the two results into a vector data of original size. This PR eliminates the use of shuffle_vector. Added: Modified: libclc/clc/include/clc/clcmacro.h libclc/clc/lib/generic/math/clc_lgamma_r.cl Removed: ################################################################################ diff --git a/libclc/clc/include/clc/clcmacro.h b/libclc/clc/include/clc/clcmacro.h index d8772ce38792a..c9f70d2998d37 100644 --- a/libclc/clc/include/clc/clcmacro.h +++ b/libclc/clc/include/clc/clcmacro.h @@ -14,100 +14,140 @@ #define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \ - return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \ + return (RET_TYPE##2)(FUNCTION(x.s0), FUNCTION(x.s1)); \ } \ \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) { \ - return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z)); \ + return (RET_TYPE##3)(FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2)); \ } \ \ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x) { \ - return (RET_TYPE##4)(FUNCTION(x.lo), FUNCTION(x.hi)); \ + return (RET_TYPE##4)(FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), \ + FUNCTION(x.s3)); \ } \ \ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x) { \ - return (RET_TYPE##8)(FUNCTION(x.lo), FUNCTION(x.hi)); \ + return (RET_TYPE##8)(FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), \ + FUNCTION(x.s3), FUNCTION(x.s4), FUNCTION(x.s5), \ + FUNCTION(x.s6), FUNCTION(x.s7)); \ } \ \ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x) { \ - return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi)); \ + return (RET_TYPE##16)( \ + FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \ + FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7), \ + FUNCTION(x.s8), FUNCTION(x.s9), FUNCTION(x.sa), FUNCTION(x.sb), \ + FUNCTION(x.sc), FUNCTION(x.sd), FUNCTION(x.se), FUNCTION(x.sf)); \ } #define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \ - return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \ + return (RET_TYPE##2)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1)); \ } \ \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) { \ - return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y), \ - FUNCTION(x.z, y.z)); \ + return (RET_TYPE##3)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \ + FUNCTION(x.s2, y.s2)); \ } \ \ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y) { \ - return (RET_TYPE##4)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \ + return (RET_TYPE##4)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \ + FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3)); \ } \ \ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y) { \ - return (RET_TYPE##8)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \ + return (RET_TYPE##8)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \ + FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3), \ + FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), \ + FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7)); \ } \ \ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y) { \ - return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \ + return (RET_TYPE##16)( \ + FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2), \ + FUNCTION(x.s3, y.s3), FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), \ + FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7), FUNCTION(x.s8, y.s8), \ + FUNCTION(x.s9, y.s9), FUNCTION(x.sa, y.sa), FUNCTION(x.sb, y.sb), \ + FUNCTION(x.sc, y.sc), FUNCTION(x.sd, y.sd), FUNCTION(x.se, y.se), \ + FUNCTION(x.sf, y.sf)); \ } #define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \ - return (RET_TYPE##2)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ + return (RET_TYPE##2)(FUNCTION(x, y.s0), FUNCTION(x, y.s1)); \ } \ \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE x, ARG2_TYPE##3 y) { \ - return (RET_TYPE##3)(FUNCTION(x, y.x), FUNCTION(x, y.y), \ - FUNCTION(x, y.z)); \ + return (RET_TYPE##3)(FUNCTION(x, y.s0), FUNCTION(x, y.s1), \ + FUNCTION(x, y.s2)); \ } \ \ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE x, ARG2_TYPE##4 y) { \ - return (RET_TYPE##4)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ + return (RET_TYPE##4)(FUNCTION(x, y.s0), FUNCTION(x, y.s1), \ + FUNCTION(x, y.s2), FUNCTION(x, y.s3)); \ } \ \ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE x, ARG2_TYPE##8 y) { \ - return (RET_TYPE##8)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ + return (RET_TYPE##8)(FUNCTION(x, y.s0), FUNCTION(x, y.s1), \ + FUNCTION(x, y.s2), FUNCTION(x, y.s3), \ + FUNCTION(x, y.s4), FUNCTION(x, y.s5), \ + FUNCTION(x, y.s6), FUNCTION(x, y.s7)); \ } \ \ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE x, ARG2_TYPE##16 y) { \ - return (RET_TYPE##16)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ + return (RET_TYPE##16)( \ + FUNCTION(x, y.s0), FUNCTION(x, y.s1), FUNCTION(x, y.s2), \ + FUNCTION(x, y.s3), FUNCTION(x, y.s4), FUNCTION(x, y.s5), \ + FUNCTION(x, y.s6), FUNCTION(x, y.s7), FUNCTION(x, y.s8), \ + FUNCTION(x, y.s9), FUNCTION(x, y.sa), FUNCTION(x, y.sb), \ + FUNCTION(x, y.sc), FUNCTION(x, y.sd), FUNCTION(x, y.se), \ + FUNCTION(x, y.sf)); \ } #define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE, ARG3_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, \ ARG3_TYPE##2 z) { \ - return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \ + return (RET_TYPE##2)(FUNCTION(x.s0, y.s0, z.s0), \ + FUNCTION(x.s1, y.s1, z.s1)); \ } \ \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, \ ARG3_TYPE##3 z) { \ - return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y), \ - FUNCTION(x.z, y.z, z.z)); \ + return (RET_TYPE##3)(FUNCTION(x.s0, y.s0, z.s0), \ + FUNCTION(x.s1, y.s1, z.s1), \ + FUNCTION(x.s2, y.s2, z.s2)); \ } \ \ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y, \ ARG3_TYPE##4 z) { \ - return (RET_TYPE##4)(FUNCTION(x.lo, y.lo, z.lo), \ - FUNCTION(x.hi, y.hi, z.hi)); \ + return (RET_TYPE##4)( \ + FUNCTION(x.s0, y.s0, z.s0), FUNCTION(x.s1, y.s1, z.s1), \ + FUNCTION(x.s2, y.s2, z.s2), FUNCTION(x.s3, y.s3, z.s3)); \ } \ \ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y, \ ARG3_TYPE##8 z) { \ - return (RET_TYPE##8)(FUNCTION(x.lo, y.lo, z.lo), \ - FUNCTION(x.hi, y.hi, z.hi)); \ + return (RET_TYPE##8)( \ + FUNCTION(x.s0, y.s0, z.s0), FUNCTION(x.s1, y.s1, z.s1), \ + FUNCTION(x.s2, y.s2, z.s2), FUNCTION(x.s3, y.s3, z.s3), \ + FUNCTION(x.s4, y.s4, z.s4), FUNCTION(x.s5, y.s5, z.s5), \ + FUNCTION(x.s6, y.s6, z.s6), FUNCTION(x.s7, y.s7, z.s7)); \ } \ \ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y, \ ARG3_TYPE##16 z) { \ - return (RET_TYPE##16)(FUNCTION(x.lo, y.lo, z.lo), \ - FUNCTION(x.hi, y.hi, z.hi)); \ + return (RET_TYPE##16)( \ + FUNCTION(x.s0, y.s0, z.s0), FUNCTION(x.s1, y.s1, z.s1), \ + FUNCTION(x.s2, y.s2, z.s2), FUNCTION(x.s3, y.s3, z.s3), \ + FUNCTION(x.s4, y.s4, z.s4), FUNCTION(x.s5, y.s5, z.s5), \ + FUNCTION(x.s6, y.s6, z.s6), FUNCTION(x.s7, y.s7, z.s7), \ + FUNCTION(x.s8, y.s8, z.s8), FUNCTION(x.s9, y.s9, z.s9), \ + FUNCTION(x.sa, y.sa, z.sa), FUNCTION(x.sb, y.sb, z.sb), \ + FUNCTION(x.sc, y.sc, z.sc), FUNCTION(x.sd, y.sd, z.sd), \ + FUNCTION(x.se, y.se, z.se), FUNCTION(x.sf, y.sf, z.sf)); \ } #define _CLC_V_V_VP_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ @@ -115,48 +155,53 @@ DECLSPEC __CLC_XCONCAT(RET_TYPE, 2) \ FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 2) x, \ ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 2) * y) { \ - return (__CLC_XCONCAT(RET_TYPE, 2))( \ - FUNCTION(x.x, (ADDR_SPACE ARG2_TYPE *)y), \ - FUNCTION(x.y, \ - (ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 1))); \ + ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ + return (__CLC_XCONCAT(RET_TYPE, 2))(FUNCTION(x.s0, ptr), \ + FUNCTION(x.s1, ptr + 1)); \ } \ \ DECLSPEC __CLC_XCONCAT(RET_TYPE, 3) \ FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 3) x, \ ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 3) * y) { \ - return (__CLC_XCONCAT(RET_TYPE, 3))( \ - FUNCTION(x.x, (ADDR_SPACE ARG2_TYPE *)y), \ - FUNCTION(x.y, \ - (ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 1)), \ - FUNCTION(x.z, \ - (ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 2))); \ + ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ + return (__CLC_XCONCAT(RET_TYPE, 3))(FUNCTION(x.s0, ptr), \ + FUNCTION(x.s1, ptr + 1), \ + FUNCTION(x.s2, ptr + 2)); \ } \ \ DECLSPEC __CLC_XCONCAT(RET_TYPE, 4) \ FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 4) x, \ ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 4) * y) { \ + ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ return (__CLC_XCONCAT(RET_TYPE, 4))( \ - FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 2) *)y), \ - FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT( \ - ARG2_TYPE, 2) *)((ADDR_SPACE ARG2_TYPE *)y + 2))); \ + FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \ + FUNCTION(x.s3, ptr + 3)); \ } \ \ DECLSPEC __CLC_XCONCAT(RET_TYPE, 8) \ FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 8) x, \ ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 8) * y) { \ + ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ return (__CLC_XCONCAT(RET_TYPE, 8))( \ - FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 4) *)y), \ - FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT( \ - ARG2_TYPE, 4) *)((ADDR_SPACE ARG2_TYPE *)y + 4))); \ + FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \ + FUNCTION(x.s3, ptr + 3), FUNCTION(x.s4, ptr + 4), \ + FUNCTION(x.s5, ptr + 5), FUNCTION(x.s6, ptr + 6), \ + FUNCTION(x.s7, ptr + 7)); \ } \ \ DECLSPEC __CLC_XCONCAT(RET_TYPE, 16) \ FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 16) x, \ ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 16) * y) { \ + ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ return (__CLC_XCONCAT(RET_TYPE, 16))( \ - FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 8) *)y), \ - FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT( \ - ARG2_TYPE, 8) *)((ADDR_SPACE ARG2_TYPE *)y + 8))); \ + FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \ + FUNCTION(x.s3, ptr + 3), FUNCTION(x.s4, ptr + 4), \ + FUNCTION(x.s5, ptr + 5), FUNCTION(x.s6, ptr + 6), \ + FUNCTION(x.s7, ptr + 7), FUNCTION(x.s8, ptr + 8), \ + FUNCTION(x.s9, ptr + 9), FUNCTION(x.sa, ptr + 10), \ + FUNCTION(x.sb, ptr + 11), FUNCTION(x.sc, ptr + 12), \ + FUNCTION(x.sd, ptr + 13), FUNCTION(x.se, ptr + 14), \ + FUNCTION(x.sf, ptr + 15)); \ } #define _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, \ diff --git a/libclc/clc/lib/generic/math/clc_lgamma_r.cl b/libclc/clc/lib/generic/math/clc_lgamma_r.cl index ad3d63b734eca..96a42bbb6e158 100644 --- a/libclc/clc/lib/generic/math/clc_lgamma_r.cl +++ b/libclc/clc/lib/generic/math/clc_lgamma_r.cl @@ -406,13 +406,13 @@ _CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_lgamma_r, float, #define v4 1.04222645593369134254e-01 /* 0x3FBAAE55, 0xD6537C88 */ #define v5 3.21709242282423911810e-03 /* 0x3F6A5ABB, 0x57D0CF61 */ -#define s0 -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */ -#define s1 2.14982415960608852501e-01 /* 0x3FCB848B, 0x36E20878 */ -#define s2 3.25778796408930981787e-01 /* 0x3FD4D98F, 0x4F139F59 */ -#define s3 1.46350472652464452805e-01 /* 0x3FC2BB9C, 0xBEE5F2F7 */ -#define s4 2.66422703033638609560e-02 /* 0x3F9B481C, 0x7E939961 */ -#define s5 1.84028451407337715652e-03 /* 0x3F5E26B6, 0x7368F239 */ -#define s6 3.19475326584100867617e-05 /* 0x3F00BFEC, 0xDD17E945 */ +#define s0_d -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */ +#define s1_d 2.14982415960608852501e-01 /* 0x3FCB848B, 0x36E20878 */ +#define s2_d 3.25778796408930981787e-01 /* 0x3FD4D98F, 0x4F139F59 */ +#define s3_d 1.46350472652464452805e-01 /* 0x3FC2BB9C, 0xBEE5F2F7 */ +#define s4_d 2.66422703033638609560e-02 /* 0x3F9B481C, 0x7E939961 */ +#define s5_d 1.84028451407337715652e-03 /* 0x3F5E26B6, 0x7368F239 */ +#define s6_d 3.19475326584100867617e-05 /* 0x3F00BFEC, 0xDD17E945 */ #define r1 1.39200533467621045958e+00 /* 0x3FF645A7, 0x62C4AB74 */ #define r2 7.21935547567138069525e-01 /* 0x3FE71A18, 0x93D3DCDC */ @@ -530,10 +530,12 @@ _CLC_OVERLOAD _CLC_DEF double __clc_lgamma_r(double x, private int *ip) { __clc_fma( y, __clc_fma( - y, __clc_fma(y, __clc_fma(y, __clc_fma(y, s6, s5), s4), s3), - s2), - s1), - s0); + y, + __clc_fma(y, __clc_fma(y, __clc_fma(y, s6_d, s5_d), s4_d), + s3_d), + s2_d), + s1_d), + s0_d); double q = __clc_fma( y, __clc_fma( _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits