tra created this revision. tra added reviewers: jlebar, jingyue. tra added a subscriber: cfe-commits.
This provides substantial performance boost on some benchmarks (~25% on SHOC's FFT) due to vectorized loads/stores. Unfortunately existing CUDA headers and user code occasionally take pointer to vector fields which clang does not allow, so we can't use vector types by default. While vectorized types help in some cases, they may lower performance in cases when user reads/writes only part of the vector as Clang currently generates code to always load/store complete vector. It may also create data races if user code assumed that parts of the same vector can be safely changed from different threads. For now control this feature via -DCUDA_VECTOR_TYPES and let user choose whether to use Clang's vectorized types or CUDA's non-vectorized ones. http://reviews.llvm.org/D18051 Files: lib/Headers/CMakeLists.txt lib/Headers/__clang_cuda_runtime_wrapper.h lib/Headers/__clang_cuda_vector_types.h lib/Headers/cuda_builtin_vars.h
Index: lib/Headers/cuda_builtin_vars.h =================================================================== --- lib/Headers/cuda_builtin_vars.h +++ lib/Headers/cuda_builtin_vars.h @@ -24,9 +24,11 @@ #ifndef __CUDA_BUILTIN_VARS_H #define __CUDA_BUILTIN_VARS_H +#if !defined(CUDA_VECTOR_TYPES) // Forward declares from vector_types.h. struct uint3; struct dim3; +#endif // The file implements built-in CUDA variables using __declspec(property). // https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx Index: lib/Headers/__clang_cuda_vector_types.h =================================================================== --- /dev/null +++ lib/Headers/__clang_cuda_vector_types.h @@ -0,0 +1,87 @@ +/*===---- __clang_cuda_vector_types.h - CUDA vector types -----------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_CUDA_VECTOR_TYPES_H__ +#define __CLANG_CUDA_VECTOR_TYPES_H__ + +typedef char char1 __attribute__((ext_vector_type(1))); +typedef char char2 __attribute__((ext_vector_type(2))); +typedef char char3 __attribute__((ext_vector_type(3))); +typedef char char4 __attribute__((ext_vector_type(4))); +typedef unsigned char uchar1 __attribute__((ext_vector_type(1))); +typedef unsigned char uchar2 __attribute__((ext_vector_type(2))); +typedef unsigned char uchar3 __attribute__((ext_vector_type(3))); +typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); +typedef short short1 __attribute__((ext_vector_type(1))); +typedef short short2 __attribute__((ext_vector_type(2))); +typedef short short3 __attribute__((ext_vector_type(3))); +typedef short short4 __attribute__((ext_vector_type(4))); +typedef unsigned short ushort1 __attribute__((ext_vector_type(1))); +typedef unsigned short ushort2 __attribute__((ext_vector_type(2))); +typedef unsigned short ushort3 __attribute__((ext_vector_type(3))); +typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); +typedef int int1 __attribute__((ext_vector_type(1))); +typedef int int2 __attribute__((ext_vector_type(2))); +typedef int int3 __attribute__((ext_vector_type(3))); +typedef int int4 __attribute__((ext_vector_type(4))); +typedef unsigned int uint1 __attribute__((ext_vector_type(1))); +typedef unsigned int uint2 __attribute__((ext_vector_type(2))); +typedef unsigned int uint3 __attribute__((ext_vector_type(3))); +typedef unsigned int uint4 __attribute__((ext_vector_type(4))); +typedef long long1 __attribute__((ext_vector_type(1))); +typedef long long2 __attribute__((ext_vector_type(2))); +typedef long long3 __attribute__((ext_vector_type(3))); +typedef long long4 __attribute__((ext_vector_type(4))); +typedef unsigned long ulong1 __attribute__((ext_vector_type(1))); +typedef unsigned long ulong2 __attribute__((ext_vector_type(2))); +typedef unsigned long ulong3 __attribute__((ext_vector_type(3))); +typedef unsigned long ulong4 __attribute__((ext_vector_type(4))); +typedef long long longlong1 __attribute__((ext_vector_type(1))); +typedef long long longlong2 __attribute__((ext_vector_type(2))); +typedef long long longlong3 __attribute__((ext_vector_type(3))); +typedef long long longlong4 __attribute__((ext_vector_type(4))); +typedef unsigned long long ulonglong1 __attribute__((ext_vector_type(1))); +typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2))); +typedef unsigned long long ulonglong3 __attribute__((ext_vector_type(3))); +typedef unsigned long long ulonglong4 __attribute__((ext_vector_type(4))); +typedef float float1 __attribute__((ext_vector_type(1))); +typedef float float2 __attribute__((ext_vector_type(2))); +typedef float float3 __attribute__((ext_vector_type(3))); +typedef float float4 __attribute__((ext_vector_type(4))); +typedef double double1 __attribute__((ext_vector_type(1))); +typedef double double2 __attribute__((ext_vector_type(2))); +typedef double double3 __attribute__((ext_vector_type(3))); +typedef double double4 __attribute__((ext_vector_type(4))); + +__attribute__((host,device)) +struct dim3 { + uint x, y, z; + __attribute__((host, device)) + dim3(unsigned __x = 1, unsigned __y = 1, unsigned __z = 1) + : x(__x), y(__y), z(__z) {} + __attribute__((host, device)) explicit dim3(uint3 __a) + : x(__a.x), y(__a.y), z(__a.z) {} + __attribute__((host, device)) operator uint3(void) { return {x, y, z}; } +}; + +#endif Index: lib/Headers/__clang_cuda_runtime_wrapper.h =================================================================== --- lib/Headers/__clang_cuda_runtime_wrapper.h +++ lib/Headers/__clang_cuda_runtime_wrapper.h @@ -69,6 +69,13 @@ #define __CUDA_ARCH__ 350 #endif +#if defined(CUDA_VECTOR_TYPES) +// Prevent inclusion of CUDA's vector_types.h +#define __VECTOR_TYPES_H__ +// .. and include clang's own types for them instead. +#include "__clang_cuda_vector_types.h" +#endif + #include "cuda_builtin_vars.h" // No need for device_launch_parameters.h as cuda_builtin_vars.h above @@ -190,7 +197,9 @@ #include "math_functions.hpp" #pragma pop_macro("__host__") +#if !defined(CUDA_VECTOR_TYPES) #include "texture_indirect_functions.h" +#endif // Restore state of __CUDA_ARCH__ and __THROW we had on entry. #pragma pop_macro("__CUDA_ARCH__") Index: lib/Headers/CMakeLists.txt =================================================================== --- lib/Headers/CMakeLists.txt +++ lib/Headers/CMakeLists.txt @@ -20,6 +20,7 @@ bmiintrin.h __clang_cuda_cmath.h __clang_cuda_runtime_wrapper.h + __clang_cuda_vector_types.h cpuid.h cuda_builtin_vars.h emmintrin.h
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits