arsenm created this revision. arsenm added reviewers: Anastasia, yaxunl, rjmccall. Herald added subscribers: tpr, nhaehnle, wdng.
The way address space declarations for builtins currently work is nearly useless. The code assumes the address spaces used for builtins is a confusingly named "target address space" from user code using __attribute__((address_space(N))) that matches the builtin declaration. There's no way to use this to declare a builtin that returns a language specific address space. The terminology used is highly cofusing since it has nothing to do with the the address space selected by the target to use for a language address space. This feature is essentially unused as-is. AMDGPU and NVPTX are the only in-tree targets attempting to use this. The AMDGPU builtins certainly do not behave as intended (i.e. all of the builtins returning pointers can never compile because the numbered address space never matches the expected named address space). The NVPTX builtins are missing tests for some, and the others seem to rely on an implicit addrspacecast. Change the used address space for builtins based on a target hook to allow using a language address space for a builtin. This allows the same builtin declaration to be used for multiple languages with similarly purposed address spaces (e.g. the same AMDGPU builtin can be used in OpenCL and CUDA even though the constant address spaces are arbitarily different). This breaks the possibility of using arbitrary numbered address spaces alongside the named address spaces for builtins. If this is an issue we probably need to introduce another builtin declaration character to distinguish language address spaces from so-called "target address spaces". https://reviews.llvm.org/D47154 Files: include/clang/AST/ASTContext.h include/clang/Basic/BuiltinsAMDGPU.def include/clang/Basic/TargetInfo.h lib/AST/ASTContext.cpp lib/Basic/Targets/AMDGPU.h lib/CodeGen/CGBuiltin.cpp lib/Sema/SemaExpr.cpp test/CodeGenCUDA/builtins-amdgcn.cu test/CodeGenOpenCL/builtins-amdgcn-vi.cl test/CodeGenOpenCL/builtins-amdgcn.cl test/CodeGenOpenCL/numbered-address-space.cl
Index: test/CodeGenOpenCL/numbered-address-space.cl =================================================================== --- /dev/null +++ test/CodeGenOpenCL/numbered-address-space.cl @@ -0,0 +1,47 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tonga -S -emit-llvm -O0 -o - %s | FileCheck %s + +// Make sure using numbered address spaces doesn't trigger crashes when a +// builtin has an address space parameter. + +// CHECK-LABEL: @test_numbered_as_to_generic( +// CHECK: addrspacecast i32 addrspace(42)* %0 to i32* +void test_numbered_as_to_generic(__attribute__((address_space(42))) int *arbitary_numbered_ptr) { + generic int* generic_ptr = arbitary_numbered_ptr; + *generic_ptr = 4; +} + +// CHECK-LABEL: @test_numbered_as_to_builtin( +// CHECK: addrspacecast i32 addrspace(42)* %0 to float addrspace(3)* +void test_numbered_as_to_builtin(__attribute__((address_space(42))) int *arbitary_numbered_ptr, float src) { + volatile float result = __builtin_amdgcn_ds_fmax(arbitary_numbered_ptr, src, 0, 0, false); +} + +// CHECK-LABEL: @test_generic_as_to_builtin_parameter_explicit_cast( +// CHECK: addrspacecast i32 addrspace(3)* %0 to i32* +void test_generic_as_to_builtin_parameter_explicit_cast(__local int *local_ptr, float src) { + generic int* generic_ptr = local_ptr; + volatile float result = __builtin_amdgcn_ds_fmax((__local float*) generic_ptr, src, 0, 0, false); +} + +// CHECK-LABEL: @test_generic_as_to_builtin_parameter_implicit_cast( +// CHECK: addrspacecast i32* %2 to float addrspace(3)* +void test_generic_as_to_builtin_parameter_implicit_cast(__local int *local_ptr, float src) { + generic int* generic_ptr = local_ptr; + + volatile float result = __builtin_amdgcn_ds_fmax(generic_ptr, src, 0, 0, false); +} + +#if 0 +// XXX: Should this compile? +void test_generic_as_to_builtin_parameter_explicit_cast_numeric(__attribute__((address_space(3))) int *local_ptr, float src) { + generic int* generic_ptr = local_ptr; + volatile float result = __builtin_amdgcn_ds_fmax((__attribute__((address_space(3))) float*) generic_ptr, src, 0, 0, false); +} + +// XXX: Should this compile? +void test_generic_as_to_builtin_parameterimplicit_cast_numeric(__attribute__((address_space(3))) int *local_ptr, float src) { + generic int* generic_ptr = local_ptr; + volatile float result = __builtin_amdgcn_ds_fmax(generic_ptr, src, 0, 0, false); +} +#endif Index: test/CodeGenOpenCL/builtins-amdgcn.cl =================================================================== --- test/CodeGenOpenCL/builtins-amdgcn.cl +++ test/CodeGenOpenCL/builtins-amdgcn.cl @@ -1,6 +1,6 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown-opencl -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown-opencl -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s #pragma OPENCL EXTENSION cl_khr_fp64 : enable @@ -20,19 +20,42 @@ *flagout = flag; } -// CHECK-LABEL: @test_div_scale_f32 +// CHECK-LABEL: @test_div_scale_f32( // CHECK: call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) // CHECK-DAG: [[FLAG:%.+]] = extractvalue { float, i1 } %{{.+}}, 1 // CHECK-DAG: [[VAL:%.+]] = extractvalue { float, i1 } %{{.+}}, 0 -// CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i32 -// CHECK: store i32 [[FLAGEXT]] -void test_div_scale_f32(global float* out, global int* flagout, float a, float b) +// CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i8 +// CHECK: store i8 [[FLAGEXT]] +void test_div_scale_f32(global float* out, global bool* flagout, float a, float b) { bool flag; *out = __builtin_amdgcn_div_scalef(a, b, true, &flag); *flagout = flag; } +// CHECK-LABEL: @test_div_scale_f32_global_ptr( +// CHECK: call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) +// CHECK-DAG: [[FLAG:%.+]] = extractvalue { float, i1 } %{{.+}}, 1 +// CHECK-DAG: [[VAL:%.+]] = extractvalue { float, i1 } %{{.+}}, 0 +// CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i8 +// CHECK: store i8 [[FLAGEXT]] +void test_div_scale_f32_global_ptr(global float* out, global int* flagout, float a, float b, global bool* flag) +{ + *out = __builtin_amdgcn_div_scalef(a, b, true, flag); +} + +// CHECK-LABEL: @test_div_scale_f32_generic_ptr( +// CHECK: call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) +// CHECK-DAG: [[FLAG:%.+]] = extractvalue { float, i1 } %{{.+}}, 1 +// CHECK-DAG: [[VAL:%.+]] = extractvalue { float, i1 } %{{.+}}, 0 +// CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i8 +// CHECK: store i8 [[FLAGEXT]] +void test_div_scale_f32_generic_ptr(global float* out, global int* flagout, float a, float b, global bool* flag_arg) +{ + generic bool* flag = flag_arg; + *out = __builtin_amdgcn_div_scalef(a, b, true, flag); +} + // CHECK-LABEL: @test_div_fmas_f32 // CHECK: call float @llvm.amdgcn.div.fmas.f32 void test_div_fmas_f32(global float* out, float a, float b, float c, int d) @@ -414,42 +437,42 @@ } // CHECK-LABEL: @test_read_exec( -// CHECK: call i64 @llvm.read_register.i64(metadata ![[EXEC:[0-9]+]]) #[[READ_EXEC_ATTRS:[0-9]+]] +// CHECK: call i64 @llvm.read_register.i64(metadata ![[$EXEC:[0-9]+]]) #[[$READ_EXEC_ATTRS:[0-9]+]] void test_read_exec(global ulong* out) { *out = __builtin_amdgcn_read_exec(); } -// CHECK: declare i64 @llvm.read_register.i64(metadata) #[[NOUNWIND_READONLY:[0-9]+]] +// CHECK: declare i64 @llvm.read_register.i64(metadata) #[[$NOUNWIND_READONLY:[0-9]+]] // CHECK-LABEL: @test_read_exec_lo( -// CHECK: call i32 @llvm.read_register.i32(metadata ![[EXEC_LO:[0-9]+]]) #[[READ_EXEC_ATTRS]] +// CHECK: call i32 @llvm.read_register.i32(metadata ![[$EXEC_LO:[0-9]+]]) #[[$READ_EXEC_ATTRS]] void test_read_exec_lo(global uint* out) { *out = __builtin_amdgcn_read_exec_lo(); } // CHECK-LABEL: @test_read_exec_hi( -// CHECK: call i32 @llvm.read_register.i32(metadata ![[EXEC_HI:[0-9]+]]) #[[READ_EXEC_ATTRS]] +// CHECK: call i32 @llvm.read_register.i32(metadata ![[$EXEC_HI:[0-9]+]]) #[[$READ_EXEC_ATTRS]] void test_read_exec_hi(global uint* out) { *out = __builtin_amdgcn_read_exec_hi(); } // CHECK-LABEL: @test_dispatch_ptr // CHECK: call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() -void test_dispatch_ptr(__attribute__((address_space(4))) unsigned char ** out) +void test_dispatch_ptr(__constant unsigned char ** out) { *out = __builtin_amdgcn_dispatch_ptr(); } // CHECK-LABEL: @test_kernarg_segment_ptr // CHECK: call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() -void test_kernarg_segment_ptr(__attribute__((address_space(4))) unsigned char ** out) +void test_kernarg_segment_ptr(__constant unsigned char ** out) { *out = __builtin_amdgcn_kernarg_segment_ptr(); } // CHECK-LABEL: @test_implicitarg_ptr // CHECK: call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() -void test_implicitarg_ptr(__attribute__((address_space(4))) unsigned char ** out) +void test_implicitarg_ptr(__constant unsigned char ** out) { *out = __builtin_amdgcn_implicitarg_ptr(); } @@ -480,9 +503,9 @@ } // CHECK-LABEL: @test_get_local_id( -// CHECK: tail call i32 @llvm.amdgcn.workitem.id.x(), !range [[WI_RANGE:![0-9]*]] -// CHECK: tail call i32 @llvm.amdgcn.workitem.id.y(), !range [[WI_RANGE]] -// CHECK: tail call i32 @llvm.amdgcn.workitem.id.z(), !range [[WI_RANGE]] +// CHECK: tail call i32 @llvm.amdgcn.workitem.id.x(), !range [[$WI_RANGE:![0-9]*]] +// CHECK: tail call i32 @llvm.amdgcn.workitem.id.y(), !range [[$WI_RANGE]] +// CHECK: tail call i32 @llvm.amdgcn.workitem.id.z(), !range [[$WI_RANGE]] void test_get_local_id(int d, global int *out) { switch (d) { @@ -507,9 +530,9 @@ *out = __builtin_amdgcn_s_getpc(); } -// CHECK-DAG: [[WI_RANGE]] = !{i32 0, i32 1024} -// CHECK-DAG: attributes #[[NOUNWIND_READONLY:[0-9]+]] = { nounwind readonly } -// CHECK-DAG: attributes #[[READ_EXEC_ATTRS]] = { convergent } -// CHECK-DAG: ![[EXEC]] = !{!"exec"} -// CHECK-DAG: ![[EXEC_LO]] = !{!"exec_lo"} -// CHECK-DAG: ![[EXEC_HI]] = !{!"exec_hi"} +// CHECK-DAG: [[$WI_RANGE]] = !{i32 0, i32 1024} +// CHECK-DAG: attributes #[[$NOUNWIND_READONLY:[0-9]+]] = { nounwind readonly } +// CHECK-DAG: attributes #[[$READ_EXEC_ATTRS]] = { convergent } +// CHECK-DAG: ![[$EXEC]] = !{!"exec"} +// CHECK-DAG: ![[$EXEC_LO]] = !{!"exec_lo"} +// CHECK-DAG: ![[$EXEC_HI]] = !{!"exec_hi"} Index: test/CodeGenOpenCL/builtins-amdgcn-vi.cl =================================================================== --- test/CodeGenOpenCL/builtins-amdgcn-vi.cl +++ test/CodeGenOpenCL/builtins-amdgcn-vi.cl @@ -91,18 +91,18 @@ // CHECK-LABEL: @test_ds_fadd // CHECK: call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) -void test_ds_fadd(__attribute__((address_space(3))) float *out, float src) { +void test_ds_fadd(__local float *out, float src) { *out = __builtin_amdgcn_ds_fadd(out, src, 0, 0, false); } // CHECK-LABEL: @test_ds_fmin // CHECK: call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) -void test_ds_fmin(__attribute__((address_space(3))) float *out, float src) { +void test_ds_fmin(__local float *out, float src) { *out = __builtin_amdgcn_ds_fmin(out, src, 0, 0, false); } // CHECK-LABEL: @test_ds_fmax // CHECK: call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) -void test_ds_fmax(__attribute__((address_space(3))) float *out, float src) { +void test_ds_fmax(__local float *out, float src) { *out = __builtin_amdgcn_ds_fmax(out, src, 0, 0, false); } Index: test/CodeGenCUDA/builtins-amdgcn.cu =================================================================== --- /dev/null +++ test/CodeGenCUDA/builtins-amdgcn.cu @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device -emit-llvm %s -o - | FileCheck %s +#include "Inputs/cuda.h" + +// CHECK-LABEL: @_Z16use_dispatch_ptrPi( +// CHECK: %2 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +// CHECK: %3 = addrspacecast i8 addrspace(4)* %2 to i8 addrspace(4)** +__global__ void use_dispatch_ptr(int* out) { + const int* dispatch_ptr = (const int*)__builtin_amdgcn_dispatch_ptr(); + *out = *dispatch_ptr; +} + +// CHECK-LABEL: @_Z12test_ds_fmaxf( +// CHECK: call float @llvm.amdgcn.ds.fmax(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %2, i32 0, i32 0, i1 false) +__global__ +void test_ds_fmax(float src) { + __shared__ float shared; + volatile float x = __builtin_amdgcn_ds_fmax(&shared, src, 0, 0, false); +} Index: lib/Sema/SemaExpr.cpp =================================================================== --- lib/Sema/SemaExpr.cpp +++ lib/Sema/SemaExpr.cpp @@ -5086,10 +5086,13 @@ continue; } + QualType PointeeType = ParamType->getPointeeType(); + if (PointeeType.getQualifiers().hasAddressSpace()) + continue; + NeedsNewDecl = true; LangAS AS = ArgType->getPointeeType().getAddressSpace(); - QualType PointeeType = ParamType->getPointeeType(); PointeeType = Context.getAddrSpaceQualType(PointeeType, AS); OverloadParams.push_back(Context.getPointerType(PointeeType)); } Index: lib/CodeGen/CGBuiltin.cpp =================================================================== --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -3495,6 +3495,16 @@ // we need to do a bit cast. llvm::Type *PTy = FTy->getParamType(i); if (PTy != ArgValue->getType()) { + // XXX - vector of pointers? + if (auto *PtrTy = dyn_cast<llvm::PointerType>(PTy)) { + if (PtrTy->getAddressSpace() != + ArgValue->getType()->getPointerAddressSpace()) { + ArgValue = Builder.CreateAddrSpaceCast( + ArgValue, + ArgValue->getType()->getPointerTo(PtrTy->getAddressSpace())); + } + } + assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) && "Must be able to losslessly bit cast to param"); ArgValue = Builder.CreateBitCast(ArgValue, PTy); @@ -3511,6 +3521,14 @@ RetTy = ConvertType(BuiltinRetType); if (RetTy != V->getType()) { + // XXX - vector of pointers? + if (auto *PtrTy = dyn_cast<llvm::PointerType>(RetTy)) { + if (PtrTy->getAddressSpace() != V->getType()->getPointerAddressSpace()) { + V = Builder.CreateAddrSpaceCast( + V, V->getType()->getPointerTo(PtrTy->getAddressSpace())); + } + } + assert(V->getType()->canLosslesslyBitCastTo(RetTy) && "Must be able to losslessly bit cast result type"); V = Builder.CreateBitCast(V, RetTy); Index: lib/Basic/Targets/AMDGPU.h =================================================================== --- lib/Basic/Targets/AMDGPU.h +++ lib/Basic/Targets/AMDGPU.h @@ -378,6 +378,27 @@ } } + LangAS getOpenCLBuiltinAddressSpace(unsigned AS) const override { + switch (AS) { + case 0: + return LangAS::opencl_generic; + case 1: + return LangAS::opencl_global; + case 3: + return LangAS::opencl_local; + case 4: + return LangAS::opencl_constant; + case 5: + return LangAS::opencl_private; + default: + return getLangASFromTargetAS(AS); + } + } + + LangAS getCUDABuiltinAddressSpace(unsigned AS) const override { + return LangAS::Default; + } + llvm::Optional<LangAS> getConstantAddressSpace() const override { return getLangASFromTargetAS(Constant); } Index: lib/AST/ASTContext.cpp =================================================================== --- lib/AST/ASTContext.cpp +++ lib/AST/ASTContext.cpp @@ -9090,9 +9090,11 @@ // qualified with an address space. char *End; unsigned AddrSpace = strtoul(Str, &End, 10); - if (End != Str && AddrSpace != 0) { - Type = Context.getAddrSpaceQualType(Type, - getLangASFromTargetAS(AddrSpace)); + if (End != Str) { + // Note AddrSpace == 0 is not the same as an unspecified address space. + Type = Context.getAddrSpaceQualType( + Type, + Context.getLangASForBuiltinAddressSpace(AddrSpace)); Str = End; } if (c == '*') @@ -9969,6 +9971,16 @@ return (*AddrSpaceMap)[(unsigned)AS]; } +LangAS ASTContext::getLangASForBuiltinAddressSpace(unsigned AS) const { + if (LangOpts.OpenCL) + return getTargetInfo().getOpenCLBuiltinAddressSpace(AS); + + if (LangOpts.CUDA) + return getTargetInfo().getCUDABuiltinAddressSpace(AS); + + return getLangASFromTargetAS(AS); +} + // Explicitly instantiate this in case a Redeclarable<T> is used from a TU that // doesn't include ASTContext.h template Index: include/clang/Basic/TargetInfo.h =================================================================== --- include/clang/Basic/TargetInfo.h +++ include/clang/Basic/TargetInfo.h @@ -1006,6 +1006,14 @@ const LangASMap &getAddressSpaceMap() const { return *AddrSpaceMap; } + virtual LangAS getOpenCLBuiltinAddressSpace(unsigned AS) const { + return getLangASFromTargetAS(AS); + } + + virtual LangAS getCUDABuiltinAddressSpace(unsigned AS) const { + return getLangASFromTargetAS(AS); + } + /// Return an AST address space which can be used opportunistically /// for constant global memory. It must be possible to convert pointers into /// this address space to LangAS::Default. If no such address space exists, Index: include/clang/Basic/BuiltinsAMDGPU.def =================================================================== --- include/clang/Basic/BuiltinsAMDGPU.def +++ include/clang/Basic/BuiltinsAMDGPU.def @@ -21,9 +21,9 @@ // SI+ only builtins. //===----------------------------------------------------------------------===// -BUILTIN(__builtin_amdgcn_dispatch_ptr, "Uc*4", "nc") -BUILTIN(__builtin_amdgcn_kernarg_segment_ptr, "Uc*4", "nc") -BUILTIN(__builtin_amdgcn_implicitarg_ptr, "Uc*4", "nc") +BUILTIN(__builtin_amdgcn_dispatch_ptr, "v*4", "nc") +BUILTIN(__builtin_amdgcn_kernarg_segment_ptr, "v*4", "nc") +BUILTIN(__builtin_amdgcn_implicitarg_ptr, "v*4", "nc") BUILTIN(__builtin_amdgcn_workgroup_id_x, "Ui", "nc") BUILTIN(__builtin_amdgcn_workgroup_id_y, "Ui", "nc") @@ -45,6 +45,8 @@ BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") + +// FIXME: Need to disallow constant address space. BUILTIN(__builtin_amdgcn_div_scale, "dddbb*", "n") BUILTIN(__builtin_amdgcn_div_scalef, "fffbb*", "n") BUILTIN(__builtin_amdgcn_div_fmas, "ddddb", "nc") Index: include/clang/AST/ASTContext.h =================================================================== --- include/clang/AST/ASTContext.h +++ include/clang/AST/ASTContext.h @@ -2433,6 +2433,8 @@ unsigned getTargetAddressSpace(LangAS AS) const; + LangAS getLangASForBuiltinAddressSpace(unsigned AS) const; + /// Get target-dependent integer value for null pointer which is used for /// constant folding. uint64_t getTargetNullPointerValue(QualType QT) const;
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits