https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/122149
>From 3329b7ae7dc6044f6563f218c65f6af7498290f0 Mon Sep 17 00:00:00 2001 From: Joseph Huber <hube...@outlook.com> Date: Wed, 8 Jan 2025 12:19:53 -0600 Subject: [PATCH 1/2] [OpenMP] Allow GPUs to be targeted directly via `-fopenmp`. Summary: Currently we prevent the following from working. However, it is completely reasonable to be able to target the files individually. ``` $ clang --target=amdgcn-amd-amdhsa -fopenmp ``` This patch lifts this restriction, allowing individual files to be compiled as standalone OpenMP without the extra offloading overhead. The main motivation behind this is to update the build of the OpenMP DeviceRTL. Currently, we do `--offload-device-only -S -emit-llvm` which is just a hackier version of `-fopenmp -flto -c`. This patch allows the following to work. ``` $ clang omp.c -fopenmp --target=amdgcn-amd-amdhsa -flto -c $ clang offload.c -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xoffload-linker omp.o $ ./a.out ``` --- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 3 - clang/lib/CodeGen/CGStmtOpenMP.cpp | 3 +- clang/lib/CodeGen/CodeGenModule.cpp | 2 - clang/lib/Frontend/CompilerInvocation.cpp | 13 -- clang/test/OpenMP/gpu_target.cpp | 220 ++++++++++++++++++++++ clang/test/OpenMP/target_messages.cpp | 3 - 6 files changed, 222 insertions(+), 22 deletions(-) create mode 100644 clang/test/OpenMP/gpu_target.cpp diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 756f0482b8ea72..1ad4b4b0e8a7fc 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -870,9 +870,6 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false); OMPBuilder.setConfig(Config); - if (!CGM.getLangOpts().OpenMPIsTargetDevice) - llvm_unreachable("OpenMP can only handle device code."); - if (CGM.getLangOpts().OpenMPCUDAMode) CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA; diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 6cb37b20b7aeee..950ed173aecf3a 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -6801,7 +6801,8 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF, CodeGenModule &CGM = CGF.CGM; // On device emit this construct as inlined code. - if (CGM.getLangOpts().OpenMPIsTargetDevice) { + if (CGM.getLangOpts().OpenMPIsTargetDevice || + CGM.getOpenMPRuntime().isGPU()) { OMPLexicalScope Scope(CGF, S, OMPD_target); CGM.getOpenMPRuntime().emitInlinedDirective( CGF, OMPD_target, [&S](CodeGenFunction &CGF, PrePostActionTy &) { diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 5f15f0f48c54e4..26abd9a60632ae 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -483,8 +483,6 @@ void CodeGenModule::createOpenMPRuntime() { case llvm::Triple::nvptx: case llvm::Triple::nvptx64: case llvm::Triple::amdgcn: - assert(getLangOpts().OpenMPIsTargetDevice && - "OpenMP AMDGPU/NVPTX is only prepared to deal with device code."); OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this)); break; default: diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index d711df02ce9503..d2df51593ff62b 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4210,19 +4210,6 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, Args, OPT_fopenmp_version_EQ, (IsSimdSpecified || IsTargetSpecified) ? 51 : Opts.OpenMP, Diags)) Opts.OpenMP = Version; - // Provide diagnostic when a given target is not expected to be an OpenMP - // device or host. - if (!Opts.OpenMPIsTargetDevice) { - switch (T.getArch()) { - default: - break; - // Add unsupported host targets here: - case llvm::Triple::nvptx: - case llvm::Triple::nvptx64: - Diags.Report(diag::err_drv_omp_host_target_not_supported) << T.str(); - break; - } - } } // Set the flag to prevent the implementation from emitting device exception diff --git a/clang/test/OpenMP/gpu_target.cpp b/clang/test/OpenMP/gpu_target.cpp new file mode 100644 index 00000000000000..3d5a47d7050436 --- /dev/null +++ b/clang/test/OpenMP/gpu_target.cpp @@ -0,0 +1,220 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --version 5 +// expected-no-diagnostics + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --check-prefix=AMDGCN +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --check-prefix=NVPTX + +typedef enum omp_allocator_handle_t { + omp_null_allocator = 0, + omp_default_mem_alloc = 1, + omp_large_cap_mem_alloc = 2, + omp_const_mem_alloc = 3, + omp_high_bw_mem_alloc = 4, + omp_low_lat_mem_alloc = 5, + omp_cgroup_mem_alloc = 6, + omp_pteam_mem_alloc = 7, + omp_thread_mem_alloc = 8, + KMP_ALLOCATOR_MAX_HANDLE = ~(0LU) +} omp_allocator_handle_t; + +int d = 0; +#pragma omp allocate(d) allocator(omp_default_mem_alloc) + +int g = 0; +#pragma omp allocate(g) allocator(omp_cgroup_mem_alloc) + +extern const int c = 0; +#pragma omp allocate(c) allocator(omp_const_mem_alloc) + + +int foo() { + int t = 0; +#pragma omp allocate(t) allocator(omp_thread_mem_alloc) + return t; +} + +void bar() { +#pragma omp target + ; +#pragma omp parallel + ; +} + +void baz(int *p) { +#pragma omp atomic + *p += 1; +} + +int qux() { +#if defined(__NVPTX__) + return 1; +#elif defined(__AMDGPU__) + return 2; +#endif +} +//. +// AMDGCN: @c = addrspace(4) constant i32 0, align 4 +// AMDGCN: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" +// AMDGCN: @[[GLOB1:[0-9]+]] = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// AMDGCN: @d = global i32 0, align 4 +// AMDGCN: @g = global i32 0, align 4 +// AMDGCN: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 +//. +// NVPTX: @d = global i32 0, align 4 +// NVPTX: @g = global i32 0, align 4 +// NVPTX: @c = addrspace(4) constant i32 0, align 4 +// NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" +// NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +//. +// AMDGCN-LABEL: define dso_local noundef i32 @_Z3foov( +// AMDGCN-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[T:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN-NEXT: [[T_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[T]] to ptr +// AMDGCN-NEXT: store i32 0, ptr [[T_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP0:%.*]] = load i32, ptr [[T_ASCAST]], align 4 +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +// +// AMDGCN-LABEL: define dso_local void @_Z3barv( +// AMDGCN-SAME: ) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) +// AMDGCN-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr +// AMDGCN-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @_Z3barv_omp_outlined, ptr @_Z3barv_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0) +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define internal void @_Z3barv_omp_outlined( +// AMDGCN-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr +// AMDGCN-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr +// AMDGCN-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define internal void @_Z3barv_omp_outlined_wrapper( +// AMDGCN-SAME: i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2, addrspace(5) +// AMDGCN-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// AMDGCN-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr +// AMDGCN-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr +// AMDGCN-NEXT: [[GLOBAL_ARGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr +// AMDGCN-NEXT: store i16 [[TMP0]], ptr [[DOTADDR_ASCAST]], align 2 +// AMDGCN-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4 +// AMDGCN-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4 +// AMDGCN-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_ASCAST]]) +// AMDGCN-NEXT: call void @_Z3barv_omp_outlined(ptr [[DOTADDR1_ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]]) #[[ATTR3:[0-9]+]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @_Z3bazPi( +// AMDGCN-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr +// AMDGCN-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = atomicrmw add ptr [[TMP0]], i32 1 monotonic, align 4 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local noundef i32 @_Z3quxv( +// AMDGCN-SAME: ) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN-NEXT: ret i32 2 +// +// +// NVPTX-LABEL: define dso_local noundef i32 @_Z3foov( +// NVPTX-SAME: ) #[[ATTR0:[0-9]+]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[T:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store i32 0, ptr [[T]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[T]], align 4 +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// +// NVPTX-LABEL: define dso_local void @_Z3barv( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @_Z3barv_omp_outlined, ptr @_Z3barv_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +// NVPTX-NEXT: ret void +// +// +// NVPTX-LABEL: define internal void @_Z3barv_omp_outlined( +// NVPTX-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// NVPTX-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// NVPTX-NEXT: ret void +// +// +// NVPTX-LABEL: define internal void @_Z3barv_omp_outlined_wrapper( +// NVPTX-SAME: i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: store i16 [[TMP0]], ptr [[DOTADDR]], align 2 +// NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4 +// NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 +// NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) +// NVPTX-NEXT: call void @_Z3barv_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3:[0-9]+]] +// NVPTX-NEXT: ret void +// +// +// NVPTX-LABEL: define dso_local void @_Z3bazPi( +// NVPTX-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: store ptr [[P]], ptr [[P_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = atomicrmw add ptr [[TMP0]], i32 1 monotonic, align 4 +// NVPTX-NEXT: ret void +// +// +// NVPTX-LABEL: define dso_local noundef i32 @_Z3quxv( +// NVPTX-SAME: ) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: ret i32 1 +// +//. +// AMDGCN: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// AMDGCN: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// AMDGCN: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// AMDGCN: attributes #[[ATTR3]] = { nounwind } +// AMDGCN: attributes #[[ATTR4:[0-9]+]] = { alwaysinline } +//. +// NVPTX: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" } +// NVPTX: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" } +// NVPTX: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" } +// NVPTX: attributes #[[ATTR3]] = { nounwind } +// NVPTX: attributes #[[ATTR4:[0-9]+]] = { alwaysinline } +//. +// AMDGCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} +// AMDGCN: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 45} +// AMDGCN: [[META3:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// NVPTX: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// NVPTX: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 45} +// NVPTX: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. diff --git a/clang/test/OpenMP/target_messages.cpp b/clang/test/OpenMP/target_messages.cpp index 62ab5817ba28da..ba300858bf8f79 100644 --- a/clang/test/OpenMP/target_messages.cpp +++ b/clang/test/OpenMP/target_messages.cpp @@ -6,9 +6,6 @@ // RUN: %clang_cc1 -verify=expected,omp4 -fopenmp-simd -fopenmp-version=45 -std=c++11 -o - %s // RUN: %clang_cc1 -verify=expected,omp5 -fopenmp-simd -std=c++11 -o - %s // CHECK: error: OpenMP target is invalid: 'aaa-bbb-ccc-ddd' -// RUN: not %clang_cc1 -fopenmp -std=c++11 -triple nvptx64-nvidia-cuda -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-HOST-TARGET %s -// RUN: not %clang_cc1 -fopenmp -std=c++11 -triple nvptx-nvidia-cuda -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-HOST-TARGET %s -// CHECK-UNSUPPORTED-HOST-TARGET: error: target '{{nvptx64-nvidia-cuda|nvptx-nvidia-cuda}}' is not a supported OpenMP host target // RUN: not %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=hexagon-linux-gnu -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-DEVICE-TARGET %s // CHECK-UNSUPPORTED-DEVICE-TARGET: OpenMP target is invalid: 'hexagon-linux-gnu' >From 6a7b5fe8a75f013e49f6a9ce08736d1ca5576797 Mon Sep 17 00:00:00 2001 From: Joseph Huber <hube...@outlook.com> Date: Thu, 9 Jan 2025 10:55:38 -0600 Subject: [PATCH 2/2] update to make omp target illegal --- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 13 ++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.h | 10 ++ clang/lib/CodeGen/CGStmtOpenMP.cpp | 3 +- clang/test/OpenMP/gpu_target.cpp | 209 +++++++++++++++++++++- clang/test/OpenMP/gpu_target_messages.cpp | 7 + 5 files changed, 238 insertions(+), 4 deletions(-) create mode 100644 clang/test/OpenMP/gpu_target_messages.cpp diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 1ad4b4b0e8a7fc..5587196e750539 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1309,6 +1309,19 @@ void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF, Args); } +void CGOpenMPRuntimeGPU::emitTargetCall( + CodeGenFunction &CGF, const OMPExecutableDirective &D, + llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, + llvm::PointerIntPair<const Expr *, 2, OpenMPDeviceClauseModifier> Device, + llvm::function_ref<llvm::Value *(CodeGenFunction &CGF, + const OMPLoopDirective &D)> + SizeEmitter) { + SmallString<256> Buffer; + llvm::raw_svector_ostream Out(Buffer); + Out << "Cannot emit a '#pragma omp target' on the GPU"; + CGM.Error(D.getBeginLoc(), Out.str()); +} + void CGOpenMPRuntimeGPU::emitCriticalRegion( CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h index b59f43a6915ddf..0d64ba11265522 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -248,6 +248,16 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime { OpenMPDirectiveKind Kind, bool EmitChecks = true, bool ForceSimpleCall = false) override; + /// Emit the target offloading code associated with \a D. This is not + /// supported by the GPU-side and simply returns an error. + virtual void emitTargetCall( + CodeGenFunction &CGF, const OMPExecutableDirective &D, + llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, + llvm::PointerIntPair<const Expr *, 2, OpenMPDeviceClauseModifier> Device, + llvm::function_ref<llvm::Value *(CodeGenFunction &CGF, + const OMPLoopDirective &D)> + SizeEmitter); + /// Emits a critical region. /// \param CriticalName Name of the critical region. /// \param CriticalOpGen Generator for the statement associated with the given diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 950ed173aecf3a..6cb37b20b7aeee 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -6801,8 +6801,7 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF, CodeGenModule &CGM = CGF.CGM; // On device emit this construct as inlined code. - if (CGM.getLangOpts().OpenMPIsTargetDevice || - CGM.getOpenMPRuntime().isGPU()) { + if (CGM.getLangOpts().OpenMPIsTargetDevice) { OMPLexicalScope Scope(CGF, S, OMPD_target); CGM.getOpenMPRuntime().emitInlinedDirective( CGF, OMPD_target, [&S](CodeGenFunction &CGF, PrePostActionTy &) { diff --git a/clang/test/OpenMP/gpu_target.cpp b/clang/test/OpenMP/gpu_target.cpp index 3d5a47d7050436..6ab95056561de6 100644 --- a/clang/test/OpenMP/gpu_target.cpp +++ b/clang/test/OpenMP/gpu_target.cpp @@ -34,10 +34,11 @@ int foo() { } void bar() { -#pragma omp target - ; #pragma omp parallel ; +#pragma omp parallel for + for (int i = 0; i < 1; ++i) + ; } void baz(int *p) { @@ -56,6 +57,7 @@ int qux() { // AMDGCN: @c = addrspace(4) constant i32 0, align 4 // AMDGCN: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" // AMDGCN: @[[GLOB1:[0-9]+]] = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// AMDGCN: @[[GLOB2:[0-9]+]] = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 // AMDGCN: @d = global i32 0, align 4 // AMDGCN: @g = global i32 0, align 4 // AMDGCN: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 @@ -65,6 +67,7 @@ int qux() { // NVPTX: @c = addrspace(4) constant i32 0, align 4 // NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" // NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 //. // AMDGCN-LABEL: define dso_local noundef i32 @_Z3foov( // AMDGCN-SAME: ) #[[ATTR0:[0-9]+]] { @@ -82,9 +85,12 @@ int qux() { // AMDGCN-SAME: ) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8, addrspace(5) +// AMDGCN-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8, addrspace(5) // AMDGCN-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) // AMDGCN-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr +// AMDGCN-NEXT: [[CAPTURED_VARS_ADDRS1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS1]] to ptr // AMDGCN-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @_Z3barv_omp_outlined, ptr @_Z3barv_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0) +// AMDGCN-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @_Z3barv_omp_outlined.1, ptr @_Z3barv_omp_outlined.1_wrapper, ptr [[CAPTURED_VARS_ADDRS1_ASCAST]], i64 0) // AMDGCN-NEXT: ret void // // @@ -119,6 +125,111 @@ int qux() { // AMDGCN-NEXT: ret void // // +// AMDGCN-LABEL: define internal void @_Z3barv_omp_outlined.1( +// AMDGCN-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[TMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr +// AMDGCN-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr +// AMDGCN-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// AMDGCN-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr +// AMDGCN-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// AMDGCN-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// AMDGCN-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr +// AMDGCN-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr +// AMDGCN-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// AMDGCN-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// AMDGCN-NEXT: store i32 0, ptr [[DOTOMP_UB_ASCAST]], align 4 +// AMDGCN-NEXT: store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4 +// AMDGCN-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// AMDGCN-NEXT: call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), i32 [[TMP1]], i32 33, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_LB_ASCAST]], ptr [[DOTOMP_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1) +// AMDGCN-NEXT: br label %[[OMP_DISPATCH_COND:.*]] +// AMDGCN: [[OMP_DISPATCH_COND]]: +// AMDGCN-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// AMDGCN-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 0 +// AMDGCN-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// AMDGCN: [[COND_TRUE]]: +// AMDGCN-NEXT: br label %[[COND_END:.*]] +// AMDGCN: [[COND_FALSE]]: +// AMDGCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// AMDGCN-NEXT: br label %[[COND_END]] +// AMDGCN: [[COND_END]]: +// AMDGCN-NEXT: [[COND:%.*]] = phi i32 [ 0, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ] +// AMDGCN-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// AMDGCN-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// AMDGCN-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// AMDGCN-NEXT: br i1 [[CMP1]], label %[[OMP_DISPATCH_BODY:.*]], label %[[OMP_DISPATCH_END:.*]] +// AMDGCN: [[OMP_DISPATCH_BODY]]: +// AMDGCN-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// AMDGCN: [[OMP_INNER_FOR_COND]]: +// AMDGCN-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// AMDGCN-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// AMDGCN-NEXT: br i1 [[CMP2]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// AMDGCN: [[OMP_INNER_FOR_BODY]]: +// AMDGCN-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// AMDGCN-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 +// AMDGCN-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// AMDGCN-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// AMDGCN-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] +// AMDGCN: [[OMP_BODY_CONTINUE]]: +// AMDGCN-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// AMDGCN: [[OMP_INNER_FOR_INC]]: +// AMDGCN-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// AMDGCN-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// AMDGCN-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// AMDGCN-NEXT: br label %[[OMP_INNER_FOR_COND]] +// AMDGCN: [[OMP_INNER_FOR_END]]: +// AMDGCN-NEXT: br label %[[OMP_DISPATCH_INC:.*]] +// AMDGCN: [[OMP_DISPATCH_INC]]: +// AMDGCN-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4 +// AMDGCN-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// AMDGCN-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_LB_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4 +// AMDGCN-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]] +// AMDGCN-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_UB_ASCAST]], align 4 +// AMDGCN-NEXT: br label %[[OMP_DISPATCH_COND]] +// AMDGCN: [[OMP_DISPATCH_END]]: +// AMDGCN-NEXT: call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), i32 [[TMP1]]) +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define internal void @_Z3barv_omp_outlined.1_wrapper( +// AMDGCN-SAME: i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2, addrspace(5) +// AMDGCN-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// AMDGCN-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr +// AMDGCN-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr +// AMDGCN-NEXT: [[GLOBAL_ARGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr +// AMDGCN-NEXT: store i16 [[TMP0]], ptr [[DOTADDR_ASCAST]], align 2 +// AMDGCN-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4 +// AMDGCN-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4 +// AMDGCN-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_ASCAST]]) +// AMDGCN-NEXT: call void @_Z3barv_omp_outlined.1(ptr [[DOTADDR1_ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]]) #[[ATTR3]] +// AMDGCN-NEXT: ret void +// +// // AMDGCN-LABEL: define dso_local void @_Z3bazPi( // AMDGCN-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -151,8 +262,10 @@ int qux() { // NVPTX-SAME: ) #[[ATTR0]] { // NVPTX-NEXT: [[ENTRY:.*:]] // NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8 +// NVPTX-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8 // NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) // NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @_Z3barv_omp_outlined, ptr @_Z3barv_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0) +// NVPTX-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @_Z3barv_omp_outlined1, ptr @_Z3barv_omp_outlined1_wrapper, ptr [[CAPTURED_VARS_ADDRS1]], i64 0) // NVPTX-NEXT: ret void // // @@ -181,6 +294,98 @@ int qux() { // NVPTX-NEXT: ret void // // +// NVPTX-LABEL: define internal void @_Z3barv_omp_outlined1( +// NVPTX-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[I:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// NVPTX-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// NVPTX-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// NVPTX-NEXT: store i32 0, ptr [[DOTOMP_UB]], align 4 +// NVPTX-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// NVPTX-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// NVPTX-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP1]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// NVPTX-NEXT: br label %[[OMP_DISPATCH_COND:.*]] +// NVPTX: [[OMP_DISPATCH_COND]]: +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// NVPTX-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 0 +// NVPTX-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// NVPTX: [[COND_TRUE]]: +// NVPTX-NEXT: br label %[[COND_END:.*]] +// NVPTX: [[COND_FALSE]]: +// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// NVPTX-NEXT: br label %[[COND_END]] +// NVPTX: [[COND_END]]: +// NVPTX-NEXT: [[COND:%.*]] = phi i32 [ 0, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ] +// NVPTX-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// NVPTX-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4 +// NVPTX-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// NVPTX-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// NVPTX-NEXT: br i1 [[CMP1]], label %[[OMP_DISPATCH_BODY:.*]], label %[[OMP_DISPATCH_END:.*]] +// NVPTX: [[OMP_DISPATCH_BODY]]: +// NVPTX-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// NVPTX: [[OMP_INNER_FOR_COND]]: +// NVPTX-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// NVPTX-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// NVPTX-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// NVPTX-NEXT: br i1 [[CMP2]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// NVPTX: [[OMP_INNER_FOR_BODY]]: +// NVPTX-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// NVPTX-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 +// NVPTX-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// NVPTX-NEXT: store i32 [[ADD]], ptr [[I]], align 4 +// NVPTX-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] +// NVPTX: [[OMP_BODY_CONTINUE]]: +// NVPTX-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// NVPTX: [[OMP_INNER_FOR_INC]]: +// NVPTX-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// NVPTX-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// NVPTX-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4 +// NVPTX-NEXT: br label %[[OMP_INNER_FOR_COND]] +// NVPTX: [[OMP_INNER_FOR_END]]: +// NVPTX-NEXT: br label %[[OMP_DISPATCH_INC:.*]] +// NVPTX: [[OMP_DISPATCH_INC]]: +// NVPTX-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// NVPTX-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4 +// NVPTX-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// NVPTX-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_LB]], align 4 +// NVPTX-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// NVPTX-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4 +// NVPTX-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]] +// NVPTX-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_UB]], align 4 +// NVPTX-NEXT: br label %[[OMP_DISPATCH_COND]] +// NVPTX: [[OMP_DISPATCH_END]]: +// NVPTX-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]]) +// NVPTX-NEXT: ret void +// +// +// NVPTX-LABEL: define internal void @_Z3barv_omp_outlined1_wrapper( +// NVPTX-SAME: i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: store i16 [[TMP0]], ptr [[DOTADDR]], align 2 +// NVPTX-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4 +// NVPTX-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4 +// NVPTX-NEXT: call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]]) +// NVPTX-NEXT: call void @_Z3barv_omp_outlined1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]] +// NVPTX-NEXT: ret void +// +// // NVPTX-LABEL: define dso_local void @_Z3bazPi( // NVPTX-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] { // NVPTX-NEXT: [[ENTRY:.*:]] diff --git a/clang/test/OpenMP/gpu_target_messages.cpp b/clang/test/OpenMP/gpu_target_messages.cpp new file mode 100644 index 00000000000000..e4fe5b49b6d5b6 --- /dev/null +++ b/clang/test/OpenMP/gpu_target_messages.cpp @@ -0,0 +1,7 @@ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple amdgcn-amd-amdhsa -emit-llvm %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-nvidia-cuda -emit-llvm %s + +void foo() { +#pragma omp target // expected-error {{Cannot emit a '#pragma omp target' on the GPU}} + ; +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits