jdoerfert created this revision. jdoerfert added reviewers: JonChesterfield, ABataev. Herald added subscribers: guansong, bollu, kosarev, yaxunl. Herald added a project: clang. jdoerfert requested review of this revision. Herald added a subscriber: sstefan1.
When we implement OpenMP GPU reductions we use type punning a lot during the shuffle and reduce operations. This is not always compatible with language rules on aliasing. So far we generated TBAA which later allowed to remove some of the reduce code as accesses and initialization were "known to not alias". With this patch we avoid TBAA in this step, hopefully for all accesses that we need to. Verified on the reproducer of PR46156 and QMCPack. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D86037 Files: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
Index: clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp =================================================================== --- /dev/null +++ clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp @@ -0,0 +1,40 @@ +// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -fexceptions -fcxx-exceptions -aux-triple powerpc64le-unknown-unknown -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +#include <complex> + +// Verify we do not add tbaa metadata to type punned memory operations: + +// CHECK: call i64 @__kmpc_shuffle_int64( +// CHECK-NEXT: store i64 %{{.*}}, i64* %{{.*}}, align {{[0-9]+$}} + +// CHECK: call i64 @__kmpc_shuffle_int64( +// CHECK-NEXT: store i64 %{{.*}}, i64* %{{.*}}, align {{[0-9]+$}} + +template<typename T> +void complex_reduction() +{ + #pragma omp target teams distribute + for (int ib = 0; ib < 100; ib++) + { + std::complex<T> partial_sum; + const int istart = ib * 4; + const int iend = (ib + 1) * 4; + #pragma omp parallel for reduction(+: partial_sum) + for (int i = istart; i < iend; i++) + partial_sum += std::complex<T>(i, i); + } +} + +void test() +{ + complex_reduction<float>(); + complex_reduction<double>(); +} +#endif Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp =================================================================== --- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2845,8 +2845,12 @@ Address CastItem = CGF.CreateMemTemp(CastTy); Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace())); - CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy); - return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc); + CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); + return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); } /// This function creates calls to one of two shuffle functions to copy @@ -2933,9 +2937,14 @@ ThenBB, ExitBB); CGF.EmitBlock(ThenBB); llvm::Value *Res = createRuntimeShuffleFunction( - CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc), + CGF, + CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()), IntType, Offset, Loc); - CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType); + CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); Address LocalPtr = Bld.CreateConstGEP(Ptr, 1); Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1); PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB); @@ -2944,9 +2953,14 @@ CGF.EmitBlock(ExitBB); } else { llvm::Value *Res = createRuntimeShuffleFunction( - CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc), + CGF, + CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()), IntType, Offset, Loc); - CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType); + CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); Ptr = Bld.CreateConstGEP(Ptr, 1); ElemPtr = Bld.CreateConstGEP(ElemPtr, 1); } @@ -3100,12 +3114,14 @@ } else { switch (CGF.getEvaluationKind(Private->getType())) { case TEK_Scalar: { - llvm::Value *Elem = - CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false, - Private->getType(), Private->getExprLoc()); + llvm::Value *Elem = CGF.EmitLoadOfScalar( + SrcElementAddr, /*Volatile=*/false, Private->getType(), + Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); // Store the source element value to the dest element address. - CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false, - Private->getType()); + CGF.EmitStoreOfScalar( + Elem, DestElementAddr, /*Volatile=*/false, Private->getType(), + LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); break; } case TEK_Complex: { @@ -3250,8 +3266,9 @@ Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); Address LocalReduceList( Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, - C.VoidPtrTy, Loc), + CGF.EmitLoadOfScalar( + AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc, + LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()), CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), CGF.getPointerAlign()); @@ -3329,10 +3346,13 @@ // elem = *elemptr //*MediumPtr = elem - llvm::Value *Elem = - CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc); + llvm::Value *Elem = CGF.EmitLoadOfScalar( + ElemPtr, /*Volatile=*/false, CType, Loc, + LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); // Store the source element value to the dest element address. - CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType); + CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); Bld.CreateBr(MergeBB); @@ -3712,8 +3732,9 @@ GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment())); switch (CGF.getEvaluationKind(Private->getType())) { case TEK_Scalar: { - llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, - Private->getType(), Loc); + llvm::Value *V = CGF.EmitLoadOfScalar( + ElemPtr, /*Volatile=*/false, Private->getType(), Loc, + LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); CGF.EmitStoreOfScalar(V, GlobLVal); break; } @@ -3916,7 +3937,9 @@ switch (CGF.getEvaluationKind(Private->getType())) { case TEK_Scalar: { llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc); - CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType()); + CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(), + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); break; } case TEK_Complex: {
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits