https://github.com/TSWorld1314 created https://github.com/llvm/llvm-project/pull/85795
None >From c44d8c1c7986fad6370273aca55e2db99d47387a Mon Sep 17 00:00:00 2001 From: "Harrison,Hao" <tsworld1...@gmail.com> Date: Tue, 19 Mar 2024 13:18:07 +0000 Subject: [PATCH] [Clang][OpenMP] Port clang codegen code for GPU First project --- clang/include/clang/Basic/LangOptions.def | 1 + clang/lib/CodeGen/CGBuilder.h | 9 + clang/lib/CodeGen/CGDecl.cpp | 218 +++++++++--------- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 45 ++-- .../include/llvm/Frontend/OpenMP/OMPKinds.def | 9 + 5 files changed, 156 insertions(+), 126 deletions(-) diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 8ef6700ecdc78ec..64b87ecdc97524c 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -260,6 +260,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.") LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.") +LANGOPT(OpenMPGlobalizeToGlobalSpace , 1, 0, "Globalize to global space for the globalized variables") LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.") LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region") LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.") diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h index bf5ab171d720d9b..fe5beff05134ac6 100644 --- a/clang/lib/CodeGen/CGBuilder.h +++ b/clang/lib/CodeGen/CGBuilder.h @@ -152,6 +152,15 @@ class CGBuilderTy : public CGBuilderBaseTy { Addr.isKnownNonNull()); } + /// Cast the element type of the given address to a different type, + /// preserving information like the alignment and address space. + Address CreateElementBitCast(Address Addr, llvm::Type *Ty, + const llvm::Twine &Name = "") { + auto *PtrTy = Ty->getPointerTo(Addr.getAddressSpace()); + return Address(CreateBitCast(Addr.getPointer(), PtrTy, Name), Ty, + Addr.getAlignment(), Addr.isKnownNonNull()); + } + using CGBuilderBaseTy::CreatePointerBitCastOrAddrSpaceCast; Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index dc42faf8dbb9fda..691af33dc239d67 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -2531,48 +2531,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg, (IPD->getParameterKind() == ImplicitParamKind::ThreadPrivateVar); } - Address DeclPtr = Address::invalid(); - Address AllocaPtr = Address::invalid(); - bool DoStore = false; - bool IsScalar = hasScalarEvaluationKind(Ty); - bool UseIndirectDebugAddress = false; - - // If we already have a pointer to the argument, reuse the input pointer. - if (Arg.isIndirect()) { - DeclPtr = Arg.getIndirectAddress(); - DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty)); - // Indirect argument is in alloca address space, which may be different - // from the default address space. - auto AllocaAS = CGM.getASTAllocaAddressSpace(); - auto *V = DeclPtr.getPointer(); - AllocaPtr = DeclPtr; - - // For truly ABI indirect arguments -- those that are not `byval` -- store - // the address of the argument on the stack to preserve debug information. - ABIArgInfo ArgInfo = CurFnInfo->arguments()[ArgNo - 1].info; - if (ArgInfo.isIndirect()) - UseIndirectDebugAddress = !ArgInfo.getIndirectByVal(); - if (UseIndirectDebugAddress) { - auto PtrTy = getContext().getPointerType(Ty); - AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy), - D.getName() + ".indirect_addr"); - EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy); - } - - auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS; - auto DestLangAS = - getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default; - if (SrcLangAS != DestLangAS) { - assert(getContext().getTargetAddressSpace(SrcLangAS) == - CGM.getDataLayout().getAllocaAddrSpace()); - auto DestAS = getContext().getTargetAddressSpace(DestLangAS); - auto *T = llvm::PointerType::get(getLLVMContext(), DestAS); - DeclPtr = - DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast( - *this, V, SrcLangAS, DestLangAS, T, true), - DeclPtr.isKnownNonNull()); - } - + auto PushCleanupIfNeeded = [this, Ty, &D](Address DeclPtr) { // Push a destructor cleanup for this parameter if the ABI requires it. // Don't push a cleanup in a thunk for a method that will also emit a // cleanup. @@ -2588,87 +2547,126 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg, EHStack.stable_begin(); } } + }; + + Address DeclPtr = Address::invalid(); + Address AllocaPtr = Address::invalid(); + Address OpenMPLocalAddr = + getLangOpts().OpenMP + ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D) + : Address::invalid(); + bool DoStore = false; + bool IsScalar = hasScalarEvaluationKind(Ty); + bool UseIndirectDebugAddress = false; + if (OpenMPLocalAddr.isValid()) { + DeclPtr = OpenMPLocalAddr; + AllocaPtr = DeclPtr; + LValue Dst = MakeAddrLValue(DeclPtr, Ty); + if (Arg.isIndirect()) { + LValue Src = MakeAddrLValue(Arg.getIndirectAddress(), Ty); + callCStructCopyConstructor(Dst, Src); + PushCleanupIfNeeded(Arg.getIndirectAddress()); + } else { + EmitStoreOfScalar(Arg.getDirectValue(), Dst, /* isInitialization */ true); + } } else { - // Check if the parameter address is controlled by OpenMP runtime. - Address OpenMPLocalAddr = - getLangOpts().OpenMP - ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D) - : Address::invalid(); - if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) { - DeclPtr = OpenMPLocalAddr; + // If we already have a pointer to the argument, reuse the input pointer. + if (Arg.isIndirect()) { + // If we have a prettier pointer type at this point, bitcast to that. + DeclPtr = Arg.getIndirectAddress(); + DeclPtr = Builder.CreateElementBitCast(DeclPtr, ConvertTypeForMem(Ty), + D.getName()); + // Indirect argument is in alloca address space, which may be different + // from the default address space. + auto AllocaAS = CGM.getASTAllocaAddressSpace(); + auto *V = DeclPtr.getPointer(); AllocaPtr = DeclPtr; + auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS; + auto DestLangAS = + getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default; + if (SrcLangAS != DestLangAS) { + assert(getContext().getTargetAddressSpace(SrcLangAS) == + CGM.getDataLayout().getAllocaAddrSpace()); + auto DestAS = getContext().getTargetAddressSpace(DestLangAS); + auto *T = DeclPtr.getElementType()->getPointerTo(DestAS); + DeclPtr = + DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast( + *this, V, SrcLangAS, DestLangAS, T, true), + DeclPtr.isKnownNonNull()); + } + PushCleanupIfNeeded(DeclPtr); } else { - // Otherwise, create a temporary to hold the value. + // Create a temporary to hold the value. DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D), D.getName() + ".addr", &AllocaPtr); + DoStore = true; } - DoStore = true; - } - - llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr); - - LValue lv = MakeAddrLValue(DeclPtr, Ty); - if (IsScalar) { - Qualifiers qs = Ty.getQualifiers(); - if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) { - // We honor __attribute__((ns_consumed)) for types with lifetime. - // For __strong, it's handled by just skipping the initial retain; - // otherwise we have to balance out the initial +1 with an extra - // cleanup to do the release at the end of the function. - bool isConsumed = D.hasAttr<NSConsumedAttr>(); - - // If a parameter is pseudo-strong then we can omit the implicit retain. - if (D.isARCPseudoStrong()) { - assert(lt == Qualifiers::OCL_Strong && - "pseudo-strong variable isn't strong?"); - assert(qs.hasConst() && "pseudo-strong variable should be const!"); - lt = Qualifiers::OCL_ExplicitNone; - } - // Load objects passed indirectly. - if (Arg.isIndirect() && !ArgVal) - ArgVal = Builder.CreateLoad(DeclPtr); - - if (lt == Qualifiers::OCL_Strong) { - if (!isConsumed) { - if (CGM.getCodeGenOpts().OptimizationLevel == 0) { - // use objc_storeStrong(&dest, value) for retaining the - // object. But first, store a null into 'dest' because - // objc_storeStrong attempts to release its old value. - llvm::Value *Null = CGM.EmitNullConstant(D.getType()); - EmitStoreOfScalar(Null, lv, /* isInitialization */ true); - EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true); - DoStore = false; - } - else - // Don't use objc_retainBlock for block pointers, because we - // don't want to Block_copy something just because we got it - // as a parameter. - ArgVal = EmitARCRetainNonBlock(ArgVal); - } - } else { - // Push the cleanup for a consumed parameter. - if (isConsumed) { - ARCPreciseLifetime_t precise = (D.hasAttr<ObjCPreciseLifetimeAttr>() - ? ARCPreciseLifetime : ARCImpreciseLifetime); - EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(), ArgVal, - precise); + llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr); + + LValue lv = MakeAddrLValue(DeclPtr, Ty); + if (IsScalar) { + Qualifiers qs = Ty.getQualifiers(); + if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) { + // We honor __attribute__((ns_consumed)) for types with lifetime. + // For __strong, it's handled by just skipping the initial retain; + // otherwise we have to balance out the initial +1 with an extra + // cleanup to do the release at the end of the function. + bool isConsumed = D.hasAttr<NSConsumedAttr>(); + + // If a parameter is pseudo-strong then we can omit the implicit retain. + if (D.isARCPseudoStrong()) { + assert(lt == Qualifiers::OCL_Strong && + "pseudo-strong variable isn't strong?"); + assert(qs.hasConst() && "pseudo-strong variable should be const!"); + lt = Qualifiers::OCL_ExplicitNone; } - if (lt == Qualifiers::OCL_Weak) { - EmitARCInitWeak(DeclPtr, ArgVal); - DoStore = false; // The weak init is a store, no need to do two. + // Load objects passed indirectly. + if (Arg.isIndirect() && !ArgVal) + ArgVal = Builder.CreateLoad(DeclPtr); + + if (lt == Qualifiers::OCL_Strong) { + if (!isConsumed) { + if (CGM.getCodeGenOpts().OptimizationLevel == 0) { + // use objc_storeStrong(&dest, value) for retaining the + // object. But first, store a null into 'dest' because + // objc_storeStrong attempts to release its old value. + llvm::Value *Null = CGM.EmitNullConstant(D.getType()); + EmitStoreOfScalar(Null, lv, /* isInitialization */ true); + EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true); + DoStore = false; + } else + // Don't use objc_retainBlock for block pointers, because we + // don't want to Block_copy something just because we got it + // as a parameter. + ArgVal = EmitARCRetainNonBlock(ArgVal); + } + } else { + // Push the cleanup for a consumed parameter. + if (isConsumed) { + ARCPreciseLifetime_t precise = + (D.hasAttr<ObjCPreciseLifetimeAttr>() ? ARCPreciseLifetime + : ARCImpreciseLifetime); + EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(), + ArgVal, precise); + } + + if (lt == Qualifiers::OCL_Weak) { + EmitARCInitWeak(DeclPtr, ArgVal); + DoStore = false; // The weak init is a store, no need to do two. + } } - } - // Enter the cleanup scope. - EmitAutoVarWithLifetime(*this, D, DeclPtr, lt); + // Enter the cleanup scope. + EmitAutoVarWithLifetime(*this, D, DeclPtr, lt); + } } - } - // Store the initial value into the alloca. - if (DoStore) - EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true); + // Store the initial value into the alloca. + if (DoStore) + EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true); + } setAddrOfLocalVar(&D, DeclPtr); diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 299ee1460b3db0e..8f0c7caa2f3b4b0 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1083,10 +1083,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, // Allocate space for the variable to be globalized llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; - llvm::CallBase *VoidPtr = - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_alloc_shared), - AllocArgs, VD->getName()); + llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace + ? OMPRTL_malloc + : OMPRTL___kmpc_alloc_shared), + AllocArgs, VD->getName()); // FIXME: We should use the variables actual alignment as an argument. VoidPtr->addRetAttr(llvm::Attribute::get( CGM.getLLVMContext(), llvm::Attribute::Alignment, @@ -1149,10 +1151,12 @@ CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF, // Allocate space for this VLA object to be globalized. llvm::Value *AllocArgs[] = {Size}; - llvm::CallBase *VoidPtr = - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_alloc_shared), - AllocArgs, VD->getName()); + llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace + ? OMPRTL_malloc + : OMPRTL___kmpc_alloc_shared), + AllocArgs, VD->getName()); VoidPtr->addRetAttr(llvm::Attribute::get( CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity())); @@ -1178,20 +1182,29 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) { // globalized in the prolog (i.e. emitGenericVarsProlog). for (const auto &AddrSizePair : llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_free_shared), - {AddrSizePair.first, AddrSizePair.second}); + if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace) + CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free), + {AddrSizePair.first}); + else + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_free_shared), + {AddrSizePair.first, AddrSizePair.second}); } // Deallocate the memory for each globalized value for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) { const auto *VD = cast<VarDecl>(Rec.first); I->getSecond().MappedParams->restore(CGF); - llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal, - CGF.getTypeSize(VD->getType())}; - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_free_shared), - FreeArgs); + if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace) + CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free), + {Rec.second.GlobalizedVal}); + else + CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), + OMPRTL___kmpc_free_shared), + {Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())}); } } } diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index d22d2a8e948b00e..90d5d1973967493 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -227,7 +227,9 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, ) __OMP_RTL(__kmpc_get_warp_size, false, Int32, ) __OMP_RTL(omp_get_thread_num, false, Int32, ) +__OMP_RTL(omp_get_bulk_thread_num, false, Int32, ) __OMP_RTL(omp_get_num_threads, false, Int32, ) +__OMP_RTL(omp_get_bulk_num_threads, false, Int32, ) __OMP_RTL(omp_get_max_threads, false, Int32, ) __OMP_RTL(omp_in_parallel, false, Int32, ) __OMP_RTL(omp_get_dynamic, false, Int32, ) @@ -490,6 +492,8 @@ __OMP_RTL(__kmpc_reduction_get_fixed_buffer, false, VoidPtr, ) __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16) +__OMP_RTL(malloc, false, VoidPtr, SizeTy) +__OMP_RTL(free, false, Void, VoidPtr) __OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy) __OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy) __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy) @@ -503,6 +507,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,) __OMP_RTL(__kmpc_syncwarp, false, Void, Int64) +__OMP_RTL(__kmpc_launch_parallel_51_kernel, false, Void, Int8Ptr, Int32, Int32, + Int32, VoidPtrPtr, Int64) + __OMP_RTL(__last, false, Void, ) #undef __OMP_RTL @@ -710,6 +717,8 @@ __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs()) +__OMP_RTL_ATTRS(omp_get_bulk_thread_num, GetterAttrs, SExt, ParamAttrs()) +__OMP_RTL_ATTRS(omp_get_bulk_num_threads, GetterAttrs, SExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, SExt, ParamAttrs()) _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits