llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang-codegen Author: Akash Banerjee (TIFitis) <details> <summary>Changes</summary> This patch migrates the CGOpenMPRuntimeGPU::emitReduction and related functions to the OpenMPIRBUilder. In future patches MLIR OpenMP translation would be making use of these functions. Co-authored-by: Jan Leyonberg <jan.leyonberg@<!-- -->amd.com> --- Patch is 267.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80343.diff 8 Files Affected: - (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp (+71-1303) - (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.h (-3) - (modified) clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp (+273-273) - (modified) clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp (+12-12) - (modified) clang/test/OpenMP/target_teams_generic_loop_codegen.cpp (+6-6) - (modified) llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h (+640-32) - (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+1420-66) - (modified) mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+18-4) ``````````diff diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 299ee1460b3db..7cddf73306f2d 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -501,31 +501,6 @@ class CheckVarsEscapingDeclContext final }; } // anonymous namespace -/// Get the id of the warp in the block. -/// We assume that the warp size is 32, which is always the case -/// on the NVPTX device, to generate more efficient code. -static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - unsigned LaneIDBits = - llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size); - auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); - return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id"); -} - -/// Get the id of the current lane in the Warp. -/// We assume that the warp size is 32, which is always the case -/// on the NVPTX device, to generate more efficient code. -static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - unsigned LaneIDBits = - llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size); - assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device."); - unsigned LaneIDMask = ~0u >> (32u - LaneIDBits); - auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); - return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask), - "nvptx_lane_id"); -} - CGOpenMPRuntimeGPU::ExecutionMode CGOpenMPRuntimeGPU::getExecutionMode() const { return CurrentExecutionMode; @@ -1429,1132 +1404,6 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, TBAAAccessInfo()); } -/// This function creates calls to one of two shuffle functions to copy -/// variables between lanes in a warp. -static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF, - llvm::Value *Elem, - QualType ElemType, - llvm::Value *Offset, - SourceLocation Loc) { - CodeGenModule &CGM = CGF.CGM; - CGBuilderTy &Bld = CGF.Builder; - CGOpenMPRuntimeGPU &RT = - *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime())); - llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder(); - - CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType); - assert(Size.getQuantity() <= 8 && - "Unsupported bitwidth in shuffle instruction."); - - RuntimeFunction ShuffleFn = Size.getQuantity() <= 4 - ? OMPRTL___kmpc_shuffle_int32 - : OMPRTL___kmpc_shuffle_int64; - - // Cast all types to 32- or 64-bit values before calling shuffle routines. - QualType CastTy = CGF.getContext().getIntTypeForBitwidth( - Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1); - llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc); - llvm::Value *WarpSize = - Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true); - - llvm::Value *ShuffledVal = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn), - {ElemCast, Offset, WarpSize}); - - return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc); -} - -static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, - Address DestAddr, QualType ElemType, - llvm::Value *Offset, SourceLocation Loc) { - CGBuilderTy &Bld = CGF.Builder; - - CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType); - // Create the loop over the big sized data. - // ptr = (void*)Elem; - // ptrEnd = (void*) Elem + 1; - // Step = 8; - // while (ptr + Step < ptrEnd) - // shuffle((int64_t)*ptr); - // Step = 4; - // while (ptr + Step < ptrEnd) - // shuffle((int32_t)*ptr); - // ... - Address ElemPtr = DestAddr; - Address Ptr = SrcAddr; - Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast( - Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty); - for (int IntSize = 8; IntSize >= 1; IntSize /= 2) { - if (Size < CharUnits::fromQuantity(IntSize)) - continue; - QualType IntType = CGF.getContext().getIntTypeForBitwidth( - CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)), - /*Signed=*/1); - llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType); - Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(), - IntTy); - ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( - ElemPtr, IntTy->getPointerTo(), IntTy); - if (Size.getQuantity() / IntSize > 1) { - llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond"); - llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then"); - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit"); - llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock(); - CGF.EmitBlock(PreCondBB); - llvm::PHINode *PhiSrc = - Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2); - PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB); - llvm::PHINode *PhiDest = - Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2); - PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB); - Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment()); - ElemPtr = - Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment()); - llvm::Value *PtrDiff = Bld.CreatePtrDiff( - CGF.Int8Ty, PtrEnd.getPointer(), - Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(), - CGF.VoidPtrTy)); - Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)), - ThenBB, ExitBB); - CGF.EmitBlock(ThenBB); - llvm::Value *Res = createRuntimeShuffleFunction( - CGF, - CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc, - LValueBaseInfo(AlignmentSource::Type), - TBAAAccessInfo()), - IntType, Offset, Loc); - CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType, - LValueBaseInfo(AlignmentSource::Type), - TBAAAccessInfo()); - Address LocalPtr = Bld.CreateConstGEP(Ptr, 1); - Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1); - PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB); - PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB); - CGF.EmitBranch(PreCondBB); - CGF.EmitBlock(ExitBB); - } else { - llvm::Value *Res = createRuntimeShuffleFunction( - CGF, - CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc, - LValueBaseInfo(AlignmentSource::Type), - TBAAAccessInfo()), - IntType, Offset, Loc); - CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType, - LValueBaseInfo(AlignmentSource::Type), - TBAAAccessInfo()); - Ptr = Bld.CreateConstGEP(Ptr, 1); - ElemPtr = Bld.CreateConstGEP(ElemPtr, 1); - } - Size = Size % IntSize; - } -} - -namespace { -enum CopyAction : unsigned { - // RemoteLaneToThread: Copy over a Reduce list from a remote lane in - // the warp using shuffle instructions. - RemoteLaneToThread, - // ThreadCopy: Make a copy of a Reduce list on the thread's stack. - ThreadCopy, -}; -} // namespace - -struct CopyOptionsTy { - llvm::Value *RemoteLaneOffset; - llvm::Value *ScratchpadIndex; - llvm::Value *ScratchpadWidth; -}; - -/// Emit instructions to copy a Reduce list, which contains partially -/// aggregated values, in the specified direction. -static void emitReductionListCopy( - CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, - ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase, - CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) { - - CodeGenModule &CGM = CGF.CGM; - ASTContext &C = CGM.getContext(); - CGBuilderTy &Bld = CGF.Builder; - - llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset; - - // Iterates, element-by-element, through the source Reduce list and - // make a copy. - unsigned Idx = 0; - for (const Expr *Private : Privates) { - Address SrcElementAddr = Address::invalid(); - Address DestElementAddr = Address::invalid(); - Address DestElementPtrAddr = Address::invalid(); - // Should we shuffle in an element from a remote lane? - bool ShuffleInElement = false; - // Set to true to update the pointer in the dest Reduce list to a - // newly created element. - bool UpdateDestListPtr = false; - QualType PrivatePtrType = C.getPointerType(Private->getType()); - llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType); - - switch (Action) { - case RemoteLaneToThread: { - // Step 1.1: Get the address for the src element in the Reduce list. - Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); - SrcElementAddr = CGF.EmitLoadOfPointer( - SrcElementPtrAddr.withElementType(PrivateLlvmPtrType), - PrivatePtrType->castAs<PointerType>()); - - // Step 1.2: Create a temporary to store the element in the destination - // Reduce list. - DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); - DestElementAddr = - CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); - ShuffleInElement = true; - UpdateDestListPtr = true; - break; - } - case ThreadCopy: { - // Step 1.1: Get the address for the src element in the Reduce list. - Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); - SrcElementAddr = CGF.EmitLoadOfPointer( - SrcElementPtrAddr.withElementType(PrivateLlvmPtrType), - PrivatePtrType->castAs<PointerType>()); - - // Step 1.2: Get the address for dest element. The destination - // element has already been created on the thread's stack. - DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); - DestElementAddr = CGF.EmitLoadOfPointer( - DestElementPtrAddr.withElementType(PrivateLlvmPtrType), - PrivatePtrType->castAs<PointerType>()); - break; - } - } - - // Regardless of src and dest of copy, we emit the load of src - // element as this is required in all directions - SrcElementAddr = SrcElementAddr.withElementType( - CGF.ConvertTypeForMem(Private->getType())); - DestElementAddr = - DestElementAddr.withElementType(SrcElementAddr.getElementType()); - - // Now that all active lanes have read the element in the - // Reduce list, shuffle over the value from the remote lane. - if (ShuffleInElement) { - shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(), - RemoteLaneOffset, Private->getExprLoc()); - } else { - switch (CGF.getEvaluationKind(Private->getType())) { - case TEK_Scalar: { - llvm::Value *Elem = CGF.EmitLoadOfScalar( - SrcElementAddr, /*Volatile=*/false, Private->getType(), - Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type), - TBAAAccessInfo()); - // Store the source element value to the dest element address. - CGF.EmitStoreOfScalar( - Elem, DestElementAddr, /*Volatile=*/false, Private->getType(), - LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); - break; - } - case TEK_Complex: { - CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex( - CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), - Private->getExprLoc()); - CGF.EmitStoreOfComplex( - Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()), - /*isInit=*/false); - break; - } - case TEK_Aggregate: - CGF.EmitAggregateCopy( - CGF.MakeAddrLValue(DestElementAddr, Private->getType()), - CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), - Private->getType(), AggValueSlot::DoesNotOverlap); - break; - } - } - - // Step 3.1: Modify reference in dest Reduce list as needed. - // Modifying the reference in Reduce list to point to the newly - // created element. The element is live in the current function - // scope and that of functions it invokes (i.e., reduce_function). - // RemoteReduceData[i] = (void*)&RemoteElem - if (UpdateDestListPtr) { - CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast( - DestElementAddr.getPointer(), CGF.VoidPtrTy), - DestElementPtrAddr, /*Volatile=*/false, - C.VoidPtrTy); - } - - ++Idx; - } -} - -/// This function emits a helper that gathers Reduce lists from the first -/// lane of every active warp to lanes in the first warp. -/// -/// void inter_warp_copy_func(void* reduce_data, num_warps) -/// shared smem[warp_size]; -/// For all data entries D in reduce_data: -/// sync -/// If (I am the first lane in each warp) -/// Copy my local D to smem[warp_id] -/// sync -/// if (I am the first warp) -/// Copy smem[thread_id] to my local D -static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, - ArrayRef<const Expr *> Privates, - QualType ReductionArrayTy, - SourceLocation Loc) { - ASTContext &C = CGM.getContext(); - llvm::Module &M = CGM.getModule(); - - // ReduceList: thread local Reduce list. - // At the stage of the computation when this function is called, partially - // aggregated values reside in the first lane of every active warp. - ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamKind::Other); - // NumWarps: number of warps active in the parallel region. This could - // be smaller than 32 (max warps in a CTA) for partial block reduction. - ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.getIntTypeForBitwidth(32, /* Signed */ true), - ImplicitParamKind::Other); - FunctionArgList Args; - Args.push_back(&ReduceListArg); - Args.push_back(&NumWarpsArg); - - const CGFunctionInfo &CGFI = - CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI), - llvm::GlobalValue::InternalLinkage, - "_omp_reduction_inter_warp_copy_func", &M); - CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); - Fn->setDoesNotRecurse(); - CodeGenFunction CGF(CGM); - CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); - - CGBuilderTy &Bld = CGF.Builder; - - // This array is used as a medium to transfer, one reduce element at a time, - // the data from the first lane of every warp to lanes in the first warp - // in order to perform the final step of a reduction in a parallel region - // (reduction across warps). The array is placed in NVPTX __shared__ memory - // for reduced latency, as well as to have a distinct copy for concurrently - // executing target regions. The array is declared with common linkage so - // as to be shared across compilation units. - StringRef TransferMediumName = - "__openmp_nvptx_data_transfer_temporary_storage"; - llvm::GlobalVariable *TransferMedium = - M.getGlobalVariable(TransferMediumName); - unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; - if (!TransferMedium) { - auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); - unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); - TransferMedium = new llvm::GlobalVariable( - M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage, - llvm::UndefValue::get(Ty), TransferMediumName, - /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, - SharedAddressSpace); - CGM.addCompilerUsedGlobal(TransferMedium); - } - - auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); - // Get the CUDA thread id of the current OpenMP thread on the GPU. - llvm::Value *ThreadID = RT.getGPUThreadID(CGF); - // nvptx_lane_id = nvptx_id % warpsize - llvm::Value *LaneID = getNVPTXLaneID(CGF); - // nvptx_warp_id = nvptx_id / warpsize - llvm::Value *WarpID = getNVPTXWarpID(CGF); - - Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); - llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy); - Address LocalReduceList( - Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar( - AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc, - LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()), - ElemTy->getPointerTo()), - ElemTy, CGF.getPointerAlign()); - - unsigned Idx = 0; - for (const Expr *Private : Privates) { - // - // Warp master copies reduce element to transfer medium in __shared__ - // memory. - // - unsigned RealTySize = - C.getTypeSizeInChars(Private->getType()) - .alignTo(C.getTypeAlignInChars(Private->getType())) - .getQuantity(); - for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) { - unsigned NumIters = RealTySize / TySize; - if (NumIters == 0) - continue; - QualType CType = C.getIntTypeForBitwidth( - C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1); - llvm::Type *CopyType = CGF.ConvertTypeForMem(CType); - CharUnits Align = CharUnits::fromQuantity(TySize); - llvm::Value *Cnt = nullptr; - Address CntAddr = Address::invalid(); - llvm::BasicBlock *PrecondBB = nullptr; - llvm::BasicBlock *ExitBB = nullptr; - if (NumIters > 1) { - CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr"); - CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr, - /*Volatile=*/false, C.IntTy); - PrecondBB = CGF.createBasicBlock("precond"); - ExitBB = CGF.createBasicBlock("exit"); - llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body"); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(PrecondBB); - Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc); - llvm::Value *Cmp = - Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters)); - Bld.CreateCondBr(Cmp, BodyBB, ExitBB); - CGF.EmitBlock(BodyBB); - } - // kmpc_barrier. - CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown, - /*EmitChecks=*/false, - /*ForceSimpleCall=*/true); - llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); - - // if (lane_id == 0) - llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master"); - Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); - CGF.EmitBlock(ThenBB); - - // Reduce element = LocalReduceList[i] - Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); - llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( - ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); - // elemptr = ((CopyType*)(elemptrptr)) + I - Address ElemPtr(ElemPtrPtr, CopyType, Align); - if (NumIters > 1) - ElemPtr = Bld.CreateGEP(ElemPtr, Cnt); - - // Get pointer to location in transfer medium. - // MediumPtr = &medium[warp_id] - llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP( - TransferMedium->getValueType(), TransferM... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/80343 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits