https://github.com/jsjodin updated https://github.com/llvm/llvm-project/pull/133310
>From 50fefbb31f4de7352c241c48fe5382785daaef21 Mon Sep 17 00:00:00 2001 From: Jan Leyonberg <jan_sjo...@yahoo.com> Date: Sun, 23 Mar 2025 09:56:51 -0400 Subject: [PATCH 01/10] Initial modifications to support reductions in flang. --- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 3 +- .../llvm/Frontend/OpenMP/OMPIRBuilder.h | 3 -- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 30 +++++++++++++++---- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index feb2448297542..d30bef9e7f0ba 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1659,7 +1659,6 @@ void CGOpenMPRuntimeGPU::emitReduction( return; bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); - bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind); bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); ASTContext &C = CGM.getContext(); @@ -1756,7 +1755,7 @@ void CGOpenMPRuntimeGPU::emitReduction( llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail(OMPBuilder.createReductionsGPU( OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction, - DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, + llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc)); CGF.Builder.restoreIP(AfterIP); diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 28909cef4748d..9b67d0c050e46 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1905,8 +1905,6 @@ class OpenMPIRBuilder { /// nowait. /// \param IsTeamsReduction Optional flag set if it is a teams /// reduction. - /// \param HasDistribute Optional flag set if it is a - /// distribute reduction. /// \param GridValue Optional GPU grid value. /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be /// used for teams reduction. @@ -1915,7 +1913,6 @@ class OpenMPIRBuilder { const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait = false, bool IsTeamsReduction = false, - bool HasDistribute = false, ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024, Value *SrcLocInfo = nullptr); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 2e5ce5308eea5..0001626086d7c 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -3495,9 +3495,9 @@ checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, - bool IsNoWait, bool IsTeamsReduction, bool HasDistribute, - ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue, - unsigned ReductionBufNum, Value *SrcLocInfo) { + bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind, + std::optional<omp::GV> GridValue, unsigned ReductionBufNum, + Value *SrcLocInfo) { if (!updateToLocation(Loc)) return InsertPointTy(); Builder.restoreIP(CodeGenIP); @@ -3514,6 +3514,16 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( if (ReductionInfos.size() == 0) return Builder.saveIP(); + BasicBlock *ContinuationBlock = nullptr; + if (ReductionGenCBKind != ReductionGenCBKind::Clang) { + // Copied code from createReductions + BasicBlock *InsertBlock = Loc.IP.getBlock(); + ContinuationBlock = + InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize"); + InsertBlock->getTerminator()->eraseFromParent(); + Builder.SetInsertPoint(InsertBlock, InsertBlock->end()); + } + Function *CurFunc = Builder.GetInsertBlock()->getParent(); AttributeList FuncAttrs; AttrBuilder AttrBldr(Ctx); @@ -3669,11 +3679,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( ReductionFunc; }); } else { - assert(false && "Unhandled ReductionGenCBKind"); + Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs"); + Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs"); + Value *Reduced; + InsertPointOrErrorTy AfterIP = + RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced); + if (!AfterIP) + return AfterIP.takeError(); + Builder.CreateStore(Reduced, LHS, false); } } emitBlock(ExitBB, CurFunc); - + if (ContinuationBlock) { + Builder.CreateBr(ContinuationBlock); + Builder.SetInsertPoint(ContinuationBlock); + } Config.setEmitLLVMUsed(); return Builder.saveIP(); >From fc3a9d0311abcc9e0fd0174c89214613c8afc14e Mon Sep 17 00:00:00 2001 From: Jan Leyonberg <jan_sjo...@yahoo.com> Date: Sun, 23 Mar 2025 10:11:39 -0400 Subject: [PATCH 02/10] Prepare for reduction support --- .../llvm/Frontend/OpenMP/OMPIRBuilder.h | 5 +- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 143 ++++++++++++------ 2 files changed, 98 insertions(+), 50 deletions(-) diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 9b67d0c050e46..a3a266e3f0a98 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1984,7 +1984,8 @@ class OpenMPIRBuilder { InsertPointTy AllocaIP, ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef, - bool IsNoWait = false); + bool IsNoWait = false, + bool IsTeamsReduction = false); ///} @@ -2268,6 +2269,8 @@ class OpenMPIRBuilder { int32_t MinTeams = 1; SmallVector<int32_t, 3> MaxThreads = {-1}; int32_t MinThreads = 1; + int32_t ReductionDataSize = 0; + int32_t ReductionBufferLength = 0; }; /// Container to pass LLVM IR runtime values or constants related to the diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 0001626086d7c..6eb3ae7b60eb9 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -3708,27 +3708,95 @@ static Function *getFreshReductionFunc(Module &M) { ".omp.reduction.func", &M); } -OpenMPIRBuilder::InsertPointOrErrorTy -OpenMPIRBuilder::createReductions(const LocationDescription &Loc, - InsertPointTy AllocaIP, - ArrayRef<ReductionInfo> ReductionInfos, - ArrayRef<bool> IsByRef, bool IsNoWait) { - assert(ReductionInfos.size() == IsByRef.size()); - for (const ReductionInfo &RI : ReductionInfos) { - (void)RI; - assert(RI.Variable && "expected non-null variable"); - assert(RI.PrivateVariable && "expected non-null private variable"); - assert(RI.ReductionGen && "expected non-null reduction generator callback"); - assert(RI.Variable->getType() == RI.PrivateVariable->getType() && - "expected variables and their private equivalents to have the same " - "type"); - assert(RI.Variable->getType()->isPointerTy() && - "expected variables to be pointers"); +static Error populateReductionFunction( + Function *ReductionFunc, + ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, + IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) { + Module *Module = ReductionFunc->getParent(); + BasicBlock *ReductionFuncBlock = + BasicBlock::Create(Module->getContext(), "", ReductionFunc); + Builder.SetInsertPoint(ReductionFuncBlock); + Value *LHSArrayPtr = nullptr; + Value *RHSArrayPtr = nullptr; + if (IsGPU) { + // Need to alloca memory here and deal with the pointers before getting + // LHS/RHS pointers out + // + Argument *Arg0 = ReductionFunc->getArg(0); + Argument *Arg1 = ReductionFunc->getArg(1); + Type *Arg0Type = Arg0->getType(); + Type *Arg1Type = Arg1->getType(); + + Value *LHSAlloca = + Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr"); + Value *RHSAlloca = + Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr"); + Value *LHSAddrCast = + Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type); + Value *RHSAddrCast = + Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type); + Builder.CreateStore(Arg0, LHSAddrCast); + Builder.CreateStore(Arg1, RHSAddrCast); + LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast); + RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast); + } else { + LHSArrayPtr = ReductionFunc->getArg(0); + RHSArrayPtr = ReductionFunc->getArg(1); + } + + unsigned NumReductions = ReductionInfos.size(); + Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions); + + for (auto En : enumerate(ReductionInfos)) { + const OpenMPIRBuilder::ReductionInfo &RI = En.value(); + Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( + RedArrayTy, LHSArrayPtr, 0, En.index()); + Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr); + Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + LHSI8Ptr, RI.Variable->getType()); + Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); + Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( + RedArrayTy, RHSArrayPtr, 0, En.index()); + Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr); + Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + RHSI8Ptr, RI.PrivateVariable->getType()); + Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); + Value *Reduced; + OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = + RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced); + if (!AfterIP) + return AfterIP.takeError(); + + Builder.restoreIP(*AfterIP); + // TODO: Consider flagging an error. + if (!Builder.GetInsertBlock()) + return Error::success(); + + // store is inside of the reduction region when using by-ref + if (!IsByRef[En.index()]) + Builder.CreateStore(Reduced, LHSPtr); } + Builder.CreateRetVoid(); + return Error::success(); +} + +OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions( + const LocationDescription &Loc, InsertPointTy AllocaIP, + ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef, + bool IsNoWait, bool IsTeamsReduction) { + assert(ReductionInfos.size() == IsByRef.size()); + if (Config.isGPU()) + return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos, + IsNoWait, IsTeamsReduction); + + checkReductionInfos(ReductionInfos, /*IsGPU*/ false); if (!updateToLocation(Loc)) return InsertPointTy(); + if (ReductionInfos.size() == 0) + return Builder.saveIP(); + BasicBlock *InsertBlock = Loc.IP.getBlock(); BasicBlock *ContinuationBlock = InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize"); @@ -3852,38 +3920,13 @@ OpenMPIRBuilder::createReductions(const LocationDescription &Loc, // Populate the outlined reduction function using the elementwise reduction // function. Partial values are extracted from the type-erased array of // pointers to private variables. - BasicBlock *ReductionFuncBlock = - BasicBlock::Create(Module->getContext(), "", ReductionFunc); - Builder.SetInsertPoint(ReductionFuncBlock); - Value *LHSArrayPtr = ReductionFunc->getArg(0); - Value *RHSArrayPtr = ReductionFunc->getArg(1); + Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder, + IsByRef, false); + if (Err) + return Err; - for (auto En : enumerate(ReductionInfos)) { - const ReductionInfo &RI = En.value(); - Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( - RedArrayTy, LHSArrayPtr, 0, En.index()); - Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr); - Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType()); - Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); - Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( - RedArrayTy, RHSArrayPtr, 0, En.index()); - Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr); - Value *RHSPtr = - Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType()); - Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); - Value *Reduced; - InsertPointOrErrorTy AfterIP = - RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced); - if (!AfterIP) - return AfterIP.takeError(); - Builder.restoreIP(*AfterIP); - if (!Builder.GetInsertBlock()) - return InsertPointTy(); - // store is inside of the reduction region when using by-ref - if (!IsByRef[En.index()]) - Builder.CreateStore(Reduced, LHSPtr); - } - Builder.CreateRetVoid(); + if (!Builder.GetInsertBlock()) + return InsertPointTy(); Builder.SetInsertPoint(ContinuationBlock); return Builder.saveIP(); @@ -6259,8 +6302,10 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit( Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal); Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams); Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front()); - Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0); - Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0); + Constant *ReductionDataSize = + ConstantInt::getSigned(Int32, Attrs.ReductionDataSize); + Constant *ReductionBufferLength = + ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength); Function *Fn = getOrCreateRuntimeFunctionPtr( omp::RuntimeFunction::OMPRTL___kmpc_target_init); >From 8497219d2a269cb5d346214c0e48180f84174605 Mon Sep 17 00:00:00 2001 From: Jan Leyonberg <jan_sjo...@yahoo.com> Date: Sun, 23 Mar 2025 10:53:02 -0400 Subject: [PATCH 03/10] Enable reductions --- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 208 ++++++++++++++++-- 1 file changed, 185 insertions(+), 23 deletions(-) diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index d41489921bd13..155ea3f920617 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -265,7 +265,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { .Case([&](omp::TeamsOp op) { checkAllocate(op, result); checkPrivate(op, result); - checkReduction(op, result); }) .Case([&](omp::TaskOp op) { checkAllocate(op, result); @@ -1018,19 +1017,37 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs, // variable allocated in the inlined region) llvm::Value *var = builder.CreateAlloca( moduleTranslation.convertType(reductionDecls[i].getType())); - deferredStores.emplace_back(phis[0], var); - - privateReductionVariables[i] = var; - moduleTranslation.mapValue(reductionArgs[i], phis[0]); - reductionVariableMap.try_emplace(loop.getReductionVars()[i], phis[0]); + // var->setName("private_redvar"); + + llvm::Type *ptrTy = llvm::PointerType::getUnqual(builder.getContext()); + llvm::Value *castVar = + builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy); + // TODO: I (Sergio) just guessed casting phis[0] like it's done for var is + // what's supposed to happen with this code coming from a merge from main, + // but I don't actually know. Someone more familiar with it needs to check + // this. + llvm::Value *castPhi = + builder.CreatePointerBitCastOrAddrSpaceCast(phis[0], ptrTy); + + deferredStores.emplace_back(castPhi, castVar); + + privateReductionVariables[i] = castVar; + moduleTranslation.mapValue(reductionArgs[i], castPhi); + reductionVariableMap.try_emplace(loop.getReductionVars()[i], castPhi); } else { assert(allocRegion.empty() && "allocaction is implicit for by-val reduction"); llvm::Value *var = builder.CreateAlloca( moduleTranslation.convertType(reductionDecls[i].getType())); - moduleTranslation.mapValue(reductionArgs[i], var); - privateReductionVariables[i] = var; - reductionVariableMap.try_emplace(loop.getReductionVars()[i], var); + // var->setName("private_redvar"); + + llvm::Type *ptrTy = llvm::PointerType::getUnqual(builder.getContext()); + llvm::Value *castVar = + builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy); + + moduleTranslation.mapValue(reductionArgs[i], castVar); + privateReductionVariables[i] = castVar; + reductionVariableMap.try_emplace(loop.getReductionVars()[i], castVar); } } @@ -1250,18 +1267,20 @@ static LogicalResult createReductionsAndCleanup( LLVM::ModuleTranslation &moduleTranslation, llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls, - ArrayRef<llvm::Value *> privateReductionVariables, ArrayRef<bool> isByRef) { + ArrayRef<llvm::Value *> privateReductionVariables, ArrayRef<bool> isByRef, + bool isNowait = false, bool isTeamsReduction = false) { // Process the reductions if required. if (op.getNumReductionVars() == 0) return success(); + SmallVector<OwningReductionGen> owningReductionGens; + SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens; + SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos; + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); // Create the reduction generators. We need to own them here because // ReductionInfo only accepts references to the generators. - SmallVector<OwningReductionGen> owningReductionGens; - SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens; - SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos; collectReductionInfo(op, builder, moduleTranslation, reductionDecls, owningReductionGens, owningAtomicReductionGens, privateReductionVariables, reductionInfos); @@ -1273,7 +1292,7 @@ static LogicalResult createReductionsAndCleanup( builder.SetInsertPoint(tempTerminator); llvm::OpenMPIRBuilder::InsertPointOrErrorTy contInsertPoint = ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos, - isByRef, op.getNowait()); + isByRef, isNowait, isTeamsReduction); if (failed(handleError(contInsertPoint, *op))) return failure(); @@ -1666,9 +1685,9 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder, builder.restoreIP(*afterIP); // Process the reductions if required. - return createReductionsAndCleanup(sectionsOp, builder, moduleTranslation, - allocaIP, reductionDecls, - privateReductionVariables, isByRef); + return createReductionsAndCleanup( + sectionsOp, builder, moduleTranslation, allocaIP, reductionDecls, + privateReductionVariables, isByRef, sectionsOp.getNowait()); } /// Converts an OpenMP single construct into LLVM IR using OpenMPIRBuilder. @@ -1714,6 +1733,43 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder, return success(); } +static bool teamsReductionContainedInDistribute(omp::TeamsOp teamsOp) { + auto iface = + llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(teamsOp.getOperation()); + // Check that all uses of the reduction block arg has the same distribute op + // parent. + llvm::SmallVector<mlir::Operation *> debugUses; + Operation *distOp = nullptr; + for (auto ra : iface.getReductionBlockArgs()) + for (auto &use : ra.getUses()) { + auto *useOp = use.getOwner(); + // Ignore debug uses. + if (mlir::isa<LLVM::DbgDeclareOp>(useOp) || + mlir::isa<LLVM::DbgValueOp>(useOp)) { + debugUses.push_back(useOp); + continue; + } + + auto currentDistOp = useOp->getParentOfType<omp::DistributeOp>(); + // Use is not inside a distribute op - return false + if (!currentDistOp) + return false; + // Multiple distribute operations - return false + Operation *currentOp = currentDistOp.getOperation(); + if (distOp && (distOp != currentOp)) + return false; + + distOp = currentOp; + } + + // If we are going to use distribute reduction then remove any debug uses of + // the reduction parameters in teamsOp. Otherwise they will be left without + // any mapped value in moduleTranslation and will eventually error out. + for (auto use : debugUses) + use->erase(); + return true; +} + // Convert an OpenMP Teams construct to LLVM IR using OpenMPIRBuilder static LogicalResult convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder, @@ -1722,6 +1778,34 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder, if (failed(checkImplementationStatus(*op))) return failure(); + DenseMap<Value, llvm::Value *> reductionVariableMap; + unsigned numReductionVars = op.getNumReductionVars(); + SmallVector<omp::DeclareReductionOp> reductionDecls; + SmallVector<llvm::Value *> privateReductionVariables(numReductionVars); + llvm::ArrayRef<bool> isByRef; + llvm::OpenMPIRBuilder::InsertPointTy allocaIP = + findAllocaInsertPoint(builder, moduleTranslation); + + // Only do teams reduction if there is no distribute op that captures the + // reduction instead. + bool doTeamsReduction = !teamsReductionContainedInDistribute(op); + if (doTeamsReduction) { + isByRef = getIsByRef(op.getReductionByref()); + + assert(isByRef.size() == op.getNumReductionVars()); + + MutableArrayRef<BlockArgument> reductionArgs = + llvm::cast<omp::BlockArgOpenMPOpInterface>(*op).getReductionBlockArgs(); + + collectReductionDecls(op, reductionDecls); + + if (failed(allocAndInitializeReductionVars( + op, reductionArgs, builder, moduleTranslation, allocaIP, + reductionDecls, privateReductionVariables, reductionVariableMap, + isByRef))) + return failure(); + } + auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) { LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame( moduleTranslation, allocaIP); @@ -1756,6 +1840,13 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder, return failure(); builder.restoreIP(*afterIP); + if (doTeamsReduction) { + // Process the reductions if required. + return createReductionsAndCleanup( + op, builder, moduleTranslation, allocaIP, reductionDecls, + privateReductionVariables, isByRef, + /*isNoWait*/ false, /*isTeamsReduction*/ true); + } return success(); } @@ -2273,9 +2364,10 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, return failure(); // Process the reductions if required. - if (failed(createReductionsAndCleanup(wsloopOp, builder, moduleTranslation, - allocaIP, reductionDecls, - privateReductionVariables, isByRef))) + if (failed(createReductionsAndCleanup( + wsloopOp, builder, moduleTranslation, allocaIP, reductionDecls, + privateReductionVariables, isByRef, wsloopOp.getNowait(), + /*isTeamsReduction=*/false))) return failure(); return cleanupPrivateVars(builder, moduleTranslation, wsloopOp.getLoc(), @@ -2378,7 +2470,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder::InsertPointOrErrorTy contInsertPoint = ompBuilder->createReductions(builder.saveIP(), allocaIP, - reductionInfos, isByRef, false); + reductionInfos, isByRef, false, false); if (!contInsertPoint) return contInsertPoint.takeError(); @@ -4161,6 +4253,37 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, if (failed(checkImplementationStatus(opInst))) return failure(); + /// Process teams op reduction in distribute if the reduction is contained in + /// the distribute op. + omp::TeamsOp teamsOp = opInst.getParentOfType<omp::TeamsOp>(); + bool doDistributeReduction = + teamsOp ? teamsReductionContainedInDistribute(teamsOp) : false; + + DenseMap<Value, llvm::Value *> reductionVariableMap; + unsigned numReductionVars = teamsOp ? teamsOp.getNumReductionVars() : 0; + SmallVector<omp::DeclareReductionOp> reductionDecls; + SmallVector<llvm::Value *> privateReductionVariables(numReductionVars); + llvm::ArrayRef<bool> isByRef; + + if (doDistributeReduction) { + isByRef = getIsByRef(teamsOp.getReductionByref()); + assert(isByRef.size() == teamsOp.getNumReductionVars()); + + collectReductionDecls(teamsOp, reductionDecls); + llvm::OpenMPIRBuilder::InsertPointTy allocaIP = + findAllocaInsertPoint(builder, moduleTranslation); + + MutableArrayRef<BlockArgument> reductionArgs = + llvm::cast<omp::BlockArgOpenMPOpInterface>(*teamsOp) + .getReductionBlockArgs(); + + if (failed(allocAndInitializeReductionVars( + teamsOp, reductionArgs, builder, moduleTranslation, allocaIP, + reductionDecls, privateReductionVariables, reductionVariableMap, + isByRef))) + return failure(); + } + using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) -> llvm::Error { @@ -4244,6 +4367,14 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, return failure(); builder.restoreIP(*afterIP); + + if (doDistributeReduction) { + // Process the reductions if required. + return createReductionsAndCleanup( + teamsOp, builder, moduleTranslation, allocaIP, reductionDecls, + privateReductionVariables, isByRef, + /*isNoWait*/ false, /*isTeamsReduction*/ true); + } return success(); } @@ -4554,6 +4685,25 @@ static std::optional<int64_t> extractConstInteger(Value value) { return std::nullopt; } +static uint64_t getTypeByteSize(mlir::Type type, const DataLayout &dl) { + uint64_t sizeInBits = dl.getTypeSizeInBits(type); + uint64_t sizeInBytes = sizeInBits / 8; + return sizeInBytes; +} + +template <typename OpTy> +static uint64_t getReductionDataSize(OpTy &op) { + if (op.getNumReductionVars() > 0) { + assert(op.getNumReductionVars() && + "Only 1 reduction variable currently supported"); + mlir::Type reductionVarTy = op.getReductionVars()[0].getType(); + Operation *opp = op.getOperation(); + DataLayout dl = DataLayout(opp->getParentOfType<ModuleOp>()); + return getTypeByteSize(reductionVarTy, dl); + } + return 0; +} + /// Populate default `MinTeams`, `MaxTeams` and `MaxThreads` to their default /// values as stated by the corresponding clauses, if constant. /// @@ -4563,7 +4713,7 @@ static std::optional<int64_t> extractConstInteger(Value value) { static void initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp, llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &attrs, - bool isTargetDevice) { + bool isTargetDevice, bool isGPU) { // TODO: Handle constant 'if' clauses. Value numThreads, numTeamsLower, numTeamsUpper, threadLimit; @@ -4645,12 +4795,23 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp, (maxThreadsVal >= 0 && maxThreadsVal < combinedMaxThreadsVal)) combinedMaxThreadsVal = maxThreadsVal; + // Calculate reduction data size, limited to single reduction variable for + // now. + int32_t reductionDataSize = 0; + if (isGPU && capturedOp) { + if (auto teamsOp = castOrGetParentOfType<omp::TeamsOp>(capturedOp)) + reductionDataSize = getReductionDataSize(teamsOp); + } + // Update kernel bounds structure for the `OpenMPIRBuilder` to use. attrs.ExecFlags = targetOp.getKernelExecFlags(capturedOp); attrs.MinTeams = minTeamsVal; attrs.MaxTeams.front() = maxTeamsVal; attrs.MinThreads = 1; attrs.MaxThreads.front() = combinedMaxThreadsVal; + attrs.ReductionDataSize = reductionDataSize; + if (attrs.ReductionDataSize != 0) + attrs.ReductionBufferLength = 1024; } /// Gather LLVM runtime values for all clauses evaluated in the host that are @@ -4731,6 +4892,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); bool isTargetDevice = ompBuilder->Config.isTargetDevice(); + bool isGPU = ompBuilder->Config.isGPU(); auto parentFn = opInst.getParentOfType<LLVM::LLVMFuncOp>(); auto argIface = cast<omp::BlockArgOpenMPOpInterface>(opInst); @@ -4933,7 +5095,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs defaultAttrs; Operation *targetCapturedOp = targetOp.getInnermostCapturedOmpOp(); initTargetDefaultAttrs(targetOp, targetCapturedOp, defaultAttrs, - isTargetDevice); + isTargetDevice, isGPU); // Collect host-evaluated values needed to properly launch the kernel from the // host. >From 53216480793381000352b17bb675770374b5ed80 Mon Sep 17 00:00:00 2001 From: Jan Leyonberg <jan_sjo...@yahoo.com> Date: Thu, 27 Mar 2025 09:48:50 -0400 Subject: [PATCH 04/10] Remove todo test. --- mlir/test/Target/LLVMIR/openmp-todo.mlir | 28 ------------------------ 1 file changed, 28 deletions(-) diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir index af31f8bab73ac..7eafe396082e4 100644 --- a/mlir/test/Target/LLVMIR/openmp-todo.mlir +++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir @@ -536,34 +536,6 @@ llvm.func @teams_private(%x : !llvm.ptr) { // ----- -omp.declare_reduction @add_f32 : f32 -init { -^bb0(%arg: f32): - %0 = llvm.mlir.constant(0.0 : f32) : f32 - omp.yield (%0 : f32) -} -combiner { -^bb1(%arg0: f32, %arg1: f32): - %1 = llvm.fadd %arg0, %arg1 : f32 - omp.yield (%1 : f32) -} -atomic { -^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): - %2 = llvm.load %arg3 : !llvm.ptr -> f32 - llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32 - omp.yield -} -llvm.func @teams_reduction(%x : !llvm.ptr) { - // expected-error@below {{not yet implemented: Unhandled clause reduction in omp.teams operation}} - // expected-error@below {{LLVM Translation failed for operation: omp.teams}} - omp.teams reduction(@add_f32 %x -> %prv : !llvm.ptr) { - omp.terminator - } - llvm.return -} - -// ----- - llvm.func @wsloop_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { // expected-error@below {{not yet implemented: Unhandled clause allocate in omp.wsloop operation}} // expected-error@below {{LLVM Translation failed for operation: omp.wsloop}} >From d2aadda98a4eb29aefeb4390fedea1f9610e5608 Mon Sep 17 00:00:00 2001 From: Jan Leyonberg <jan_sjo...@yahoo.com> Date: Thu, 27 Mar 2025 15:02:01 -0400 Subject: [PATCH 05/10] Add fix for tripcount. --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 6eb3ae7b60eb9..578ac1326010a 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -4497,10 +4497,23 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, static void createTargetLoopWorkshareCall( OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, - Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) { - Type *TripCountTy = TripCount->getType(); + Type *ParallelTaskPtr, Value *TripCountOrig, Function &LoopBodyFn) { Module &M = OMPBuilder->M; IRBuilder<> &Builder = OMPBuilder->Builder; + Value *TripCount = TripCountOrig; + // FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may + // not be the right way to fix it, but this works for now. + if (OMPBuilder->Config.isGPU()) { + Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())}); + LLVMContext &Ctx = M.getContext(); + Type *IVTy = TripCountOrig->getType(); + Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 + ? Type::getInt32Ty(Ctx) + : Type::getInt64Ty(Ctx); + Constant *One = ConstantInt::get(InternalIVTy, 1); + TripCount = Builder.CreateSub(TripCountOrig, One, "modified_trip_count"); + } + Type *TripCountTy = TripCount->getType(); FunctionCallee RTLFn = getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType); SmallVector<Value *, 8> RealArgs; >From fc7043ab2b7d292d8de15dd22ba7a298c26682dc Mon Sep 17 00:00:00 2001 From: Jan Leyonberg <jan_sjo...@yahoo.com> Date: Thu, 27 Mar 2025 15:24:19 -0400 Subject: [PATCH 06/10] Add offload runtime tests. --- .../basic-target-parallel-reduction.f90 | 27 +++++++++++++++++++ .../basic-target-teams-parallel-reduction.f90 | 27 +++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 offload/test/offloading/fortran/basic-target-parallel-reduction.f90 create mode 100644 offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90 diff --git a/offload/test/offloading/fortran/basic-target-parallel-reduction.f90 b/offload/test/offloading/fortran/basic-target-parallel-reduction.f90 new file mode 100644 index 0000000000000..ce2bb714c8d0f --- /dev/null +++ b/offload/test/offloading/fortran/basic-target-parallel-reduction.f90 @@ -0,0 +1,27 @@ +! Basic offloading test with a target region +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-generic +! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic +program main + use omp_lib + integer :: error = 0 + integer :: i + integer :: sum = 0 + + !$omp target parallel do reduction(+:sum) + do i = 1, 100 + sum = sum + i + end do + !$omp end target parallel do + + if (sum /= 5050) then + error = 1 + endif + + print *,"number of errors: ", error + +end program main + +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} +! CHECK: number of errors: 0 diff --git a/offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90 b/offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90 new file mode 100644 index 0000000000000..950887bf05f66 --- /dev/null +++ b/offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90 @@ -0,0 +1,27 @@ +! Basic offloading test with a target region +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-generic +! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic +program main + use omp_lib + integer :: error = 0 + integer :: i + integer :: sum = 0 + + !$omp target teams distribute parallel do reduction(+:sum) + do i = 1, 1000 + sum = sum + i + end do + !$omp end target teams distribute parallel do + + if (sum /= 500500) then + error = 1 + endif + + print *,"number of errors: ", error + +end program main + +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} +! CHECK: number of errors: 0 >From 37c6b7bcb96c66c1e5e918c537a1c79b814b0ce4 Mon Sep 17 00:00:00 2001 From: Jan Leyonberg <jan_sjo...@yahoo.com> Date: Thu, 27 Mar 2025 16:20:57 -0400 Subject: [PATCH 07/10] Fix tests, add new teams reduction test. --- .../LLVMIR/omptarget-parallel-wsloop.mlir | 2 +- .../LLVMIR/omptarget-wsloop-collapsed.mlir | 2 +- mlir/test/Target/LLVMIR/omptarget-wsloop.mlir | 4 +- .../Target/LLVMIR/openmp-teams-reduction.mlir | 71 +++++++++++++++++++ 4 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/openmp-teams-reduction.mlir diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir index 649210795ff5c..0e6e1c3b83bf1 100644 --- a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir @@ -36,7 +36,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK-SAME: ptr %[[ARG_PTR:.*]]) // CHECK-SAME: #[[ATTRS1:[0-9]+]] // CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB]] to ptr), -// CHECK-SAME: ptr @[[LOOP_BODY_FUNC:.*]], ptr %[[LOO_BODY_FUNC_ARG:.*]], i32 10, +// CHECK-SAME: ptr @[[LOOP_BODY_FUNC:.*]], ptr %[[LOO_BODY_FUNC_ARG:.*]], i32 9, // CHECK-SAME: i32 %[[THREAD_NUM:.*]], i32 0) // CHECK: define internal void @[[LOOP_BODY_FUNC]](i32 %[[CNT:.*]], ptr %[[LOOP_BODY_ARG_PTR:.*]]) #[[ATTRS2:[0-9]+]] { diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir index b7aecec308ef3..2213a5b7a4709 100644 --- a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir @@ -24,7 +24,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: define void @[[FUNC_COLLAPSED_WSLOOP:.*]](ptr %[[ARG0:.*]]) // CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), -// CHECK-SAME: ptr @[[COLLAPSED_WSLOOP_BODY_FN:.*]], ptr %[[STRUCT_ARG:.*]], i32 10000, +// CHECK-SAME: ptr @[[COLLAPSED_WSLOOP_BODY_FN:.*]], ptr %[[STRUCT_ARG:.*]], i32 9999, // CHECK-SAME: i32 %[[NUM_THREADS:.*]], i32 0) // CHECK: define internal void @[[COLLAPSED_WSLOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]]) diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir index e2a8d88bd181a..845647593108f 100644 --- a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir @@ -37,7 +37,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: %[[GEP:.*]] = getelementptr { ptr }, ptr addrspace(5) %[[STRUCTARG]], i32 0, i32 0 // CHECK: store ptr %[[ARG0]], ptr addrspace(5) %[[GEP]], align 8 // CHECK: %[[NUM_THREADS:.*]] = call i32 @omp_get_num_threads() -// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 10, i32 %[[NUM_THREADS]], i32 0) +// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 9, i32 %[[NUM_THREADS]], i32 0) // CHECK: define internal void @[[LOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]]) // CHECK: %[[GEP2:.*]] = getelementptr { ptr }, ptr %[[LOOP_BODY_ARG]], i32 0, i32 0 @@ -46,6 +46,6 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: store i32 %[[VAL0:.*]], ptr %[[GEP3]], align 4 // CHECK: define void @[[FUNC_EMPTY_WSLOOP:.*]]() -// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0) +// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 9, i32 %[[NUM_THREADS:.*]], i32 0) // CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]]) diff --git a/mlir/test/Target/LLVMIR/openmp-teams-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-teams-reduction.mlir new file mode 100644 index 0000000000000..854723050b035 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-teams-reduction.mlir @@ -0,0 +1,71 @@ +// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s + +// Only check the overall shape of the code and the presence of relevant +// runtime calls. Actual IR checking is done at the OpenMPIRBuilder level. + +omp.private {type = private} @_QFsimple_teams_reductionEindex__private_i32 : i32 +omp.declare_reduction @add_reduction_i32 : i32 init { +^bb0(%arg0: i32): + %0 = llvm.mlir.constant(0 : i32) : i32 + omp.yield(%0 : i32) +} combiner { +^bb0(%arg0: i32, %arg1: i32): + %0 = llvm.add %arg0, %arg1 : i32 + omp.yield(%0 : i32) +} +llvm.func @simple_teams_reduction_() attributes {fir.internal_name = "_QPsimple_teams_reduction", frame_pointer = #llvm.framePointerKind<all>, target_cpu = "x86-64"} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "sum"} : (i64) -> !llvm.ptr + %2 = llvm.mlir.constant(1 : i64) : i64 + %3 = llvm.alloca %2 x i32 {bindc_name = "index_"} : (i64) -> !llvm.ptr + %4 = llvm.mlir.constant(10000 : i32) : i32 + %5 = llvm.mlir.constant(1 : i32) : i32 + %6 = llvm.mlir.constant(0 : i32) : i32 + %7 = llvm.mlir.constant(1 : i64) : i64 + %8 = llvm.mlir.constant(1 : i64) : i64 + llvm.store %6, %1 : i32, !llvm.ptr + omp.teams reduction(@add_reduction_i32 %1 -> %arg0 : !llvm.ptr) { + omp.distribute private(@_QFsimple_teams_reductionEindex__private_i32 %3 -> %arg1 : !llvm.ptr) { + omp.loop_nest (%arg2) : i32 = (%5) to (%4) inclusive step (%5) { + llvm.store %arg2, %arg1 : i32, !llvm.ptr + %9 = llvm.load %arg0 : !llvm.ptr -> i32 + %10 = llvm.load %arg1 : !llvm.ptr -> i32 + %11 = llvm.add %9, %10 : i32 + llvm.store %11, %arg0 : i32, !llvm.ptr + omp.yield + } + } + omp.terminator + } + llvm.return +} +// Call to outlined function +// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams +// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Outlined function. +// CHECK: define internal void @[[OUTLINED]] + +// Private reduction variable and its initialization. +// CHECK: %[[PRIVATE:.+]] = alloca i32 +// CHECK: store i32 0, ptr %[[PRIVATE]] + +// Call to the reduction function. +// CHECK: call i32 @__kmpc_reduce +// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Atomic version not generated +// CHECK: unreachable + +// Non atomic version +// CHECK: call void @__kmpc_end_reduce + +// Finalize +// CHECK: br label %[[FINALIZE:.+]] + +// CHECK: [[FINALIZE]]: +// CHECK: call void @__kmpc_barrier + +// Reduction function. +// CHECK: define internal void @[[REDFUNC]] +// CHECK: add i32 >From c198b85dfce8c06a6514c092cfd2a74cb8292dcb Mon Sep 17 00:00:00 2001 From: Jan Leyonberg <jan_sjo...@yahoo.com> Date: Thu, 27 Mar 2025 16:23:38 -0400 Subject: [PATCH 08/10] Fix comment. --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 578ac1326010a..b5e55dbccf464 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -4501,8 +4501,9 @@ static void createTargetLoopWorkshareCall( Module &M = OMPBuilder->M; IRBuilder<> &Builder = OMPBuilder->Builder; Value *TripCount = TripCountOrig; - // FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may - // not be the right way to fix it, but this works for now. + // The trip count is 1 larger than it should be for GPU, this is because + // of how the deviceRTL functions work with clang. TODO: make the trip + // count consistent between both so we don't have to subtract one here. if (OMPBuilder->Config.isGPU()) { Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())}); LLVMContext &Ctx = M.getContext(); >From 1be881a0ffb8542796df8b9a17de38ca519d5067 Mon Sep 17 00:00:00 2001 From: Jan Leyonberg <jan_sjo...@yahoo.com> Date: Thu, 27 Mar 2025 16:41:47 -0400 Subject: [PATCH 09/10] Fix comments --- .../LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 155ea3f920617..84c4ee15ee8b0 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1017,15 +1017,10 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs, // variable allocated in the inlined region) llvm::Value *var = builder.CreateAlloca( moduleTranslation.convertType(reductionDecls[i].getType())); - // var->setName("private_redvar"); llvm::Type *ptrTy = llvm::PointerType::getUnqual(builder.getContext()); llvm::Value *castVar = builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy); - // TODO: I (Sergio) just guessed casting phis[0] like it's done for var is - // what's supposed to happen with this code coming from a merge from main, - // but I don't actually know. Someone more familiar with it needs to check - // this. llvm::Value *castPhi = builder.CreatePointerBitCastOrAddrSpaceCast(phis[0], ptrTy); @@ -1039,7 +1034,6 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs, "allocaction is implicit for by-val reduction"); llvm::Value *var = builder.CreateAlloca( moduleTranslation.convertType(reductionDecls[i].getType())); - // var->setName("private_redvar"); llvm::Type *ptrTy = llvm::PointerType::getUnqual(builder.getContext()); llvm::Value *castVar = >From d1ecf5f77c2a7172d2fe412b6200906ff24a0fb0 Mon Sep 17 00:00:00 2001 From: Jan Leyonberg <jan_sjo...@yahoo.com> Date: Fri, 28 Mar 2025 07:54:54 -0400 Subject: [PATCH 10/10] Fix tests. --- llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp | 1 + .../offloading/fortran/basic-target-parallel-reduction.f90 | 4 ++-- .../fortran/basic-target-teams-parallel-reduction.f90 | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 27c0e0bf80255..2d3d318be7ff1 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -2354,6 +2354,7 @@ TEST_F(OpenMPIRBuilderTest, StaticWorkshareLoopTarget) { "256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"); OpenMPIRBuilder OMPBuilder(*M); OMPBuilder.Config.IsTargetDevice = true; + OMPBuilder.Config.setIsGPU(false); OMPBuilder.initialize(); IRBuilder<> Builder(BB); OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); diff --git a/offload/test/offloading/fortran/basic-target-parallel-reduction.f90 b/offload/test/offloading/fortran/basic-target-parallel-reduction.f90 index ce2bb714c8d0f..cb84bcd3462cf 100644 --- a/offload/test/offloading/fortran/basic-target-parallel-reduction.f90 +++ b/offload/test/offloading/fortran/basic-target-parallel-reduction.f90 @@ -18,8 +18,8 @@ program main if (sum /= 5050) then error = 1 endif - - print *,"number of errors: ", error + + print *,"number of errors: ", error end program main diff --git a/offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90 b/offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90 index 950887bf05f66..fab4950452478 100644 --- a/offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90 +++ b/offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90 @@ -18,8 +18,8 @@ program main if (sum /= 500500) then error = 1 endif - - print *,"number of errors: ", error + + print *,"number of errors: ", error end program main _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits