[llvm-branch-commits] [clang] release/21.x: [Clang][CMake] Use IRPGO instead of FE PGO for Cmake Caches (#155957) (PR #156271)
boomanaiden154 wrote: This makes it easier to use the new options in the CI container since we build from the release branch. This should be a relatively safe backport given how small the change is in addition to how infrequently these CMake caches change at this point. https://github.com/llvm/llvm-project/pull/156271 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] release/21.x: [Clang][CMake] Use IRPGO instead of FE PGO for Cmake Caches (#155957) (PR #156271)
https://github.com/mtrofin approved this pull request. lgtm, but you probably want the release branch owner to lgtm, too https://github.com/llvm/llvm-project/pull/156271 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Split large loop dependence masks (PR #153187)
@@ -5247,50 +5247,85 @@ SDValue AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); - uint64_t EltSize = Op.getConstantOperandVal(2); - EVT VT = Op.getValueType(); - switch (EltSize) { - case 1: -if (VT != MVT::v16i8 && VT != MVT::nxv16i1) - return SDValue(); -break; - case 2: -if (VT != MVT::v8i8 && VT != MVT::nxv8i1) - return SDValue(); -break; - case 4: -if (VT != MVT::v4i16 && VT != MVT::nxv4i1) - return SDValue(); -break; - case 8: -if (VT != MVT::v2i32 && VT != MVT::nxv2i1) - return SDValue(); -break; - default: -// Other element sizes are incompatible with whilewr/rw, so expand instead -return SDValue(); - } + assert((Subtarget->hasSVE2() || + (Subtarget->hasSME() && Subtarget->isStreaming())) && + "Lowering loop_dependence_raw_mask or loop_dependence_war_mask " + "requires SVE or SME"); + + uint64_t EltSizeInBytes = Op.getConstantOperandVal(2); + // Other element sizes are incompatible with whilewr/rw, so expand instead + if (!is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes)) +return SDValue(); + + EVT FullVT = Op.getValueType(); + EVT ExtractVT = FullVT; + EVT EltVT = MVT::getIntegerVT(EltSizeInBytes * 8); + unsigned NumElements = FullVT.getVectorMinNumElements(); + unsigned PredElements = getPackedSVEVectorVT(EltVT).getVectorMinNumElements(); + bool Split = NumElements > PredElements; + + if (EltSizeInBytes * NumElements < 16) +// The element size and vector length combination must at least form a +// 128-bit vector. Shorter vector lengths can be widened then extracted +FullVT = FullVT.getDoubleNumVectorElementsVT(*DAG.getContext()); + + auto LowerToWhile = [&](EVT VT, unsigned AddrScale) { +SDValue PtrA = Op.getOperand(0); +SDValue PtrB = Op.getOperand(1); + +if (AddrScale > 0) { + unsigned Offset = + VT.getVectorMinNumElements() * EltSizeInBytes * AddrScale; + SDValue Addend; + + if (VT.isScalableVT()) +Addend = DAG.getVScale(DL, MVT::i64, APInt(64, Offset)); + else +Addend = DAG.getConstant(Offset, DL, MVT::i64); - SDValue PtrA = Op.getOperand(0); - SDValue PtrB = Op.getOperand(1); + PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend); +} - if (VT.isScalableVT()) -return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2)); +if (VT.isScalableVT()) + return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2)); - // We can use the SVE whilewr/whilerw instruction to lower this - // intrinsic by creating the appropriate sequence of scalable vector - // operations and then extracting a fixed-width subvector from the scalable - // vector. Scalable vector variants are already legal. - EVT ContainerVT = - EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - VT.getVectorNumElements(), true); - EVT WhileVT = ContainerVT.changeElementType(MVT::i1); +// We can use the SVE whilewr/whilerw instruction to lower this +// intrinsic by creating the appropriate sequence of scalable vector +// operations and then extracting a fixed-width subvector from the scalable +// vector. Scalable vector variants are already legal. +EVT ContainerVT = +EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements(), true); +EVT WhileVT = ContainerVT.changeElementType(MVT::i1); - SDValue Mask = - DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2)); - SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt, - DAG.getVectorIdxConstant(0, DL)); +SDValue Mask = +DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2)); +SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask); +return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt, + DAG.getVectorIdxConstant(0, DL)); + }; + + SDValue Result; + if (!Split) { +Result = LowerToWhile(FullVT, 0); SamTebbs33 wrote: It was being called to re-use the containerisation, not needed now with the new approach. https://github.com/llvm/llvm-project/pull/153187 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Split large loop dependence masks (PR #153187)
@@ -5247,50 +5247,85 @@ SDValue AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); - uint64_t EltSize = Op.getConstantOperandVal(2); - EVT VT = Op.getValueType(); - switch (EltSize) { - case 1: -if (VT != MVT::v16i8 && VT != MVT::nxv16i1) - return SDValue(); -break; - case 2: -if (VT != MVT::v8i8 && VT != MVT::nxv8i1) - return SDValue(); -break; - case 4: -if (VT != MVT::v4i16 && VT != MVT::nxv4i1) - return SDValue(); -break; - case 8: -if (VT != MVT::v2i32 && VT != MVT::nxv2i1) - return SDValue(); -break; - default: -// Other element sizes are incompatible with whilewr/rw, so expand instead -return SDValue(); - } + assert((Subtarget->hasSVE2() || + (Subtarget->hasSME() && Subtarget->isStreaming())) && + "Lowering loop_dependence_raw_mask or loop_dependence_war_mask " + "requires SVE or SME"); + + uint64_t EltSizeInBytes = Op.getConstantOperandVal(2); + // Other element sizes are incompatible with whilewr/rw, so expand instead + if (!is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes)) +return SDValue(); + + EVT FullVT = Op.getValueType(); + EVT ExtractVT = FullVT; + EVT EltVT = MVT::getIntegerVT(EltSizeInBytes * 8); + unsigned NumElements = FullVT.getVectorMinNumElements(); + unsigned PredElements = getPackedSVEVectorVT(EltVT).getVectorMinNumElements(); + bool Split = NumElements > PredElements; + + if (EltSizeInBytes * NumElements < 16) +// The element size and vector length combination must at least form a +// 128-bit vector. Shorter vector lengths can be widened then extracted +FullVT = FullVT.getDoubleNumVectorElementsVT(*DAG.getContext()); + + auto LowerToWhile = [&](EVT VT, unsigned AddrScale) { SamTebbs33 wrote: Done. https://github.com/llvm/llvm-project/pull/153187 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64] Split large loop dependence masks (PR #153187)
@@ -5286,41 +5285,44 @@ AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op, PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend); } -if (VT.isScalableVT()) - return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2)); - -// We can use the SVE whilewr/whilerw instruction to lower this -// intrinsic by creating the appropriate sequence of scalable vector -// operations and then extracting a fixed-width subvector from the scalable -// vector. Scalable vector variants are already legal. -EVT ContainerVT = -EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - VT.getVectorNumElements(), true); -EVT WhileVT = ContainerVT.changeElementType(MVT::i1); - -SDValue Mask = -DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2)); -SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask); -return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt, - DAG.getVectorIdxConstant(0, DL)); +return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2)); }; SDValue Result; - if (!Split) { -Result = LowerToWhile(FullVT, 0); - } else { - + if (Split) { SamTebbs33 wrote: Yeah I separated the splitting and containerisation logic as per the request [here](https://github.com/llvm/llvm-project/pull/153187/commits/85b52942d61712aba884c12a60b98cbdaee2b233#r2304356700). I've experimented with adding another lambda that containerises without re-entering and that seems to have fixed the codegen. https://github.com/llvm/llvm-project/pull/153187 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang-tools-extra] [clangd] Use HeuristicResolver to try to resolve dependent 'auto' (PR #156283)
llvmbot wrote: @llvm/pr-subscribers-clangd Author: Nathan Ridge (HighCommander4) Changes Fixes https://github.com/clangd/clangd/issues/2431 --- Full diff: https://github.com/llvm/llvm-project/pull/156283.diff 9 Files Affected: - (modified) clang-tools-extra/clangd/AST.cpp (+17-4) - (modified) clang-tools-extra/clangd/AST.h (+3-1) - (modified) clang-tools-extra/clangd/Hover.cpp (+3-1) - (modified) clang-tools-extra/clangd/XRefs.cpp (+7-4) - (modified) clang-tools-extra/clangd/refactor/tweaks/ExpandDeducedType.cpp (+2-1) - (modified) clang-tools-extra/clangd/unittests/ASTTests.cpp (+2-1) - (modified) clang-tools-extra/clangd/unittests/HoverTests.cpp (+2-2) - (modified) clang-tools-extra/clangd/unittests/XRefsTests.cpp (+9-1) - (modified) clang-tools-extra/clangd/unittests/tweaks/ExpandDeducedTypeTests.cpp (+2-2) ``diff diff --git a/clang-tools-extra/clangd/AST.cpp b/clang-tools-extra/clangd/AST.cpp index 2f46ecc92576c..b96a84519e78c 100644 --- a/clang-tools-extra/clangd/AST.cpp +++ b/clang-tools-extra/clangd/AST.cpp @@ -29,6 +29,7 @@ #include "clang/Basic/SourceManager.h" #include "clang/Basic/Specifiers.h" #include "clang/Index/USRGeneration.h" +#include "clang/Sema/HeuristicResolver.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" @@ -479,10 +480,12 @@ namespace { /// a deduced type set. The AST should be improved to simplify this scenario. class DeducedTypeVisitor : public RecursiveASTVisitor { SourceLocation SearchedLocation; + const HeuristicResolver *Resolver; public: - DeducedTypeVisitor(SourceLocation SearchedLocation) - : SearchedLocation(SearchedLocation) {} + DeducedTypeVisitor(SourceLocation SearchedLocation, + const HeuristicResolver *Resolver) + : SearchedLocation(SearchedLocation), Resolver(Resolver) {} // Handle auto initializers: //- auto i = 1; @@ -499,6 +502,14 @@ class DeducedTypeVisitor : public RecursiveASTVisitor { return true; if (auto *AT = D->getType()->getContainedAutoType()) { + if (AT->isUndeducedAutoType()) { +if (const auto *VD = dyn_cast(D)) { + if (Resolver && VD->hasInit()) { +DeducedType = Resolver->resolveExprToType(VD->getInit()); +return true; + } +} + } DeducedType = AT->desugar(); } return true; @@ -608,10 +619,12 @@ class DeducedTypeVisitor : public RecursiveASTVisitor { }; } // namespace -std::optional getDeducedType(ASTContext &ASTCtx, SourceLocation Loc) { +std::optional getDeducedType(ASTContext &ASTCtx, + const HeuristicResolver *Resolver, + SourceLocation Loc) { if (!Loc.isValid()) return {}; - DeducedTypeVisitor V(Loc); + DeducedTypeVisitor V(Loc, Resolver); V.TraverseAST(ASTCtx); if (V.DeducedType.isNull()) return std::nullopt; diff --git a/clang-tools-extra/clangd/AST.h b/clang-tools-extra/clangd/AST.h index 1538d12172593..2b83595e5b8e9 100644 --- a/clang-tools-extra/clangd/AST.h +++ b/clang-tools-extra/clangd/AST.h @@ -31,6 +31,7 @@ namespace clang { class SourceManager; class Decl; class DynTypedNode; +class HeuristicResolver; namespace clangd { @@ -167,7 +168,8 @@ QualType declaredType(const TypeDecl *D); /// Retrieves the deduced type at a given location (auto, decltype). /// It will return the underlying type. /// If the type is an undeduced auto, returns the type itself. -std::optional getDeducedType(ASTContext &, SourceLocation Loc); +std::optional getDeducedType(ASTContext &, const HeuristicResolver *, + SourceLocation Loc); // Find the abbreviated-function-template `auto` within a type, or returns null. // Similar to getContainedAutoTypeLoc, but these `auto`s are diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp index 9eec322fe5963..138544dea99a1 100644 --- a/clang-tools-extra/clangd/Hover.cpp +++ b/clang-tools-extra/clangd/Hover.cpp @@ -1309,7 +1309,9 @@ std::optional getHover(ParsedAST &AST, Position Pos, } } else if (Tok.kind() == tok::kw_auto || Tok.kind() == tok::kw_decltype) { HoverCountMetric.record(1, "keyword"); - if (auto Deduced = getDeducedType(AST.getASTContext(), Tok.location())) { + if (auto Deduced = + getDeducedType(AST.getASTContext(), AST.getHeuristicResolver(), + Tok.location())) { HI = getDeducedTypeHoverContents(*Deduced, Tok, AST.getASTContext(), PP, Index); HighlightRange = Tok.range(SM).toCharRange(SM); diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp index a253a630a48cc..18bc29426df29 100644 --- a/clang-tools-extra/clangd/XRefs.cpp +++ b/clang-tools-extra/clangd/XRefs.cpp @@ -806,7 +806,9 @@ std::vector locateSymbolAt(ParsedAST &AST, Po
[llvm-branch-commits] [clang-tools-extra] [clangd] Use HeuristicResolver to try to resolve dependent 'auto' (PR #156283)
HighCommander4 wrote: Depends on https://github.com/llvm/llvm-project/pull/156282 https://github.com/llvm/llvm-project/pull/156283 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang][OpenMP] Extend `do concurrent` mapping to device (PR #155987)
https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/155987 >From 0373863b919e59130dcf57593f4283ece0dff12a Mon Sep 17 00:00:00 2001 From: ergawy Date: Fri, 29 Aug 2025 02:04:49 -0500 Subject: [PATCH] [flang][OpenMP] Extend `do concurrent` mapping to device Upstreams further parts of `do concurrent` to OpenMP conversion pass from AMD's fork. This PR extends the pass by adding support for mapping to the device. --- flang/lib/Optimizer/OpenMP/CMakeLists.txt | 1 + .../OpenMP/DoConcurrentConversion.cpp | 400 +- .../Transforms/DoConcurrent/basic_device.f90 | 83 .../Transforms/DoConcurrent/basic_device.mlir | 10 +- 4 files changed, 476 insertions(+), 18 deletions(-) create mode 100644 flang/test/Transforms/DoConcurrent/basic_device.f90 diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt index e0aebd0714c8f..b85ee7e861a4f 100644 --- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt @@ -26,6 +26,7 @@ add_flang_library(FlangOpenMPTransforms FIRSupport FortranSupport HLFIRDialect + FortranUtils MLIR_DEPS ${dialect_libs} diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index c928b76065ade..e975b86a6ba0d 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -6,17 +6,22 @@ // //===--===// +#include "flang/Optimizer/Builder/DirectivesCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/HLFIRTools.h" #include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/OpenMP/Passes.h" #include "flang/Optimizer/OpenMP/Utils.h" #include "flang/Support/OpenMP-utils.h" +#include "flang/Utils/OpenMP.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/IRMapping.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/RegionUtils.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" namespace flangomp { #define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS @@ -107,6 +112,33 @@ struct InductionVariableInfo { using InductionVariableInfos = llvm::SmallVector; +/// Collect the list of values used inside the loop but defined outside of it. +void collectLoopLiveIns(fir::DoConcurrentLoopOp loop, +llvm::SmallVectorImpl &liveIns) { + llvm::SmallDenseSet seenValues; + llvm::SmallDenseSet seenOps; + + for (auto [lb, ub, st] : llvm::zip_equal( + loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) { +liveIns.push_back(lb); +liveIns.push_back(ub); +liveIns.push_back(st); + } + + mlir::visitUsedValuesDefinedAbove( + loop.getRegion(), [&](mlir::OpOperand *operand) { +if (!seenValues.insert(operand->get()).second) + return; + +mlir::Operation *definingOp = operand->get().getDefiningOp(); +// We want to collect ops corresponding to live-ins only once. +if (definingOp && !seenOps.insert(definingOp).second) + return; + +liveIns.push_back(operand->get()); + }); +} + /// Collects values that are local to a loop: "loop-local values". A loop-local /// value is one that is used exclusively inside the loop but allocated outside /// of it. This usually corresponds to temporary values that are used inside the @@ -182,10 +214,6 @@ class DoConcurrentConversion mlir::LogicalResult matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const override { -if (mapToDevice) - return doLoop.emitError( - "not yet implemented: Mapping `do concurrent` loops to device"); - looputils::InductionVariableInfos ivInfos; auto loop = mlir::cast( doLoop.getRegion().back().getTerminator()); @@ -196,20 +224,72 @@ class DoConcurrentConversion for (mlir::Value indVar : *indVars) ivInfos.emplace_back(loop, indVar); +llvm::SmallVector loopNestLiveIns; +looputils::collectLoopLiveIns(loop, loopNestLiveIns); +assert(!loopNestLiveIns.empty()); + llvm::SetVector locals; looputils::collectLoopLocalValues(loop, locals); +// We do not want to map "loop-local" values to the device through +// `omp.map.info` ops. Therefore, we remove them from the list of live-ins. +loopNestLiveIns.erase(llvm::remove_if(loopNestLiveIns, + [&](mlir::Value liveIn) { +return locals.contains(liveIn); + }), + loopNestLiveIns.end()); + +mlir::omp::TargetOp targetOp;
[llvm-branch-commits] [flang] [flang][OpenMP] `do concurrent` to device mapping lit tests (PR #155992)
https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/155992 >From 3dd383be4ab7c3a03da321f1127e056f52eecff0 Mon Sep 17 00:00:00 2001 From: ergawy Date: Fri, 29 Aug 2025 03:53:51 -0500 Subject: [PATCH] [flang][OpenMP] `do concurrent` to device mapping lit tests Adds more lit tests for `do concurrent` device mapping. --- .../Transforms/DoConcurrent/allocatable.f90 | 29 + .../Transforms/DoConcurrent/host_eval.f90 | 63 +++ .../DoConcurrent/locally_destroyed_temp.f90 | 43 --- .../DoConcurrent/map_shape_info.f90 | 104 + .../multiple_iteration_ranges.f90 | 106 +++--- .../DoConcurrent/non_reference_to_device.f90 | 34 ++ .../DoConcurrent/not_perfectly_nested.f90 | 66 +++ .../DoConcurrent/runtime_sized_array.f90 | 42 +++ .../DoConcurrent/skip_all_nested_loops.f90| 68 +++ 9 files changed, 478 insertions(+), 77 deletions(-) create mode 100644 flang/test/Transforms/DoConcurrent/allocatable.f90 create mode 100644 flang/test/Transforms/DoConcurrent/host_eval.f90 create mode 100644 flang/test/Transforms/DoConcurrent/map_shape_info.f90 create mode 100644 flang/test/Transforms/DoConcurrent/non_reference_to_device.f90 create mode 100644 flang/test/Transforms/DoConcurrent/runtime_sized_array.f90 create mode 100644 flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90 diff --git a/flang/test/Transforms/DoConcurrent/allocatable.f90 b/flang/test/Transforms/DoConcurrent/allocatable.f90 new file mode 100644 index 0..03962f150eb95 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/allocatable.f90 @@ -0,0 +1,29 @@ +! Verifies that proper `omp.map.bounds` ops are emitted when an allocatable is +! implicitly mapped by a `do concurrent` loop. + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \ +! RUN: | FileCheck %s +program main + implicit none + + integer,parameter :: n = 100 + real, allocatable, dimension(:) :: y + integer :: i + + allocate(y(1:n)) + + do concurrent(i=1:n) + y(i) = 42 + end do + + deallocate(y) +end program main + +! CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QFEy"} +! CHECK: %[[Y_VAL:.*]] = fir.load %[[Y_DECL]]#0 +! CHECK: %[[Y_DIM0:.*]]:3 = fir.box_dims %[[Y_VAL]], %{{c0_.*}} +! CHECK: %[[Y_LB:.*]] = arith.constant 0 : index +! CHECK: %[[Y_UB:.*]] = arith.subi %[[Y_DIM0]]#1, %{{c1_.*}} : index +! CHECK: %[[Y_BOUNDS:.*]] = omp.map.bounds lower_bound(%[[Y_LB]] : index) upper_bound(%[[Y_UB]] : index) extent(%[[Y_DIM0]]#1 : index) +! CHECK: %[[MEM_MAP:.*]] = omp.map.info {{.*}} bounds(%[[Y_BOUNDS]]) +! CHECK: omp.map.info var_ptr(%[[Y_DECL]]#1 : {{.*}}) {{.*}} members(%[[MEM_MAP]] : {{.*}}) diff --git a/flang/test/Transforms/DoConcurrent/host_eval.f90 b/flang/test/Transforms/DoConcurrent/host_eval.f90 new file mode 100644 index 0..7d16a91ae6941 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/host_eval.f90 @@ -0,0 +1,63 @@ +! Tests `host_eval` clause code-gen and loop nest bounds on host vs. device. + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa \ +! RUN: -fdo-concurrent-to-openmp=device %s -o - \ +! RUN: | FileCheck %s --check-prefix=HOST -vv + +! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir -fopenmp\ +! RUN: -fopenmp-is-target-device -fdo-concurrent-to-openmp=device %s -o - \ +! RUN: | FileCheck %s --check-prefix=DEVICE + +program do_concurrent_host_eval +implicit none +integer :: i, j + +do concurrent (i=1:10, j=1:20) +end do +end program do_concurrent_host_eval + +! HOST: omp.target host_eval( +! HOST-SAME:%{{[^[:space:]]+}} -> %[[I_LB:[^,]+]], +! HOST-SAME:%{{[^[:space:]]+}} -> %[[I_UB:[^,]+]], +! HOST-SAME:%{{[^[:space:]]+}} -> %[[I_ST:[^,]+]], +! HOST-SAME:%{{[^[:space:]]+}} -> %[[J_LB:[^,]+]], +! HOST-SAME:%{{[^[:space:]]+}} -> %[[J_UB:[^,]+]], +! HOST-SAME:%{{[^[:space:]]+}} -> %[[J_ST:[^,]+]] : {{.*}}) map_entries + +! HOST: omp.loop_nest ({{.*}}, {{.*}}) : index = (%[[I_LB]], %[[J_LB]]) to +! HOST-SAME:(%[[I_UB]], %[[J_UB]]) inclusive step +! HOST-SAME:(%[[I_ST]], %[[J_ST]]) + +! DEVICE: omp.target map_entries( +! DEVICE-SAME: %{{[^[:space:]]+}} -> %[[I_LB_MAP:[^,]+]], +! DEVICE-SAME: %{{[^[:space:]]+}} -> %[[I_UB_MAP:[^,]+]], +! DEVICE-SAME: %{{[^[:space:]]+}} -> %[[I_ST_MAP:[^,]+]], + +! DEVICE-SAME: %{{[^[:space:]]+}} -> %[[J_LB_MAP:[^,]+]], +! DEVICE-SAME: %{{[^[:space:]]+}} -> %[[J_UB_MAP:[^,]+]], +! DEVICE-SAME: %{{[^[:space:]]+}} -> %[[J_ST_MAP:[^,]+]], + +! DEVICE-SAME: %{{[^[:space:]]+}} -> %{{[^,]+}}, +! DEVICE-SAME: %{{[^[:space:]]+}} -> %{{[^,]+}} : {{.*}}) + +! DEVICE: %[[I_LB_DECL:.*]]:2 = hlfir.declare %[[I_LB_MAP]] +! DEVICE: %[[I_LB:.*]] = fir.load %[[I_LB_DECL]]#1 : !fir.ref + +! DEVICE: %[[I_UB_DECL:.*]]:2 = hlfir.declare %[[I_UB_MAP]
[llvm-branch-commits] [llvm] [flang][do concurent] Add saxpy offload tests for OpenMP mapping (PR #155993)
https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/155993 >From f1bbd24a5069458c008736404c000c5334424b46 Mon Sep 17 00:00:00 2001 From: ergawy Date: Fri, 29 Aug 2025 04:04:07 -0500 Subject: [PATCH] [flang][do concurent] Add saxpy offload tests for OpenMP mapping Adds end-to-end tests for `do concurrent` offloading to the device. --- .../fortran/do-concurrent-to-omp-saxpy-2d.f90 | 53 +++ .../fortran/do-concurrent-to-omp-saxpy.f90| 53 +++ 2 files changed, 106 insertions(+) create mode 100644 offload/test/offloading/fortran/do-concurrent-to-omp-saxpy-2d.f90 create mode 100644 offload/test/offloading/fortran/do-concurrent-to-omp-saxpy.f90 diff --git a/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy-2d.f90 b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy-2d.f90 new file mode 100644 index 0..c6f576acb90b6 --- /dev/null +++ b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy-2d.f90 @@ -0,0 +1,53 @@ +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-generic -fdo-concurrent-to-openmp=device +! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic +module saxpymod + use iso_fortran_env + public :: saxpy +contains + +subroutine saxpy(a, x, y, n, m) + use iso_fortran_env + implicit none + integer,intent(in) :: n, m + real(kind=real32),intent(in) :: a + real(kind=real32), dimension(:,:),intent(in) :: x + real(kind=real32), dimension(:,:),intent(inout) :: y + integer :: i, j + + do concurrent(i=1:n, j=1:m) + y(i,j) = a * x(i,j) + y(i,j) + end do + + write(*,*) "plausibility check:" + write(*,'("y(1,1) ",f8.6)') y(1,1) + write(*,'("y(n,m) ",f8.6)') y(n,m) +end subroutine saxpy + +end module saxpymod + +program main + use iso_fortran_env + use saxpymod, ONLY:saxpy + implicit none + + integer,parameter :: n = 1000, m=1 + real(kind=real32), allocatable, dimension(:,:) :: x, y + real(kind=real32) :: a + integer :: i + + allocate(x(1:n,1:m), y(1:n,1:m)) + a = 2.0_real32 + x(:,:) = 1.0_real32 + y(:,:) = 2.0_real32 + + call saxpy(a, x, y, n, m) + + deallocate(x,y) +end program main + +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} +! CHECK: plausibility check: +! CHECK: y(1,1) 4.0 +! CHECK: y(n,m) 4.0 diff --git a/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy.f90 b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy.f90 new file mode 100644 index 0..e094a1d7459ef --- /dev/null +++ b/offload/test/offloading/fortran/do-concurrent-to-omp-saxpy.f90 @@ -0,0 +1,53 @@ +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-generic -fdo-concurrent-to-openmp=device +! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic +module saxpymod + use iso_fortran_env + public :: saxpy +contains + +subroutine saxpy(a, x, y, n) + use iso_fortran_env + implicit none + integer,intent(in) :: n + real(kind=real32),intent(in) :: a + real(kind=real32), dimension(:),intent(in) :: x + real(kind=real32), dimension(:),intent(inout) :: y + integer :: i + + do concurrent(i=1:n) + y(i) = a * x(i) + y(i) + end do + + write(*,*) "plausibility check:" + write(*,'("y(1) ",f8.6)') y(1) + write(*,'("y(n) ",f8.6)') y(n) +end subroutine saxpy + +end module saxpymod + +program main + use iso_fortran_env + use saxpymod, ONLY:saxpy + implicit none + + integer,parameter :: n = 1000 + real(kind=real32), allocatable, dimension(:) :: x, y + real(kind=real32) :: a + integer :: i + + allocate(x(1:n), y(1:n)) + a = 2.0_real32 + x(:) = 1.0_real32 + y(:) = 2.0_real32 + + call saxpy(a, x, y, n) + + deallocate(x,y) +end program main + +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} +! CHECK: plausibility check: +! CHECK: y(1) 4.0 +! CHECK: y(n) 4.0 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] release/21.x: [Clang][CMake] Use IRPGO instead of FE PGO for Cmake Caches (#155957) (PR #156271)
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/156271 Backport 7fca1f841b4c226d50ab7bad64de5db225d4193b Requested by: @boomanaiden154 >From 08383fb44eba580749dc2254effc5b1b1f96f4c4 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 29 Aug 2025 10:06:26 -0700 Subject: [PATCH] [Clang][CMake] Use IRPGO instead of FE PGO for Cmake Caches (#155957) Currently the clang CMake caches use FE PGO for instrumentation (LLVM_BUILD_INSTRUMENTED=ON). However, IRPGO is generally regarded as better for performance. I am measuring about a 1.5% performance gain when building libLLVMSupport.a using this configuration versus what existed before this commit. I would suspect the gains are larger on other platforms like Windows where we cannot subsume any gains using PLO. (cherry picked from commit 7fca1f841b4c226d50ab7bad64de5db225d4193b) --- clang/cmake/caches/PGO.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/cmake/caches/PGO.cmake b/clang/cmake/caches/PGO.cmake index 15bc755d110d1..d6471160037c1 100644 --- a/clang/cmake/caches/PGO.cmake +++ b/clang/cmake/caches/PGO.cmake @@ -5,7 +5,7 @@ set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "") set(LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "") set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "") -set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED ON CACHE BOOL "") +set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED IR CACHE BOOL "") set(CLANG_BOOTSTRAP_TARGETS generate-profdata stage2 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] release/21.x: [Clang][CMake] Use IRPGO instead of FE PGO for Cmake Caches (#155957) (PR #156271)
https://github.com/llvmbot milestoned https://github.com/llvm/llvm-project/pull/156271 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] release/21.x: [Clang][CMake] Use IRPGO instead of FE PGO for Cmake Caches (#155957) (PR #156271)
llvmbot wrote: @llvm/pr-subscribers-clang Author: None (llvmbot) Changes Backport 7fca1f841b4c226d50ab7bad64de5db225d4193b Requested by: @boomanaiden154 --- Full diff: https://github.com/llvm/llvm-project/pull/156271.diff 1 Files Affected: - (modified) clang/cmake/caches/PGO.cmake (+1-1) ``diff diff --git a/clang/cmake/caches/PGO.cmake b/clang/cmake/caches/PGO.cmake index 15bc755d110d1..d6471160037c1 100644 --- a/clang/cmake/caches/PGO.cmake +++ b/clang/cmake/caches/PGO.cmake @@ -5,7 +5,7 @@ set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "") set(LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "") set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "") -set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED ON CACHE BOOL "") +set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED IR CACHE BOOL "") set(CLANG_BOOTSTRAP_TARGETS generate-profdata stage2 `` https://github.com/llvm/llvm-project/pull/156271 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] release/21.x: [Clang][CMake] Use IRPGO instead of FE PGO for Cmake Caches (#155957) (PR #156271)
llvmbot wrote: @mtrofin What do you think about merging this PR to the release branch? https://github.com/llvm/llvm-project/pull/156271 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang-tools-extra] [clangd] Show type hint for simple cases of dependent 'auto' (PR #156284)
https://github.com/HighCommander4 created https://github.com/llvm/llvm-project/pull/156284 Fixes https://github.com/clangd/clangd/issues/2275 >From 67b81f57250655c41112b3c3601ca537119df5cc Mon Sep 17 00:00:00 2001 From: Nathan Ridge Date: Mon, 1 Sep 2025 02:48:44 -0400 Subject: [PATCH] [clangd] Show type hint for simple cases of dependent 'auto' Fixes https://github.com/clangd/clangd/issues/2275 --- clang-tools-extra/clangd/InlayHints.cpp | 27 ++- .../clangd/unittests/InlayHintTests.cpp | 3 ++- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/clang-tools-extra/clangd/InlayHints.cpp b/clang-tools-extra/clangd/InlayHints.cpp index cd479e1b7c9bc..819e597873d83 100644 --- a/clang-tools-extra/clangd/InlayHints.cpp +++ b/clang-tools-extra/clangd/InlayHints.cpp @@ -633,13 +633,26 @@ class InlayHintVisitor : public RecursiveASTVisitor { } if (auto *AT = D->getType()->getContainedAutoType()) { - if (AT->isDeduced() && !D->getType()->isDependentType()) { -// Our current approach is to place the hint on the variable -// and accordingly print the full type -// (e.g. for `const auto& x = 42`, print `const int&`). -// Alternatively, we could place the hint on the `auto` -// (and then just print the type deduced for the `auto`). -addTypeHint(D->getLocation(), D->getType(), /*Prefix=*/": "); + if (AT->isDeduced()) { +QualType T; +if (D->getType()->isDependentType()) { + if (D->hasInit()) { +QualType Resolved = Resolver->resolveExprToType(D->getInit()); +if (Resolved != AST.DependentTy) { + T = Resolved; +} + } +} else { + T = D->getType(); +} +if (!T.isNull()) { + // Our current approach is to place the hint on the variable + // and accordingly print the full type + // (e.g. for `const auto& x = 42`, print `const int&`). + // Alternatively, we could place the hint on the `auto` + // (and then just print the type deduced for the `auto`). + addTypeHint(D->getLocation(), T, /*Prefix=*/": "); +} } } diff --git a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp index 99e728c40063d..feb4404b3d2bf 100644 --- a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp +++ b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp @@ -1441,7 +1441,8 @@ TEST(TypeHints, DependentType) { void bar(T arg) { auto [a, b] = arg; } - )cpp"); + )cpp", + ExpectedHint{": T", "var2"}); } TEST(TypeHints, LongTypeName) { ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits