https://github.com/madhur13490 updated https://github.com/llvm/llvm-project/pull/142686
>From e2f53441f1a2d35af7c0d6d177afea79c7af2c47 Mon Sep 17 00:00:00 2001 From: Sebastian Pop <s...@nvidia.com> Date: Thu, 22 May 2025 13:50:38 +0000 Subject: [PATCH 01/10] add -floop-fuse to clang and flang --- clang/include/clang/Basic/CodeGenOptions.def | 1 + clang/include/clang/Driver/Options.td | 4 ++++ clang/lib/CodeGen/BackendUtil.cpp | 2 ++ clang/lib/Driver/ToolChains/Clang.cpp | 1 + clang/lib/Driver/ToolChains/CommonArgs.cpp | 10 ++++++++-- clang/lib/Driver/ToolChains/Flang.cpp | 2 ++ clang/lib/Frontend/CompilerInvocation.cpp | 6 ++++++ clang/test/Driver/clang_f_opts.c | 7 +++++++ flang/docs/ReleaseNotes.md | 4 ++++ flang/include/flang/Frontend/CodeGenOptions.def | 1 + flang/lib/Frontend/CompilerInvocation.cpp | 3 +++ flang/lib/Frontend/FrontendActions.cpp | 1 + flang/test/Driver/loop-fuse.f90 | 17 +++++++++++++++++ llvm/include/llvm/Passes/PassBuilder.h | 3 +++ llvm/lib/Passes/PassBuilderPipelines.cpp | 13 ++++++++++++- 15 files changed, 72 insertions(+), 3 deletions(-) create mode 100644 flang/test/Driver/loop-fuse.f90 diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index fda0da99b60c0..4c4f0970dfff0 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -322,6 +322,7 @@ CODEGENOPT(TimeTrace , 1, 0, Benign) ///< Set when -ftime-trace is enabl VALUE_CODEGENOPT(TimeTraceGranularity, 32, 500, Benign) ///< Minimum time granularity (in microseconds), ///< traced by time profiler CODEGENOPT(InterchangeLoops , 1, 0, Benign) ///< Run loop-interchange. +CODEGENOPT(FuseLoops , 1, 0, Benign) ///< Run loop-fuse. CODEGENOPT(UnrollLoops , 1, 0, Benign) ///< Control whether loops are unrolled. CODEGENOPT(RerollLoops , 1, 0, Benign) ///< Control whether loops are rerolled. CODEGENOPT(NoUseJumpTables , 1, 0, Benign) ///< Set when -fno-jump-tables is enabled. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index a7c514e809aa9..13a8502c6c7b5 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4304,6 +4304,10 @@ def floop_interchange : Flag<["-"], "floop-interchange">, Group<f_Group>, HelpText<"Enable the loop interchange pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; def fno_loop_interchange: Flag<["-"], "fno-loop-interchange">, Group<f_Group>, HelpText<"Disable the loop interchange pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; +def floop_fuse : Flag<["-"], "floop-fuse">, Group<f_Group>, + HelpText<"Enable the loop fuse pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; +def fno_loop_fuse: Flag<["-"], "fno-loop-fuse">, Group<f_Group>, + HelpText<"Disable the loop fuse pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>, HelpText<"Turn on loop unroller">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>, diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 3f095c03397fd..b16e619727617 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -896,6 +896,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline( PipelineTuningOptions PTO; PTO.LoopUnrolling = CodeGenOpts.UnrollLoops; PTO.LoopInterchange = CodeGenOpts.InterchangeLoops; + PTO.LoopFuse = CodeGenOpts.FuseLoops; // For historical reasons, loop interleaving is set to mirror setting for loop // unrolling. PTO.LoopInterleaving = CodeGenOpts.UnrollLoops; @@ -1331,6 +1332,7 @@ runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex, Conf.SampleProfile = std::move(SampleProfile); Conf.PTO.LoopUnrolling = CGOpts.UnrollLoops; Conf.PTO.LoopInterchange = CGOpts.InterchangeLoops; + Conf.PTO.LoopFuse = CGOpts.FuseLoops; // For historical reasons, loop interleaving is set to mirror setting for loop // unrolling. Conf.PTO.LoopInterleaving = CGOpts.UnrollLoops; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 946b1e39af3b9..dcfba95558fe6 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6854,6 +6854,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_unroll_loops); Args.AddLastArg(CmdArgs, options::OPT_floop_interchange, options::OPT_fno_loop_interchange); + Args.AddLastArg(CmdArgs, options::OPT_floop_fuse, options::OPT_fno_loop_fuse); Args.AddLastArg(CmdArgs, options::OPT_fstrict_flex_arrays_EQ); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index b50549219e4e8..a3c75d1009daa 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -3334,8 +3334,14 @@ void tools::handleVectorizeSLPArgs(const ArgList &Args, void tools::handleInterchangeLoopsArgs(const ArgList &Args, ArgStringList &CmdArgs) { - if (Args.hasFlag(options::OPT_floop_interchange, - options::OPT_fno_loop_interchange, false)) + // FIXME: Instead of relying on shouldEnableVectorizerAtOLevel, we may want to + // implement a separate function to infer loop interchange from opt level. + // For now, enable loop-interchange at the same opt levels as loop-vectorize. + bool EnableInterchange = shouldEnableVectorizerAtOLevel(Args, false); + OptSpecifier InterchangeAliasOption = + EnableInterchange ? options::OPT_O_Group : options::OPT_floop_interchange; + if (Args.hasFlag(options::OPT_floop_interchange, InterchangeAliasOption, + options::OPT_fno_loop_interchange, EnableInterchange)) CmdArgs.push_back("-floop-interchange"); } diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 1535f4cebf436..77bc221d7b709 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -151,6 +151,8 @@ void Flang::addCodegenOptions(const ArgList &Args, !stackArrays->getOption().matches(options::OPT_fno_stack_arrays)) CmdArgs.push_back("-fstack-arrays"); + Args.AddLastArg(CmdArgs, options::OPT_floop_fuse, options::OPT_fno_loop_fuse); + handleInterchangeLoopsArgs(Args, CmdArgs); handleVectorizeLoopsArgs(Args, CmdArgs); handleVectorizeSLPArgs(Args, CmdArgs); diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 761310813f787..d6dcc9c24e7ad 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -1680,6 +1680,11 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts, else GenerateArg(Consumer, OPT_fno_loop_interchange); + if (Opts.FuseLoops) + GenerateArg(Consumer, OPT_floop_fuse); + else + GenerateArg(Consumer, OPT_fno_loop_fuse); + if (!Opts.BinutilsVersion.empty()) GenerateArg(Consumer, OPT_fbinutils_version_EQ, Opts.BinutilsVersion); @@ -2001,6 +2006,7 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, (Opts.OptimizationLevel > 1)); Opts.InterchangeLoops = Args.hasFlag(OPT_floop_interchange, OPT_fno_loop_interchange, false); + Opts.FuseLoops = Args.hasFlag(OPT_floop_fuse, OPT_fno_loop_fuse, false); Opts.BinutilsVersion = std::string(Args.getLastArgValue(OPT_fbinutils_version_EQ)); diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c index ee7ded265769b..204d5e338190f 100644 --- a/clang/test/Driver/clang_f_opts.c +++ b/clang/test/Driver/clang_f_opts.c @@ -52,6 +52,13 @@ // CHECK-INTERCHANGE-LOOPS: "-floop-interchange" // CHECK-NO-INTERCHANGE-LOOPS: "-fno-loop-interchange" +// RUN: %clang -### -S -floop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s +// RUN: %clang -### -S -fno-loop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s +// RUN: %clang -### -S -fno-loop-fuse -floop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s +// RUN: %clang -### -S -floop-fuse -fno-loop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s +// CHECK-FUSE-LOOPS: "-floop-fuse" +// CHECK-NO-FUSE-LOOPS: "-fno-loop-fuse" + // RUN: %clang -### -S -fprofile-sample-accurate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-SAMPLE-ACCURATE %s // CHECK-PROFILE-SAMPLE-ACCURATE: "-fprofile-sample-accurate" diff --git a/flang/docs/ReleaseNotes.md b/flang/docs/ReleaseNotes.md index c9623ea08c4e6..516424f689762 100644 --- a/flang/docs/ReleaseNotes.md +++ b/flang/docs/ReleaseNotes.md @@ -35,6 +35,10 @@ page](https://llvm.org/releases/). ## New Compiler Flags +* -floop-interchange is now recognized by flang. +* -floop-interchange is enabled by default at -O2 and above. +* -floop-fuse is now recognized by flang. + ## Windows Support ## Fortran Language Changes in Flang diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def index cdeea93c9aecb..49e4d292a13e1 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.def +++ b/flang/include/flang/Frontend/CodeGenOptions.def @@ -43,6 +43,7 @@ CODEGENOPT(StackArrays, 1, 0) ///< -fstack-arrays (enable the stack-arrays pass) CODEGENOPT(VectorizeLoop, 1, 0) ///< Enable loop vectorization. CODEGENOPT(VectorizeSLP, 1, 0) ///< Enable SLP vectorization. CODEGENOPT(InterchangeLoops, 1, 0) ///< Enable loop interchange. +CODEGENOPT(FuseLoops, 1, 0) ///< Enable loop fuse. CODEGENOPT(LoopVersioning, 1, 0) ///< Enable loop versioning. CODEGENOPT(UnrollLoops, 1, 0) ///< Enable loop unrolling CODEGENOPT(AliasAnalysis, 1, 0) ///< Enable alias analysis pass diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 6295a58b1bdad..d46b4c83e8bf4 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -276,6 +276,9 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, if (args.getLastArg(clang::driver::options::OPT_floop_interchange)) opts.InterchangeLoops = 1; + if (args.getLastArg(clang::driver::options::OPT_floop_fuse)) + opts.FuseLoops = 1; + if (args.getLastArg(clang::driver::options::OPT_vectorize_loops)) opts.VectorizeLoop = 1; diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index 3bef6b1c31825..0c031e59e44f6 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -958,6 +958,7 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) { si.getTimePasses().setOutStream(ci.getTimingStreamLLVM()); pto.LoopUnrolling = opts.UnrollLoops; pto.LoopInterchange = opts.InterchangeLoops; + pto.LoopFuse = opts.FuseLoops; pto.LoopInterleaving = opts.UnrollLoops; pto.LoopVectorization = opts.VectorizeLoop; pto.SLPVectorization = opts.VectorizeSLP; diff --git a/flang/test/Driver/loop-fuse.f90 b/flang/test/Driver/loop-fuse.f90 new file mode 100644 index 0000000000000..240d00fdb62d7 --- /dev/null +++ b/flang/test/Driver/loop-fuse.f90 @@ -0,0 +1,17 @@ +! RUN: %flang -### -S -floop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE %s +! RUN: %flang -### -S -fno-loop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s +! RUN: %flang -### -S -O0 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s +! RUN: %flang -### -S -O1 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s +! RUN: %flang -### -S -O2 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s +! RUN: %flang -### -S -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s +! RUN: %flang -### -S -Os %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s +! RUN: %flang -### -S -Oz %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s +! CHECK-LOOP-FUSE: "-floop-fuse" +! CHECK-NO-LOOP-FUSE-NOT: "-floop-fuse" +! RUN: %flang_fc1 -emit-llvm -O2 -floop-fuse -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE-PASS %s +! RUN: %flang_fc1 -emit-llvm -O2 -fno-loop-fuse -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE-PASS %s +! CHECK-LOOP-FUSE-PASS: loop-fusion +! CHECK-NO-LOOP-FUSE-PASS-NOT: loop-fusion + +program test +end program diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h index 9cdb7ca7dbc9b..5870f9e6baeac 100644 --- a/llvm/include/llvm/Passes/PassBuilder.h +++ b/llvm/include/llvm/Passes/PassBuilder.h @@ -65,6 +65,9 @@ class PipelineTuningOptions { /// false. bool LoopInterchange; + /// Tuning option to enable/disable loop fuse. Its default value is false. + bool LoopFuse; + /// Tuning option to forget all SCEV loops in LoopUnroll. Its default value /// is that of the flag: `-forget-scev-loop-unroll`. bool ForgetAllSCEVInLoopUnroll; diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 98821bb1408a7..fe077198dfb06 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -104,6 +104,7 @@ #include "llvm/Transforms/Scalar/LoopDeletion.h" #include "llvm/Transforms/Scalar/LoopDistribute.h" #include "llvm/Transforms/Scalar/LoopFlatten.h" +#include "llvm/Transforms/Scalar/LoopFuse.h" #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" #include "llvm/Transforms/Scalar/LoopInstSimplify.h" #include "llvm/Transforms/Scalar/LoopInterchange.h" @@ -204,6 +205,10 @@ static cl::opt<bool> EnableLoopInterchange("enable-loopinterchange", cl::init(false), cl::Hidden, cl::desc("Enable the LoopInterchange Pass")); +static cl::opt<bool> EnableLoopFuse("enable-loopfuse", cl::init(false), + cl::Hidden, + cl::desc("Enable the LoopFuse Pass")); + static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam", cl::init(false), cl::Hidden, cl::desc("Enable Unroll And Jam Pass")); @@ -313,6 +318,7 @@ PipelineTuningOptions::PipelineTuningOptions() { SLPVectorization = false; LoopUnrolling = true; LoopInterchange = EnableLoopInterchange; + LoopFuse = EnableLoopFuse; ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll; LicmMssaOptCap = SetLicmMssaOptCap; LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; @@ -514,6 +520,9 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, invokeLoopOptimizerEndEPCallbacks(LPM2, Level); + if (PTO.LoopFuse) + FPM.addPass(LoopFusePass()); + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); @@ -703,6 +712,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, invokeLoopOptimizerEndEPCallbacks(LPM2, Level); + if (PTO.LoopFuse) + FPM.addPass(LoopFusePass()); + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); @@ -2115,7 +2127,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, LPM.addPass(LoopFlattenPass()); LPM.addPass(IndVarSimplifyPass()); LPM.addPass(LoopDeletionPass()); - // FIXME: Add loop interchange. // Unroll small loops and perform peeling. LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), >From 7b4fdc4e4d61e4ad6b9d76c560c1aaa07cb3dcb9 Mon Sep 17 00:00:00 2001 From: Sebastian Pop <s...@nvidia.com> Date: Thu, 12 Jun 2025 09:36:51 +0000 Subject: [PATCH 02/10] rename -floop-fuse / -fexperimental-fuse-loops --- clang/include/clang/Basic/CodeGenOptions.def | 2 +- clang/include/clang/Driver/Options.td | 8 ++++---- clang/lib/CodeGen/BackendUtil.cpp | 4 ++-- clang/lib/Driver/ToolChains/Clang.cpp | 3 ++- clang/lib/Driver/ToolChains/CommonArgs.cpp | 2 +- clang/lib/Driver/ToolChains/Flang.cpp | 3 ++- clang/lib/Frontend/CompilerInvocation.cpp | 7 ++++--- clang/test/Driver/clang_f_opts.c | 12 +++++------ flang/docs/ReleaseNotes.md | 4 +--- .../include/flang/Frontend/CodeGenOptions.def | 2 +- flang/lib/Frontend/CompilerInvocation.cpp | 2 +- flang/lib/Frontend/FrontendActions.cpp | 2 +- flang/test/Driver/loop-fuse.f90 | 12 +++++------ llvm/include/llvm/Passes/PassBuilder.h | 4 ++-- llvm/lib/Passes/PassBuilderPipelines.cpp | 20 +++++++++---------- 15 files changed, 43 insertions(+), 44 deletions(-) diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index 4c4f0970dfff0..872f73ebf3810 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -322,7 +322,7 @@ CODEGENOPT(TimeTrace , 1, 0, Benign) ///< Set when -ftime-trace is enabl VALUE_CODEGENOPT(TimeTraceGranularity, 32, 500, Benign) ///< Minimum time granularity (in microseconds), ///< traced by time profiler CODEGENOPT(InterchangeLoops , 1, 0, Benign) ///< Run loop-interchange. -CODEGENOPT(FuseLoops , 1, 0, Benign) ///< Run loop-fuse. +CODEGENOPT(FuseLoops , 1, 0, Benign) ///< Run loop-fusion. CODEGENOPT(UnrollLoops , 1, 0, Benign) ///< Control whether loops are unrolled. CODEGENOPT(RerollLoops , 1, 0, Benign) ///< Control whether loops are rerolled. CODEGENOPT(NoUseJumpTables , 1, 0, Benign) ///< Set when -fno-jump-tables is enabled. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 13a8502c6c7b5..3ac5bc1e2a477 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4304,10 +4304,10 @@ def floop_interchange : Flag<["-"], "floop-interchange">, Group<f_Group>, HelpText<"Enable the loop interchange pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; def fno_loop_interchange: Flag<["-"], "fno-loop-interchange">, Group<f_Group>, HelpText<"Disable the loop interchange pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; -def floop_fuse : Flag<["-"], "floop-fuse">, Group<f_Group>, - HelpText<"Enable the loop fuse pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; -def fno_loop_fuse: Flag<["-"], "fno-loop-fuse">, Group<f_Group>, - HelpText<"Disable the loop fuse pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; +def fexperimental_loop_fusion : Flag<["-"], "fexperimental-loop-fusion">, Group<f_Group>, + HelpText<"Enable the experimental loop fusion pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; +def fno_experimental_loop_fusion: Flag<["-"], "fno-experimental-loop-fusion">, Group<f_Group>, + HelpText<"Disable the experimental loop fusion pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>, HelpText<"Turn on loop unroller">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>, diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index b16e619727617..8c99af2bdff83 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -896,7 +896,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline( PipelineTuningOptions PTO; PTO.LoopUnrolling = CodeGenOpts.UnrollLoops; PTO.LoopInterchange = CodeGenOpts.InterchangeLoops; - PTO.LoopFuse = CodeGenOpts.FuseLoops; + PTO.LoopFusion = CodeGenOpts.FuseLoops; // For historical reasons, loop interleaving is set to mirror setting for loop // unrolling. PTO.LoopInterleaving = CodeGenOpts.UnrollLoops; @@ -1332,7 +1332,7 @@ runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex, Conf.SampleProfile = std::move(SampleProfile); Conf.PTO.LoopUnrolling = CGOpts.UnrollLoops; Conf.PTO.LoopInterchange = CGOpts.InterchangeLoops; - Conf.PTO.LoopFuse = CGOpts.FuseLoops; + Conf.PTO.LoopFusion = CGOpts.FuseLoops; // For historical reasons, loop interleaving is set to mirror setting for loop // unrolling. Conf.PTO.LoopInterleaving = CGOpts.UnrollLoops; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index dcfba95558fe6..d3f503c8f6c77 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6854,7 +6854,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_unroll_loops); Args.AddLastArg(CmdArgs, options::OPT_floop_interchange, options::OPT_fno_loop_interchange); - Args.AddLastArg(CmdArgs, options::OPT_floop_fuse, options::OPT_fno_loop_fuse); + Args.AddLastArg(CmdArgs, options::OPT_fexperimental_loop_fusion, + options::OPT_fno_experimental_loop_fusion); Args.AddLastArg(CmdArgs, options::OPT_fstrict_flex_arrays_EQ); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index a3c75d1009daa..bbac9f1af3828 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -3334,7 +3334,7 @@ void tools::handleVectorizeSLPArgs(const ArgList &Args, void tools::handleInterchangeLoopsArgs(const ArgList &Args, ArgStringList &CmdArgs) { - // FIXME: Instead of relying on shouldEnableVectorizerAtOLevel, we may want to + // FIXME: instead of relying on shouldEnableVectorizerAtOLevel, we may want to // implement a separate function to infer loop interchange from opt level. // For now, enable loop-interchange at the same opt levels as loop-vectorize. bool EnableInterchange = shouldEnableVectorizerAtOLevel(Args, false); diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 77bc221d7b709..eae914ca68877 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -151,7 +151,8 @@ void Flang::addCodegenOptions(const ArgList &Args, !stackArrays->getOption().matches(options::OPT_fno_stack_arrays)) CmdArgs.push_back("-fstack-arrays"); - Args.AddLastArg(CmdArgs, options::OPT_floop_fuse, options::OPT_fno_loop_fuse); + Args.AddLastArg(CmdArgs, options::OPT_fexperimental_loop_fusion, + options::OPT_fno_experimental_loop_fusion); handleInterchangeLoopsArgs(Args, CmdArgs); handleVectorizeLoopsArgs(Args, CmdArgs); diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index d6dcc9c24e7ad..2aec6dc02450c 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -1681,9 +1681,9 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts, GenerateArg(Consumer, OPT_fno_loop_interchange); if (Opts.FuseLoops) - GenerateArg(Consumer, OPT_floop_fuse); + GenerateArg(Consumer, OPT_fexperimental_loop_fusion); else - GenerateArg(Consumer, OPT_fno_loop_fuse); + GenerateArg(Consumer, OPT_fno_experimental_loop_fusion); if (!Opts.BinutilsVersion.empty()) GenerateArg(Consumer, OPT_fbinutils_version_EQ, Opts.BinutilsVersion); @@ -2006,7 +2006,8 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, (Opts.OptimizationLevel > 1)); Opts.InterchangeLoops = Args.hasFlag(OPT_floop_interchange, OPT_fno_loop_interchange, false); - Opts.FuseLoops = Args.hasFlag(OPT_floop_fuse, OPT_fno_loop_fuse, false); + Opts.FuseLoops = Args.hasFlag(OPT_fexperimental_loop_fusion, + OPT_fno_experimental_loop_fusion, false); Opts.BinutilsVersion = std::string(Args.getLastArgValue(OPT_fbinutils_version_EQ)); diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c index 204d5e338190f..ca654cb2dd49b 100644 --- a/clang/test/Driver/clang_f_opts.c +++ b/clang/test/Driver/clang_f_opts.c @@ -52,12 +52,12 @@ // CHECK-INTERCHANGE-LOOPS: "-floop-interchange" // CHECK-NO-INTERCHANGE-LOOPS: "-fno-loop-interchange" -// RUN: %clang -### -S -floop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s -// RUN: %clang -### -S -fno-loop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s -// RUN: %clang -### -S -fno-loop-fuse -floop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s -// RUN: %clang -### -S -floop-fuse -fno-loop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s -// CHECK-FUSE-LOOPS: "-floop-fuse" -// CHECK-NO-FUSE-LOOPS: "-fno-loop-fuse" +// RUN: %clang -### -S -fexperimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s +// RUN: %clang -### -S -fno-experimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s +// RUN: %clang -### -S -fno-experimental-loop-fusion -fexperimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s +// RUN: %clang -### -S -fexperimental-loop-fusion -fno-experimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s +// CHECK-FUSE-LOOPS: "-fexperimental-loop-fusion" +// CHECK-NO-FUSE-LOOPS: "-fno-experimental-loop-fusion" // RUN: %clang -### -S -fprofile-sample-accurate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-SAMPLE-ACCURATE %s // CHECK-PROFILE-SAMPLE-ACCURATE: "-fprofile-sample-accurate" diff --git a/flang/docs/ReleaseNotes.md b/flang/docs/ReleaseNotes.md index 516424f689762..6a285f829053b 100644 --- a/flang/docs/ReleaseNotes.md +++ b/flang/docs/ReleaseNotes.md @@ -35,9 +35,7 @@ page](https://llvm.org/releases/). ## New Compiler Flags -* -floop-interchange is now recognized by flang. -* -floop-interchange is enabled by default at -O2 and above. -* -floop-fuse is now recognized by flang. +* -fexperimental-loop-fusion is now recognized by flang. ## Windows Support diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def index 49e4d292a13e1..f273dad9606a6 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.def +++ b/flang/include/flang/Frontend/CodeGenOptions.def @@ -43,7 +43,7 @@ CODEGENOPT(StackArrays, 1, 0) ///< -fstack-arrays (enable the stack-arrays pass) CODEGENOPT(VectorizeLoop, 1, 0) ///< Enable loop vectorization. CODEGENOPT(VectorizeSLP, 1, 0) ///< Enable SLP vectorization. CODEGENOPT(InterchangeLoops, 1, 0) ///< Enable loop interchange. -CODEGENOPT(FuseLoops, 1, 0) ///< Enable loop fuse. +CODEGENOPT(FuseLoops, 1, 0) ///< Enable loop fusion. CODEGENOPT(LoopVersioning, 1, 0) ///< Enable loop versioning. CODEGENOPT(UnrollLoops, 1, 0) ///< Enable loop unrolling CODEGENOPT(AliasAnalysis, 1, 0) ///< Enable alias analysis pass diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index d46b4c83e8bf4..4f42fbd66eac0 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -276,7 +276,7 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, if (args.getLastArg(clang::driver::options::OPT_floop_interchange)) opts.InterchangeLoops = 1; - if (args.getLastArg(clang::driver::options::OPT_floop_fuse)) + if (args.getLastArg(clang::driver::options::OPT_fexperimental_loop_fusion)) opts.FuseLoops = 1; if (args.getLastArg(clang::driver::options::OPT_vectorize_loops)) diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index 0c031e59e44f6..23cc1e63e773d 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -958,7 +958,7 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) { si.getTimePasses().setOutStream(ci.getTimingStreamLLVM()); pto.LoopUnrolling = opts.UnrollLoops; pto.LoopInterchange = opts.InterchangeLoops; - pto.LoopFuse = opts.FuseLoops; + pto.LoopFusion = opts.FuseLoops; pto.LoopInterleaving = opts.UnrollLoops; pto.LoopVectorization = opts.VectorizeLoop; pto.SLPVectorization = opts.VectorizeSLP; diff --git a/flang/test/Driver/loop-fuse.f90 b/flang/test/Driver/loop-fuse.f90 index 240d00fdb62d7..ddfd9065e0fd4 100644 --- a/flang/test/Driver/loop-fuse.f90 +++ b/flang/test/Driver/loop-fuse.f90 @@ -1,15 +1,15 @@ -! RUN: %flang -### -S -floop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE %s -! RUN: %flang -### -S -fno-loop-fuse %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s +! RUN: %flang -### -S -fexperimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE %s +! RUN: %flang -### -S -fno-experimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s ! RUN: %flang -### -S -O0 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s ! RUN: %flang -### -S -O1 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s ! RUN: %flang -### -S -O2 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s ! RUN: %flang -### -S -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s ! RUN: %flang -### -S -Os %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s ! RUN: %flang -### -S -Oz %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE %s -! CHECK-LOOP-FUSE: "-floop-fuse" -! CHECK-NO-LOOP-FUSE-NOT: "-floop-fuse" -! RUN: %flang_fc1 -emit-llvm -O2 -floop-fuse -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE-PASS %s -! RUN: %flang_fc1 -emit-llvm -O2 -fno-loop-fuse -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE-PASS %s +! CHECK-LOOP-FUSE: "-fexperimental-loop-fusion" +! CHECK-NO-LOOP-FUSE-NOT: "-fexperimental-loop-fusion" +! RUN: %flang_fc1 -emit-llvm -O2 -fexperimental-loop-fusion -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-FUSE-PASS %s +! RUN: %flang_fc1 -emit-llvm -O2 -fno-experimental-loop-fusion -mllvm -print-pipeline-passes -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-FUSE-PASS %s ! CHECK-LOOP-FUSE-PASS: loop-fusion ! CHECK-NO-LOOP-FUSE-PASS-NOT: loop-fusion diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h index 5870f9e6baeac..2742ec1b71b7e 100644 --- a/llvm/include/llvm/Passes/PassBuilder.h +++ b/llvm/include/llvm/Passes/PassBuilder.h @@ -65,8 +65,8 @@ class PipelineTuningOptions { /// false. bool LoopInterchange; - /// Tuning option to enable/disable loop fuse. Its default value is false. - bool LoopFuse; + /// Tuning option to enable/disable loop fusion. Its default value is false. + bool LoopFusion; /// Tuning option to forget all SCEV loops in LoopUnroll. Its default value /// is that of the flag: `-forget-scev-loop-unroll`. diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index fe077198dfb06..390febda569c3 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -205,9 +205,9 @@ static cl::opt<bool> EnableLoopInterchange("enable-loopinterchange", cl::init(false), cl::Hidden, cl::desc("Enable the LoopInterchange Pass")); -static cl::opt<bool> EnableLoopFuse("enable-loopfuse", cl::init(false), - cl::Hidden, - cl::desc("Enable the LoopFuse Pass")); +static cl::opt<bool> EnableLoopFusion("enable-loopfusion", cl::init(false), + cl::Hidden, + cl::desc("Enable the LoopFusion Pass")); static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam", cl::init(false), cl::Hidden, @@ -318,7 +318,7 @@ PipelineTuningOptions::PipelineTuningOptions() { SLPVectorization = false; LoopUnrolling = true; LoopInterchange = EnableLoopInterchange; - LoopFuse = EnableLoopFuse; + LoopFusion = EnableLoopFusion; ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll; LicmMssaOptCap = SetLicmMssaOptCap; LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; @@ -520,9 +520,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, invokeLoopOptimizerEndEPCallbacks(LPM2, Level); - if (PTO.LoopFuse) - FPM.addPass(LoopFusePass()); - FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); @@ -647,6 +644,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, if (EnableConstraintElimination) FPM.addPass(ConstraintEliminationPass()); + if (PTO.LoopFusion) + FPM.addPass(LoopFusePass()); + // Add the primary loop simplification pipeline. // FIXME: Currently this is split into two loop pass pipelines because we run // some function passes in between them. These can and should be removed @@ -712,9 +712,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, invokeLoopOptimizerEndEPCallbacks(LPM2, Level); - if (PTO.LoopFuse) - FPM.addPass(LoopFusePass()); - FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); @@ -2127,6 +2124,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, LPM.addPass(LoopFlattenPass()); LPM.addPass(IndVarSimplifyPass()); LPM.addPass(LoopDeletionPass()); + // FIXME: Add loop interchange. // Unroll small loops and perform peeling. LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), @@ -2366,4 +2364,4 @@ AAManager PassBuilder::buildDefaultAAPipeline() { bool PassBuilder::isInstrumentedPGOUse() const { return (PGOOpt && PGOOpt->Action == PGOOptions::IRUse) || !UseCtxProfile.empty(); -} \ No newline at end of file +} >From fe53e398179ec3d4f85e150b1fd829324f50e816 Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar <madh...@nvidia.com> Date: Thu, 31 Jul 2025 09:09:30 -0700 Subject: [PATCH 03/10] Address review comments --- llvm/lib/Passes/PassBuilderPipelines.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 390febda569c3..9dfe0180de0c4 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -207,7 +207,7 @@ static cl::opt<bool> static cl::opt<bool> EnableLoopFusion("enable-loopfusion", cl::init(false), cl::Hidden, - cl::desc("Enable the LoopFusion Pass")); + cl::desc("Enable the LoopFuse Pass")); static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam", cl::init(false), cl::Hidden, @@ -644,6 +644,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, if (EnableConstraintElimination) FPM.addPass(ConstraintEliminationPass()); + // FIXME: This may not be the right place in the pipeline. + // We need to have the data to support the right place. if (PTO.LoopFusion) FPM.addPass(LoopFusePass()); >From ab202f026e6505d35b63943b28dbd0b6fb7d08be Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar <madh...@nvidia.com> Date: Thu, 7 Aug 2025 06:42:52 -0700 Subject: [PATCH 04/10] change place of pass in pipeline --- llvm/lib/Passes/PassBuilderPipelines.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 9dfe0180de0c4..2df23c5dcf299 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -644,11 +644,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, if (EnableConstraintElimination) FPM.addPass(ConstraintEliminationPass()); - // FIXME: This may not be the right place in the pipeline. - // We need to have the data to support the right place. - if (PTO.LoopFusion) - FPM.addPass(LoopFusePass()); - // Add the primary loop simplification pipeline. // FIXME: Currently this is split into two loop pass pipelines because we run // some function passes in between them. These can and should be removed @@ -1559,6 +1554,11 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, if (PTO.LoopInterchange) LPM.addPass(LoopInterchangePass()); + // FIXME: This may not be the right place in the pipeline. + // We need to have the data to support the right place. + if (PTO.LoopFusion) + OptimizePM.addPass(LoopFusePass()); + OptimizePM.addPass(createFunctionToLoopPassAdaptor( std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); >From 3f88802809e8bcaafcb63b47887899bd3d34d7ab Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar <madh...@nvidia.com> Date: Fri, 8 Aug 2025 04:17:44 -0700 Subject: [PATCH 05/10] fixup! change place of pass in pipeline also add a RUN line for testing position in pass pipeline --- clang/include/clang/Driver/Options.td | 8 ++++---- clang/lib/Frontend/CompilerInvocation.cpp | 2 -- clang/test/Driver/clang_f_opts.c | 6 ++++++ llvm/lib/Passes/PassBuilderPipelines.cpp | 6 +++--- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 3ac5bc1e2a477..3b58267833009 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4304,10 +4304,10 @@ def floop_interchange : Flag<["-"], "floop-interchange">, Group<f_Group>, HelpText<"Enable the loop interchange pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; def fno_loop_interchange: Flag<["-"], "fno-loop-interchange">, Group<f_Group>, HelpText<"Disable the loop interchange pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; -def fexperimental_loop_fusion : Flag<["-"], "fexperimental-loop-fusion">, Group<f_Group>, - HelpText<"Enable the experimental loop fusion pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; -def fno_experimental_loop_fusion: Flag<["-"], "fno-experimental-loop-fusion">, Group<f_Group>, - HelpText<"Disable the experimental loop fusion pass">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; +defm experimental_loop_fusion + : OptInCC1FFlag<"experimental-loop-fusion", "Enable", "Disable", + "Enable the loop fusion pass", + [ClangOption, CC1Option, FlangOption, FC1Option]>; def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>, HelpText<"Turn on loop unroller">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>, diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 2aec6dc02450c..422375240bab6 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -1682,8 +1682,6 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts, if (Opts.FuseLoops) GenerateArg(Consumer, OPT_fexperimental_loop_fusion); - else - GenerateArg(Consumer, OPT_fno_experimental_loop_fusion); if (!Opts.BinutilsVersion.empty()) GenerateArg(Consumer, OPT_fbinutils_version_EQ, Opts.BinutilsVersion); diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c index ca654cb2dd49b..a9e8414db37a4 100644 --- a/clang/test/Driver/clang_f_opts.c +++ b/clang/test/Driver/clang_f_opts.c @@ -58,6 +58,12 @@ // RUN: %clang -### -S -fexperimental-loop-fusion -fno-experimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s // CHECK-FUSE-LOOPS: "-fexperimental-loop-fusion" // CHECK-NO-FUSE-LOOPS: "-fno-experimental-loop-fusion" +// +// RUN: %clang -c -fexperimental-loop-fusion -mllvm -print-pipeline-passes -O3 %s 2>&1 | FileCheck --check-prefixes=LOOP-FUSION-ON %s +// RUN: %clang -c -mllvm -print-pipeline-passes -O3 %s 2>&1 | FileCheck --check-prefixes=LOOP-FUSION-OFF %s + +// LOOP-FUSION-ON: loop-fusion +// LOOP-FUSION-OFF-NOT: loop-fusion // RUN: %clang -### -S -fprofile-sample-accurate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-SAMPLE-ACCURATE %s // CHECK-PROFILE-SAMPLE-ACCURATE: "-fprofile-sample-accurate" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 2df23c5dcf299..f3e3fb2eafccb 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1554,14 +1554,14 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, if (PTO.LoopInterchange) LPM.addPass(LoopInterchangePass()); + OptimizePM.addPass(createFunctionToLoopPassAdaptor( + std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); + // FIXME: This may not be the right place in the pipeline. // We need to have the data to support the right place. if (PTO.LoopFusion) OptimizePM.addPass(LoopFusePass()); - OptimizePM.addPass(createFunctionToLoopPassAdaptor( - std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); - // Distribute loops to allow partial vectorization. I.e. isolate dependences // into separate loop that would otherwise inhibit vectorization. This is // currently only performed for loops marked with the metadata >From 9f264b4688ed54c9304777c5b3812a8141f58884 Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar <madh...@nvidia.com> Date: Thu, 28 Aug 2025 01:43:38 -0700 Subject: [PATCH 06/10] fixup! address review comments about optInFlag --- clang/include/clang/Driver/Options.td | 2 +- clang/lib/Driver/ToolChains/Clang.cpp | 2 +- clang/test/Driver/clang_f_opts.c | 4 ---- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 3b58267833009..47d328f862e07 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4307,7 +4307,7 @@ def fno_loop_interchange: Flag<["-"], "fno-loop-interchange">, Group<f_Group>, defm experimental_loop_fusion : OptInCC1FFlag<"experimental-loop-fusion", "Enable", "Disable", "Enable the loop fusion pass", - [ClangOption, CC1Option, FlangOption, FC1Option]>; + [ClangOption, FlangOption, FC1Option]>; def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>, HelpText<"Turn on loop unroller">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>; def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index d3f503c8f6c77..dfcf06727847c 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6854,7 +6854,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_unroll_loops); Args.AddLastArg(CmdArgs, options::OPT_floop_interchange, options::OPT_fno_loop_interchange); - Args.AddLastArg(CmdArgs, options::OPT_fexperimental_loop_fusion, + Args.addOptInFlag(CmdArgs, options::OPT_fexperimental_loop_fusion, options::OPT_fno_experimental_loop_fusion); Args.AddLastArg(CmdArgs, options::OPT_fstrict_flex_arrays_EQ); diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c index a9e8414db37a4..eb3994ddabcd3 100644 --- a/clang/test/Driver/clang_f_opts.c +++ b/clang/test/Driver/clang_f_opts.c @@ -53,11 +53,7 @@ // CHECK-NO-INTERCHANGE-LOOPS: "-fno-loop-interchange" // RUN: %clang -### -S -fexperimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s -// RUN: %clang -### -S -fno-experimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s -// RUN: %clang -### -S -fno-experimental-loop-fusion -fexperimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-FUSE-LOOPS %s -// RUN: %clang -### -S -fexperimental-loop-fusion -fno-experimental-loop-fusion %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FUSE-LOOPS %s // CHECK-FUSE-LOOPS: "-fexperimental-loop-fusion" -// CHECK-NO-FUSE-LOOPS: "-fno-experimental-loop-fusion" // // RUN: %clang -c -fexperimental-loop-fusion -mllvm -print-pipeline-passes -O3 %s 2>&1 | FileCheck --check-prefixes=LOOP-FUSION-ON %s // RUN: %clang -c -mllvm -print-pipeline-passes -O3 %s 2>&1 | FileCheck --check-prefixes=LOOP-FUSION-OFF %s >From 76780b04b1c718acf8be5f77b031c85316834818 Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar <madh...@nvidia.com> Date: Thu, 28 Aug 2025 10:04:33 -0700 Subject: [PATCH 07/10] fixup! addOptInFlag for Flang.cpp too. --- clang/lib/Driver/ToolChains/Flang.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index eae914ca68877..d3f4af164f672 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -151,8 +151,8 @@ void Flang::addCodegenOptions(const ArgList &Args, !stackArrays->getOption().matches(options::OPT_fno_stack_arrays)) CmdArgs.push_back("-fstack-arrays"); - Args.AddLastArg(CmdArgs, options::OPT_fexperimental_loop_fusion, - options::OPT_fno_experimental_loop_fusion); + Args.addOptInFlag(CmdArgs, options::OPT_fexperimental_loop_fusion, + options::OPT_fno_experimental_loop_fusion); handleInterchangeLoopsArgs(Args, CmdArgs); handleVectorizeLoopsArgs(Args, CmdArgs); >From 00d5ca55245daf6990905730461f66e34ca7e1ae Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar <madh...@nvidia.com> Date: Thu, 28 Aug 2025 22:09:03 -0700 Subject: [PATCH 08/10] fixup! alignments --- clang/lib/Driver/ToolChains/Clang.cpp | 2 +- flang/include/flang/Frontend/CodeGenOptions.def | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index dfcf06727847c..63efb0f02baa8 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6855,7 +6855,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.AddLastArg(CmdArgs, options::OPT_floop_interchange, options::OPT_fno_loop_interchange); Args.addOptInFlag(CmdArgs, options::OPT_fexperimental_loop_fusion, - options::OPT_fno_experimental_loop_fusion); + options::OPT_fno_experimental_loop_fusion); Args.AddLastArg(CmdArgs, options::OPT_fstrict_flex_arrays_EQ); diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def index f273dad9606a6..edab48a70d29d 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.def +++ b/flang/include/flang/Frontend/CodeGenOptions.def @@ -43,7 +43,7 @@ CODEGENOPT(StackArrays, 1, 0) ///< -fstack-arrays (enable the stack-arrays pass) CODEGENOPT(VectorizeLoop, 1, 0) ///< Enable loop vectorization. CODEGENOPT(VectorizeSLP, 1, 0) ///< Enable SLP vectorization. CODEGENOPT(InterchangeLoops, 1, 0) ///< Enable loop interchange. -CODEGENOPT(FuseLoops, 1, 0) ///< Enable loop fusion. +CODEGENOPT(FuseLoops, 1, 0) ///< Enable loop fusion. CODEGENOPT(LoopVersioning, 1, 0) ///< Enable loop versioning. CODEGENOPT(UnrollLoops, 1, 0) ///< Enable loop unrolling CODEGENOPT(AliasAnalysis, 1, 0) ///< Enable alias analysis pass >From 551813b0687c6a39373ff79743dc8a81db163650 Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar <madh...@nvidia.com> Date: Fri, 29 Aug 2025 10:20:22 -0700 Subject: [PATCH 09/10] fixup! add check for EnableFusion flag --- llvm/lib/Passes/PassBuilderPipelines.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index f3e3fb2eafccb..2f69b40e0bc69 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1559,7 +1559,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // FIXME: This may not be the right place in the pipeline. // We need to have the data to support the right place. - if (PTO.LoopFusion) + if (PTO.LoopFusion || EnableLoopFusion) OptimizePM.addPass(LoopFusePass()); // Distribute loops to allow partial vectorization. I.e. isolate dependences >From a007cea14e9affac6f632f811c9eea3a43c9e15e Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar <madh...@nvidia.com> Date: Mon, 1 Sep 2025 01:47:43 -0700 Subject: [PATCH 10/10] fixup! move enable-fusion to NewPMDriver --- llvm/lib/Passes/PassBuilderPipelines.cpp | 7 +------ llvm/tools/opt/NewPMDriver.cpp | 4 ++++ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 2f69b40e0bc69..79642e650ac83 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -205,10 +205,6 @@ static cl::opt<bool> EnableLoopInterchange("enable-loopinterchange", cl::init(false), cl::Hidden, cl::desc("Enable the LoopInterchange Pass")); -static cl::opt<bool> EnableLoopFusion("enable-loopfusion", cl::init(false), - cl::Hidden, - cl::desc("Enable the LoopFuse Pass")); - static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam", cl::init(false), cl::Hidden, cl::desc("Enable Unroll And Jam Pass")); @@ -318,7 +314,6 @@ PipelineTuningOptions::PipelineTuningOptions() { SLPVectorization = false; LoopUnrolling = true; LoopInterchange = EnableLoopInterchange; - LoopFusion = EnableLoopFusion; ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll; LicmMssaOptCap = SetLicmMssaOptCap; LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; @@ -1559,7 +1554,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // FIXME: This may not be the right place in the pipeline. // We need to have the data to support the right place. - if (PTO.LoopFusion || EnableLoopFusion) + if (PTO.LoopFusion) OptimizePM.addPass(LoopFusePass()); // Distribute loops to allow partial vectorization. I.e. isolate dependences diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp index b9b8929a0f703..0c991b71a6b26 100644 --- a/llvm/tools/opt/NewPMDriver.cpp +++ b/llvm/tools/opt/NewPMDriver.cpp @@ -60,6 +60,9 @@ cl::opt<bool> VerifyEachDebugInfoPreserve( cl::desc("Start each pass with collecting and end it with checking of " "debug info preservation.")); +static cl::opt<bool> EnableLoopFusion("enable-loopfusion", cl::init(false), + cl::Hidden, + cl::desc("Enable the LoopFuse Pass")); cl::opt<std::string> VerifyDIPreserveExport("verify-di-preserve-export", cl::desc("Export debug info preservation failures into " @@ -446,6 +449,7 @@ bool llvm::runPassPipeline( // option has been enabled. PTO.LoopUnrolling = !DisableLoopUnrolling; PTO.UnifiedLTO = UnifiedLTO; + PTO.LoopFusion = EnableLoopFusion; PassBuilder PB(TM, PTO, P, &PIC); registerEPCallbacks(PB); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits