llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Slava Zakharin (vzakhari) <details> <summary>Changes</summary> Added options: * -f[no-]repack-arrays * -f[no-]stack-repack-arrays * -frepack-arrays-contiguity=whole/innermost --- Patch is 27.25 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134002.diff 11 Files Affected: - (modified) clang/include/clang/Driver/Options.td (+61-1) - (modified) clang/lib/Driver/ToolChains/Flang.cpp (+19-9) - (modified) flang/docs/ArrayRepacking.md (+10-9) - (modified) flang/include/flang/Lower/LoweringOptions.def (+5) - (modified) flang/lib/Frontend/CompilerInvocation.cpp (+14) - (modified) flang/lib/Lower/ConvertVariable.cpp (+1-1) - (added) flang/test/Driver/frepack-arrays-contiguity.f90 (+27) - (added) flang/test/Driver/frepack-arrays.f90 (+24) - (added) flang/test/Driver/fstack-repack-arrays.f90 (+24) - (modified) flang/test/Lower/repack-arrays.f90 (+4-4) - (modified) flang/tools/bbc/bbc.cpp (+9-3) ``````````diff diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 89cb03cc33b98..49a59c0417455 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6819,7 +6819,6 @@ defm real_8_real_10 : BooleanFFlag<"real-8-real-10">, Group<gfortran_Group>; defm real_8_real_16 : BooleanFFlag<"real-8-real-16">, Group<gfortran_Group>; defm real_8_real_4 : BooleanFFlag<"real-8-real-4">, Group<gfortran_Group>; defm recursive : BooleanFFlag<"recursive">, Group<gfortran_Group>; -defm repack_arrays : BooleanFFlag<"repack-arrays">, Group<gfortran_Group>; defm second_underscore : BooleanFFlag<"second-underscore">, Group<gfortran_Group>; defm sign_zero : BooleanFFlag<"sign-zero">, Group<gfortran_Group>; defm whole_file : BooleanFFlag<"whole-file">, Group<gfortran_Group>; @@ -6961,6 +6960,51 @@ defm unsigned : OptInFC1FFlag<"unsigned", "Enables UNSIGNED type">; def fno_automatic : Flag<["-"], "fno-automatic">, Group<f_Group>, HelpText<"Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE">; +defm repack_arrays + : BoolOptionWithoutMarshalling< + "f", "repack-arrays", PosFlag<SetTrue, [], [], "Pack">, + NegFlag<SetFalse, [], [], "Do not pack">, + BothFlags<[], [], + " non-contiguous assumed shape dummy arrays into " + "contiguous memory">>, + DocBrief<[{Create temporary copies of non-contiguous assumed shape dummy +arrays in subprogram prologues, and destroy them in subprotram epilogues. +The temporary copy is initialized with values from the original array +in the prologue, if needed. In the epilogue, the current values +in the temporary array are copied into the original array, if needed. + +Accessing the contiguous temporary in the program code may result +in faster execution comparing to accessing elements of the original array, +when they are sparse in memory. At the same time, the overhead +of copying values between the original and the temporary arrays +may be significant, which may slow down some programs. + +Enabling array repacking may also change the behavior of certain +programs: +* The copy actions may introduce a data race in valid OpenACC/OpenMP programs. + For example, if different threads execute the same subprogram + with a non-contiguous assumed shape dummy array, and the different threads + access unrelated parts of the array, then the whole array copy + made in each thread will cause a data race. +* OpenACC/OpenMP offload programs may behave incorrectly with regards + to the device data environment, due to the fact that the original + array and the temporary may have different presence status on the device. +* ``IS_CONTIGUOUS`` intrinsic may return ``TRUE`` with the array repacking + enabled, whereas if would return ``FALSE`` with the repacking disabled. +* The result of ``LOC`` intrinsic applied to an actual argument associated + with a non-contiguous assumed shape dummy array, may be different + from the result of ``LOC`` applied to the dummy array.}]>; + +def frepack_arrays_contiguity_EQ + : Joined<["-"], "frepack-arrays-contiguity=">, + Group<f_Group>, + Values<"whole,innermost">, + HelpText< + "When -frepack-arrays is in effect, 'whole' enables " + "repacking for arrays that are non-contiguous in any dimension, " + "'innermost' enables repacking for arrays that are non-contiguous " + "in the innermost dimension (the default)">; + defm save_main_program : BoolOptionWithoutMarshalling<"f", "save-main-program", PosFlag<SetTrue, [], [], "Place all main program variables in static memory (otherwise scalars may be placed on the stack)">, @@ -6974,6 +7018,22 @@ defm loop_versioning : BoolOptionWithoutMarshalling<"f", "version-loops-for-stri PosFlag<SetTrue, [], [ClangOption], "Create unit-strided versions of loops">, NegFlag<SetFalse, [], [ClangOption], "Do not create unit-strided loops (default)">>; +defm stack_repack_arrays + : BoolOptionWithoutMarshalling< + "f", "stack-repack-arrays", + PosFlag<SetTrue, [], [], + "Attempt to allocate array temporaries created under " + "-frepack-arrays on the stack">, + NegFlag< + SetFalse, [], [], + "Allocate -frepack-arrays temporaries on the heap (default)">>, + DocBrief<[{Controls whether the array temporaries created under +**-frepack-arrays** are allocated on the stack or on the heap. + +By default, the heap is used. Allocations of the polymorphic types +are always done on the heap, though this may change in future releases. + }]>; + def fhermetic_module_files : Flag<["-"], "fhermetic-module-files">, Group<f_Group>, HelpText<"Emit hermetic module files (no nested USE association)">; } // let Visibility = [FC1Option, FlangOption] diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index a44513a83a2d7..ef451eb618a05 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -157,15 +157,25 @@ void Flang::addCodegenOptions(const ArgList &Args, if (shouldLoopVersion(Args)) CmdArgs.push_back("-fversion-loops-for-stride"); - Args.addAllArgs(CmdArgs, - {options::OPT_flang_experimental_hlfir, - options::OPT_flang_deprecated_no_hlfir, - options::OPT_fno_ppc_native_vec_elem_order, - options::OPT_fppc_native_vec_elem_order, - options::OPT_finit_global_zero, - options::OPT_fno_init_global_zero, options::OPT_ftime_report, - options::OPT_ftime_report_EQ, options::OPT_funroll_loops, - options::OPT_fno_unroll_loops}); + for (const auto &arg : + Args.getAllArgValues(options::OPT_frepack_arrays_contiguity_EQ)) + if (arg.compare("whole") != 0 && arg.compare("innermost") != 0) { + getToolChain().getDriver().Diag(diag::err_drv_unsupported_option_argument) + << "-frepack-arrays-contiguity=" << arg; + } + + Args.addAllArgs( + CmdArgs, + {options::OPT_flang_experimental_hlfir, + options::OPT_flang_deprecated_no_hlfir, + options::OPT_fno_ppc_native_vec_elem_order, + options::OPT_fppc_native_vec_elem_order, options::OPT_finit_global_zero, + options::OPT_fno_init_global_zero, options::OPT_frepack_arrays, + options::OPT_fno_repack_arrays, + options::OPT_frepack_arrays_contiguity_EQ, + options::OPT_fstack_repack_arrays, options::OPT_fno_stack_repack_arrays, + options::OPT_ftime_report, options::OPT_ftime_report_EQ, + options::OPT_funroll_loops, options::OPT_fno_unroll_loops}); } void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const { diff --git a/flang/docs/ArrayRepacking.md b/flang/docs/ArrayRepacking.md index 87cfc5d1bb4bc..7de599f293e40 100755 --- a/flang/docs/ArrayRepacking.md +++ b/flang/docs/ArrayRepacking.md @@ -39,13 +39,13 @@ Having these results it seems reasonable to provide support for arrays repacking #### Facts and guesses about the implementation -The dynamic checks for continuity and the array copy code is located completely in the [runtime](https://github.com/gcc-mirror/gcc/blob/3e08a4ecea27c54fda90e8f58641b1986ad957e1/libgfortran/generated/in_pack_r8.c#L35), so the compiler inserts unconditional calls in the subprogram prologue/epilogue. +The dynamic checks for contiguity and the array copy code is located completely in the [runtime](https://github.com/gcc-mirror/gcc/blob/3e08a4ecea27c54fda90e8f58641b1986ad957e1/libgfortran/generated/in_pack_r8.c#L35), so the compiler inserts unconditional calls in the subprogram prologue/epilogue. It looks like `gfortran` ignores `intent(out)/intent(in)` which could have helped to avoid some of the `pack/unpack` overhead. It looks like the `pack`/`unpack` actions are inserted early in the compilation pipeline, and these extra calls affect behavior of the later optimization passes. For example, `Polyhedron/fatigue2` slows down by about 2x with `-frepack-arrays`: this slowdown is not caused by the `pack`/`unpack` overhead, but is a consequence of worse function inlining decisions made after the calls insertion. The benchmarks becomes even faster than the original version with `-frepack-arrays` and proper `-finline-limit=` settings, but it does not look like the benchmark contains code that would benefit from the array repacking. -It does not look like `gfortran` is able to eliminate the `pack`/`unpack` code after the function inlining, if the actual argument is statically known to be contiguous. So the overhead from the dynamic continuity checks is inevitable when `-frepack-arrays` is specified. +It does not look like `gfortran` is able to eliminate the `pack`/`unpack` code after the function inlining, if the actual argument is statically known to be contiguous. So the overhead from the dynamic contiguity checks is inevitable when `-frepack-arrays` is specified. It does not look like `gfortran` tries to optimize the insertion of `pack`/`unpack` code. For example, if a dummy array is only used under a condition within the subprogram, the repacking code might be inserted under the same condition to minimize the overhead on the unconditional path through the subprogram. @@ -59,7 +59,7 @@ It does not look like `gfortran` tries to optimize the insertion of `pack`/`unpa #### Facts and guesses about the implementation -The `pack` code is only generated if the actual argument may be non-contiguous in the innermost dimension, as determined statically, i.e. the compiler does not generate any dynamic continuity checks. For example: +The `pack` code is only generated if the actual argument may be non-contiguous in the innermost dimension, as determined statically, i.e. the compiler does not generate any dynamic contiguity checks. For example: ```Fortran interface @@ -132,8 +132,8 @@ So it does not seem practical/reasonable to enable the array repacking by defaul ### Performance 1. Minimize the overhead of array repacking, e.g. avoid copy-in/out whenever possible, execute copy-in/out only on the execution paths where the array is accessed. -2. Provide different modes of repacking depending on the "continuity" meaning, i.e. one - array is contiguous in the innermost dimension, two - array is contiguous in all dimensions. -3. Avoid generating repacking code, when the "continuity" can be statically proven (including after optimization passes like constant propagation, function inlining, etc.). +2. Provide different modes of repacking depending on the "contiguity" meaning, i.e. one - array is contiguous in the innermost dimension, two - array is contiguous in all dimensions. +3. Avoid generating repacking code, when the "contiguity" can be statically proven (including after optimization passes like constant propagation, function inlining, etc.). 4. Use a set of heuristics to avoid generating repacking code based on the array usage pattern, e.g. if an array is proven not to be used in an array expression or a loop, etc. 5. Use a set of heuristics to avoid repacking actions dynamically, e.g. based on the array size, element size, byte stride(s) of the [innermost] dimension(s), etc. 6. Minimize the impact of the IR changes, introduced by repacking, on the later optimization passes. @@ -156,7 +156,7 @@ Controlled by cli options, Lowering will generate a `fir.pack_array` operation i The new operations will hold all the information that customizes further handling of the `pack`/`unpack` actions, such as: * Optional array of attributes supporting an interface to generate a predicate that says if the repacking is safe in the current context. -* The continuity mode: `innermost` vs `whole`. +* The contiguity mode: `innermost` vs `whole`. * Attributes selecting the heuristics (both compiler and runtime ones) that may be applied to avoid `pack`/`unpack` actions. * Other attributes, like `stack` vs `heap` to manage the temporary allocation according to `-fstack-arrays`, etc. @@ -195,7 +195,7 @@ The operation creates a new `!fir.box/class<!fir.array<>>` value to represent ei Arguments: * `stack` - indicates if `-fstack-arrays` is in effect for compiling this function. -* `innermost` - tells that the repacking has to be done iff the array is not contiguous in the innermost dimension. This also describes what type of continuity can be expected from `%new_var`, i.e. `innermost` means that the resulting array is definitely contiguous in the innermost dimension, but may be non-contiguous in other dimensions (unless additional analysis proves otherwise). For 1-D arrays, `innermost` attribute is not valid. +* `innermost` - tells that the repacking has to be done iff the array is not contiguous in the innermost dimension. This also describes what type of contiguity can be expected from `%new_var`, i.e. `innermost` means that the resulting array is definitely contiguous in the innermost dimension, but may be non-contiguous in other dimensions (unless additional analysis proves otherwise). For 1-D arrays, `innermost` attribute is not valid. * `no_copy` - indicates that, in case a temporary array is created, `%var` to `%new_var` copy is not required (`intent(out)` dummy argument case). * `heuristics` * `loop-only` - `fir.pack_array` can be optimized away, if the array is not used in a loop. @@ -351,7 +351,7 @@ The `fir.pack_array`'s copy-in action cannot be skipped for `INTENT(OUT)` dummy #### Optional behavior -In case of the `whole` continuity mode or with 1-D array, Flang can propagate this information to `hlfir.declare` - this may improve optimizations down the road. This can be done iff the repacking has no dynamic constraints and/or heuristics. For example: +In case of the `whole` contiguity mode or with 1-D array, Flang can propagate this information to `hlfir.declare` - this may improve optimizations down the road. This can be done iff the repacking has no dynamic constraints and/or heuristics. For example: ``` %c0 = arith.constant 0 : index @@ -441,10 +441,11 @@ In cases where `fir.pack_array` is statically known to produce a copy that is co The following user options are proposed: * `-frepack-arrays` - the option forces Flang to repack a non-contiguous assumed-shape dummy array into a temporary contiguous memory, which may result in faster accesses of the array. The compiler will insert special code in subprogram prologue to allocate a temporary array and copy the original array into the temporary; in subprogram epilogue, it will insert a copy from the temporary array into the original array and deallocate the temporary. The overhead of the allocation/deallocation and the copies may be significant depending on the array size. The compiler will try to optimize the unnecessary/unprofitable repacking. +* `-fstack-repack-arrays` - attempt allocating the temporary arrays in stack memory. By default, they are allocated in heap memory (note that `-fstack-arrays` does not affect the allocation of the temporaries created for the arrays repacking). * `-frepack-arrays-opts=[none|loop-only]` - the option enables optimizations that may eliminate the array repacking code depending on the array usage pattern: * `none` - no optimizations. * `loop-only` - the array repacking code will be removed in any subprogram where the array is not used inside a loop or an array expression. -* `-frepack-arrays-continuity=[whole|innermost]`: +* `-frepack-arrays-contiguity=[whole|innermost]`: * `whole` - the option will repack arrays that are non-contiguous in any dimension (default). * `innermost` - the option will repack arrays that are non-contiguous in the innermost dimension. * `-frepack-arrays-max-size=<int>` - arrays bigger than the specified size will not be repacked. diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def index 6735bea551414..d98823a0e3341 100644 --- a/flang/include/flang/Lower/LoweringOptions.def +++ b/flang/include/flang/Lower/LoweringOptions.def @@ -56,6 +56,11 @@ ENUM_LOWERINGOPT(StackArrays, unsigned, 1, 0) /// packed into contiguous memory. ENUM_LOWERINGOPT(RepackArrays, unsigned, 1, 0) +/// If true, the temporary arrays created under RepackArrays +/// control will be allocated in stack memory. If false, +/// they will be allocated in heap memory. +ENUM_LOWERINGOPT(StackRepackArrays, unsigned, 1, 0) + /// If true, the repacking (RepackArrays option above) /// will be done for arrays non-contiguous in any dimension, /// otherwise, it will be done only for arrays non-contiguous diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 229695b18d278..4c3b63f07b190 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -1448,6 +1448,19 @@ bool CompilerInvocation::createFromArgs( clang::driver::options::OPT_fno_realloc_lhs, true)) invoc.loweringOpts.setReallocateLHS(false); + invoc.loweringOpts.setRepackArrays( + args.hasFlag(clang::driver::options::OPT_frepack_arrays, + clang::driver::options::OPT_fno_repack_arrays, + /*default=*/false)); + invoc.loweringOpts.setStackRepackArrays( + args.hasFlag(clang::driver::options::OPT_fstack_repack_arrays, + clang::driver::options::OPT_fno_stack_repack_arrays, + /*default=*/false)); + if (auto *arg = args.getLastArg( + clang::driver::options::OPT_frepack_arrays_contiguity_EQ)) + invoc.loweringOpts.setRepackArraysWhole(arg->getValue() == + llvm::StringRef{"whole"}); + success &= parseFrontendArgs(invoc.getFrontendOpts(), args, diags); parseTargetArgs(invoc.getTargetOpts(), args); parsePreprocessorArgs(invoc.getPreprocessorOpts(), args); @@ -1687,6 +1700,7 @@ void CompilerInvocation::setLoweringOptions() { const Fortran::common::LangOptions &langOptions = getLangOpts(); loweringOpts.setIntegerWrapAround(langOptions.getSignedOverflowBehavior() == Fortran::common::LangOptions::SOB_Defined); + loweringOpts.setStackArrays(codegenOpts.StackArrays); Fortran::common::MathOptionsBase &mathOpts = loweringOpts.getMathOptions(); // TODO: when LangOptions are finalized, we can represent // the math related options using Fortran::commmon::MathOptionsBase, diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 0b22b743edee9..366ff328bfa27 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -2630,7 +2630,7 @@ Fortran::lower::genPackArray(Fortran::lower::AbstractConverter &converter, }); fir::FirOpBuilder &builder = converter.getFirOpBuilder(); const mlir::Location loc = genLocation(converter, sym); - bool stackAlloc = opts.getStackArrays(); + bool stackAlloc = opts.getStackRepackArrays(); // 1D arrays must always use 'whole' mode. bool isInnermostMode = !opts.getRepackArraysWhole() && sym.Rank() > 1; // Avoid copy-in for 'intent(out)' variable, unless this is a dummy diff --git a/flang/test/Driver/frepack-arrays-contiguity.f90 b/flang/test/Driver/frepack-arrays-contiguity.f90 new file mode 100644 index 0000000000000..d642cdac598af --- /dev/null +++ b/flang/test/Driver/frepack-arrays-contiguity.f90 @@ -0,0 +1,27 @@ +! Test forwarding just the forwarding of -frepack-arrays-contiguity options: +! RUN: %flang -frepack-arrays-contiguity=whole %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=WHOLECMD %s +! RUN: %flang -frepack-arrays-contiguity=innermost %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=INNERMOSTCMD %s +! RUN: %flang -frepack-arrays-contiguity=innermost -frepack-arrays-contiguity=whole %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=WHOLECMD %s +! RUN: %flang -frepack-arrays-contiguity=whole -frepack-arrays-contiguity=innermost %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=INNERMOSTCMD %s + +! Test proper setting of the lowering options: +! RUN: %flang_fc1 -frepack-arrays -frepack-arrays-contiguity=whole %s -emit-hlfir -o - | FileCheck --check-prefix=WHOLE %s +! RUN: %flang_fc1 -frepack-arrays-contiguity=whole %s -emit-hl... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/134002 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits