https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/128509
The default number of partitions is the number of cores on the machine with a cap at 16, as going above 16 is unlikely to be useful in the common case. Adds a flto-partitions option to override the number of partitions easily (without having to use -Xoffload-linker). Setting it to 1 effectively disables module splitting. Fixes SWDEV-506214 >From e58e1b4e324150e913f9c80a467b8f9a349d1bdc Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Mon, 24 Feb 2025 14:21:49 +0100 Subject: [PATCH] [clang][AMDGPU] Enable module splitting by default The default number of partitions is the number of cores on the machine with a cap at 16, as going above 16 is unlikely to be useful in the common case. Adds a flto-partitions option to override the number of partitions easily (without having to use -Xoffload-linker). Setting it to 1 effectively disables module splitting. Fixes SWDEV-506214 --- clang/include/clang/Driver/Options.td | 6 ++-- clang/lib/Driver/ToolChains/AMDGPU.cpp | 36 +++++++++++++++++-- clang/lib/Driver/ToolChains/AMDGPU.h | 2 ++ clang/lib/Driver/ToolChains/HIPAMD.cpp | 2 ++ clang/test/Driver/amdgpu-toolchain.c | 20 +++++++++-- .../hip-toolchain-rdc-flto-partitions.hip | 35 ++++++++++++++++++ .../Driver/hip-toolchain-rdc-static-lib.hip | 2 ++ clang/test/Driver/hip-toolchain-rdc.hip | 1 + 8 files changed, 98 insertions(+), 6 deletions(-) create mode 100644 clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index e521cbf678d93..6cb2fd1b755c1 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1393,6 +1393,8 @@ def fhip_emit_relocatable : Flag<["-"], "fhip-emit-relocatable">, HelpText<"Compile HIP source to relocatable">; def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">, HelpText<"Do not override toolchain to compile HIP source to relocatable">; +def flto_partitions_EQ : Joined<["--"], "flto-partitions=">, Group<hip_Group>, + HelpText<"Number of partitions to use for parallel full LTO codegen. Use 1 to disable partitioning.">; } // Clang specific/exclusive options for OpenACC. @@ -3158,7 +3160,7 @@ def modules_reduced_bmi : Flag<["-"], "fmodules-reduced-bmi">, HelpText<"Generate the reduced BMI">, MarshallingInfoFlag<FrontendOpts<"GenReducedBMI">>; -def experimental_modules_reduced_bmi : Flag<["-"], "fexperimental-modules-reduced-bmi">, +def experimental_modules_reduced_bmi : Flag<["-"], "fexperimental-modules-reduced-bmi">, Group<f_Group>, Visibility<[ClangOption, CC1Option]>, Alias<modules_reduced_bmi>; def fmodules_embed_all_files : Joined<["-"], "fmodules-embed-all-files">, @@ -7417,7 +7419,7 @@ def fuse_register_sized_bitfield_access: Flag<["-"], "fuse-register-sized-bitfie def relaxed_aliasing : Flag<["-"], "relaxed-aliasing">, HelpText<"Turn off Type Based Alias Analysis">, MarshallingInfoFlag<CodeGenOpts<"RelaxedAliasing">>; -defm pointer_tbaa: BoolOption<"", "pointer-tbaa", CodeGenOpts<"PointerTBAA">, +defm pointer_tbaa: BoolOption<"", "pointer-tbaa", CodeGenOpts<"PointerTBAA">, DefaultTrue, PosFlag<SetTrue, [], [ClangOption], "Enable">, NegFlag<SetFalse, [], [ClangOption], "Disable">, diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 6a35a2feabc9b..820a335a4b384 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -21,6 +21,7 @@ #include "llvm/Support/LineIterator.h" #include "llvm/Support/Path.h" #include "llvm/Support/Process.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/VirtualFileSystem.h" #include "llvm/TargetParser/Host.h" #include <optional> @@ -630,8 +631,11 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, getToolChain().AddFilePathLibArgs(Args, CmdArgs); AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA); if (C.getDriver().isUsingLTO()) { - addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], - C.getDriver().getLTOMode() == LTOK_Thin); + const bool ThinLTO = (C.getDriver().getLTOMode() == LTOK_Thin); + addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], ThinLTO); + + if (!ThinLTO) + addFullLTOPartitionOption(C.getDriver(), Args, CmdArgs); } else if (Args.hasArg(options::OPT_mcpu_EQ)) { CmdArgs.push_back(Args.MakeArgString( "-plugin-opt=mcpu=" + @@ -708,6 +712,34 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D, options::OPT_m_amdgpu_Features_Group); } +static unsigned GetFullLTOPartitions(const Driver &D, const ArgList &Args) { + const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ); + // In the absence of an option, use the number of available threads with a cap + // at 16 partitions. More than 16 partitions rarely benefits code splitting + // and can lead to more empty/small modules each with their own overhead. + if (!A) + return std::max(16u, llvm::hardware_concurrency().compute_thread_count()); + int Value; + if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) { + D.Diag(diag::err_drv_invalid_int_value) + << A->getAsString(Args) << A->getValue(); + return 1; + } + + return Value; +} + +void amdgpu::addFullLTOPartitionOption(const Driver &D, + const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) { + // TODO: restrict to gpu-rdc only? + + if (unsigned NumParts = GetFullLTOPartitions(D, Args); NumParts > 1) { + CmdArgs.push_back( + Args.MakeArgString("--lto-partitions=" + std::to_string(NumParts))); + } +} + /// AMDGPU Toolchain AMDGPUToolChain::AMDGPUToolChain(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h index bc941a40445ad..08bd4fa556f78 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.h +++ b/clang/lib/Driver/ToolChains/AMDGPU.h @@ -41,6 +41,8 @@ void getAMDGPUTargetFeatures(const Driver &D, const llvm::Triple &Triple, const llvm::opt::ArgList &Args, std::vector<StringRef> &Features); +void addFullLTOPartitionOption(const Driver &D, const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs); } // end namespace amdgpu } // end namespace tools diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index 271626ed54aed..4f0511b272a98 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -116,6 +116,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA, addLinkerCompressDebugSectionsOption(TC, Args, LldArgs); + amdgpu::addFullLTOPartitionOption(D, Args, LldArgs); + // Given that host and device linking happen in separate processes, the device // linker doesn't always have the visibility as to which device symbols are // needed by a program, especially for the device symbol dependencies that are diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c index c1c5aa8e90e68..6617108e59fcf 100644 --- a/clang/test/Driver/amdgpu-toolchain.c +++ b/clang/test/Driver/amdgpu-toolchain.c @@ -19,10 +19,12 @@ // AS_LINK_UR: ld.lld{{.*}} "--no-undefined"{{.*}} "--unresolved-symbols=ignore-all" // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \ -// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefixes=LTO,MCPU %s +// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=LTO %s +// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions" +// LTO: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions={{[0-9]+}}"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack" + // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \ // RUN: -L. -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=MCPU %s -// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions" // MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack" // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \ @@ -36,3 +38,17 @@ // RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -stdlib -startfiles \ // RUN: -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=STARTUP %s // STARTUP: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o" + +// Check --flto-partitions + +// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \ +// RUN: -L. -flto --flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s +// LTO_PARTS: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions=42" + +// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \ +// RUN: -L. -flto --flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s +// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a' + +// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \ +// RUN: -L. -flto --flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s +// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0' diff --git a/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip new file mode 100644 index 0000000000000..e345bd3f5be6b --- /dev/null +++ b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip @@ -0,0 +1,35 @@ +// RUN: %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=42 \ +// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \ +// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefix=FIXED-PARTS + +// FIXED-PARTS-NOT: "*.llvm-link" +// FIXED-PARTS-NOT: ".*opt" +// FIXED-PARTS-NOT: ".*llc" +// FIXED-PARTS: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" +// FIXED-PARTS-SAME: "-plugin-opt=mcpu=gfx803" +// FIXED-PARTS-SAME: "--lto-partitions=42" +// FIXED-PARTS-SAME: "-o" "{{.*out}}" "{{.*bc}}" + +// RUN: not %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=a \ +// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \ +// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV0 + +// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a' + +// RUN: not %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=0 \ +// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \ +// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV1 + +// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0' diff --git a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip index 5276faf31bdc2..6f38a06f7cf31 100644 --- a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip +++ b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip @@ -49,6 +49,7 @@ // CHECK-NOT: ".*llc" // CHECK: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" // CHECK-SAME: "-plugin-opt=mcpu=gfx803" +// CHECK-SAME: "--lto-partitions={{[0-9]+}}" // CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]] // generate image for device side path on gfx900 @@ -77,6 +78,7 @@ // CHECK-NOT: ".*llc" // CHECK: [[LLD]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" // CHECK-SAME: "-plugin-opt=mcpu=gfx900" +// CHECK-SAME: "--lto-partitions={{[0-9]+}}" // CHECK-SAME: "--whole-archive" // CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[A_BC2]] [[B_BC2]] // CHECK-SAME: "--no-whole-archive" diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip index 96da423144c1c..9015702e3211a 100644 --- a/clang/test/Driver/hip-toolchain-rdc.hip +++ b/clang/test/Driver/hip-toolchain-rdc.hip @@ -147,6 +147,7 @@ // CHECK-NOT: ".*llc" // CHECK: {{".*lld.*"}} {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" // CHECK-SAME: "-plugin-opt=mcpu=gfx900" +// CHECK-SAME: "--lto-partitions={{[0-9]+}}" // CHECK-SAME: "-o" "[[IMG_DEV2:.*.out]]" [[A_BC2]] [[B_BC2]] // combine images generated into hip fat binary object _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits