Author: Pierre van Houtryve Date: 2025-03-24T14:43:08+01:00 New Revision: ed022d93b2fbfe52b7bdee786aa5cc49fa2323c4
URL: https://github.com/llvm/llvm-project/commit/ed022d93b2fbfe52b7bdee786aa5cc49fa2323c4 DIFF: https://github.com/llvm/llvm-project/commit/ed022d93b2fbfe52b7bdee786aa5cc49fa2323c4.diff LOG: [clang][AMDGPU] Enable module splitting by default (#128509) The default number of partitions is the number of cores on the machine with a cap at 16, as going above 16 is unlikely to be useful in the common case. Adds a flto-partitions option to override the number of partitions easily (without having to use -Xoffload-linker). Setting it to 1 effectively disables module splitting. Fixes SWDEV-506214 Added: clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip Modified: clang/include/clang/Driver/Options.td clang/lib/Driver/ToolChains/AMDGPU.cpp clang/lib/Driver/ToolChains/AMDGPU.h clang/lib/Driver/ToolChains/HIPAMD.cpp clang/test/Driver/amdgpu-toolchain.c clang/test/Driver/hip-toolchain-rdc-static-lib.hip clang/test/Driver/hip-toolchain-rdc.hip Removed: ################################################################################ diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index fbd5cf632c350..59a57c83c6b89 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1392,6 +1392,8 @@ def fhip_emit_relocatable : Flag<["-"], "fhip-emit-relocatable">, HelpText<"Compile HIP source to relocatable">; def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">, HelpText<"Do not override toolchain to compile HIP source to relocatable">; +def flto_partitions_EQ : Joined<["--"], "flto-partitions=">, Group<hip_Group>, + HelpText<"Number of partitions to use for parallel full LTO codegen. Use 1 to disable partitioning.">; } // Clang specific/exclusive options for OpenACC. diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 6a35a2feabc9b..e919f4e941f47 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -630,8 +630,11 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, getToolChain().AddFilePathLibArgs(Args, CmdArgs); AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA); if (C.getDriver().isUsingLTO()) { - addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], - C.getDriver().getLTOMode() == LTOK_Thin); + const bool ThinLTO = (C.getDriver().getLTOMode() == LTOK_Thin); + addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], ThinLTO); + + if (!ThinLTO) + addFullLTOPartitionOption(C.getDriver(), Args, CmdArgs); } else if (Args.hasArg(options::OPT_mcpu_EQ)) { CmdArgs.push_back(Args.MakeArgString( "-plugin-opt=mcpu=" + @@ -708,6 +711,33 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D, options::OPT_m_amdgpu_Features_Group); } +static unsigned getFullLTOPartitions(const Driver &D, const ArgList &Args) { + const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ); + // In the absence of an option, use 8 as the default. + if (!A) + return 8; + int Value = 0; + if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) { + D.Diag(diag::err_drv_invalid_int_value) + << A->getAsString(Args) << A->getValue(); + return 1; + } + + return Value; +} + +void amdgpu::addFullLTOPartitionOption(const Driver &D, + const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) { + // TODO: Should this be restricted to fgpu-rdc only ? Currently we'll + // also do it for non gpu-rdc LTO + + if (unsigned NumParts = getFullLTOPartitions(D, Args); NumParts > 1) { + CmdArgs.push_back( + Args.MakeArgString("--lto-partitions=" + Twine(NumParts))); + } +} + /// AMDGPU Toolchain AMDGPUToolChain::AMDGPUToolChain(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h index bc941a40445ad..08bd4fa556f78 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.h +++ b/clang/lib/Driver/ToolChains/AMDGPU.h @@ -41,6 +41,8 @@ void getAMDGPUTargetFeatures(const Driver &D, const llvm::Triple &Triple, const llvm::opt::ArgList &Args, std::vector<StringRef> &Features); +void addFullLTOPartitionOption(const Driver &D, const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs); } // end namespace amdgpu } // end namespace tools diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index 55a8f2ca87de0..dc3300b00f9ff 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -116,6 +116,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA, addLinkerCompressDebugSectionsOption(TC, Args, LldArgs); + amdgpu::addFullLTOPartitionOption(D, Args, LldArgs); + // Given that host and device linking happen in separate processes, the device // linker doesn't always have the visibility as to which device symbols are // needed by a program, especially for the device symbol dependencies that are diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c index c1c5aa8e90e68..6617108e59fcf 100644 --- a/clang/test/Driver/amdgpu-toolchain.c +++ b/clang/test/Driver/amdgpu-toolchain.c @@ -19,10 +19,12 @@ // AS_LINK_UR: ld.lld{{.*}} "--no-undefined"{{.*}} "--unresolved-symbols=ignore-all" // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \ -// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefixes=LTO,MCPU %s +// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=LTO %s +// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions" +// LTO: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions={{[0-9]+}}"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack" + // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \ // RUN: -L. -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=MCPU %s -// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions" // MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack" // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \ @@ -36,3 +38,17 @@ // RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -stdlib -startfiles \ // RUN: -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=STARTUP %s // STARTUP: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o" + +// Check --flto-partitions + +// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \ +// RUN: -L. -flto --flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s +// LTO_PARTS: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions=42" + +// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \ +// RUN: -L. -flto --flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s +// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a' + +// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \ +// RUN: -L. -flto --flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s +// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0' diff --git a/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip new file mode 100644 index 0000000000000..e345bd3f5be6b --- /dev/null +++ b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip @@ -0,0 +1,35 @@ +// RUN: %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=42 \ +// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \ +// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefix=FIXED-PARTS + +// FIXED-PARTS-NOT: "*.llvm-link" +// FIXED-PARTS-NOT: ".*opt" +// FIXED-PARTS-NOT: ".*llc" +// FIXED-PARTS: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" +// FIXED-PARTS-SAME: "-plugin-opt=mcpu=gfx803" +// FIXED-PARTS-SAME: "--lto-partitions=42" +// FIXED-PARTS-SAME: "-o" "{{.*out}}" "{{.*bc}}" + +// RUN: not %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=a \ +// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \ +// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV0 + +// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a' + +// RUN: not %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=0 \ +// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \ +// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV1 + +// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0' diff --git a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip index 5276faf31bdc2..6f38a06f7cf31 100644 --- a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip +++ b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip @@ -49,6 +49,7 @@ // CHECK-NOT: ".*llc" // CHECK: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" // CHECK-SAME: "-plugin-opt=mcpu=gfx803" +// CHECK-SAME: "--lto-partitions={{[0-9]+}}" // CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]] // generate image for device side path on gfx900 @@ -77,6 +78,7 @@ // CHECK-NOT: ".*llc" // CHECK: [[LLD]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" // CHECK-SAME: "-plugin-opt=mcpu=gfx900" +// CHECK-SAME: "--lto-partitions={{[0-9]+}}" // CHECK-SAME: "--whole-archive" // CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[A_BC2]] [[B_BC2]] // CHECK-SAME: "--no-whole-archive" diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip index 96da423144c1c..9015702e3211a 100644 --- a/clang/test/Driver/hip-toolchain-rdc.hip +++ b/clang/test/Driver/hip-toolchain-rdc.hip @@ -147,6 +147,7 @@ // CHECK-NOT: ".*llc" // CHECK: {{".*lld.*"}} {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" // CHECK-SAME: "-plugin-opt=mcpu=gfx900" +// CHECK-SAME: "--lto-partitions={{[0-9]+}}" // CHECK-SAME: "-o" "[[IMG_DEV2:.*.out]]" [[A_BC2]] [[B_BC2]] // combine images generated into hip fat binary object _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits