https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/128509
>From e580bbe66a5d65e90051cb0906b87478341696e6 Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Mon, 24 Feb 2025 14:21:49 +0100 Subject: [PATCH 1/5] [clang][AMDGPU] Enable module splitting by default The default number of partitions is the number of cores on the machine with a cap at 16, as going above 16 is unlikely to be useful in the common case. Adds a flto-partitions option to override the number of partitions easily (without having to use -Xoffload-linker). Setting it to 1 effectively disables module splitting. Fixes SWDEV-506214 --- clang/include/clang/Driver/Options.td | 2 ++ clang/lib/Driver/ToolChains/AMDGPU.cpp | 36 +++++++++++++++++-- clang/lib/Driver/ToolChains/AMDGPU.h | 2 ++ clang/lib/Driver/ToolChains/HIPAMD.cpp | 2 ++ clang/test/Driver/amdgpu-toolchain.c | 20 +++++++++-- .../hip-toolchain-rdc-flto-partitions.hip | 35 ++++++++++++++++++ .../Driver/hip-toolchain-rdc-static-lib.hip | 2 ++ clang/test/Driver/hip-toolchain-rdc.hip | 1 + 8 files changed, 96 insertions(+), 4 deletions(-) create mode 100644 clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 75b1c51445942..7ff2b04ba2a82 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1393,6 +1393,8 @@ def fhip_emit_relocatable : Flag<["-"], "fhip-emit-relocatable">, HelpText<"Compile HIP source to relocatable">; def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">, HelpText<"Do not override toolchain to compile HIP source to relocatable">; +def flto_partitions_EQ : Joined<["--"], "flto-partitions=">, Group<hip_Group>, + HelpText<"Number of partitions to use for parallel full LTO codegen. Use 1 to disable partitioning.">; } // Clang specific/exclusive options for OpenACC. diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 6a35a2feabc9b..820a335a4b384 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -21,6 +21,7 @@ #include "llvm/Support/LineIterator.h" #include "llvm/Support/Path.h" #include "llvm/Support/Process.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/VirtualFileSystem.h" #include "llvm/TargetParser/Host.h" #include <optional> @@ -630,8 +631,11 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, getToolChain().AddFilePathLibArgs(Args, CmdArgs); AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA); if (C.getDriver().isUsingLTO()) { - addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], - C.getDriver().getLTOMode() == LTOK_Thin); + const bool ThinLTO = (C.getDriver().getLTOMode() == LTOK_Thin); + addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], ThinLTO); + + if (!ThinLTO) + addFullLTOPartitionOption(C.getDriver(), Args, CmdArgs); } else if (Args.hasArg(options::OPT_mcpu_EQ)) { CmdArgs.push_back(Args.MakeArgString( "-plugin-opt=mcpu=" + @@ -708,6 +712,34 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D, options::OPT_m_amdgpu_Features_Group); } +static unsigned GetFullLTOPartitions(const Driver &D, const ArgList &Args) { + const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ); + // In the absence of an option, use the number of available threads with a cap + // at 16 partitions. More than 16 partitions rarely benefits code splitting + // and can lead to more empty/small modules each with their own overhead. + if (!A) + return std::max(16u, llvm::hardware_concurrency().compute_thread_count()); + int Value; + if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) { + D.Diag(diag::err_drv_invalid_int_value) + << A->getAsString(Args) << A->getValue(); + return 1; + } + + return Value; +} + +void amdgpu::addFullLTOPartitionOption(const Driver &D, + const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) { + // TODO: restrict to gpu-rdc only? + + if (unsigned NumParts = GetFullLTOPartitions(D, Args); NumParts > 1) { + CmdArgs.push_back( + Args.MakeArgString("--lto-partitions=" + std::to_string(NumParts))); + } +} + /// AMDGPU Toolchain AMDGPUToolChain::AMDGPUToolChain(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h index bc941a40445ad..08bd4fa556f78 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.h +++ b/clang/lib/Driver/ToolChains/AMDGPU.h @@ -41,6 +41,8 @@ void getAMDGPUTargetFeatures(const Driver &D, const llvm::Triple &Triple, const llvm::opt::ArgList &Args, std::vector<StringRef> &Features); +void addFullLTOPartitionOption(const Driver &D, const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs); } // end namespace amdgpu } // end namespace tools diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index 271626ed54aed..4f0511b272a98 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -116,6 +116,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA, addLinkerCompressDebugSectionsOption(TC, Args, LldArgs); + amdgpu::addFullLTOPartitionOption(D, Args, LldArgs); + // Given that host and device linking happen in separate processes, the device // linker doesn't always have the visibility as to which device symbols are // needed by a program, especially for the device symbol dependencies that are diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c index c1c5aa8e90e68..6617108e59fcf 100644 --- a/clang/test/Driver/amdgpu-toolchain.c +++ b/clang/test/Driver/amdgpu-toolchain.c @@ -19,10 +19,12 @@ // AS_LINK_UR: ld.lld{{.*}} "--no-undefined"{{.*}} "--unresolved-symbols=ignore-all" // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \ -// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefixes=LTO,MCPU %s +// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=LTO %s +// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions" +// LTO: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions={{[0-9]+}}"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack" + // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \ // RUN: -L. -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=MCPU %s -// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions" // MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack" // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \ @@ -36,3 +38,17 @@ // RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -stdlib -startfiles \ // RUN: -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=STARTUP %s // STARTUP: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o" + +// Check --flto-partitions + +// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \ +// RUN: -L. -flto --flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s +// LTO_PARTS: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions=42" + +// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \ +// RUN: -L. -flto --flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s +// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a' + +// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \ +// RUN: -L. -flto --flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s +// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0' diff --git a/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip new file mode 100644 index 0000000000000..e345bd3f5be6b --- /dev/null +++ b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip @@ -0,0 +1,35 @@ +// RUN: %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=42 \ +// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \ +// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefix=FIXED-PARTS + +// FIXED-PARTS-NOT: "*.llvm-link" +// FIXED-PARTS-NOT: ".*opt" +// FIXED-PARTS-NOT: ".*llc" +// FIXED-PARTS: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" +// FIXED-PARTS-SAME: "-plugin-opt=mcpu=gfx803" +// FIXED-PARTS-SAME: "--lto-partitions=42" +// FIXED-PARTS-SAME: "-o" "{{.*out}}" "{{.*bc}}" + +// RUN: not %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=a \ +// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \ +// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV0 + +// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a' + +// RUN: not %clang -### --target=x86_64-linux-gnu \ +// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=0 \ +// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \ +// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV1 + +// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0' diff --git a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip index 5276faf31bdc2..6f38a06f7cf31 100644 --- a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip +++ b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip @@ -49,6 +49,7 @@ // CHECK-NOT: ".*llc" // CHECK: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" // CHECK-SAME: "-plugin-opt=mcpu=gfx803" +// CHECK-SAME: "--lto-partitions={{[0-9]+}}" // CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]] // generate image for device side path on gfx900 @@ -77,6 +78,7 @@ // CHECK-NOT: ".*llc" // CHECK: [[LLD]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" // CHECK-SAME: "-plugin-opt=mcpu=gfx900" +// CHECK-SAME: "--lto-partitions={{[0-9]+}}" // CHECK-SAME: "--whole-archive" // CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[A_BC2]] [[B_BC2]] // CHECK-SAME: "--no-whole-archive" diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip index 96da423144c1c..9015702e3211a 100644 --- a/clang/test/Driver/hip-toolchain-rdc.hip +++ b/clang/test/Driver/hip-toolchain-rdc.hip @@ -147,6 +147,7 @@ // CHECK-NOT: ".*llc" // CHECK: {{".*lld.*"}} {{.*}} "-plugin-opt=-amdgpu-internalize-symbols" // CHECK-SAME: "-plugin-opt=mcpu=gfx900" +// CHECK-SAME: "--lto-partitions={{[0-9]+}}" // CHECK-SAME: "-o" "[[IMG_DEV2:.*.out]]" [[A_BC2]] [[B_BC2]] // combine images generated into hip fat binary object >From fb9a6fa14d8499d98f9e2a991e25eb6db76d78d7 Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Tue, 25 Feb 2025 08:17:29 +0100 Subject: [PATCH 2/5] use twine --- clang/lib/Driver/ToolChains/AMDGPU.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 820a335a4b384..2482be0eb66fb 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -736,7 +736,7 @@ void amdgpu::addFullLTOPartitionOption(const Driver &D, if (unsigned NumParts = GetFullLTOPartitions(D, Args); NumParts > 1) { CmdArgs.push_back( - Args.MakeArgString("--lto-partitions=" + std::to_string(NumParts))); + Args.MakeArgString("--lto-partitions=" + Twine(NumParts))); } } >From 14f23a5ca8ddefaec17160e03672fd52ee92db7a Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Tue, 25 Feb 2025 10:01:44 +0100 Subject: [PATCH 3/5] max -> min --- clang/lib/Driver/ToolChains/AMDGPU.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 2482be0eb66fb..3cc2f7fc99ec2 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -718,7 +718,7 @@ static unsigned GetFullLTOPartitions(const Driver &D, const ArgList &Args) { // at 16 partitions. More than 16 partitions rarely benefits code splitting // and can lead to more empty/small modules each with their own overhead. if (!A) - return std::max(16u, llvm::hardware_concurrency().compute_thread_count()); + return std::min(16u, llvm::hardware_concurrency().compute_thread_count()); int Value; if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) { D.Diag(diag::err_drv_invalid_int_value) >From 6db0c7d1ad3362e0512c021e87d1c4318bf7f346 Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Fri, 28 Feb 2025 10:24:50 +0100 Subject: [PATCH 4/5] Comments --- clang/lib/Driver/ToolChains/AMDGPU.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 3cc2f7fc99ec2..8831fb1c17634 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -712,14 +712,14 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D, options::OPT_m_amdgpu_Features_Group); } -static unsigned GetFullLTOPartitions(const Driver &D, const ArgList &Args) { +static unsigned getFullLTOPartitions(const Driver &D, const ArgList &Args) { const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ); // In the absence of an option, use the number of available threads with a cap // at 16 partitions. More than 16 partitions rarely benefits code splitting // and can lead to more empty/small modules each with their own overhead. if (!A) return std::min(16u, llvm::hardware_concurrency().compute_thread_count()); - int Value; + int Value = 0; if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) { D.Diag(diag::err_drv_invalid_int_value) << A->getAsString(Args) << A->getValue(); @@ -732,9 +732,10 @@ static unsigned GetFullLTOPartitions(const Driver &D, const ArgList &Args) { void amdgpu::addFullLTOPartitionOption(const Driver &D, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) { - // TODO: restrict to gpu-rdc only? + // TODO: Should this be restricted to fgpu-rdc only ? Currently we'll + // also do it for non gpu-rdc LTO - if (unsigned NumParts = GetFullLTOPartitions(D, Args); NumParts > 1) { + if (unsigned NumParts = getFullLTOPartitions(D, Args); NumParts > 1) { CmdArgs.push_back( Args.MakeArgString("--lto-partitions=" + Twine(NumParts))); } >From e9997859af1bd5a6be5601f2fa7543f1d9b24986 Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Mon, 17 Mar 2025 15:09:01 +0100 Subject: [PATCH 5/5] Use 8 as default value --- clang/lib/Driver/ToolChains/AMDGPU.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 8831fb1c17634..e919f4e941f47 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -21,7 +21,6 @@ #include "llvm/Support/LineIterator.h" #include "llvm/Support/Path.h" #include "llvm/Support/Process.h" -#include "llvm/Support/Threading.h" #include "llvm/Support/VirtualFileSystem.h" #include "llvm/TargetParser/Host.h" #include <optional> @@ -714,11 +713,9 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D, static unsigned getFullLTOPartitions(const Driver &D, const ArgList &Args) { const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ); - // In the absence of an option, use the number of available threads with a cap - // at 16 partitions. More than 16 partitions rarely benefits code splitting - // and can lead to more empty/small modules each with their own overhead. + // In the absence of an option, use 8 as the default. if (!A) - return std::min(16u, llvm::hardware_concurrency().compute_thread_count()); + return 8; int Value = 0; if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) { D.Diag(diag::err_drv_invalid_int_value) _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits