https://github.com/AlexVlx updated https://github.com/llvm/llvm-project/pull/134753
>From a988ecf63dc79d226c2f7aa1430f65d08256888b Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.vo...@amd.com> Date: Tue, 8 Apr 2025 00:20:27 +0100 Subject: [PATCH 1/4] Re-order & adapt `hipstdpar` specific passes. --- clang/lib/CodeGen/BackendUtil.cpp | 8 ++++---- clang/lib/Driver/ToolChains/HIPAMD.cpp | 7 ++++--- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 20 ++++++++++++------- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 7557cb8408921..fa5e12d4033a5 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1115,6 +1115,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline( if (CodeGenOpts.LinkBitcodePostopt) MPM.addPass(LinkInModulesPass(BC)); + if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice && + LangOpts.HIPStdParInterposeAlloc) + MPM.addPass(HipStdParAllocationInterpositionPass()); + // Add a verifier pass if requested. We don't have to do this if the action // requires code generation because there will already be a verifier pass in // the code-generation pipeline. @@ -1178,10 +1182,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline( return; } - if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice && - LangOpts.HIPStdParInterposeAlloc) - MPM.addPass(HipStdParAllocationInterpositionPass()); - // Now that we have all of the passes ready, run them. { PrettyStackTraceString CrashInfo("Optimizer"); diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index abb83701759ce..52e35a01be58d 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -231,10 +231,11 @@ void HIPAMDToolChain::addClangTargetOptions( CC1Args.append({"-fcuda-is-device", "-fno-threadsafe-statics"}); if (!DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, - false)) + false)) { CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"}); - if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar)) - CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"}); + if (DriverArgs.hasArgNoClaim(options::OPT_hipstdpar)) + CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"}); + } StringRef MaxThreadsPerBlock = DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4b5c70f09155f..03b1693244879 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -802,17 +802,17 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def" #include "llvm/Passes/TargetPassRegistry.inc" - PB.registerPipelineStartEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level) { - if (EnableHipStdPar) - PM.addPass(HipStdParAcceleratorCodeSelectionPass()); - }); - PB.registerPipelineEarlySimplificationEPCallback( [](ModulePassManager &PM, OptimizationLevel Level, ThinOrFullLTOPhase Phase) { - if (!isLTOPreLink(Phase)) + if (!isLTOPreLink(Phase)) { + // When we are not using -fgpu-rdc, we can run accelerator code + // selection relatively early, but still after linking to prevent + // eager removal of potentially reachable symbols. + if (EnableHipStdPar) + PM.addPass(HipStdParAcceleratorCodeSelectionPass()); PM.addPass(AMDGPUPrintfRuntimeBindingPass()); + } if (Level == OptimizationLevel::O0) return; @@ -883,6 +883,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerFullLinkTimeOptimizationLastEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { + // When we are using -fgpu-rdc, we can onky run accelerator code + // selection after linking to prevent, otherwise we end up removing + // potentially reachable symbols that were exported as external in other + // modules. + if (EnableHipStdPar) + PM.addPass(HipStdParAcceleratorCodeSelectionPass()); // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. >From 5cd1abb217d7fb2dd1f33c94a4f285b9aacd8dde Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.vo...@amd.com> Date: Tue, 8 Apr 2025 00:27:18 +0100 Subject: [PATCH 2/4] Fix formatting. --- clang/lib/CodeGen/BackendUtil.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index fa5e12d4033a5..f7eb853beb23c 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1117,7 +1117,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline( if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice && LangOpts.HIPStdParInterposeAlloc) - MPM.addPass(HipStdParAllocationInterpositionPass()); + MPM.addPass(HipStdParAllocationInterpositionPass()); // Add a verifier pass if requested. We don't have to do this if the action // requires code generation because there will already be a verifier pass in >From 2765739128a30b1dc94a8e8d8ab76a6f91c88e6a Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.vo...@amd.com> Date: Tue, 8 Apr 2025 18:54:28 +0100 Subject: [PATCH 3/4] Add tests. --- .../rdc-does-not-enable-hipstdpar.cpp | 11 ++++++++++ .../select-accelerator-code-pass-ordering.cpp | 21 +++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp create mode 100644 clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp diff --git a/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp b/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp new file mode 100644 index 0000000000000..c737fc9a42423 --- /dev/null +++ b/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp @@ -0,0 +1,11 @@ +// Check that if we are compiling with fgpu-rdc amdgpu-enable-hipstdpar is not +// passed to CC1, to avoid eager, per TU, removal of potentially accessible +// functions. + +// RUN: %clang -### --hipstdpar --offload-arch=gfx906 %s -nogpulib -nogpuinc \ +// RUN: 2>&1 | FileCheck -check-prefix=NORDC %s +// NORDC: {{".*clang.*".* "-triple" "amdgcn-amd-amdhsa".* "-mllvm" "-amdgpu-enable-hipstdpar".*}} + +// RUN: %clang -### --hipstdpar --offload-arch=gfx906 %s -nogpulib -nogpuinc -fgpu-rdc \ +// RUN: 2>&1 | FileCheck -check-prefix=RDC %s +// RDC-NOT: {{"-mllvm" "-amdgpu-enable-hipstdpar".*}} \ No newline at end of file diff --git a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp new file mode 100644 index 0000000000000..81aedc9cbcf03 --- /dev/null +++ b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp @@ -0,0 +1,21 @@ +// Test that the accelerator code selection pass only gets invoked after linking + +// Ensure Pass HipStdParAcceleratorCodeSelectionPass is not invoked in PreLink. +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -flto -emit-llvm-bc -fcuda-is-device -fdebug-pass-manager \ +// RUN: %s -o - 2>&1 | FileCheck --check-prefix=HIPSTDPAR-PRE %s +// HIPSTDPAR-PRE-NOT: Running pass: HipStdParAcceleratorCodeSelectionPass + +// Ensure Pass HipStdParAcceleratorCodeSelectionPass is invoked in PostLink. +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -fcuda-is-device -fdebug-pass-manager -emit-llvm \ +// RUN: %s -o - 2>&1 | FileCheck --check-prefix=HIPSTDPAR-POST %s +// HIPSTDPAR-POST: Running pass: HipStdParAcceleratorCodeSelection + +#define __device__ __attribute__((device)) + +void foo(float *a, float b) { + *a = b; +} + +__device__ void bar(float *a, float b) { + *a = b; +} >From 9df1e62007e40c91227945b24da7c634d801c587 Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.vo...@amd.com> Date: Fri, 11 Apr 2025 23:35:45 +0100 Subject: [PATCH 4/4] Fix typos. --- clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp b/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp index c737fc9a42423..da1c3943553f0 100644 --- a/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp +++ b/clang/test/CodeGenHipStdPar/rdc-does-not-enable-hipstdpar.cpp @@ -8,4 +8,4 @@ // RUN: %clang -### --hipstdpar --offload-arch=gfx906 %s -nogpulib -nogpuinc -fgpu-rdc \ // RUN: 2>&1 | FileCheck -check-prefix=RDC %s -// RDC-NOT: {{"-mllvm" "-amdgpu-enable-hipstdpar".*}} \ No newline at end of file +// RDC-NOT: {{"-mllvm" "-amdgpu-enable-hipstdpar".*}} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index dce6e594873aa..5fec974c5f9a7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -886,7 +886,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerFullLinkTimeOptimizationLastEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { - // When we are using -fgpu-rdc, we can onky run accelerator code + // When we are using -fgpu-rdc, we can only run accelerator code // selection after linking to prevent, otherwise we end up removing // potentially reachable symbols that were exported as external in other // modules. _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits