https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/143964
>From 5f3cc287ff2c3c84ba840ebaa0931773341a02b2 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" <yaxun....@amd.com> Date: Wed, 11 Jun 2025 14:02:59 -0400 Subject: [PATCH 1/3] Revert "Revert "[HIP] use offload wrapper for non-device-only non-rdc (#132869)" (#143432)" This reverts commit f5e499a3383c1e3b9f60e60151075e8d9c1c3166. --- clang/lib/CodeGen/CGCUDANV.cpp | 3 +- clang/lib/Driver/Driver.cpp | 59 +++++++++++----- clang/lib/Driver/ToolChains/Clang.cpp | 18 ++++- clang/test/Driver/hip-binding.hip | 6 +- clang/test/Driver/hip-phases.hip | 51 ++++++++------ clang/test/Driver/hip-toolchain-no-rdc.hip | 81 +++++++++++++--------- 6 files changed, 142 insertions(+), 76 deletions(-) diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 38f514304df5e..dd26be74e561b 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -1280,7 +1280,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() { return nullptr; } if (CGM.getLangOpts().OffloadViaLLVM || - (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)) + (CGM.getLangOpts().OffloadingNewDriver && + (CGM.getLangOpts().HIP || RelocatableDeviceCode))) createOffloadingEntries(); else return makeModuleCtorFunction(); diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index eb60d907d2218..98bc4b91a0bcd 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -4423,6 +4423,10 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, options::OPT_no_offload_new_driver, C.isOffloadingHostKind(Action::OFK_Cuda)); + bool HIPNoRDC = + C.isOffloadingHostKind(Action::OFK_HIP) && + !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false); + // Builder to be used to build offloading actions. std::unique_ptr<OffloadingActionBuilder> OffloadBuilder = !UseNewOffloadingDriver @@ -4556,7 +4560,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, // Check if this Linker Job should emit a static library. if (ShouldEmitStaticLibrary(Args)) { LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image); - } else if (UseNewOffloadingDriver || + } else if ((UseNewOffloadingDriver && !HIPNoRDC) || Args.hasArg(options::OPT_offload_link)) { LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image); LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(), @@ -4867,10 +4871,28 @@ Action *Driver::BuildOffloadingActions(Compilation &C, const InputTy &Input, StringRef CUID, Action *HostAction) const { // Don't build offloading actions if explicitly disabled or we do not have a - // valid source input and compile action to embed it in. If preprocessing only - // ignore embedding. - if (offloadHostOnly() || !types::isSrcFile(Input.first) || - !(isa<CompileJobAction>(HostAction) || + // valid source input. + if (offloadHostOnly() || !types::isSrcFile(Input.first)) + return HostAction; + + bool HIPNoRDC = + C.isOffloadingHostKind(Action::OFK_HIP) && + !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false); + + // For HIP non-rdc non-device-only compilation, create a linker wrapper + // action for each host object to link, bundle and wrap device files in + // it. + if (isa<AssembleJobAction>(HostAction) && HIPNoRDC && !offloadDeviceOnly()) { + ActionList AL{HostAction}; + HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object); + HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(), + /*BoundArch=*/nullptr); + return HostAction; + } + + // Don't build offloading actions if we do not have a compile action. If + // preprocessing only ignore embedding. + if (!(isa<CompileJobAction>(HostAction) || getFinalPhase(Args) == phases::Preprocess)) return HostAction; @@ -4966,12 +4988,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C, } } - // Compiling HIP in non-RDC mode requires linking each action individually. + // Compiling HIP in device-only non-RDC mode requires linking each action + // individually. for (Action *&A : DeviceActions) { if ((A->getType() != types::TY_Object && A->getType() != types::TY_LTO_BC) || - Kind != Action::OFK_HIP || - Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) + !HIPNoRDC || !offloadDeviceOnly()) continue; ActionList LinkerInput = {A}; A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image); @@ -4995,12 +5017,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C, } } - // HIP code in non-RDC mode will bundle the output if it invoked the linker. + // HIP code in device-only non-RDC mode will bundle the output if it invoked + // the linker. bool ShouldBundleHIP = - C.isOffloadingHostKind(Action::OFK_HIP) && + HIPNoRDC && offloadDeviceOnly() && Args.hasFlag(options::OPT_gpu_bundle_output, options::OPT_no_gpu_bundle_output, true) && - !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) && !llvm::any_of(OffloadActions, [](Action *A) { return A->getType() != types::TY_Image; }); @@ -5020,11 +5042,9 @@ Action *Driver::BuildOffloadingActions(Compilation &C, C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN); DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(), nullptr, Action::OFK_Cuda); - } else if (C.isOffloadingHostKind(Action::OFK_HIP) && - !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, - false)) { - // If we are not in RDC-mode we just emit the final HIP fatbinary for each - // translation unit, linking each input individually. + } else if (HIPNoRDC && offloadDeviceOnly()) { + // If we are in device-only non-RDC-mode we just emit the final HIP + // fatbinary for each translation unit, linking each input individually. Action *FatbinAction = C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN); DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(), @@ -5177,8 +5197,11 @@ Action *Driver::ConstructPhaseAction( (((Input->getOffloadingToolChain() && Input->getOffloadingToolChain()->getTriple().isAMDGPU()) || TargetDeviceOffloadKind == Action::OFK_HIP) && - (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, - false) || + ((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, + false) || + (Args.hasFlag(options::OPT_offload_new_driver, + options::OPT_no_offload_new_driver, false) && + !offloadDeviceOnly())) || TargetDeviceOffloadKind == Action::OFK_OpenMP))) { types::ID Output = Args.hasArg(options::OPT_S) && diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 1d11be1d82be8..d6b89c324936f 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7702,7 +7702,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-fcuda-include-gpubinary"); CmdArgs.push_back(CudaDeviceInput->getFilename()); } else if (!HostOffloadingInputs.empty()) { - if ((IsCuda || IsHIP) && !IsRDCMode) { + if (IsCuda && !IsRDCMode) { assert(HostOffloadingInputs.size() == 1 && "Only one input expected"); CmdArgs.push_back("-fcuda-include-gpubinary"); CmdArgs.push_back(HostOffloadingInputs.front().getFilename()); @@ -9249,8 +9249,20 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, // Add the linker arguments to be forwarded by the wrapper. CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") + LinkCommand->getExecutable())); - for (const char *LinkArg : LinkCommand->getArguments()) - CmdArgs.push_back(LinkArg); + + // We use action type to differentiate two use cases of the linker wrapper. + // TY_Image for normal linker wrapper work. + // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable + // object. + assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image); + if (JA.getType() == types::TY_Object) { + CmdArgs.append({"-o", Output.getFilename()}); + for (auto Input : Inputs) + CmdArgs.push_back(Input.getFilename()); + CmdArgs.push_back("-r"); + } else + for (const char *LinkArg : LinkCommand->getArguments()) + CmdArgs.push_back(LinkArg); addOffloadCompressArgs(Args, CmdArgs); diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip index 57e57194ec87b..d8b3f1e242018 100644 --- a/clang/test/Driver/hip-binding.hip +++ b/clang/test/Driver/hip-binding.hip @@ -93,7 +93,7 @@ // RUN: -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \ // RUN: | FileCheck -check-prefix=LTO-NO-RDC %s // LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]" -// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]" // LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]" -// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]" -// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]" +// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]" +// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]" +// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o" diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip index 5fd2c0216ccc3..996d72e58755a 100644 --- a/clang/test/Driver/hip-phases.hip +++ b/clang/test/Driver/hip-phases.hip @@ -8,39 +8,50 @@ // // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \ // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s +// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \ // RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s +// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s // // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \ // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=BIN,RDC %s +// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s +// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \ +// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s // // BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]]) // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]]) // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]]) -// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) -// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) +// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]]) +// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) // BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]]) // BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]]) // BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]]) -// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]]) -// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]]) -// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]]) -// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]]) -// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image -// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]]) -// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]]) - -// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir -// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object -// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) -// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) -// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]]) -// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]]) -// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]]) +// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]]) +// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]]) +// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]]) +// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]]) +// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]]) +// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image +// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir +// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]]) +// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]]) +// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]]) + +// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir +// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir +// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object +// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) +// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) +// NEW-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) +// NEW-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) +// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]]) +// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]]) +// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]]) +// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]]) +// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image // // Test single gpu architecture up to the assemble phase. diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip index 6c69d1d51a260..ddd251b67cc57 100644 --- a/clang/test/Driver/hip-toolchain-no-rdc.hip +++ b/clang/test/Driver/hip-toolchain-no-rdc.hip @@ -7,7 +7,7 @@ // RUN: -fuse-ld=lld -B%S/Inputs/lld -nogpuinc \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ -// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK %s +// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK,OLD %s // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \ // RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ @@ -17,7 +17,7 @@ // RUN: -fuse-ld=lld -B%S/Inputs/lld -nogpuinc -c \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ -// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s +// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,OLD %s // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \ // RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ @@ -27,7 +27,7 @@ // RUN: -fuse-ld=lld -B%S/Inputs/lld -nogpuinc --offload-new-driver -c \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ -// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s +// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,NEW %s // RUN: touch %t/a.o %t/b.o // RUN: %clang -### --target=x86_64-linux-gnu \ @@ -47,22 +47,23 @@ // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" -// CHECK-SAME: "-emit-obj" +// OLD-SAME: "-emit-obj" +// NEW-SAME: "-emit-llvm-bc" // CHECK-SAME: {{.*}} "-main-file-name" "a.cu" // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols" // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden" // CHECK-SAME: "-fapply-global-visibility-to-externs" // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc" // CHECK-SAME: "-target-cpu" "gfx803" -// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_803:".*o"]] "-x" "hip" +// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_803:.*(o|bc)]]" "-x" "hip" // CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]] // CHECK-NOT: {{".*llvm-link"}} // CHECK-NOT: {{".*opt"}} // CHECK-NOT: {{".*llc"}} -// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared" -// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]] +// OLD: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared" +// OLD-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" "[[OBJ_DEV_A_803]]" // // Compile device code in a.cu to code object for gfx900. @@ -70,62 +71,71 @@ // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" -// CHECK-SAME: "-emit-obj" +// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}" // CHECK-SAME: {{.*}} "-main-file-name" "a.cu" // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols" // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden" // CHECK-SAME: "-fapply-global-visibility-to-externs" // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc" // CHECK-SAME: "-target-cpu" "gfx900" -// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_900:".*o"]] "-x" "hip" +// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_900:.*(o|bc)]]" "-x" "hip" // CHECK-SAME: {{.*}} [[A_SRC]] // CHECK-NOT: {{".*llvm-link"}} // CHECK-NOT: {{".*opt"}} // CHECK-NOT: {{".*llc"}} -// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared" -// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]] +// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared" +// OLD-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" "[[OBJ_DEV_A_900]]" // // Bundle and embed device code in host object for a.cu. // -// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// CHECK-SAME: "-bundle-align=4096" -// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900" -// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]" +// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" +// OLD-SAME: "-bundle-align=4096" +// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900" +// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]" + +// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_A:.*.out]]" +// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip" +// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip" // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" // CHECK-SAME: "-emit-obj" // CHECK-SAME: {{.*}} "-main-file-name" "a.cu" -// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]" -// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip" +// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]" +// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]" +// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip" +// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip" // CHECK-SAME: {{.*}} [[A_SRC]] +// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu" +// NEW: "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r" + // // Compile device code in b.hip to code object for gfx803. // // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" -// CHECK-SAME: "-emit-obj" +// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}" // CHECK-SAME: {{.*}} "-main-file-name" "b.hip" // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols" // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden" // CHECK-SAME: "-fapply-global-visibility-to-externs" // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc" // CHECK-SAME: "-target-cpu" "gfx803" -// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_803:".*o"]] "-x" "hip" +// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_803:.*(o|bc)]]" "-x" "hip" // CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]] // CHECK-NOT: {{".*llvm-link"}} // CHECK-NOT: {{".*opt"}} // CHECK-NOT: {{".*llc"}} -// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared" -// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]] +// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared" +// OLD-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" "[[OBJ_DEV_B_803]]" // // Compile device code in b.hip to code object for gfx900. @@ -133,40 +143,49 @@ // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" -// CHECK-SAME: "-emit-obj" +// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}" // CHECK-SAME: {{.*}} "-main-file-name" "b.hip" // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols" // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden" // CHECK-SAME: "-fapply-global-visibility-to-externs" // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc" // CHECK-SAME: "-target-cpu" "gfx900" -// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_900:".*o"]] "-x" "hip" +// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_900:.*(o|bc)]]" "-x" "hip" // CHECK-SAME: {{.*}} [[B_SRC]] // CHECK-NOT: {{".*llvm-link"}} // CHECK-NOT: {{".*opt"}} // CHECK-NOT: {{".*llc"}} -// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared" -// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]] +// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared" +// OLD-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" "[[OBJ_DEV_B_900]]" // // Bundle and embed device code in host object for b.hip. // -// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// CHECK-SAME: "-bundle-align=4096" -// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900" -// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_A:.*hipfb]]" +// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" +// OLD-SAME: "-bundle-align=4096" +// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900" +// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_B:.*hipfb]]" + +// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_B:.*.out]]" +// NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip" +// NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip" // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" // CHECK-SAME: "-emit-obj" // CHECK-SAME: {{.*}} "-main-file-name" "b.hip" -// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]" -// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip" +// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]" +// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]" +// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip" +// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip" // CHECK-SAME: {{.*}} [[B_SRC]] +// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu" +// NEW: "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r" + // // Link host objects. // >From 501b77e186ee9518b0ee6bddbda3ff5fd6318945 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" <yaxun....@amd.com> Date: Wed, 11 Jun 2025 17:37:29 -0400 Subject: [PATCH 2/3] Fix assertion with -flto Missing offload linker wrapper job action to wrap device binary for -flto. --- clang/lib/Driver/Driver.cpp | 5 ++++- clang/test/Driver/hip-phases.hip | 11 +++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 98bc4b91a0bcd..060f76fb653c9 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -4882,7 +4882,10 @@ Action *Driver::BuildOffloadingActions(Compilation &C, // For HIP non-rdc non-device-only compilation, create a linker wrapper // action for each host object to link, bundle and wrap device files in // it. - if (isa<AssembleJobAction>(HostAction) && HIPNoRDC && !offloadDeviceOnly()) { + if ((isa<AssembleJobAction>(HostAction) || + (isa<BackendJobAction>(HostAction) && + HostAction->getType() == types::TY_LTO_BC)) && + HIPNoRDC && !offloadDeviceOnly()) { ActionList AL{HostAction}; HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object); HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(), diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip index 996d72e58755a..d8a58b78d6d5c 100644 --- a/clang/test/Driver/hip-phases.hip +++ b/clang/test/Driver/hip-phases.hip @@ -12,6 +12,9 @@ // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \ // RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \ // RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s +// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \ +// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -flto -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWLTO %s // // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \ // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \ @@ -45,10 +48,14 @@ // OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object // OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) // OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) -// NEW-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) -// NEW-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) +// NEWN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) +// NEWN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) +// NEWLTO-DAG: [[P13:[0-9]+]]: backend, {[[P11]]}, lto-bc, (host-hip) +// NEWR-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]]) +// NEWR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]]) // OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]]) // NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]]) +// NEWLTO-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]]) // OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]]) // NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]]) // NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image >From 7b488bb9e44bc8541ea32fc2f8aa12962fb3babf Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" <yaxun....@amd.com> Date: Thu, 12 Jun 2025 15:53:34 -0400 Subject: [PATCH 3/3] Fix issue about mutiple source files HIP now uses llvm_offload_entries as section name. --- clang/test/Driver/linker-wrapper.c | 1 + .../ClangLinkerWrapper.cpp | 31 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c index a7e98e7351d98..80b1a5745a123 100644 --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -223,6 +223,7 @@ __attribute__((visibility("protected"), used)) int x; // RELOCATABLE-LINK-HIP: clang-offload-bundler{{.*}} -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx90a -input={{/dev/null|NUL}} -input={{.*}} -output={{.*}} // RELOCATABLE-LINK-HIP: /usr/bin/ld.lld{{.*}}-r // RELOCATABLE-LINK-HIP: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading +// RELOCATABLE-LINK-HIP: --rename-section llvm_offload_entries // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_89 \ diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 95b6f74af1f13..b8019fac4c2ec 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -310,22 +310,21 @@ Error relocateOffloadSection(const ArgList &Args, StringRef Output) { // Remove the old .llvm.offloading section to prevent further linking. ObjcopyArgs.emplace_back("--remove-section"); ObjcopyArgs.emplace_back(".llvm.offloading"); - for (StringRef Prefix : {"omp", "cuda", "hip"}) { - auto Section = (Prefix + "_offloading_entries").str(); - // Rename the offloading entires to make them private to this link unit. - ObjcopyArgs.emplace_back("--rename-section"); - ObjcopyArgs.emplace_back( - Args.MakeArgString(Section + "=" + Section + Suffix)); - - // Rename the __start_ / __stop_ symbols appropriately to iterate over the - // newly renamed section containing the offloading entries. - ObjcopyArgs.emplace_back("--redefine-sym"); - ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" + - "__start_" + Section + Suffix)); - ObjcopyArgs.emplace_back("--redefine-sym"); - ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" + - "__stop_" + Section + Suffix)); - } + StringRef Prefix = "llvm"; + auto Section = (Prefix + "llvm_offload_entries").str(); + // Rename the offloading entires to make them private to this link unit. + ObjcopyArgs.emplace_back("--rename-section"); + ObjcopyArgs.emplace_back( + Args.MakeArgString(Section + "=" + Section + Suffix)); + + // Rename the __start_ / __stop_ symbols appropriately to iterate over the + // newly renamed section containing the offloading entries. + ObjcopyArgs.emplace_back("--redefine-sym"); + ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" + + "__start_" + Section + Suffix)); + ObjcopyArgs.emplace_back("--redefine-sym"); + ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" + + "__stop_" + Section + Suffix)); if (Error Err = executeCommands(*ObjcopyPath, ObjcopyArgs)) return Err; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits