https://github.com/AlexVlx updated https://github.com/llvm/llvm-project/pull/134016
>From 91eeaf02336e539f14dcb0a79ff15dbe8befe6f1 Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.vo...@amd.com> Date: Wed, 2 Apr 2025 02:47:42 +0100 Subject: [PATCH 1/5] Add the functional identity and feature queries. --- clang/docs/LanguageExtensions.rst | 110 ++++++ clang/include/clang/Basic/BuiltinsAMDGPU.def | 5 + .../clang/Basic/DiagnosticSemaKinds.td | 10 + clang/lib/Basic/Targets/SPIR.cpp | 4 + clang/lib/Basic/Targets/SPIR.h | 4 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 29 ++ clang/lib/Sema/SemaExpr.cpp | 157 ++++++++ clang/test/CodeGen/amdgpu-builtin-cpu-is.c | 65 ++++ .../CodeGen/amdgpu-builtin-is-invocable.c | 64 ++++ .../amdgpu-feature-builtins-invalid-use.cpp | 43 +++ llvm/lib/Target/AMDGPU/AMDGPU.h | 9 + .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 207 ++++++++++ llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +- llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + ...pu-expand-feature-predicates-unfoldable.ll | 28 ++ .../amdgpu-expand-feature-predicates.ll | 359 ++++++++++++++++++ 17 files changed, 1099 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/amdgpu-builtin-cpu-is.c create mode 100644 clang/test/CodeGen/amdgpu-builtin-is-invocable.c create mode 100644 clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 3b8a9cac6587a..8a7cb75af13e5 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -4920,6 +4920,116 @@ If no address spaces names are provided, all address spaces are fenced. __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local") __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global") +__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``__builtin_amdgcn_processor_is`` and ``__builtin_amdgcn_is_invocable`` provide +a functional mechanism for programatically querying: + +* the identity of the current target processor; +* the capability of the current target processor to invoke a particular builtin. + +**Syntax**: + +.. code-block:: c + + // When used as the predicate for a control structure + bool __builtin_amdgcn_processor_is(const char*); + bool __builtin_amdgcn_is_invocable(builtin_name); + // Otherwise + void __builtin_amdgcn_processor_is(const char*); + void __builtin_amdgcn_is_invocable(void); + +**Example of use**: + +.. code-block:: c++ + + if (__builtin_amdgcn_processor_is("gfx1201") || + __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) + __builtin_amdgcn_s_sleep_var(x); + + if (!__builtin_amdgcn_processor_is("gfx906")) + __builtin_amdgcn_s_wait_event_export_ready(); + else if (__builtin_amdgcn_processor_is("gfx1010") || + __builtin_amdgcn_processor_is("gfx1101")) + __builtin_amdgcn_s_ttracedata_imm(1); + + while (__builtin_amdgcn_processor_is("gfx1101")) *p += x; + + do { *p -= x; } while (__builtin_amdgcn_processor_is("gfx1010")); + + for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break; + + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)) + __builtin_amdgcn_s_wait_event_export_ready(); + else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm)) + __builtin_amdgcn_s_ttracedata_imm(1); + + do { + *p -= x; + } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32)); + + for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break; + +**Description**: + +When used as the predicate value of the following control structures: + +.. code-block:: c++ + + if (...) + while (...) + do { } while (...) + for (...) + +be it directly, or as arguments to logical operators such as ``!, ||, &&``, the +builtins return a boolean value that: + +* indicates whether the current target matches the argument; the argument MUST + be a string literal and a valid AMDGPU target +* indicates whether the builtin function passed as the argument can be invoked + by the current target; the argument MUST be either a generic or AMDGPU + specific builtin name + +Outside of these contexts, the builtins have a ``void`` returning signature +which prevents their misuse. + +**Example of invalid use**: + +.. code-block:: c++ + + void kernel(int* p, int x, bool (*pfn)(bool), const char* str) { + if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return; + else if (__builtin_amdgcn_processor_is(str)) __builtin_trap(); + + bool a = __builtin_amdgcn_processor_is("gfx906"); + const bool b = !__builtin_amdgcn_processor_is("gfx906"); + const bool c = !__builtin_amdgcn_processor_is("gfx906"); + bool d = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + bool e = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + const auto f = + !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) + || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + const auto g = + !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) + || !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + __builtin_amdgcn_processor_is("gfx1201") + ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42); + if (pfn(__builtin_amdgcn_processor_is("gfx1200"))) + __builtin_amdgcn_s_sleep_var(x); + + if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return; + else if (__builtin_amdgcn_is_invocable(x)) __builtin_trap(); + } + +When invoked while compiling for a concrete target, the builtins are evaluated +early by Clang, and never produce any CodeGen effects / have no observable +side-effects in IR. Conversely, when compiling for AMDGCN flavoured SPIR-v, +which is an abstract target, a series of predicate values are implicitly +created. These predicates get resolved when finalizing the compilation process +for a concrete target, and shall reflect the latter's identity and features. +Thus, it is possible to author high-level code, in e.g. HIP, that is target +adaptive in a dynamic fashion, contrary to macro based mechanisms. ARM/AArch64 Language Extensions ------------------------------- diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 44ef404aee72f..5d01a7e75f7e7 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -346,6 +346,11 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr") BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n") BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n") +// These are special FE only builtins intended for forwarding the requirements +// to the ME. +BUILTIN(__builtin_amdgcn_processor_is, "vcC*", "nctu") +BUILTIN(__builtin_amdgcn_is_invocable, "v", "nctu") + //===----------------------------------------------------------------------===// // R600-NI only builtins. //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 5e45482584946..45f0f9eb88e55 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13054,4 +13054,14 @@ def err_acc_decl_for_routine // AMDGCN builtins diagnostics def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">; def note_amdgcn_global_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">; +def err_amdgcn_processor_is_arg_not_literal + : Error<"the argument to __builtin_amdgcn_processor_is must be a string " + "literal">; +def err_amdgcn_processor_is_arg_invalid_value + : Error<"the argument to __builtin_amdgcn_processor_is must be a valid " + "AMDGCN processor identifier; '%0' is not valid">; +def err_amdgcn_is_invocable_arg_invalid_value + : Error<"the argument to __builtin_amdgcn_is_invocable must be either a " + "target agnostic builtin or an AMDGCN target specific builtin; `%0`" + " is not valid">; } // end of sema component. diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp index 5b5f47f9647a2..eb43d9b0be283 100644 --- a/clang/lib/Basic/Targets/SPIR.cpp +++ b/clang/lib/Basic/Targets/SPIR.cpp @@ -152,3 +152,7 @@ void SPIRV64AMDGCNTargetInfo::setAuxTarget(const TargetInfo *Aux) { Float128Format = DoubleFormat; } } + +bool SPIRV64AMDGCNTargetInfo::isValidCPUName(StringRef CPU) const { + return AMDGPUTI.isValidCPUName(CPU); +} diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index 78505d66d6f2f..7aa13cbeb89fd 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -432,6 +432,10 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final } bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); } + + // This is only needed for validating arguments passed to + // __builtin_amdgcn_processor_is + bool isValidCPUName(StringRef Name) const override; }; } // namespace targets diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index b56b739094ff3..7b1a3815144b4 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -284,6 +284,18 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs)); } +static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) { + auto PTy = IntegerType::getInt1Ty(CGF.getLLVMContext()); + + auto P = cast<GlobalVariable>( + CGF.CGM.getModule().getOrInsertGlobal(Name.str(), PTy)); + P->setConstant(true); + P->setExternallyInitialized(true); + + return CGF.Builder.CreateLoad(RawAddress(P, PTy, CharUnits::One(), + KnownNonNull)); +} + Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; @@ -585,6 +597,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Value *Env = EmitScalarExpr(E->getArg(0)); return Builder.CreateCall(F, {Env}); } + case AMDGPU::BI__builtin_amdgcn_processor_is: { + assert(CGM.getTriple().isSPIRV() && + "__builtin_amdgcn_processor_is should never reach CodeGen for " + "concrete targets!"); + StringRef Proc = cast<clang::StringLiteral>(E->getArg(0))->getString(); + return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc); + } + case AMDGPU::BI__builtin_amdgcn_is_invocable: { + assert(CGM.getTriple().isSPIRV() && + "__builtin_amdgcn_is_invocable should never reach CodeGen for " + "concrete targets!"); + auto FD = cast<FunctionDecl>( + cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee()); + StringRef RF = + getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); + return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF); + } case AMDGPU::BI__builtin_amdgcn_read_exec: return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false); case AMDGPU::BI__builtin_amdgcn_read_exec_lo: diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 7cc8374e69d73..24f5262ab3cf4 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6541,6 +6541,22 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, if (Result.isInvalid()) return ExprError(); Fn = Result.get(); + // The __builtin_amdgcn_is_invocable builtin is special, and will be resolved + // later, when we check boolean conditions, for now we merely forward it + // without any additional checking. + if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 && + ArgExprs[0]->getType() == Context.BuiltinFnTy) { + auto FD = cast<FunctionDecl>(Fn->getReferencedDeclOfCallee()); + + if (FD->getName() == "__builtin_amdgcn_is_invocable") { + auto FnPtrTy = Context.getPointerType(FD->getType()); + auto R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get(); + return CallExpr::Create(Context, R, ArgExprs, Context.VoidTy, + ExprValueKind::VK_PRValue, RParenLoc, + FPOptionsOverride()); + } + } + if (CheckArgsForPlaceholders(ArgExprs)) return ExprError(); @@ -13234,6 +13250,20 @@ inline QualType Sema::CheckBitwiseOperands(ExprResult &LHS, ExprResult &RHS, return InvalidOperands(Loc, LHS, RHS); } +static inline bool IsAMDGPUPredicateBI(Expr *E) { + if (!E->getType()->isVoidType()) + return false; + + if (auto CE = dyn_cast<CallExpr>(E)) { + if (auto BI = CE->getDirectCallee()) + if (BI->getName() == "__builtin_amdgcn_processor_is" || + BI->getName() == "__builtin_amdgcn_is_invocable") + return true; + } + + return false; +} + // C99 6.5.[13,14] inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS, SourceLocation Loc, @@ -13329,6 +13359,9 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS, // The following is safe because we only use this method for // non-overloadable operands. + if (IsAMDGPUPredicateBI(LHS.get()) && IsAMDGPUPredicateBI(RHS.get())) + return Context.VoidTy; + // C++ [expr.log.and]p1 // C++ [expr.log.or]p1 // The operands are both contextually converted to type bool. @@ -15576,6 +15609,38 @@ static bool isOverflowingIntegerType(ASTContext &Ctx, QualType T) { return Ctx.getIntWidth(T) >= Ctx.getIntWidth(Ctx.IntTy); } +static Expr *ExpandAMDGPUPredicateBI(ASTContext &Ctx, CallExpr *CE) { + if (!CE->getBuiltinCallee()) + return CXXBoolLiteralExpr::Create(Ctx, false, Ctx.BoolTy, CE->getExprLoc()); + + if (Ctx.getTargetInfo().getTriple().isSPIRV()) { + CE->setType(Ctx.getLogicalOperationType()); + return CE; + } + + bool P = false; + auto &TI = Ctx.getTargetInfo(); + + if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") { + auto GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts()); + auto TID = TI.getTargetID(); + if (GFX && TID) { + auto N = GFX->getString(); + P = TI.isValidCPUName(GFX->getString()) && TID->find(N) == 0; + } + } else { + auto FD = cast<FunctionDecl>(CE->getArg(0)->getReferencedDeclOfCallee()); + + StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); + llvm::StringMap<bool> CF; + Ctx.getFunctionFeatureMap(CF, FD); + + P = Builtin::evaluateRequiredTargetFeatures(RF, CF); + } + + return CXXBoolLiteralExpr::Create(Ctx, P, Ctx.BoolTy, CE->getExprLoc()); +} + ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc, Expr *InputExpr, bool IsAfterAmp) { @@ -15753,6 +15818,8 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc, // Vector logical not returns the signed variant of the operand type. resultType = GetSignedVectorType(resultType); break; + } else if (IsAMDGPUPredicateBI(InputExpr)) { + break; } else { return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr) << resultType << Input.get()->getSourceRange()); @@ -20469,6 +20536,88 @@ void Sema::DiagnoseEqualityWithExtraParens(ParenExpr *ParenE) { } } +static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) { + if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") { + auto GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts()); + if (!GFX) { + Sema.Diag(CE->getExprLoc(), + diag::err_amdgcn_processor_is_arg_not_literal); + return false; + } + auto N = GFX->getString(); + if (!Sema.getASTContext().getTargetInfo().isValidCPUName(N) && + (!Sema.getASTContext().getAuxTargetInfo() || + !Sema.getASTContext().getAuxTargetInfo()->isValidCPUName(N))) { + Sema.Diag(CE->getExprLoc(), + diag::err_amdgcn_processor_is_arg_invalid_value) << N; + return false; + } + } else { + auto Arg = CE->getArg(0); + if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) { + Sema.Diag(CE->getExprLoc(), + diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg; + return false; + } + } + + return true; +} + +static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) { + if (auto UO = dyn_cast<UnaryOperator>(E)) { + auto SE = dyn_cast<CallExpr>(UO->getSubExpr()); + if (IsAMDGPUPredicateBI(SE)) { + assert( + UO->getOpcode() == UnaryOperator::Opcode::UO_LNot && + "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " + "can only be used as operands of logical ops!"); + + if (!ValidateAMDGPUPredicateBI(Sema, SE)) { + Invalid = true; + return nullptr; + } + + UO->setSubExpr(ExpandAMDGPUPredicateBI(Sema.getASTContext(), SE)); + UO->setType(Sema.getASTContext().getLogicalOperationType()); + + return UO; + } + } + if (auto BO = dyn_cast<BinaryOperator>(E)) { + auto LHS = dyn_cast<CallExpr>(BO->getLHS()); + auto RHS = dyn_cast<CallExpr>(BO->getRHS()); + if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) { + assert( + BO->isLogicalOp() && + "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " + "can only be used as operands of logical ops!"); + + if (!ValidateAMDGPUPredicateBI(Sema, LHS) || + !ValidateAMDGPUPredicateBI(Sema, RHS)) { + Invalid = true; + return nullptr; + } + + BO->setLHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), LHS)); + BO->setRHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), RHS)); + BO->setType(Sema.getASTContext().getLogicalOperationType()); + + return BO; + } + } + if (auto CE = dyn_cast<CallExpr>(E)) + if (IsAMDGPUPredicateBI(CE)) { + if (!ValidateAMDGPUPredicateBI(Sema, CE)) { + Invalid = true; + return nullptr; + } + return ExpandAMDGPUPredicateBI(Sema.getASTContext(), CE); + } + + return nullptr; +} + ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E, bool IsConstexpr) { DiagnoseAssignmentAsCondition(E); @@ -20480,6 +20629,14 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E, E = result.get(); if (!E->isTypeDependent()) { + if (E->getType()->isVoidType()) { + bool IsInvalidPredicate = false; + if (auto BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate)) + return BIC; + else if (IsInvalidPredicate) + return ExprError(); + } + if (getLangOpts().CPlusPlus) return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4 diff --git a/clang/test/CodeGen/amdgpu-builtin-cpu-is.c b/clang/test/CodeGen/amdgpu-builtin-cpu-is.c new file mode 100644 index 0000000000000..6e261d9f5d239 --- /dev/null +++ b/clang/test/CodeGen/amdgpu-builtin-cpu-is.c @@ -0,0 +1,65 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5 +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s + +// Test that, depending on triple and, if applicable, target-cpu, one of three +// things happens: +// 1) for gfx900 we emit a call to trap (concrete target, matches) +// 2) for gfx1010 we emit an empty kernel (concrete target, does not match) +// 3) for AMDGCNSPIRV we emit llvm.amdgcn.is.gfx900 as a bool global, and +// load from it to provide the condition a br (abstract target) +//. +// AMDGCN-GFX900: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600 +//. +// AMDGCN-GFX1010: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600 +//. +// AMDGCNSPIRV: @llvm.amdgcn.is.gfx900 = external addrspace(1) externally_initialized constant i1 +//. +// AMDGCN-GFX900-LABEL: define dso_local void @foo( +// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX900-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX900-NEXT: call void @llvm.trap() +// AMDGCN-GFX900-NEXT: ret void +// +// AMDGCN-GFX1010-LABEL: define dso_local void @foo( +// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX1010-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX1010-NEXT: ret void +// +// AMDGCNSPIRV-LABEL: define spir_func void @foo( +// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] { +// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1 +// AMDGCNSPIRV-NEXT: br i1 [[TMP0]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +// AMDGCNSPIRV: [[IF_THEN]]: +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.trap() +// AMDGCNSPIRV-NEXT: br label %[[IF_END]] +// AMDGCNSPIRV: [[IF_END]]: +// AMDGCNSPIRV-NEXT: ret void +// +void foo() { + if (__builtin_cpu_is("gfx900")) + return __builtin_trap(); +} +//. +// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +// AMDGCN-GFX900: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } +//. +// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" } +// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c new file mode 100644 index 0000000000000..6d2690cb75b7c --- /dev/null +++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c @@ -0,0 +1,64 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5 +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s + +// Test that, depending on triple and, if applicable, target-cpu, one of three +// things happens: +// 1) for gfx900 we emit an empty kernel (concrete target, lacks feature) +// 2) for gfx1010 we emit a call to trap (concrete target, has feature) +// 3) for AMDGCNSPIRV we emit llvm.amdgcn.has.gfx10-insts as a constant +// externally initialised bool global, and load from it to provide the +// condition to a br (abstract target) + +//. +// AMDGCNSPIRV: @llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1 +//. +// AMDGCN-GFX900-LABEL: define dso_local void @foo( +// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX900-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX900-NEXT: ret void +// +// AMDGCN-GFX1010-LABEL: define dso_local void @foo( +// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX1010-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX1010-NEXT: call void @llvm.trap() +// AMDGCN-GFX1010-NEXT: ret void +// +// AMDGCNSPIRV-LABEL: define spir_func void @foo( +// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] { +// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1 +// AMDGCNSPIRV-NEXT: [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +// AMDGCNSPIRV: [[IF_THEN]]: +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.trap() +// AMDGCNSPIRV-NEXT: br label %[[IF_END]] +// AMDGCNSPIRV: [[IF_END]]: +// AMDGCNSPIRV-NEXT: ret void +// +void foo() { + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16)) + return __builtin_trap(); +} +//. +// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +//. +// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } +// AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" } +// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp new file mode 100644 index 0000000000000..f618f54909b00 --- /dev/null +++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp @@ -0,0 +1,43 @@ +// RUN: not %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - 2>&1 | FileCheck %s +// RUN: not %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - 2>&1 | FileCheck %s + +bool predicate(bool x) { return x; } + +void invalid_uses(int* p, int x, bool (*pfn)(bool)) { + // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void' + bool invalid_use_in_init_0 = __builtin_amdgcn_processor_is("gfx906"); + // CHECK: error: cannot initialize a variable of type 'const bool' with an rvalue of type 'void' + const bool invalid_use_in_init_1 = !__builtin_amdgcn_processor_is("gfx906"); + // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void' + bool invalid_use_in_init_2 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void' + bool invalid_use_in_init_3 = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + // CHECK: error: variable has incomplete type 'const void' + const auto invalid_use_in_init_4 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + // CHECK: error: variable has incomplete type 'const void' + const auto invalid_use_in_init_5 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_processor_is("gfx900"); + // CHECK: error: variable has incomplete type 'const void' + const auto invalid_use_in_init_6 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep); + // CHECK: error: value of type 'void' is not contextually convertible to 'bool' + __builtin_amdgcn_processor_is("gfx1201") + ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42); + // CHECK: error: no matching function for call to 'predicate' + if (predicate(__builtin_amdgcn_processor_is("gfx1200"))) __builtin_amdgcn_s_sleep_var(x); + // CHECK: note: candidate function not viable: cannot convert argument of incomplete type 'void' to 'bool' for 1st argument +} + +void invalid_invocations(int x, const char* str) { + // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a valid AMDGCN processor identifier; 'not_an_amdgcn_gfx_id' is not valid + if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return; + // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal + if (__builtin_amdgcn_processor_is(str)) return; + + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `"__builtin_amdgcn_s_sleep_var"` is not valid + if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return; + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `str` is not valid + else if (__builtin_amdgcn_is_invocable(str)) return; + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `x` is not valid + else if (__builtin_amdgcn_is_invocable(x)) return; + // CHECK: error: use of undeclared identifier '__builtin_ia32_pause' + else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index a8e4ea9429f50..1fe0016723a30 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -408,6 +408,15 @@ extern char &AMDGPUPrintfRuntimeBindingID; void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &); extern char &AMDGPUResourceUsageAnalysisID; +struct AMDGPUExpandFeaturePredicatesPass + : PassInfoMixin<AMDGPUExpandFeaturePredicatesPass> { + const AMDGPUTargetMachine &TM; + AMDGPUExpandFeaturePredicatesPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + + static bool isRequired() { return true; } +}; + struct AMDGPUPrintfRuntimeBindingPass : PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp new file mode 100644 index 0000000000000..125051c6aa0cf --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp @@ -0,0 +1,207 @@ +//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file implements a pass that deals with expanding AMDGCN generic pseudo- +// intrinsics into target specific quantities / sequences. In this context, a +// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a +// specific instruction, but rather is intended as a mechanism for abstractly +// conveying target specific info to a HLL / the FE, without concretely +// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize. +// This pass should run as early as possible / immediately after Clang CodeGen, +// so that the optimisation pipeline and the BE operate with concrete target +// data. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetIntrinsicInfo.h" +#include "llvm/Transforms/IPO/AlwaysInliner.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" + +#include <string> +#include <utility> + +using namespace llvm; + +namespace { +inline Function *getCloneForInlining(Function *OldF) { + assert(OldF && "Must pass an existing Function!"); + + // TODO - Alias Value to clone arg. + ValueToValueMapTy VMap; + + auto NewF = CloneFunction(OldF, VMap); + + NewF->removeFnAttr(Attribute::OptimizeNone); + NewF->removeFnAttr(Attribute::NoInline); + NewF->addFnAttr(Attribute::AlwaysInline); + + return NewF; +} + +template <typename C> +inline void collectUsers(Value *V, ModulePassManager &AlwaysInliner, + ModuleAnalysisManager &MAM, + SmallDenseMap<Function *, Function *> &InlinableClones, + C &Container) { + assert(V && "Must pass an existing Value!"); + + auto A = PreservedAnalyses::all(); + + constexpr auto IsValidCall = [](auto &&U) { + if (auto CB = dyn_cast<CallBase>(U)) + if (auto F = CB->getCalledFunction()) + if (!F->isIntrinsic() && !F->isDeclaration()) + return true; + return false; + }; + + SmallVector<User *> Calls{}; + copy_if(V->users(), std::back_inserter(Calls), IsValidCall); + + while (!Calls.empty()) { + for (auto &&Call : Calls) { + auto CB = cast<CallBase>(Call); + auto &TempF = InlinableClones[CB->getCalledFunction()]; + + if (!TempF) + TempF = getCloneForInlining(CB->getCalledFunction()); + + CB->setCalledFunction(TempF); + CB->removeFnAttr(Attribute::NoInline); + CB->addFnAttr(Attribute::AlwaysInline); + + AlwaysInliner.run(*TempF->getParent(), MAM); + } + + Calls.clear(); + + copy_if(V->users(), std::back_inserter(Calls), IsValidCall); + } + + for (auto &&U : V->users()) + if (auto I = dyn_cast<Instruction>(U)) { + if (auto CB = dyn_cast<CallBase>(I)) { + if (CB->getCalledFunction() && !CB->getCalledFunction()->isIntrinsic()) + Container.insert(Container.end(), I); + } else { + Container.insert(Container.end(), I); + } + } +} + +std::pair<PreservedAnalyses, bool> +handlePredicate(const GCNSubtarget &ST, ModuleAnalysisManager &MAM, + SmallDenseMap<Function *, Function *>& InlinableClones, + GlobalVariable *P) { + auto PV = P->getName().substr(P->getName().rfind('.') + 1).str(); + auto Dx = PV.find(','); + while (Dx != std::string::npos) { + PV.insert(++Dx, {'+'}); + + Dx = PV.find(',', Dx); + } + + auto PTy = P->getValueType(); + P->setLinkage(GlobalValue::PrivateLinkage); + P->setExternallyInitialized(false); + + if (P->getName().starts_with("llvm.amdgcn.is")) + P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU())); + else + P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures('+' + PV))); + + ModulePassManager MPM; + MPM.addPass(AlwaysInlinerPass()); + + SmallPtrSet<Instruction *, 32> ToFold; + collectUsers(P, MPM, MAM, InlinableClones, ToFold); + + if (ToFold.empty()) + return {PreservedAnalyses::all(), true}; + + do { + auto I = *ToFold.begin(); + ToFold.erase(I); + + if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) { + collectUsers(I, MPM, MAM, InlinableClones, ToFold); + I->replaceAllUsesWith(C); + I->eraseFromParent(); + continue; + } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) { + continue; + } else if (I->users().empty()) { + continue; + } + + std::string W; + raw_string_ostream OS(W); + + auto Caller = I->getParent()->getParent(); + + OS << "Impossible to constant fold feature predicate: " << P->getName() + << ", please simplify.\n"; + + Caller->getContext().diagnose( + DiagnosticInfoUnsupported(*Caller, W, I->getDebugLoc(), DS_Error)); + + return {PreservedAnalyses::none(), false}; + } while (!ToFold.empty()); + + return {PreservedAnalyses::none(), true}; +} +} // Unnamed namespace. + +PreservedAnalyses +AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &MAM) { + if (M.empty()) + return PreservedAnalyses::all(); + + SmallVector<GlobalVariable *> Predicates; + for (auto &&G : M.globals()) { + if (!G.isDeclaration() || !G.hasName()) + continue; + if (G.getName().starts_with("llvm.amdgcn.")) + Predicates.push_back(&G); + } + + if (Predicates.empty()) + return PreservedAnalyses::all(); + + PreservedAnalyses Ret = PreservedAnalyses::all(); + + SmallDenseMap<Function *, Function *> InlinableClones; + const auto &ST = TM.getSubtarget<GCNSubtarget>( + *find_if(M, [](auto &&F) { return !F.isIntrinsic(); })); + + for (auto &&P : Predicates) { + auto R = handlePredicate(ST, MAM, InlinableClones, P); + + if (!R.second) + return PreservedAnalyses::none(); + + Ret.intersect(R.first); + } + + for (auto &&C : InlinableClones) + C.second->eraseFromParent(); + + return Ret; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 6a45392b5f099..c3c9e24c2efa4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -29,6 +29,8 @@ MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass()) +MODULE_PASS("amdgpu-expand-feature-predicates", + AMDGPUExpandFeaturePredicatesPass(*this)) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4937b434bc955..8e8a6e1eda437 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -803,7 +803,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #include "llvm/Passes/TargetPassRegistry.inc" PB.registerPipelineStartEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level) { + [this](ModulePassManager &PM, OptimizationLevel Level) { + PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this)); if (EnableHipStdPar) PM.addPass(HipStdParAcceleratorCodeSelectionPass()); }); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 09a3096602fc3..a389200f0db8e 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp AMDGPUCtorDtorLowering.cpp + AMDGPUExpandFeaturePredicates.cpp AMDGPUExportClustering.cpp AMDGPUExportKernelRuntimeHandles.cpp AMDGPUFrameLowering.cpp diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll new file mode 100644 index 0000000000000..bfc35d8c76e37 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll @@ -0,0 +1,28 @@ +; REQUIRES: amdgpu-registered-target + +; RUN: not opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' < %s 2>&1 | FileCheck %s + +; CHECK: error:{{.*}}in function kernel void (ptr addrspace(1), i32, ptr addrspace(1)): Impossible to constant fold feature predicate: @llvm.amdgcn.is.gfx803 = private addrspace(1) constant i1 false used by %call = call i1 %1(i1 zeroext false), please simplify. + +@llvm.amdgcn.is.gfx803 = external addrspace(1) externally_initialized constant i1 + +declare void @llvm.amdgcn.s.sleep(i32 immarg) #1 + +define amdgpu_kernel void @kernel(ptr addrspace(1) readnone captures(none) %p.coerce, i32 %x, ptr addrspace(1) %pfn.coerce) { +entry: + %0 = ptrtoint ptr addrspace(1) %pfn.coerce to i64 + %1 = inttoptr i64 %0 to ptr + %2 = ptrtoint ptr addrspace(1) %pfn.coerce to i64 + %3 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx803, align 1 + %call = call i1 %1(i1 zeroext %3) + br i1 %call, label %if.gfx803, label %if.end + +if.gfx803: + call void @llvm.amdgcn.s.sleep(i32 0) + br label %if.end + +if.end: + ret void +} + +attributes #1 = { nocallback nofree nosync nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll new file mode 100644 index 0000000000000..277323c353260 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll @@ -0,0 +1,359 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; REQUIRES: amdgpu-registered-target + +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1010 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1101 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201-W64 %s + +;; The IR was derived from the following source: +;; extern "C" __global__ void kernel(int* p, int x) +;; { +;; if (__builtin_amdgcn_processor_is("gfx1201") || +;; __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) +;; __builtin_amdgcn_s_sleep_var(x); +;; if (!__builtin_amdgcn_processor_is("gfx906")) +;; __builtin_amdgcn_s_wait_event_export_ready(); +;; else if (__builtin_amdgcn_processor_is("gfx1010") || +;; __builtin_amdgcn_processor_is("gfx1101")) +;; __builtin_amdgcn_s_ttracedata_imm(1); +;; while (__builtin_amdgcn_processor_is("gfx1101")) *p += x; +;; do { +;; *p -= x; +;; } while (__builtin_amdgcn_processor_is("gfx1010")); +;; for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break; +;; +;; if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)) +;; __builtin_amdgcn_s_wait_event_export_ready(); +;; else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm)) +;; __builtin_amdgcn_s_ttracedata_imm(1); +;; +;; do { +;; *p -= x; +;; } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32)); +;; for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break; +;; } + +@llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.has.gfx12-insts = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx1010 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx1101 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.has.gfx11-insts = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1 +@"llvm.amdgcn.has.gfx12-insts,wavefrontsize64" = external addrspace(1) externally_initialized constant i1 + +declare void @llvm.amdgcn.s.sleep.var(i32) +declare void @llvm.amdgcn.s.wait.event.export.ready() +declare void @llvm.amdgcn.s.ttracedata.imm(i16 immarg) + +define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { +; GFX906-LABEL: define amdgpu_kernel void @kernel( +; GFX906-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[ENTRY:.*:]] +; GFX906-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX906-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX906-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]] +; GFX906: [[IF_GFX1201_OR_GFX12_INSTS1]]: +; GFX906-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX906: [[IF_GFX1201_OR_GFX12_INSTS:.*:]] +; GFX906-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX906-NEXT: br label %[[IF_NOT_GFX906]] +; GFX906: [[IF_NOT_GFX906]]: +; GFX906-NEXT: br label %[[IF_GFX1010_OR_GFX1102:.*]] +; GFX906: [[IF_NOT_GFX907:.*:]] +; GFX906-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX906-NEXT: br label %[[IF_END6:.*]] +; GFX906: [[IF_GFX1010_OR_GFX1102]]: +; GFX906-NEXT: br label %[[LOR_NOT_GFX1010:.*]] +; GFX906: [[LOR_NOT_GFX1010]]: +; GFX906-NEXT: br label %[[FOR_COND:.*]] +; GFX906: [[IF_GFX1010_OR_GFX1101:.*:]] +; GFX906-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX906-NEXT: br label %[[IF_END6]] +; GFX906: [[IF_END6]]: +; GFX906-NEXT: call void @llvm.assume(i1 true) +; GFX906-NEXT: call void @llvm.assume(i1 true) +; GFX906-NEXT: br label %[[FOR_COND]] +; GFX906: [[FOR_COND]]: +; GFX906-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX906-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX906-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX906-NEXT: br label %[[IF_GFX10_INSTS1:.*]] +; GFX906: [[IF_GFX11_INSTS:.*:]] +; GFX906-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX906-NEXT: br label %[[IF_END11:.*]] +; GFX906: [[IF_GFX10_INSTS1]]: +; GFX906-NEXT: br label %[[IF_END11]] +; GFX906: [[IF_GFX10_INSTS:.*:]] +; GFX906-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX906-NEXT: br label %[[IF_END11]] +; GFX906: [[IF_END11]]: +; GFX906-NEXT: call void @llvm.assume(i1 true) +; GFX906-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX906-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX906-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX906-NEXT: ret void +; +; GFX1010-LABEL: define amdgpu_kernel void @kernel( +; GFX1010-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1010-NEXT: [[ENTRY:.*:]] +; GFX1010-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1010-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1010-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]] +; GFX1010: [[IF_GFX1201_OR_GFX12_INSTS1]]: +; GFX1010-NEXT: br label %[[IF_END:.*]] +; GFX1010: [[IF_GFX1201_OR_GFX12_INSTS:.*:]] +; GFX1010-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX1010-NEXT: br label %[[IF_END]] +; GFX1010: [[IF_END]]: +; GFX1010-NEXT: br label %[[IF_NOT_GFX907:.*]] +; GFX1010: [[IF_NOT_GFX907]]: +; GFX1010-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1010-NEXT: br label %[[IF_END6:.*]] +; GFX1010: [[IF_NOT_GFX906:.*:]] +; GFX1010-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] +; GFX1010: [[LOR_NOT_GFX1010:.*:]] +; GFX1010-NEXT: br label %[[FOR_COND:.*]] +; GFX1010: [[IF_GFX1010_OR_GFX1101]]: +; GFX1010-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1010-NEXT: br label %[[IF_END6]] +; GFX1010: [[IF_END6]]: +; GFX1010-NEXT: call void @llvm.assume(i1 true) +; GFX1010-NEXT: call void @llvm.assume(i1 false) +; GFX1010-NEXT: br label %[[FOR_COND]] +; GFX1010: [[FOR_COND]]: +; GFX1010-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1010-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1010-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1010-NEXT: br label %[[IF_ELSE8:.*]] +; GFX1010: [[IF_GFX11_INSTS:.*:]] +; GFX1010-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1010-NEXT: br label %[[IF_END11:.*]] +; GFX1010: [[IF_ELSE8]]: +; GFX1010-NEXT: br label %[[IF_GFX10_INSTS:.*]] +; GFX1010: [[IF_GFX10_INSTS]]: +; GFX1010-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1010-NEXT: br label %[[IF_END11]] +; GFX1010: [[IF_END11]]: +; GFX1010-NEXT: call void @llvm.assume(i1 true) +; GFX1010-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1010-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1010-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1010-NEXT: ret void +; +; GFX1101-LABEL: define amdgpu_kernel void @kernel( +; GFX1101-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1101-NEXT: [[ENTRY:.*:]] +; GFX1101-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1101-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1101-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]] +; GFX1101: [[IF_GFX1201_OR_GFX12_INSTS1]]: +; GFX1101-NEXT: br label %[[IF_END:.*]] +; GFX1101: [[IF_GFX1201_OR_GFX12_INSTS:.*:]] +; GFX1101-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX1101-NEXT: br label %[[IF_END]] +; GFX1101: [[IF_END]]: +; GFX1101-NEXT: br label %[[IF_NOT_GFX907:.*]] +; GFX1101: [[IF_NOT_GFX907]]: +; GFX1101-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1101-NEXT: br label %[[IF_END6:.*]] +; GFX1101: [[IF_NOT_GFX906:.*:]] +; GFX1101-NEXT: br label %[[LOR_NOT_GFX1010:.*]] +; GFX1101: [[LOR_NOT_GFX1010]]: +; GFX1101-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] +; GFX1101: [[IF_GFX1010_OR_GFX1101]]: +; GFX1101-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1101-NEXT: br label %[[IF_END6]] +; GFX1101: [[IF_END6]]: +; GFX1101-NEXT: call void @llvm.assume(i1 false) +; GFX1101-NEXT: call void @llvm.assume(i1 true) +; GFX1101-NEXT: br label %[[FOR_COND:.*]] +; GFX1101: [[FOR_COND]]: +; GFX1101-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1101-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1101-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1101-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1101: [[IF_GFX11_INSTS]]: +; GFX1101-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1101-NEXT: br label %[[IF_END11:.*]] +; GFX1101: [[IF_ELSE8:.*:]] +; GFX1101-NEXT: br label %[[IF_GFX10_INSTS:.*]] +; GFX1101: [[IF_GFX10_INSTS]]: +; GFX1101-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1101-NEXT: br label %[[IF_END11]] +; GFX1101: [[IF_END11]]: +; GFX1101-NEXT: call void @llvm.assume(i1 true) +; GFX1101-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1101-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1101-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1101-NEXT: ret void +; +; GFX1201-LABEL: define amdgpu_kernel void @kernel( +; GFX1201-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1201-NEXT: [[ENTRY:.*:]] +; GFX1201-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1201-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1201-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] +; GFX1201: [[LOR_NOT_GFX1201:.*:]] +; GFX1201-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS]] +; GFX1201: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX1201-NEXT: br label %[[IF_END:.*]] +; GFX1201: [[IF_END]]: +; GFX1201-NEXT: br label %[[IF_NOT_GFX907:.*]] +; GFX1201: [[IF_NOT_GFX907]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-NEXT: br label %[[IF_END6:.*]] +; GFX1201: [[IF_NOT_GFX906:.*:]] +; GFX1201-NEXT: br label %[[IF_GFX1010_OR_GFX1102:.*]] +; GFX1201: [[IF_GFX1010_OR_GFX1102]]: +; GFX1201-NEXT: br label %[[FOR_COND:.*]] +; GFX1201: [[IF_GFX1010_OR_GFX1101:.*:]] +; GFX1201-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1201-NEXT: br label %[[IF_END6]] +; GFX1201: [[IF_END6]]: +; GFX1201-NEXT: call void @llvm.assume(i1 true) +; GFX1201-NEXT: call void @llvm.assume(i1 true) +; GFX1201-NEXT: br label %[[FOR_COND]] +; GFX1201: [[FOR_COND]]: +; GFX1201-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1201-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1201: [[IF_GFX11_INSTS]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-NEXT: br label %[[IF_END11:.*]] +; GFX1201: [[IF_ELSE8:.*:]] +; GFX1201-NEXT: br label %[[IF_GFX10_INSTS:.*]] +; GFX1201: [[IF_GFX10_INSTS]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1201-NEXT: br label %[[IF_END11]] +; GFX1201: [[IF_END11]]: +; GFX1201-NEXT: call void @llvm.assume(i1 true) +; GFX1201-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1201-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-NEXT: ret void +; +; GFX1201-W64-LABEL: define amdgpu_kernel void @kernel( +; GFX1201-W64-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1201-W64-NEXT: [[ENTRY:.*:]] +; GFX1201-W64-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1201-W64-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1201-W64-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] +; GFX1201-W64: [[LOR_NOT_GFX1201:.*:]] +; GFX1201-W64-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS]] +; GFX1201-W64: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX1201-W64-NEXT: br label %[[IF_END:.*]] +; GFX1201-W64: [[IF_END]]: +; GFX1201-W64-NEXT: br label %[[IF_NOT_GFX907:.*]] +; GFX1201-W64: [[IF_NOT_GFX907]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-W64-NEXT: br label %[[IF_END6:.*]] +; GFX1201-W64: [[IF_NOT_GFX906:.*:]] +; GFX1201-W64-NEXT: br label %[[IF_GFX1010_OR_GFX1102:.*]] +; GFX1201-W64: [[IF_GFX1010_OR_GFX1102]]: +; GFX1201-W64-NEXT: br label %[[FOR_COND:.*]] +; GFX1201-W64: [[IF_GFX1010_OR_GFX1101:.*:]] +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1201-W64-NEXT: br label %[[IF_END6]] +; GFX1201-W64: [[IF_END6]]: +; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) +; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) +; GFX1201-W64-NEXT: br label %[[FOR_COND]] +; GFX1201-W64: [[FOR_COND]]: +; GFX1201-W64-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1201-W64-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1201-W64: [[IF_GFX11_INSTS]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-W64-NEXT: br label %[[IF_END11:.*]] +; GFX1201-W64: [[IF_ELSE8:.*:]] +; GFX1201-W64-NEXT: br label %[[IF_GFX10_INSTS:.*]] +; GFX1201-W64: [[IF_GFX10_INSTS]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1201-W64-NEXT: br label %[[IF_END11]] +; GFX1201-W64: [[IF_END11]]: +; GFX1201-W64-NEXT: call void @llvm.assume(i1 false) +; GFX1201-W64-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1201-W64-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: ret void +; +entry: + %0 = ptrtoint ptr addrspace(1) %p.coerce to i64 + %1 = inttoptr i64 %0 to ptr + %2 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1 + br i1 %2, label %if.gfx1201.or.gfx12-insts, label %lor.not.gfx1201 + +lor.not.gfx1201: + %3 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx12-insts, align 1 + br i1 %3, label %if.gfx1201.or.gfx12-insts, label %if.end + +if.gfx1201.or.gfx12-insts: + call void @llvm.amdgcn.s.sleep.var(i32 %x) + br label %if.end + +if.end: + %4 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1 + br i1 %4, label %if.gfx906, label %if.not.gfx906 + +if.not.gfx906: + call void @llvm.amdgcn.s.wait.event.export.ready() + br label %if.end6 + +if.gfx906: + %5 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1 + br i1 %5, label %if.gfx1010.or.gfx1101, label %lor.not.gfx1010 + +lor.not.gfx1010: + %6 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1 + br i1 %6, label %if.gfx1010.or.gfx1101, label %for.cond + +if.gfx1010.or.gfx1101: + call void @llvm.amdgcn.s.ttracedata.imm(i16 1) + br label %if.end6 + +if.end6: + %.pr.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1 + %7 = icmp ne i1 %.pr.pr, true + call void @llvm.assume(i1 %7) + %.pr6.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1 + %8 = icmp ne i1 %.pr6.pr, true + call void @llvm.assume(i1 %8) + br label %for.cond + +for.cond: + %.promoted = load i32, ptr %1, align 4 + %sub.peel = sub nsw i32 %.promoted, %x + store i32 %sub.peel, ptr %1, align 4 + %9 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx11-insts, align 1 + br i1 %9, label %if.gfx11-insts, label %if.else8 + +if.gfx11-insts: + call void @llvm.amdgcn.s.wait.event.export.ready() + br label %if.end11 + +if.else8: + %10 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1 + br i1 %10, label %if.gfx10-insts, label %if.end11 + +if.gfx10-insts: + call void @llvm.amdgcn.s.ttracedata.imm(i16 1) + br label %if.end11 + +if.end11: + %.pr7 = load i1, ptr addrspace(1) @"llvm.amdgcn.has.gfx12-insts,wavefrontsize64", align 1 + %11 = icmp ne i1 %.pr7, true + call void @llvm.assume(i1 %11) + %.promoted9 = load i32, ptr %1, align 4 + %sub13.peel = sub nsw i32 %.promoted9, %x + store i32 %sub13.peel, ptr %1, align 4 + ret void +} + +declare void @llvm.assume(i1 noundef) >From 8bf116837e2bd77ff5906d025fdb80bfa5507382 Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.vo...@amd.com> Date: Wed, 2 Apr 2025 03:39:32 +0100 Subject: [PATCH 2/5] Fix format. --- clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 8 ++++---- clang/lib/Sema/SemaExpr.cpp | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 7b1a3815144b4..8ad1ab74f221d 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -292,8 +292,8 @@ static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) { P->setConstant(true); P->setExternallyInitialized(true); - return CGF.Builder.CreateLoad(RawAddress(P, PTy, CharUnits::One(), - KnownNonNull)); + return CGF.Builder.CreateLoad( + RawAddress(P, PTy, CharUnits::One(), KnownNonNull)); } Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, @@ -600,7 +600,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_processor_is: { assert(CGM.getTriple().isSPIRV() && "__builtin_amdgcn_processor_is should never reach CodeGen for " - "concrete targets!"); + "concrete targets!"); StringRef Proc = cast<clang::StringLiteral>(E->getArg(0))->getString(); return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc); } @@ -609,7 +609,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, "__builtin_amdgcn_is_invocable should never reach CodeGen for " "concrete targets!"); auto FD = cast<FunctionDecl>( - cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee()); + cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee()); StringRef RF = getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 24f5262ab3cf4..bd0183ae4fb82 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -20549,14 +20549,16 @@ static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) { (!Sema.getASTContext().getAuxTargetInfo() || !Sema.getASTContext().getAuxTargetInfo()->isValidCPUName(N))) { Sema.Diag(CE->getExprLoc(), - diag::err_amdgcn_processor_is_arg_invalid_value) << N; + diag::err_amdgcn_processor_is_arg_invalid_value) + << N; return false; } } else { auto Arg = CE->getArg(0); if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) { Sema.Diag(CE->getExprLoc(), - diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg; + diag::err_amdgcn_is_invocable_arg_invalid_value) + << Arg; return false; } } @@ -20568,10 +20570,9 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) { if (auto UO = dyn_cast<UnaryOperator>(E)) { auto SE = dyn_cast<CallExpr>(UO->getSubExpr()); if (IsAMDGPUPredicateBI(SE)) { - assert( - UO->getOpcode() == UnaryOperator::Opcode::UO_LNot && - "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " - "can only be used as operands of logical ops!"); + assert(UO->getOpcode() == UnaryOperator::Opcode::UO_LNot && + "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " + "can only be used as operands of logical ops!"); if (!ValidateAMDGPUPredicateBI(Sema, SE)) { Invalid = true; @@ -20588,10 +20589,9 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) { auto LHS = dyn_cast<CallExpr>(BO->getLHS()); auto RHS = dyn_cast<CallExpr>(BO->getRHS()); if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) { - assert( - BO->isLogicalOp() && - "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " - "can only be used as operands of logical ops!"); + assert(BO->isLogicalOp() && + "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " + "can only be used as operands of logical ops!"); if (!ValidateAMDGPUPredicateBI(Sema, LHS) || !ValidateAMDGPUPredicateBI(Sema, RHS)) { >From 3421292b6e3261410734fb5a324f7dec79080fc1 Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.vo...@amd.com> Date: Wed, 2 Apr 2025 03:42:24 +0100 Subject: [PATCH 3/5] Fix broken patch merge. --- .../AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 159 ++++++++++++++ .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 207 ------------------ 2 files changed, 159 insertions(+), 207 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp new file mode 100644 index 0000000000000..17357c452b6d3 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -0,0 +1,159 @@ +//===- AMDGPUExpandFeaturePredicates.cpp - Feature Predicate Expander Pass ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file implements a pass that deals with expanding AMDGCN generic feature +// predicates into target specific quantities / sequences. In this context, a +// generic feature predicate is an implementation detail global variable that +// is inserted by the FE as a consequence of using either the __builtin_cpu_is +// or the __builtin_amdgcn_is_invocable special builtins on an abstract target +// (AMDGCNSPIRV). These placeholder globals are used to guide target specific +// lowering, once the concrete target is known, by way of constant folding their +// value all the way into a terminator (i.e. a controlled block) or into a no +// live use scenario. The pass makes a best effort attempt to look through +// calls, i.e. a constant evaluatable passthrough of a predicate value will +// generally work, however we hard fail if the folding fails, to avoid obtuse +// BE errors or opaque run time errors. This pass should run as early as +// possible / immediately after Clang CodeGen, so that the optimisation pipeline +// and the BE operate with concrete target data. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" + +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetIntrinsicInfo.h" +#include "llvm/Transforms/Utils/Local.h" + +#include <string> +#include <utility> + +using namespace llvm; + +namespace { +template <typename C> +void collectUsers(Value *V, C &Container) { + assert(V && "Must pass an existing Value!"); + + for (auto &&U : V->users()) + if (auto I = dyn_cast<Instruction>(U)) + Container.insert(Container.end(), I); +} + +inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) { + const auto IsFeature = P->getName().starts_with("llvm.amdgcn.has"); + const auto Offset = + IsFeature ? sizeof("llvm.amdgcn.has") : sizeof("llvm.amdgcn.is"); + + auto PV = P->getName().substr(Offset).str(); + if (IsFeature) { + auto Dx = PV.find(','); + while (Dx != std::string::npos) { + PV.insert(++Dx, {'+'}); + + Dx = PV.find(',', Dx); + } + PV.insert(PV.cbegin(), '+'); + } + + auto PTy = P->getValueType(); + P->setLinkage(GlobalValue::PrivateLinkage); + P->setExternallyInitialized(false); + + if (IsFeature) + P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures(PV))); + else + P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU())); +} + +std::pair<PreservedAnalyses, bool> +unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) { + std::string W; + raw_string_ostream OS(W); + + OS << "Impossible to constant fold feature predicate: " << *P + << " used by " << *NoFold << ", please simplify.\n"; + + Caller->getContext().diagnose( + DiagnosticInfoUnsupported(*Caller, W, NoFold->getDebugLoc(), DS_Error)); + + return {PreservedAnalyses::none(), false}; +} + +std::pair<PreservedAnalyses, bool> +handlePredicate(const GCNSubtarget &ST, GlobalVariable *P) { + setPredicate(ST, P); + + SmallPtrSet<Instruction *, 32> ToFold; + collectUsers(P, ToFold); + + if (ToFold.empty()) + return {PreservedAnalyses::all(), true}; + + do { + auto I = *ToFold.begin(); + ToFold.erase(I); + + if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) { + collectUsers(I, ToFold); + I->replaceAllUsesWith(C); + I->eraseFromParent(); + continue; + } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) { + continue; + } else if (I->users().empty()) { + continue; + } + + return unfoldableFound(I->getParent()->getParent(), P, I); + } while (!ToFold.empty()); + + return {PreservedAnalyses::none(), true}; +} +} // Unnamed namespace. + +PreservedAnalyses +AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) { + if (M.empty()) + return PreservedAnalyses::all(); + + SmallVector<GlobalVariable *> Predicates; + for (auto &&G : M.globals()) { + if (!G.isDeclaration() || !G.hasName()) + continue; + if (G.getName().starts_with("llvm.amdgcn.")) + Predicates.push_back(&G); + } + + if (Predicates.empty()) + return PreservedAnalyses::all(); + + const auto &ST = TM.getSubtarget<GCNSubtarget>( + *find_if(M, [](auto &&F) { return !F.isIntrinsic(); })); + + auto Ret = PreservedAnalyses::all(); + for (auto &&P : Predicates) { + auto R = handlePredicate(ST, P); + + if (!R.second) + break; + + Ret.intersect(R.first); + } + + for (auto &&P : Predicates) + P->eraseFromParent(); + + return Ret; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp deleted file mode 100644 index 125051c6aa0cf..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp +++ /dev/null @@ -1,207 +0,0 @@ -//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// This file implements a pass that deals with expanding AMDGCN generic pseudo- -// intrinsics into target specific quantities / sequences. In this context, a -// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a -// specific instruction, but rather is intended as a mechanism for abstractly -// conveying target specific info to a HLL / the FE, without concretely -// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize. -// This pass should run as early as possible / immediately after Clang CodeGen, -// so that the optimisation pipeline and the BE operate with concrete target -// data. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUTargetMachine.h" -#include "GCNSubtarget.h" - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/ConstantFolding.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Target/TargetIntrinsicInfo.h" -#include "llvm/Transforms/IPO/AlwaysInliner.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/Local.h" - -#include <string> -#include <utility> - -using namespace llvm; - -namespace { -inline Function *getCloneForInlining(Function *OldF) { - assert(OldF && "Must pass an existing Function!"); - - // TODO - Alias Value to clone arg. - ValueToValueMapTy VMap; - - auto NewF = CloneFunction(OldF, VMap); - - NewF->removeFnAttr(Attribute::OptimizeNone); - NewF->removeFnAttr(Attribute::NoInline); - NewF->addFnAttr(Attribute::AlwaysInline); - - return NewF; -} - -template <typename C> -inline void collectUsers(Value *V, ModulePassManager &AlwaysInliner, - ModuleAnalysisManager &MAM, - SmallDenseMap<Function *, Function *> &InlinableClones, - C &Container) { - assert(V && "Must pass an existing Value!"); - - auto A = PreservedAnalyses::all(); - - constexpr auto IsValidCall = [](auto &&U) { - if (auto CB = dyn_cast<CallBase>(U)) - if (auto F = CB->getCalledFunction()) - if (!F->isIntrinsic() && !F->isDeclaration()) - return true; - return false; - }; - - SmallVector<User *> Calls{}; - copy_if(V->users(), std::back_inserter(Calls), IsValidCall); - - while (!Calls.empty()) { - for (auto &&Call : Calls) { - auto CB = cast<CallBase>(Call); - auto &TempF = InlinableClones[CB->getCalledFunction()]; - - if (!TempF) - TempF = getCloneForInlining(CB->getCalledFunction()); - - CB->setCalledFunction(TempF); - CB->removeFnAttr(Attribute::NoInline); - CB->addFnAttr(Attribute::AlwaysInline); - - AlwaysInliner.run(*TempF->getParent(), MAM); - } - - Calls.clear(); - - copy_if(V->users(), std::back_inserter(Calls), IsValidCall); - } - - for (auto &&U : V->users()) - if (auto I = dyn_cast<Instruction>(U)) { - if (auto CB = dyn_cast<CallBase>(I)) { - if (CB->getCalledFunction() && !CB->getCalledFunction()->isIntrinsic()) - Container.insert(Container.end(), I); - } else { - Container.insert(Container.end(), I); - } - } -} - -std::pair<PreservedAnalyses, bool> -handlePredicate(const GCNSubtarget &ST, ModuleAnalysisManager &MAM, - SmallDenseMap<Function *, Function *>& InlinableClones, - GlobalVariable *P) { - auto PV = P->getName().substr(P->getName().rfind('.') + 1).str(); - auto Dx = PV.find(','); - while (Dx != std::string::npos) { - PV.insert(++Dx, {'+'}); - - Dx = PV.find(',', Dx); - } - - auto PTy = P->getValueType(); - P->setLinkage(GlobalValue::PrivateLinkage); - P->setExternallyInitialized(false); - - if (P->getName().starts_with("llvm.amdgcn.is")) - P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU())); - else - P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures('+' + PV))); - - ModulePassManager MPM; - MPM.addPass(AlwaysInlinerPass()); - - SmallPtrSet<Instruction *, 32> ToFold; - collectUsers(P, MPM, MAM, InlinableClones, ToFold); - - if (ToFold.empty()) - return {PreservedAnalyses::all(), true}; - - do { - auto I = *ToFold.begin(); - ToFold.erase(I); - - if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) { - collectUsers(I, MPM, MAM, InlinableClones, ToFold); - I->replaceAllUsesWith(C); - I->eraseFromParent(); - continue; - } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) { - continue; - } else if (I->users().empty()) { - continue; - } - - std::string W; - raw_string_ostream OS(W); - - auto Caller = I->getParent()->getParent(); - - OS << "Impossible to constant fold feature predicate: " << P->getName() - << ", please simplify.\n"; - - Caller->getContext().diagnose( - DiagnosticInfoUnsupported(*Caller, W, I->getDebugLoc(), DS_Error)); - - return {PreservedAnalyses::none(), false}; - } while (!ToFold.empty()); - - return {PreservedAnalyses::none(), true}; -} -} // Unnamed namespace. - -PreservedAnalyses -AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &MAM) { - if (M.empty()) - return PreservedAnalyses::all(); - - SmallVector<GlobalVariable *> Predicates; - for (auto &&G : M.globals()) { - if (!G.isDeclaration() || !G.hasName()) - continue; - if (G.getName().starts_with("llvm.amdgcn.")) - Predicates.push_back(&G); - } - - if (Predicates.empty()) - return PreservedAnalyses::all(); - - PreservedAnalyses Ret = PreservedAnalyses::all(); - - SmallDenseMap<Function *, Function *> InlinableClones; - const auto &ST = TM.getSubtarget<GCNSubtarget>( - *find_if(M, [](auto &&F) { return !F.isIntrinsic(); })); - - for (auto &&P : Predicates) { - auto R = handlePredicate(ST, MAM, InlinableClones, P); - - if (!R.second) - return PreservedAnalyses::none(); - - Ret.intersect(R.first); - } - - for (auto &&C : InlinableClones) - C.second->eraseFromParent(); - - return Ret; -} >From 539c7e6c6357fa7330de9e23fa13cf795061b85b Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.vo...@amd.com> Date: Wed, 2 Apr 2025 03:51:08 +0100 Subject: [PATCH 4/5] Add release notes. --- clang/docs/ReleaseNotes.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c4e82678949ff..005b33da29d2d 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -418,6 +418,10 @@ AMDGPU Support ^^^^^^^^^^^^^^ - Bump the default code object version to 6. ROCm 6.3 is required to run any program compiled with COV6. +- Introduced a new target specific builtin ``__builtin_amdgcn_processor_is``, + a late / deferred query for the current target processor +- Introduced a new target specific builtin ``__builtin_amdgcn_is_invocable``, + which enables fine-grained, per-builtin, feature availability NVPTX Support ^^^^^^^^^^^^^^ >From 5926b9f715fce59e753756f5330f311e3f916667 Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.vo...@amd.com> Date: Wed, 2 Apr 2025 03:55:39 +0100 Subject: [PATCH 5/5] (Hopefully) Final format fix. --- .../Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp index 17357c452b6d3..8d38508eda74b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -42,8 +42,7 @@ using namespace llvm; namespace { -template <typename C> -void collectUsers(Value *V, C &Container) { +template <typename C> void collectUsers(Value *V, C &Container) { assert(V && "Must pass an existing Value!"); for (auto &&U : V->users()) @@ -82,8 +81,8 @@ unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) { std::string W; raw_string_ostream OS(W); - OS << "Impossible to constant fold feature predicate: " << *P - << " used by " << *NoFold << ", please simplify.\n"; + OS << "Impossible to constant fold feature predicate: " << *P << " used by " + << *NoFold << ", please simplify.\n"; Caller->getContext().diagnose( DiagnosticInfoUnsupported(*Caller, W, NoFold->getDebugLoc(), DS_Error)); @@ -91,8 +90,8 @@ unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) { return {PreservedAnalyses::none(), false}; } -std::pair<PreservedAnalyses, bool> -handlePredicate(const GCNSubtarget &ST, GlobalVariable *P) { +std::pair<PreservedAnalyses, bool> handlePredicate(const GCNSubtarget &ST, + GlobalVariable *P) { setPredicate(ST, P); SmallPtrSet<Instruction *, 32> ToFold; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits