llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-aarch64 <details> <summary>Changes</summary> There are more efficient implementations for llvm.ldexp on different targets. This patch transforms llvm.ldexp into target supported intrinsics before lowering. --- Full diff: https://github.com/llvm/llvm-project/pull/67552.diff 10 Files Affected: - (modified) clang/lib/CodeGen/CGBuiltin.cpp (+3) - (modified) clang/test/CodeGen/math-libcalls.c (+6-6) - (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+6) - (modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+2) - (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+4) - (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+68) - (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h (+7) - (modified) llvm/lib/Target/X86/X86TargetTransformInfo.h (+11) - (added) llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll (+46) - (added) llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll (+38) ``````````diff diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 04c0325c7fd038b..da01c34731386e0 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -2719,6 +2719,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin( *this, E, Intrinsic::llrint, Intrinsic::experimental_constrained_llrint)); + case Builtin::BIldexp: + case Builtin::BIldexpf: + case Builtin::BIldexpl: case Builtin::BI__builtin_ldexp: case Builtin::BI__builtin_ldexpf: case Builtin::BI__builtin_ldexpl: diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c index 02df4fe5fea6018..a906bda4c88c958 100644 --- a/clang/test/CodeGen/math-libcalls.c +++ b/clang/test/CodeGen/math-libcalls.c @@ -71,15 +71,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { ldexp(f,f); ldexpf(f,f); ldexpl(f,f); - // NO__ERRNO: declare double @ldexp(double noundef, i32 noundef) [[READNONE]] - // NO__ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[READNONE]] - // NO__ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[READNONE]] + // NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC]] + // NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC]] + // NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]] - // HAS_MAYTRAP: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]] - // HAS_MAYTRAP: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]] - // HAS_MAYTRAP: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]] + // HAS_MAYTRAP: declare double @llvm.experimental.constrained.ldexp.f64.i32( + // HAS_MAYTRAP: declare float @llvm.experimental.constrained.ldexp.f32.i32( + // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.ldexp.f80.i32( modf(f,d); modff(f,fp); modfl(f,l); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 1ae595d2110457d..c8805aadf146874 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1669,6 +1669,7 @@ class TargetTransformInfo { /// \return The maximum number of function arguments the target supports. unsigned getMaxNumArgs() const; + unsigned getTargetSupportedLdexpInst(Type *Ty) const; /// @} private: @@ -2035,6 +2036,7 @@ class TargetTransformInfo::Concept { getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; virtual unsigned getMaxNumArgs() const = 0; + virtual unsigned getTargetSupportedLdexpInst(Type *Ty) const = 0; }; template <typename T> @@ -2745,6 +2747,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } + + unsigned getTargetSupportedLdexpInst(Type *Ty) const override { + return Impl.getTargetSupportedLdexpInst(Ty); + } }; template <typename T> diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 326c3130c6cff76..6d6a715f62b201c 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -891,6 +891,8 @@ class TargetTransformInfoImplBase { unsigned getMaxNumArgs() const { return UINT_MAX; } + unsigned getTargetSupportedLdexpInst(Type *Ty) const { return 0; } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c751d174a48ab1f..6a58a146d0431f9 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1237,6 +1237,10 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType, return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment); } +unsigned TargetTransformInfo::getTargetSupportedLdexpInst(Type *Ty) const { + return TTIImpl->getTargetSupportedLdexpInst(Ty); +} + TargetTransformInfo::Concept::~Concept() = default; TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index faee623d7c62fba..ce0c6b653e1c6c5 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -61,6 +61,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" @@ -486,6 +487,7 @@ class CodeGenPrepare : public FunctionPass { bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT); bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT); void verifyBFIUpdates(Function &F); + void optimizeScalarLdexp(Instruction *Ldexp, Value *X, Value *Exp); }; } // end anonymous namespace @@ -2432,6 +2434,13 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { return optimizeGatherScatterInst(II, II->getArgOperand(0)); case Intrinsic::masked_scatter: return optimizeGatherScatterInst(II, II->getArgOperand(1)); + case Intrinsic::ldexp: { + // Vector versions of llvm.ldexp are not fully supported for all targets, + // only handle scalar version currently. + if (!II->getType()->isVectorTy()) + optimizeScalarLdexp(II, II->getArgOperand(0), II->getArgOperand(1)); + break; + } } SmallVector<Value *, 2> PtrOps; @@ -8667,3 +8676,62 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, ModifyDT &ModifiedDT) { } return MadeChange; } + +// Transform llvm.ldexp.T.i32(T x, i32 exp) into target supported instructions. +void CodeGenPrepare::optimizeScalarLdexp(Instruction *Ldexp, Value *X, + Value *Exp) { + auto IID = TTI->getTargetSupportedLdexpInst(X->getType()); + if (IID == 0) + return; + + unsigned XScalarSize = X->getType()->getScalarSizeInBits(); + // Target related intrinsics for ldexp.f128 are not well supported, filter out + // the scenario currently. + if (XScalarSize > 64) + return; + unsigned VL = 128 / XScalarSize; + + IRBuilder<> B(Ldexp); + LLVMContext &C = Ldexp->getModule()->getContext(); + Type *VXTy = nullptr, *VExpTy = nullptr; + Value *VX = nullptr, *VExp = nullptr, *CvtExp = nullptr; + Value *Ret = nullptr, *Pg = nullptr; + ElementCount EC; + switch (IID) { + default: + break; + case Intrinsic::aarch64_sve_fscale: { + EC = ElementCount::get(VL, true); + CvtExp = Exp; + if (X->getType() == Type::getDoubleTy(C)) + CvtExp = B.CreateSExt(Exp, Type::getInt64Ty(C)); + VExpTy = VectorType::get(CvtExp->getType(), EC); + VExp = B.CreateInsertElement(PoisonValue::get(VExpTy), CvtExp, uint64_t(0)); + VXTy = VectorType::get(X->getType(), EC); + VX = B.CreateInsertElement(PoisonValue::get(VXTy), X, uint64_t(0)); + Type *PTy = VectorType::get(Type::getInt1Ty(C), EC); + Constant *True = ConstantInt::get(Type::getInt32Ty(C), 31); + Pg = B.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PTy}, {True}); + Value *FScale = B.CreateIntrinsic(IID, {VXTy}, {Pg, VX, VExp}); + Ret = B.CreateExtractElement(FScale, (uint64_t)0); + Ldexp->replaceAllUsesWith(Ret); + break; + } + case Intrinsic::x86_avx512_mask_scalef_ss: + case Intrinsic::x86_avx512_mask_scalef_sd: { + EC = ElementCount::get(VL, false); + CvtExp = B.CreateSIToFP(Exp, X->getType()); + VExpTy = VectorType::get(CvtExp->getType(), EC); + VExp = B.CreateInsertElement(PoisonValue::get(VExpTy), CvtExp, uint64_t(0)); + VXTy = VectorType::get(X->getType(), EC); + VX = B.CreateInsertElement(PoisonValue::get(VXTy), X, uint64_t(0)); + Pg = ConstantInt::get(Type::getInt8Ty(C), -1); + Constant *Round = ConstantInt::get(Type::getInt32Ty(C), 4); + Value *Scalef = + B.CreateIntrinsic(IID, std::nullopt, {VX, VExp, VX, Pg, Round}); + Ret = B.CreateExtractElement(Scalef, (uint64_t)0); + Ldexp->replaceAllUsesWith(Ret); + break; + } + } +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index a6baade412c77d2..5190572b3d386da 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -24,6 +24,7 @@ #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include <cstdint> #include <optional> @@ -412,6 +413,12 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy); } + + unsigned getTargetSupportedLdexpInst(Type *Ty) const { + if (!ST->hasSVE()) + return 0; + return Intrinsic::aarch64_sve_fscale; + } }; } // end namespace llvm diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 0fa0d240a548b96..4ceada4e756f6f5 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -19,6 +19,7 @@ #include "X86TargetMachine.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/IR/IntrinsicsX86.h" #include <optional> namespace llvm { @@ -285,6 +286,16 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { bool prefersVectorizedAddressing() const; bool supportsEfficientVectorElementLoadStore() const; bool enableInterleavedAccessVectorization(); + unsigned getTargetSupportedLdexpInst(Type *Ty) const { + if (!ST->hasAVX512()) + return 0; + if (Ty->isFloatTy()) + return Intrinsic::x86_avx512_mask_scalef_ss; + else if (Ty->isDoubleTy()) + return Intrinsic::x86_avx512_mask_scalef_sd; + else + return 0; + } private: bool supportsGather() const; diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll new file mode 100644 index 000000000000000..77605844450d006 --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/optimize-ldexp.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck --check-prefixes=AARCH64 %s + +define dso_local double @testExp(double noundef %val, i32 noundef %a) { +; AARCH64-LABEL: testExp: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: ptrue p0.d +; AARCH64-NEXT: // kill: def $w0 killed $w0 def $x0 +; AARCH64-NEXT: sxtw x8, w0 +; AARCH64-NEXT: // kill: def $d0 killed $d0 def $z0 +; AARCH64-NEXT: fmov d1, x8 +; AARCH64-NEXT: fscale z0.d, p0/m, z0.d, z1.d +; AARCH64-NEXT: // kill: def $d0 killed $d0 killed $z0 +; AARCH64-NEXT: ret +entry: + %0 = tail call fast double @llvm.ldexp.f64.i32(double %val, i32 %a) + ret double %0 +} +declare double @llvm.ldexp.f64.i32(double, i32) + +define dso_local float @testExpf(float noundef %val, i32 noundef %a) { +; AARCH64-LABEL: testExpf: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: ptrue p0.s +; AARCH64-NEXT: fmov s1, w0 +; AARCH64-NEXT: // kill: def $s0 killed $s0 def $z0 +; AARCH64-NEXT: fscale z0.s, p0/m, z0.s, z1.s +; AARCH64-NEXT: // kill: def $s0 killed $s0 killed $z0 +; AARCH64-NEXT: ret +entry: + %0 = tail call fast float @llvm.ldexp.f32.i32(float %val, i32 %a) + ret float %0 +} +declare float @llvm.ldexp.f32.i32(float, i32) + +; Target related intrinsics for f128 are not well supported, use call ldexpl +; currently. +define dso_local fp128 @testExpl(fp128 noundef %val, i32 noundef %a) { +; AARCH64-LABEL: testExpl: +; AARCH64: // %bb.0: // %entry +; AARCH64-NEXT: b ldexpl +entry: + %0 = tail call fast fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a) + ret fp128 %0 +} +declare fp128 @llvm.ldexp.f128.i32(fp128, i32) diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll b/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll new file mode 100644 index 000000000000000..97dd7bd80aa43b1 --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/X86/optimize-ldexp.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64 -mattr=+avx512f < %s -o - | FileCheck --check-prefixes=AARCH64 %s + +define dso_local double @testExp(double noundef %val, i32 noundef %a) { +; AARCH64-LABEL: testExp: +; AARCH64: # %bb.0: # %entry +; AARCH64-NEXT: vcvtsi2sd %edi, %xmm1, %xmm1 +; AARCH64-NEXT: vscalefsd %xmm1, %xmm0, %xmm0 +; AARCH64-NEXT: retq +entry: + %0 = tail call fast double @llvm.ldexp.f64.i32(double %val, i32 %a) + ret double %0 +} +declare double @llvm.ldexp.f64.i32(double, i32) + +define dso_local float @testExpf(float noundef %val, i32 noundef %a) { +; AARCH64-LABEL: testExpf: +; AARCH64: # %bb.0: # %entry +; AARCH64-NEXT: vcvtsi2ss %edi, %xmm1, %xmm1 +; AARCH64-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AARCH64-NEXT: retq +entry: + %0 = tail call fast float @llvm.ldexp.f32.i32(float %val, i32 %a) + ret float %0 +} +declare float @llvm.ldexp.f32.i32(float, i32) + +; Target related intrinsics for f128 are not well supported, use call ldexpl +; currently. +define dso_local fp128 @testExpl(fp128 noundef %val, i32 noundef %a) { +; AARCH64-LABEL: testExpl: +; AARCH64: # %bb.0: # %entry +; AARCH64-NEXT: jmp ldexpl@PLT # TAILCALL +entry: + %0 = tail call fast fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a) + ret fp128 %0 +} +declare fp128 @llvm.ldexp.f128.i32(fp128, i32) `````````` </details> https://github.com/llvm/llvm-project/pull/67552 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits