pengfei updated this revision to Diff 238770.
pengfei added a comment.
Remove unnecessary comment.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D72820/new/
https://reviews.llvm.org/D72820
Files:
clang/lib/CodeGen/CGExprScalar.cpp
clang/test/CodeGen/constrained-math-builtins.c
llvm/docs/LangRef.rst
llvm/include/llvm/CodeGen/BasicTTIImpl.h
llvm/include/llvm/IR/ConstrainedOps.def
llvm/include/llvm/IR/Intrinsics.td
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
llvm/test/TableGen/GlobalISelEmitter-input-discard.td
Index: llvm/test/TableGen/GlobalISelEmitter-input-discard.td
===================================================================
--- llvm/test/TableGen/GlobalISelEmitter-input-discard.td
+++ llvm/test/TableGen/GlobalISelEmitter-input-discard.td
@@ -15,7 +15,7 @@
// GISEL-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
// GISEL-NEXT: GIM_CheckType, /*MI*/0, /*Op*/3, /*Type*/GILLT_s32,
// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID,
-// GISEL-NEXT: // (intrinsic_w_chain:{ *:[i32] } 248:{ *:[iPTR] }, srcvalue:{ *:[i32] }, i32:{ *:[i32] }:$src1) => (FOO:{ *:[i32] } (IMPLICIT_DEF:{ *:[i32] }), GPR32:{ *:[i32] }:$src1)
+// GISEL-NEXT: // (intrinsic_w_chain:{ *:[i32] } 249:{ *:[iPTR] }, srcvalue:{ *:[i32] }, i32:{ *:[i32] }:$src1) => (FOO:{ *:[i32] } (IMPLICIT_DEF:{ *:[i32] }), GPR32:{ *:[i32] }:$src1)
// GISEL-NEXT: GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/TargetOpcode::IMPLICIT_DEF,
// GISEL-NEXT: GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/RegState::Define,
Index: llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
===================================================================
--- llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
+++ llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
@@ -3,6 +3,104 @@
; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=COMMON,FMA
; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=COMMON,FMA
+; Verify constrained fmul and fadd aren't fused.
+define float @f11(float %0, float %1, float %2) #0 {
+; NOFMA-LABEL: f11:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: mulss %xmm1, %xmm0
+; NOFMA-NEXT: addss %xmm2, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f11:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; FMA-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; FMA-NEXT: retq
+entry:
+ %3 = call float @llvm.experimental.constrained.fmul.f32(float %0, float %1,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %4 = call float @llvm.experimental.constrained.fadd.f32(float %3, float %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret float %4
+}
+
+; Verify constrained fmul and fadd aren't fused.
+define double @f12(double %0, double %1, double %2) #0 {
+; NOFMA-LABEL: f12:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: mulsd %xmm1, %xmm0
+; NOFMA-NEXT: addsd %xmm2, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f12:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; FMA-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; FMA-NEXT: retq
+entry:
+ %3 = call double @llvm.experimental.constrained.fmul.f64(double %0, double %1,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ %4 = call double @llvm.experimental.constrained.fadd.f64(double %3, double %2,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret double %4
+}
+
+; Verify that fmuladd(3.5) isn't simplified when the rounding mode is
+; unknown.
+define float @f15() #0 {
+; NOFMA-LABEL: f15:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; NOFMA-NEXT: movaps %xmm1, %xmm0
+; NOFMA-NEXT: mulss %xmm1, %xmm0
+; NOFMA-NEXT: addss %xmm1, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f15:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
+; FMA-NEXT: retq
+entry:
+ %result = call float @llvm.experimental.constrained.fmuladd.f32(
+ float 3.5,
+ float 3.5,
+ float 3.5,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret float %result
+}
+
+; Verify that fmuladd(42.1) isn't simplified when the rounding mode is
+; unknown.
+define double @f16() #0 {
+; NOFMA-LABEL: f16:
+; NOFMA: # %bb.0: # %entry
+; NOFMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; NOFMA-NEXT: movapd %xmm1, %xmm0
+; NOFMA-NEXT: mulsd %xmm1, %xmm0
+; NOFMA-NEXT: addsd %xmm1, %xmm0
+; NOFMA-NEXT: retq
+;
+; FMA-LABEL: f16:
+; FMA: # %bb.0: # %entry
+; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
+; FMA-NEXT: retq
+entry:
+ %result = call double @llvm.experimental.constrained.fmuladd.f64(
+ double 42.1,
+ double 42.1,
+ double 42.1,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict") #0
+ ret double %result
+}
+
; Verify that fma(3.5) isn't simplified when the rounding mode is
; unknown.
define float @f17() #0 {
@@ -65,5 +163,11 @@
attributes #0 = { strictfp }
+declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
+declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata)
+declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata)
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6999,6 +6999,35 @@
Opers.push_back(getValue(FPI.getArgOperand(1)));
}
+ auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) {
+ assert(Result.getNode()->getNumValues() == 2);
+
+ // Push node to the appropriate list so that future instructions can be
+ // chained up correctly.
+ SDValue OutChain = Result.getValue(1);
+ switch (EB) {
+ case fp::ExceptionBehavior::ebIgnore:
+ // The only reason why ebIgnore nodes still need to be chained is that
+ // they might depend on the current rounding mode, and therefore must
+ // not be moved across instruction that may change that mode.
+ LLVM_FALLTHROUGH;
+ case fp::ExceptionBehavior::ebMayTrap:
+ // These must not be moved across calls or instructions that may change
+ // floating-point exception masks.
+ PendingConstrainedFP.push_back(OutChain);
+ break;
+ case fp::ExceptionBehavior::ebStrict:
+ // These must not be moved across calls or instructions that may change
+ // floating-point exception masks or read floating-point exception flags.
+ // In addition, they cannot be optimized out even if unused.
+ PendingConstrainedFPStrict.push_back(OutChain);
+ break;
+ }
+ };
+
+ SDVTList VTs = DAG.getVTList(ValueVTs);
+ fp::ExceptionBehavior EB = FPI.getExceptionBehavior().getValue();
+
unsigned Opcode;
switch (FPI.getIntrinsicID()) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
@@ -7007,6 +7036,23 @@
Opcode = ISD::STRICT_##DAGN; \
break;
#include "llvm/IR/ConstrainedOps.def"
+ case Intrinsic::experimental_constrained_fmuladd: {
+ Opcode = ISD::STRICT_FMA;
+ // Break fmuladd into fmul and fadd.
+ if (TM.Options.AllowFPOpFusion == FPOpFusion::Strict ||
+ !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(),
+ ValueVTs[0])) {
+ Opers.pop_back();
+ SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers);
+ pushOutChain(Mul, EB);
+ Opcode = ISD::STRICT_FADD;
+ Opers.clear();
+ Opers.push_back(Mul.getValue(1));
+ Opers.push_back(Mul.getValue(0));
+ Opers.push_back(getValue(FPI.getArgOperand(2)));
+ }
+ break;
+ }
}
// A few strict DAG nodes carry additional operands that are not
@@ -7025,32 +7071,8 @@
}
}
- SDVTList VTs = DAG.getVTList(ValueVTs);
SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers);
-
- assert(Result.getNode()->getNumValues() == 2);
-
- // Push node to the appropriate list so that future instructions can be
- // chained up correctly.
- SDValue OutChain = Result.getValue(1);
- switch (FPI.getExceptionBehavior().getValue()) {
- case fp::ExceptionBehavior::ebIgnore:
- // The only reason why ebIgnore nodes still need to be chained is that
- // they might depend on the current rounding mode, and therefore must
- // not be moved across instruction that may change that mode.
- LLVM_FALLTHROUGH;
- case fp::ExceptionBehavior::ebMayTrap:
- // These must not be moved across calls or instructions that may change
- // floating-point exception masks.
- PendingConstrainedFP.push_back(OutChain);
- break;
- case fp::ExceptionBehavior::ebStrict:
- // These must not be moved across calls or instructions that may change
- // floating-point exception masks or read floating-point exception flags.
- // In addition, they cannot be optimized out even if unused.
- PendingConstrainedFPStrict.push_back(OutChain);
- break;
- }
+ pushOutChain(Result, EB);
SDValue FPResult = Result.getValue(0);
setValue(&FPI, FPResult);
Index: llvm/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/include/llvm/IR/Intrinsics.td
+++ llvm/include/llvm/IR/Intrinsics.td
@@ -626,6 +626,13 @@
llvm_metadata_ty,
llvm_metadata_ty ]>;
+ def int_experimental_constrained_fmuladd : Intrinsic<[ llvm_anyfloat_ty ],
+ [ LLVMMatchType<0>,
+ LLVMMatchType<0>,
+ LLVMMatchType<0>,
+ llvm_metadata_ty,
+ llvm_metadata_ty ]>;
+
def int_experimental_constrained_fptosi : Intrinsic<[ llvm_anyint_ty ],
[ llvm_anyfloat_ty,
llvm_metadata_ty ]>;
Index: llvm/include/llvm/IR/ConstrainedOps.def
===================================================================
--- llvm/include/llvm/IR/ConstrainedOps.def
+++ llvm/include/llvm/IR/ConstrainedOps.def
@@ -95,6 +95,10 @@
DAG_FUNCTION(sqrt, 1, 1, experimental_constrained_sqrt, FSQRT)
DAG_FUNCTION(trunc, 1, 0, experimental_constrained_trunc, FTRUNC)
+// This is definition for fmuladd intrinsic function, that is converted into
+// constrained FMA or FMUL + FADD intrinsics.
+FUNCTION(fmuladd, 3, 1, experimental_constrained_fmuladd)
+
#undef INSTRUCTION
#undef FUNCTION
#undef CMP_INSTRUCTION
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1286,6 +1286,9 @@
case Intrinsic::fmuladd:
ISDs.push_back(ISD::FMA);
break;
+ case Intrinsic::experimental_constrained_fmuladd:
+ ISDs.push_back(ISD::STRICT_FMA);
+ break;
// FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
@@ -1509,6 +1512,12 @@
if (IID == Intrinsic::fmuladd)
return ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
+ if (IID == Intrinsic::experimental_constrained_fmuladd)
+ return ConcreteTTI->getIntrinsicCost(
+ Intrinsic::experimental_constrained_fmul, RetTy, Tys,
+ nullptr) +
+ ConcreteTTI->getIntrinsicCost(
+ Intrinsic::experimental_constrained_fadd, RetTy, Tys, nullptr);
// Else, assume that we need to scalarize this intrinsic. For math builtins
// this will emit a costly libcall, adding call overhead and spills. Make it
Index: llvm/docs/LangRef.rst
===================================================================
--- llvm/docs/LangRef.rst
+++ llvm/docs/LangRef.rst
@@ -16061,6 +16061,69 @@
performed by '``llvm.experimental.constrained.fcmps``' will raise an
exception if either operand is a NAN (QNAN or SNAN).
+'``llvm.experimental.constrained.fmuladd``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare <type>
+ @llvm.experimental.constrained.fmuladd(<type> <op1>, <type> <op2>,
+ <type> <op3>,
+ metadata <rounding mode>,
+ metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.fmuladd``' intrinsic represents
+multiply-add expressions that can be fused if the code generator determines
+that (a) the target instruction set has support for a fused operation,
+and (b) that the fused operation is more efficient than the equivalent,
+separate pair of mul and add instructions.
+
+Arguments:
+""""""""""
+
+The first three arguments to the '``llvm.experimental.constrained.fmuladd``'
+intrinsic must be floating-point or vector of floating-point values.
+All three arguments must have identical types.
+
+The fourth and fifth arguments specifiy the rounding mode and exception behavior
+as described above.
+
+Semantics:
+""""""""""
+
+The expression:
+
+::
+
+ %0 = call float @llvm.experimental.constrained.fmuladd.f32(%a, %b, %c,
+ metadata <rounding mode>,
+ metadata <exception behavior>)
+
+is equivalent to the expression:
+
+::
+
+ %0 = call float @llvm.experimental.constrained.fmul.f32(%a, %b,
+ metadata <rounding mode>,
+ metadata <exception behavior>)
+ %1 = call float @llvm.experimental.constrained.fadd.f32(%0, %c,
+ metadata <rounding mode>,
+ metadata <exception behavior>)
+
+except that it is unspecified whether rounding will be performed between the
+multiplication and addition steps. Fusion is not guaranteed, even if the target
+platform supports it.
+If a fused multiply-add is required, the corresponding
+:ref:`llvm.experimental.constrained.fma <int_fma>` intrinsic function should be
+used instead.
+This never sets errno, just as '``llvm.experimental.constrained.fma.*``'.
+
Constrained libm-equivalent Intrinsics
--------------------------------------
Index: clang/test/CodeGen/constrained-math-builtins.c
===================================================================
--- clang/test/CodeGen/constrained-math-builtins.c
+++ clang/test/CodeGen/constrained-math-builtins.c
@@ -148,3 +148,15 @@
// CHECK: declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata)
};
+#pragma STDC FP_CONTRACT ON
+void bar(float f) {
+ f * f + f;
+ (double)f * f - f;
+ (long double)-f * f + f;
+
+// CHECK: call float @llvm.experimental.constrained.fmuladd.f32
+// CHECK: fneg
+// CHECK: call double @llvm.experimental.constrained.fmuladd.f64
+// CHECK: fneg
+// CHECK: call x86_fp80 @llvm.experimental.constrained.fmuladd.f80
+};
Index: clang/lib/CodeGen/CGExprScalar.cpp
===================================================================
--- clang/lib/CodeGen/CGExprScalar.cpp
+++ clang/lib/CodeGen/CGExprScalar.cpp
@@ -3361,7 +3361,7 @@
// the add operand respectively. This allows fmuladd to represent a*b-c, or
// c-a*b. Patterns in LLVM should catch the negated forms and translate them to
// efficient operations.
-static Value* buildFMulAdd(llvm::BinaryOperator *MulOp, Value *Addend,
+static Value* buildFMulAdd(llvm::Instruction *MulOp, Value *Addend,
const CodeGenFunction &CGF, CGBuilderTy &Builder,
bool negMul, bool negAdd) {
assert(!(negMul && negAdd) && "Only one of negMul and negAdd should be set.");
@@ -3373,12 +3373,23 @@
if (negAdd)
Addend = Builder.CreateFNeg(Addend, "neg");
- Value *FMulAdd = Builder.CreateCall(
- CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()),
- {MulOp0, MulOp1, Addend});
- MulOp->eraseFromParent();
+ Value *FMulAdd = nullptr;
+ if (Builder.getIsFPConstrained()) {
+ assert(isa<llvm::ConstrainedFPIntrinsic>(MulOp) &&
+ "Only constrained operation should be created when Builder is in FP "
+ "constrained mode");
+ FMulAdd = Builder.CreateCall(
+ CGF.CGM.getIntrinsic(llvm::Intrinsic::experimental_constrained_fmuladd,
+ Addend->getType()),
+ {MulOp0, MulOp1, Addend, MulOp->getOperand(2), MulOp->getOperand(3)});
+ } else {
+ FMulAdd = Builder.CreateCall(
+ CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()),
+ {MulOp0, MulOp1, Addend});
+ }
+ MulOp->eraseFromParent();
- return FMulAdd;
+ return FMulAdd;
}
// Check whether it would be legal to emit an fmuladd intrinsic call to
@@ -3413,6 +3424,19 @@
return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false);
}
+ if (auto *LHSBinOp = dyn_cast<llvm::CallBase>(op.LHS)) {
+ if (LHSBinOp->getIntrinsicID() ==
+ llvm::Intrinsic::experimental_constrained_fmul &&
+ LHSBinOp->use_empty())
+ return buildFMulAdd(LHSBinOp, op.RHS, CGF, Builder, false, isSub);
+ }
+ if (auto *RHSBinOp = dyn_cast<llvm::CallBase>(op.RHS)) {
+ if (RHSBinOp->getIntrinsicID() ==
+ llvm::Intrinsic::experimental_constrained_fmul &&
+ RHSBinOp->use_empty())
+ return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false);
+ }
+
return nullptr;
}
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits