| Issue |
171891
|
| Summary |
Worse code on targets with v_fmac_f64 in rsqrt implementation
|
| Labels |
backend:AMDGPU,
llvm:regalloc,
missed-optimization
|
| Assignees |
|
| Reporter |
arsenm
|
This function has worse code when compiled on targets with v_fmac_f64: https://godbolt.org/z/aq5e6efa3
The gfx90a compile uses v_fmac_f64, and incurs a cost of 2 copies at the function return compared to gfx900, which did not have the instruction (and instead uses the 3 address v_fma_f64)
```
target triple = "amdgcn-amd-amdhsa"
; Worse code when v_fmac_f64 is available.
define double @bad_2addr_mac_f64(double %x) {
entry:
%i = tail call double @llvm.amdgcn.rsq.f64(double %x)
%or.cond = tail call i1 @llvm.is.fpclass.f64(double %x, i32 608)
%cond = select i1 %or.cond, double %i, double %x
%fneg = fneg double %cond
%mul = fmul double %i, %fneg
%i1 = tail call double @llvm.fma.f64(double %mul, double %i, double 1.000000e+00)
%mul2 = fmul double %i, %i1
%i2 = tail call double @llvm.fma.f64(double %i1, double 3.750000e-01, double 5.000000e-01)
%i3 = tail call double @llvm.fma.f64(double %mul2, double %i2, double %i)
ret double %i3
}
declare double @llvm.amdgcn.rsq.f64(double) #0
declare double @llvm.fma.f64(double, double, double) #1
declare i1 @llvm.is.fpclass.f64(double, i32 immarg) #1
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs