https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102265
Bug ID: 102265
Summary: s390: Inefficient code for __builtin_ctzll
Product: gcc
Version: 10.2.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: jens.seifert at de dot ibm.com
Target Milestone: ---
unsigned long long ctzll(unsigned long long x)
{
return __builtin_ctzll(x);
}
creates:
lcgr %r1,%r2
ngr %r2,%r1
lghi %r1,63
flogr %r2,%r2
sgrk %r2,%r1,%r2
lgfr %r2,%r2
br %r14
Optimal sequence for z15 uses population count, for all others use ^ 63 instead
of 63 -.
unsigned long long ctzll_opt(unsigned long long x)
{
#if __ARCH__ >= 13
return __builtin_popcountll((x-1) & ~x);
#else
return __builtin_clzll(x & -x) ^ 63;
#endif
}
< z15:
lcgr %r1,%r2
ngr %r2,%r1
flogr %r2,%r2
xilf %r2,63
lgfr %r2,%r2
br %r14
=> 1 instruction saved.
z15:
.cfi_startproc
lay %r1,-1(%r2)
ncgrk %r2,%r1,%r2
popcnt %r2,%r2,8
br %r14
.cfi_endproc
=> On z15 only 3 instructions required.