Right, for version 2 I've updated the Changelog and added a few more tweaks so the test works on ILP32 and we support LDP to floating pointer registers too:
The popcount expansion uses SIMD instructions acting on 64-bit values. As a result a popcount of a 32-bit integer requires zero-extension before moving the zero-extended value into an FP register. This patch adds support for zero-extended int->FP moves to avoid the redundant uxtw. Similarly, add support for a 32-bit zero-extending load->FP register. Add a missing 'fp' arch attribute to the related 8/16-bit pattern and fix an incorrect type attribute. To complete zero-extended load support, add a new alternative to load_pair_zero_extendsidi2_aarch64 to support LDP into FP registers too. int f (int a) { return __builtin_popcount (a); } Before: uxtw x0, w0 fmov d0, x0 cnt v0.8b, v0.8b addv b0, v0.8b fmov w0, s0 ret After: fmov s0, w0 cnt v0.8b, v0.8b addv b0, v0.8b fmov w0, s0 ret Passes regress on AArch64, OK for commit? ChangeLog: 2018-09-28 Wilco Dijkstra <wdijk...@arm.com> gcc/ * config/aarch64/aarch64.md (zero_extendsidi2_aarch64): Add alternatives to zero-extend into a floating-point register. (load_pair_zero_extendsidi2_aarch64): Add alternative to emit zero-extended ldp into floating-point registers. Add type and arch attributes. (zero_extend<SHORT:mode><GPI:mode>2_aarch64): Add arch attribute. Use f_loads for type attribute. testsuite/ * gcc.target/aarch64/popcnt.c: Test zero-extended popcount. --- diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index ef2368706e88a551b9d0d2db2385860112bdbdde..53f76675e9dc4164f6bb871fb6e15601dfeafa3b 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1588,26 +1588,32 @@ ) (define_insn "*zero_extendsidi2_aarch64" - [(set (match_operand:DI 0 "register_operand" "=r,r") - (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "r,m")))] + [(set (match_operand:DI 0 "register_operand" "=r,r,w,w") + (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "r,m,r,m")))] "" "@ uxtw\t%0, %w1 - ldr\t%w0, %1" - [(set_attr "type" "extend,load_4")] + ldr\t%w0, %1 + fmov\t%s0, %w1 + ldr\t%s0, %1" + [(set_attr "type" "extend,load_4,fmov,f_loads") + (set_attr "arch" "*,*,fp,fp")] ) (define_insn "*load_pair_zero_extendsidi2_aarch64" - [(set (match_operand:DI 0 "register_operand" "=r") - (zero_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand" "Ump"))) - (set (match_operand:DI 2 "register_operand" "=r") - (zero_extend:DI (match_operand:SI 3 "memory_operand" "m")))] + [(set (match_operand:DI 0 "register_operand" "=r,w") + (zero_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand" "Ump,Ump"))) + (set (match_operand:DI 2 "register_operand" "=r,w") + (zero_extend:DI (match_operand:SI 3 "memory_operand" "m,m")))] "rtx_equal_p (XEXP (operands[3], 0), plus_constant (Pmode, XEXP (operands[1], 0), GET_MODE_SIZE (SImode)))" - "ldp\\t%w0, %w2, %1" - [(set_attr "type" "load_8")] + "@ + ldp\t%w0, %w2, %1 + ldp\t%s0, %s2, %1" + [(set_attr "type" "load_8,neon_load1_2reg") + (set_attr "arch" "*,fp")] ) (define_expand "<ANY_EXTEND:optab><SHORT:mode><GPI:mode>2" @@ -1634,7 +1640,8 @@ and\t%<GPI:w>0, %<GPI:w>1, <SHORT:short_mask> ldr<SHORT:size>\t%w0, %1 ldr\t%<SHORT:size>0, %1" - [(set_attr "type" "logic_imm,load_4,load_4")] + [(set_attr "type" "logic_imm,load_4,f_loads") + (set_attr "arch" "*,*,fp")] ) (define_expand "<optab>qihi2" diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt.c b/gcc/testsuite/gcc.target/aarch64/popcnt.c index 7e957966d8e81b8633a444bb42944d0da82ae5db..2b5e9f3e2c0245438ed7bcc5d0d4e01efe01b1ee 100644 --- a/gcc/testsuite/gcc.target/aarch64/popcnt.c +++ b/gcc/testsuite/gcc.target/aarch64/popcnt.c @@ -19,5 +19,16 @@ foo2 (long long x) return __builtin_popcountll (x); } -/* { dg-final { scan-assembler-not "popcount" } } */ -/* { dg-final { scan-assembler-times "cnt\t" 3 } } */ +int +foo3 (int *p) +{ + return __builtin_popcount (*p); +} + +/* { dg-final { scan-assembler-not {popcount} } } */ +/* { dg-final { scan-assembler-times {cnt\t} 4 } } */ +/* { dg-final { scan-assembler-times {fmov\ts} 1 {target lp64} } } */ +/* { dg-final { scan-assembler-times {fmov\td} 2 {target lp64} } } */ +/* { dg-final { scan-assembler-times {fmov\ts} 2 {target ilp32} } } */ +/* { dg-final { scan-assembler-times {fmov\td} 1 {target ilp32} } } */ +/* { dg-final { scan-assembler-times {ldr\ts} 1 } } */