https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119298
--- Comment #15 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
I made sily stand-alone test:
long test[4];
__attribute__ ((noipa))
void
foo (unsigned long a, unsigned long b, unsigned long c, unsigned long d)
{
test[0]=a;
test[1]=b;
test[2]=c;
test[3]=d;
}
int
main()
{
long s = 0;
for (int i = 0; i < 1000000000; i++)
{
foo (i,i+1,i+2,i+3);
s+=test[0];
s+=test[1];
s+=test[2];
s+=test[3];
}
return s;
}
And curiously enough it strongly prefers cost of 2 over 3.
jh@shroud:~/trunk/build/gcc> perf stat ./test-noslp
Performance counter stats for './test-noslp':
1,211.17 msec task-clock:u # 1.000 CPUs
utilized
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
55 page-faults:u # 45.411 /sec
5,000,372,342 cycles:u # 4.129 GHz
2,000,253,750 stalled-cycles-frontend:u # 40.00% frontend
cycles idle
17,000,136,662 instructions:u # 3.40 insn per
cycle
# 0.12 stalled cycles per
insn
3,000,030,827 branches:u # 2.477 G/sec
2,592 branch-misses:u # 0.00% of all
branches
1.211767440 seconds time elapsed
1.211832000 seconds user
0.000000000 seconds sys
jh@shroud:~/trunk/build/gcc> perf stat ./test-cost3
Performance counter stats for './test-cost3':
7,266.90 msec task-clock:u # 1.000 CPUs
utilized
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
55 page-faults:u # 7.569 /sec
30,001,467,995 cycles:u # 4.129 GHz
1,111,876 stalled-cycles-frontend:u # 0.00% frontend
cycles idle
23,000,138,491 instructions:u # 0.77 insn per
cycle
# 0.00 stalled cycles per
insn
3,000,032,652 branches:u # 412.835 M/sec
4,455 branch-misses:u # 0.00% of all
branches
7.267898755 seconds time elapsed
7.267379000 seconds user
0.000000000 seconds sys
jh@shroud:~/trunk/build/gcc> perf stat ./test-cost2
Performance counter stats for './test-cost2':
1,089.54 msec task-clock:u # 1.000 CPUs
utilized
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
55 page-faults:u # 50.480 /sec
4,501,104,318 cycles:u # 4.131 GHz
5,495,394 stalled-cycles-frontend:u # 0.12% frontend
cycles idle
24,000,136,630 instructions:u # 5.33 insn per
cycle
# 0.00 stalled cycles per
insn
3,000,030,793 branches:u # 2.753 G/sec
2,492 branch-misses:u # 0.00% of all
branches
1.090067946 seconds time elapsed
1.090267000 seconds user
0.000000000 seconds sys
Cost2 variant does:
00000000004011c0 <_Z3foommmm>:
4011c0: c4 e1 f9 6e d2 vmovq %rdx,%xmm2
4011c5: c4 e1 f9 6e df vmovq %rdi,%xmm3
4011ca: c4 e3 e9 22 c9 01 vpinsrq $0x1,%rcx,%xmm2,%xmm1
4011d0: c4 e3 e1 22 c6 01 vpinsrq $0x1,%rsi,%xmm3,%xmm0
4011d6: 62 f3 fd 28 38 c1 01 vinserti64x2 $0x1,%xmm1,%ymm0,%ymm0
4011dd: c5 fd 7f 05 5b 2e 00 vmovdqa %ymm0,0x2e5b(%rip) #
404040 <test>
4011e4: 00
4011e5: c5 f8 77 vzeroupper
4011e8: c3 ret
....
401059: c5 fd 6f 0d df 2f 00 vmovdqa 0x2fdf(%rip),%ymm1 #
404040 <test>
401060: 00
401061: 62 f3 fd 28 39 c8 01 vextracti64x2 $0x1,%ymm1,%xmm0
401068: c5 f9 d4 c1 vpaddq %xmm1,%xmm0,%xmm0
40106c: c5 f1 73 d8 08 vpsrldq $0x8,%xmm0,%xmm1
401071: c5 f9 d4 c1 vpaddq %xmm1,%xmm0,%xmm0
401075: c4 e1 f9 7e c0 vmovq %xmm0,%rax
40107a: 49 01 c4 add %rax,%r12
while cost3 variant does:
00000000004011c0 <_Z3foommmm>:
4011c0: c4 e1 f9 6e d7 vmovq %rdi,%xmm2
4011c5: c4 e1 f9 6e da vmovq %rdx,%xmm3
4011ca: c4 e3 e9 22 ce 01 vpinsrq $0x1,%rsi,%xmm2,%xmm1
4011d0: c4 e3 e1 22 c1 01 vpinsrq $0x1,%rcx,%xmm3,%xmm0
4011d6: c5 f9 7f 0d 62 2e 00 vmovdqa %xmm1,0x2e62(%rip) #
404040 <test>
4011dd: 00
4011de: c5 f9 7f 05 6a 2e 00 vmovdqa %xmm0,0x2e6a(%rip) #
404050 <test+0x10>
4011e5: 00
4011e6: c3 ret
....
401059: c5 fd 6f 0d df 2f 00 vmovdqa 0x2fdf(%rip),%ymm1 #
404040 <test>
401060: 00
401061: 62 f3 fd 28 39 c8 01 vextracti64x2 $0x1,%ymm1,%xmm0
401068: c5 f9 d4 c1 vpaddq %xmm1,%xmm0,%xmm0
40106c: c5 f1 73 d8 08 vpsrldq $0x8,%xmm0,%xmm1
401071: c5 f9 d4 c1 vpaddq %xmm1,%xmm0,%xmm0
401075: c4 e1 f9 7e c0 vmovq %xmm0,%rax
40107a: 49 01 c4 add %rax,%r12
noslp
00000000004011a0 <_Z3foommmm>:
4011a0: 48 89 3d 99 2e 00 00 mov %rdi,0x2e99(%rip) #
404040 <test>
4011a7: 48 89 35 9a 2e 00 00 mov %rsi,0x2e9a(%rip) #
404048 <test+0x8>
4011ae: 48 89 15 9b 2e 00 00 mov %rdx,0x2e9b(%rip) #
404050 <test+0x10>
4011b5: 48 89 0d 9c 2e 00 00 mov %rcx,0x2e9c(%rip) #
404058 <test+0x18>
4011bc: c3 ret
....
401046: 48 03 1d f3 2f 00 00 add 0x2ff3(%rip),%rbx #
404040 <test>
40104d: 48 03 1d f4 2f 00 00 add 0x2ff4(%rip),%rbx #
404048 <test+0x8>
401054: 48 03 1d f5 2f 00 00 add 0x2ff5(%rip),%rbx #
404050 <test+0x10>
40105b: 48 03 1d f6 2f 00 00 add 0x2ff6(%rip),%rbx #
404058 <test+0x18>