https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119298
--- Comment #16 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Jan Hubicka from comment #15)
> I made sily stand-alone test:
>
> long test[4];
> __attribute__ ((noipa))
> void
> foo (unsigned long a, unsigned long b, unsigned long c, unsigned long d)
> {
> test[0]=a;
> test[1]=b;
> test[2]=c;
> test[3]=d;
> }
>
> int
> main()
> {
> long s = 0;
> for (int i = 0; i < 1000000000; i++)
> {
> foo (i,i+1,i+2,i+3);
> s+=test[0];
> s+=test[1];
> s+=test[2];
> s+=test[3];
> }
> return s;
> }
>
> And curiously enough it strongly prefers cost of 2 over 3.
>
> jh@shroud:~/trunk/build/gcc> perf stat ./test-noslp
>
> Performance counter stats for './test-noslp':
>
> 1,211.17 msec task-clock:u # 1.000 CPUs
> utilized
> 0 context-switches:u # 0.000 /sec
>
> 0 cpu-migrations:u # 0.000 /sec
>
> 55 page-faults:u # 45.411 /sec
>
> 5,000,372,342 cycles:u # 4.129 GHz
>
> 2,000,253,750 stalled-cycles-frontend:u # 40.00% frontend
> cycles idle
> 17,000,136,662 instructions:u # 3.40 insn per
> cycle
> # 0.12 stalled cycles
> per insn
> 3,000,030,827 branches:u # 2.477 G/sec
>
> 2,592 branch-misses:u # 0.00% of all
> branches
>
> 1.211767440 seconds time elapsed
>
> 1.211832000 seconds user
> 0.000000000 seconds sys
>
>
> jh@shroud:~/trunk/build/gcc> perf stat ./test-cost3
>
> Performance counter stats for './test-cost3':
>
> 7,266.90 msec task-clock:u # 1.000 CPUs
> utilized
> 0 context-switches:u # 0.000 /sec
>
> 0 cpu-migrations:u # 0.000 /sec
>
> 55 page-faults:u # 7.569 /sec
>
> 30,001,467,995 cycles:u # 4.129 GHz
>
> 1,111,876 stalled-cycles-frontend:u # 0.00% frontend
> cycles idle
> 23,000,138,491 instructions:u # 0.77 insn per
> cycle
> # 0.00 stalled cycles
> per insn
> 3,000,032,652 branches:u # 412.835 M/sec
>
> 4,455 branch-misses:u # 0.00% of all
> branches
>
> 7.267898755 seconds time elapsed
>
> 7.267379000 seconds user
> 0.000000000 seconds sys
>
>
> jh@shroud:~/trunk/build/gcc> perf stat ./test-cost2
>
> Performance counter stats for './test-cost2':
>
> 1,089.54 msec task-clock:u # 1.000 CPUs
> utilized
> 0 context-switches:u # 0.000 /sec
>
> 0 cpu-migrations:u # 0.000 /sec
>
> 55 page-faults:u # 50.480 /sec
>
> 4,501,104,318 cycles:u # 4.131 GHz
>
> 5,495,394 stalled-cycles-frontend:u # 0.12% frontend
> cycles idle
> 24,000,136,630 instructions:u # 5.33 insn per
> cycle
> # 0.00 stalled cycles
> per insn
> 3,000,030,793 branches:u # 2.753 G/sec
>
> 2,492 branch-misses:u # 0.00% of all
> branches
>
> 1.090067946 seconds time elapsed
>
> 1.090267000 seconds user
> 0.000000000 seconds sys
>
>
> Cost2 variant does:
>
> 00000000004011c0 <_Z3foommmm>:
> 4011c0: c4 e1 f9 6e d2 vmovq %rdx,%xmm2
> 4011c5: c4 e1 f9 6e df vmovq %rdi,%xmm3
> 4011ca: c4 e3 e9 22 c9 01 vpinsrq $0x1,%rcx,%xmm2,%xmm1
> 4011d0: c4 e3 e1 22 c6 01 vpinsrq $0x1,%rsi,%xmm3,%xmm0
> 4011d6: 62 f3 fd 28 38 c1 01 vinserti64x2 $0x1,%xmm1,%ymm0,%ymm0
> 4011dd: c5 fd 7f 05 5b 2e 00 vmovdqa %ymm0,0x2e5b(%rip) #
> 404040 <test>
> 4011e4: 00
> 4011e5: c5 f8 77 vzeroupper
> 4011e8: c3 ret
> ....
> 401059: c5 fd 6f 0d df 2f 00 vmovdqa 0x2fdf(%rip),%ymm1 #
> 404040 <test>
that will forward nicely
> 401060: 00
> 401061: 62 f3 fd 28 39 c8 01 vextracti64x2 $0x1,%ymm1,%xmm0
> 401068: c5 f9 d4 c1 vpaddq %xmm1,%xmm0,%xmm0
> 40106c: c5 f1 73 d8 08 vpsrldq $0x8,%xmm0,%xmm1
> 401071: c5 f9 d4 c1 vpaddq %xmm1,%xmm0,%xmm0
> 401075: c4 e1 f9 7e c0 vmovq %xmm0,%rax
> 40107a: 49 01 c4 add %rax,%r12
>
>
> while cost3 variant does:
> 00000000004011c0 <_Z3foommmm>:
> 4011c0: c4 e1 f9 6e d7 vmovq %rdi,%xmm2
> 4011c5: c4 e1 f9 6e da vmovq %rdx,%xmm3
> 4011ca: c4 e3 e9 22 ce 01 vpinsrq $0x1,%rsi,%xmm2,%xmm1
> 4011d0: c4 e3 e1 22 c1 01 vpinsrq $0x1,%rcx,%xmm3,%xmm0
> 4011d6: c5 f9 7f 0d 62 2e 00 vmovdqa %xmm1,0x2e62(%rip) #
> 404040 <test>
> 4011dd: 00
> 4011de: c5 f9 7f 05 6a 2e 00 vmovdqa %xmm0,0x2e6a(%rip) #
> 404050 <test+0x10>
> 4011e5: 00
> 4011e6: c3 ret
> ....
> 401059: c5 fd 6f 0d df 2f 00 vmovdqa 0x2fdf(%rip),%ymm1 #
> 404040 <test>
this will fail to forward, thus a huge penalty.
> 401060: 00
> 401061: 62 f3 fd 28 39 c8 01 vextracti64x2 $0x1,%ymm1,%xmm0
> 401068: c5 f9 d4 c1 vpaddq %xmm1,%xmm0,%xmm0
> 40106c: c5 f1 73 d8 08 vpsrldq $0x8,%xmm0,%xmm1
> 401071: c5 f9 d4 c1 vpaddq %xmm1,%xmm0,%xmm0
> 401075: c4 e1 f9 7e c0 vmovq %xmm0,%rax
> 40107a: 49 01 c4 add %rax,%r12
>
>
> noslp
> 00000000004011a0 <_Z3foommmm>:
> 4011a0: 48 89 3d 99 2e 00 00 mov %rdi,0x2e99(%rip) #
> 404040 <test>
> 4011a7: 48 89 35 9a 2e 00 00 mov %rsi,0x2e9a(%rip) #
> 404048 <test+0x8>
> 4011ae: 48 89 15 9b 2e 00 00 mov %rdx,0x2e9b(%rip) #
> 404050 <test+0x10>
> 4011b5: 48 89 0d 9c 2e 00 00 mov %rcx,0x2e9c(%rip) #
> 404058 <test+0x18>
> 4011bc: c3 ret
> ....
> 401046: 48 03 1d f3 2f 00 00 add 0x2ff3(%rip),%rbx #
> 404040 <test>
> 40104d: 48 03 1d f4 2f 00 00 add 0x2ff4(%rip),%rbx #
> 404048 <test+0x8>
> 401054: 48 03 1d f5 2f 00 00 add 0x2ff5(%rip),%rbx #
> 404050 <test+0x10>
> 40105b: 48 03 1d f6 2f 00 00 add 0x2ff6(%rip),%rbx #
> 404058 <test+0x18>