https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116582
--- Comment #6 from Jan Hubicka <hubicka at gcc dot gnu.org> --- Here is a variant of benchmark that needs masking #include <stdlib.h> #define M 1024*1024 T a[M], b[M]; int indices[M]; char c[M]; __attribute__ ((noipa)) void test () { for (int i = 0; i < 1024* 16; i++) if (c[i]) a[i] += b[indices[i]]; } int main() { for (int i = 0 ; i < M; i++) { indices[i] = rand () % M; c[i] = rand () % 2; } for (int i = 0 ; i < 10000; i++) test (); return 0; } jh@shroud:~> ~/trunk-install-znver5/bin/g++ -DT=float -march=native cnd.c -Ofast -mtune-ctrl=^use_gather_4parts -mtune-ctrl=^use_gather_8parts -mtune-ctrl=^use_gather -mtune=native -fdump-tree-all-details ; objdump -d a.out | grep gather ; perf stat -r 10 ./a.out Performance counter stats for './a.out' (10 runs): 281.03 msec task-clock:u # 0.999 CPUs utilized ( +- 0.62% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 659 page-faults:u # 2.345 K/sec ( +- 0.06% ) 1,156,011,975 cycles:u # 4.113 GHz ( +- 0.65% ) 757,216,769 stalled-cycles-frontend:u # 65.50% frontend cycles idle ( +- 1.59% ) 1,292,982,312 instructions:u # 1.12 insn per cycle # 0.59 stalled cycles per insn ( +- 0.00% ) 360,669,069 branches:u # 1.283 G/sec ( +- 0.00% ) 118,731 branch-misses:u # 0.03% of all branches ( +- 8.51% ) 0.28126 +- 0.00173 seconds time elapsed ( +- 0.62% ) jh@shroud:~> ~/trunk-install-znver5/bin/g++ -DT=float -march=native cnd.c -Ofast -mtune-ctrl=use_gather_4parts -mtune-ctrl=use_gather_8parts -mtune-ctrl=use_gather -mtune=native -fdump-tree-all-details ; objdump -d a.out | grep gather ; perf stat -r 10 ./a.out 401241: 62 f2 7d 4d 92 1c 8d vgatherdps 0x904080(,%zmm1,4),%zmm3{%k5} 40125b: 62 f2 7d 4e 92 14 8d vgatherdps 0x904080(,%zmm1,4),%zmm2{%k6} 40126a: 62 f2 7d 4f 92 0c a5 vgatherdps 0x904080(,%zmm4,4),%zmm1{%k7} 401280: 62 f2 7d 4d 92 2c a5 vgatherdps 0x904080(,%zmm4,4),%zmm5{%k5} Performance counter stats for './a.out' (10 runs): 266.73 msec task-clock:u # 0.999 CPUs utilized ( +- 4.31% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 659 page-faults:u # 2.471 K/sec ( +- 0.05% ) 1,097,343,324 cycles:u # 4.114 GHz ( +- 4.33% ) 4,009,606 stalled-cycles-frontend:u # 0.37% frontend cycles idle ( +- 6.91% ) 241,592,306 instructions:u # 0.22 insn per cycle # 0.02 stalled cycles per insn ( +- 0.00% ) 35,549,063 branches:u # 133.279 M/sec ( +- 0.00% ) 92,191 branch-misses:u # 0.26% of all branches ( +- 0.06% ) 0.2670 +- 0.0115 seconds time elapsed ( +- 4.30% ) so the difference in number of cycles is quite small while frontend works much harder without gether. If c array is ocnstnat 1: jh@shroud:~> ~/trunk-install-znver5/bin/g++ -DT=float -march=native cnd.c -Ofast -mtune-ctrl=^use_gather_4parts -mtune-ctrl=^use_gather_8parts -mtune-ctrl=^use_gather -mtune=native -fdump-tree-all-details ; objdump -d a.out | grep gather ; perf stat -r 10 ./a.out Performance counter stats for './a.out' (10 runs): 520.92 msec task-clock:u # 1.000 CPUs utilized ( +- 5.29% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 659 page-faults:u # 1.265 K/sec ( +- 0.04% ) 2,142,512,947 cycles:u # 4.113 GHz ( +- 5.31% ) 137,707,449 stalled-cycles-frontend:u # 6.43% frontend cycles idle ( +- 94.67% ) 1,553,801,640 instructions:u # 0.73 insn per cycle # 0.09 stalled cycles per insn ( +- 0.00% ) 344,940,506 branches:u # 662.177 M/sec ( +- 0.00% ) 58,418 branch-misses:u # 0.02% of all branches ( +- 0.07% ) 0.5212 +- 0.0276 seconds time elapsed ( +- 5.29% ) jh@shroud:~> ~/trunk-install-znver5/bin/g++ -DT=float -march=native cnd.c -Ofast -mtune-ctrl=use_gather_4parts -mtune-ctrl=use_gather_8parts -mtune-ctrl=use_gather -mtune=native -fdump-tree-all-details ; objdump -d a.out | grep gather ; perf stat -r 10 ./a.out 401231: 62 f2 7d 4d 92 1c 8d vgatherdps 0x904080(,%zmm1,4),%zmm3{%k5} 40124b: 62 f2 7d 4e 92 14 8d vgatherdps 0x904080(,%zmm1,4),%zmm2{%k6} 40125a: 62 f2 7d 4f 92 0c a5 vgatherdps 0x904080(,%zmm4,4),%zmm1{%k7} 401270: 62 f2 7d 4d 92 2c a5 vgatherdps 0x904080(,%zmm4,4),%zmm5{%k5} Performance counter stats for './a.out' (10 runs): 545.45 msec task-clock:u # 1.000 CPUs utilized ( +- 5.28% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 658 page-faults:u # 1.206 K/sec ( +- 0.05% ) 2,255,436,021 cycles:u # 4.135 GHz ( +- 5.30% ) 6,191,015 stalled-cycles-frontend:u # 0.27% frontend cycles idle ( +- 7.58% ) 171,371,626 instructions:u # 0.08 insn per cycle # 0.04 stalled cycles per insn ( +- 0.00% ) 19,820,508 branches:u # 36.338 M/sec ( +- 0.00% ) 58,434 branch-misses:u # 0.29% of all branches ( +- 0.06% ) 0.5457 +- 0.0288 seconds time elapsed ( +- 5.28% ) jh@shroud:~> ~/trunk-install-znver5/bin/g++ -DT=float -march=native cnd.c -Ofast -mtune-ctrl=^use_gather_4parts -mtune-ctrl=^use_gather_8parts -mtune-ctrl=^use_gather -mtune=native -fdump-tree-all-details ; objdump -d a.out | grep gather ; perf stat -r 10 ./a.out Performance counter stats for './a.out' (10 runs): 690.27 msec task-clock:u # 1.000 CPUs utilized ( +- 3.07% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 659 page-faults:u # 954.698 /sec ( +- 0.04% ) 2,862,994,747 cycles:u # 4.148 GHz ( +- 3.08% ) 60,389,319 stalled-cycles-frontend:u # 2.11% frontend cycles idle ( +- 92.61% ) 1,477,776,699 instructions:u # 0.52 insn per cycle # 0.04 stalled cycles per insn ( +- 0.00% ) 328,232,959 branches:u # 475.514 M/sec ( +- 0.00% ) 24,844 branch-misses:u # 0.01% of all branches ( +- 1.40% ) 0.6905 +- 0.0212 seconds time elapsed ( +- 3.07% ) jh@shroud:~> ~/trunk-install-znver5/bin/g++ -DT=float -march=native cnd.c -Ofast -mtune-ctrl=use_gather_4parts -mtune-ctrl=use_gather_8parts -mtune-ctrl=use_gather -mtune=native -fdump-tree-all-details ; objdump -d a.out | grep gather ; perf stat -r 10 ./a.out 401271: 62 f2 7d 4d 92 1c 8d vgatherdps 0x904080(,%zmm1,4),%zmm3{%k5} 40128b: 62 f2 7d 4e 92 14 8d vgatherdps 0x904080(,%zmm1,4),%zmm2{%k6} 40129a: 62 f2 7d 4f 92 0c a5 vgatherdps 0x904080(,%zmm4,4),%zmm1{%k7} 4012b0: 62 f2 7d 4d 92 2c a5 vgatherdps 0x904080(,%zmm4,4),%zmm5{%k5} Performance counter stats for './a.out' (10 runs): 849.59 msec task-clock:u # 1.000 CPUs utilized ( +- 0.02% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 659 page-faults:u # 775.667 /sec ( +- 0.03% ) 3,514,169,631 cycles:u # 4.136 GHz ( +- 0.01% ) 20,390,142 stalled-cycles-frontend:u # 0.58% frontend cycles idle ( +- 1.47% ) 95,346,747 instructions:u # 0.03 insn per cycle # 0.21 stalled cycles per insn ( +- 0.00% ) 3,113,006 branches:u # 3.664 M/sec ( +- 0.00% ) 24,535 branch-misses:u # 0.79% of all branches ( +- 0.11% ) 0.849857 +- 0.000176 seconds time elapsed ( +- 0.02% ) What confuses me is the branch mispredict count. Non-gather loop is: 0.60 │ ↓ je 6c │ movslq 0x504080(,%rax,4),%rdx 1.19 │ vmovss 0x904080(,%rdx,4),%xmm0 84.09 │ vaddss 0xd04080(,%rax,4),%xmm0,%xmm0 8.77 │ vmovss %xmm0,0xd04080(,%rax,4) 1.62 │6c: inc %rax 3.03 │ cmp $0x4000,%rax While gathering loop is: │ 10:┌─→vmovdqa64 0x404080(%rax),%zmm1 │ │ vxorps %xmm3,%xmm3,%xmm3 │ │ vmovdqa32 0x504100(,%rax,4),%zmm4 │ │ vxorps %xmm2,%xmm2,%xmm2 0.15 │ │ vxorps %xmm5,%xmm5,%xmm5 │ │ vpcmpneqb %zmm6,%zmm1,%k1 │ │ vmovdqa32 0x504080(,%rax,4),%zmm1 0.07 │ │ kshiftrd $0x10,%k1,%k4 │ │ kmovw %k1,%k5 │ │ kshiftrq $0x20,%k1,%k2 │ │ kmovw %k4,%k6 │ │ kshiftrd $0x10,%k2,%k3 0.15 │ │ kmovw %k2,%k7 │ │ vgatherdps 0x904080(,%zmm1,4),%zmm3{%k5} 25.63 │ │ vmovdqa32 0x5040c0(,%rax,4),%zmm1 │ │ kmovw %k3,%k5 │ │ vgatherdps 0x904080(,%zmm1,4),%zmm2{%k6} 26.30 │ │ vxorps %xmm1,%xmm1,%xmm1 │ │ vgatherdps 0x904080(,%zmm4,4),%zmm1{%k7} 27.53 │ │ vmovdqa32 0x504140(,%rax,4),%zmm4 │ │ vgatherdps 0x904080(,%zmm4,4),%zmm5{%k5} 19.08 │ │ vmovaps 0xd04080(,%rax,4),%zmm4 │ │ vaddps %zmm4,%zmm3,%zmm4{%k1} │ │ vmovaps 0xd040c0(,%rax,4),%zmm3 0.51 │ │ vmovaps %zmm4,0xd04080(,%rax,4) │ │ vaddps %zmm3,%zmm2,%zmm3{%k4} │ │ vmovaps 0xd04100(,%rax,4),%zmm2 │ │ vmovaps %zmm3,0xd040c0(,%rax,4) │ │ vaddps %zmm2,%zmm1,%zmm2{%k2} │ │ vmovaps 0xd04140(,%rax,4),%zmm1 0.07 │ │ vmovaps %zmm2,0xd04100(,%rax,4) 0.07 │ │ vaddps %zmm1,%zmm5,%zmm1{%k3} 0.44 │ │ vmovaps %zmm1,0xd04140(,%rax,4) │ │ add $0x40,%rax │ ├──cmp $0x4000,%rax │ └──jne 10 0.70 │ ↑ jne 40 How it is possible that there is no increase in mispredicts? Curiously enabling scatter disables vectorization again: jh@shroud:~> ~/trunk-install-znver5/bin/g++ -DT=float -march=native cnd.c -Ofast -mtune-ctrl=use_gather_4parts -mtune-ctrl=use_gather_8parts -mtune-ctrl=use_gather -mtune=native -fdump-tree-all-details -mtune-ctrl=use_scatter_4parts -mtune-ctrl=use_scatter_8parts -mtune-ctrl=use_scatter_2parts ; objdump -d a.out | grep gather ; perf stat -r 10 ./a.out Performance counter stats for './a.out' (10 runs): 475.05 msec task-clock:u # 1.000 CPUs utilized ( +- 0.05% ) 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 660 page-faults:u # 1.389 K/sec ( +- 0.05% ) 1,968,735,770 cycles:u # 4.144 GHz ( +- 0.01% ) 121,150,048 stalled-cycles-frontend:u # 6.15% frontend cycles idle ( +- 0.36% ) 1,594,448,124 instructions:u # 0.81 insn per cycle # 0.08 stalled cycles per insn ( +- 0.00% ) 360,669,133 branches:u # 759.224 M/sec ( +- 0.00% ) 89,997 branch-misses:u # 0.02% of all branches ( +- 0.54% ) 0.475287 +- 0.000236 seconds time elapsed ( +- 0.05% ) We seem to give up on: cnd.c:11:21: note: vect_is_simple_use: vectype vector(16) <signed-boolean:1> cnd.c:11:21: note: vect_is_simple_use: operand indices[i_14], type of def: internal cnd.c:11:21: note: vect_is_simple_use: vectype vector(16) int cnd.c:11:21: missed: unsupported masked emulated gather. cnd.c:9:1: missed: not vectorized: relevant stmt not supported: patt_31 = .MASK_LOAD (_30, 32B, patt_12); cnd.c:11:21: missed: bad operation or unsupported loop bound.