http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55155
Bug #: 55155 Summary: Autovectorization does not use unaligned loads/stores Classification: Unclassified Product: gcc Version: 4.7.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization AssignedTo: unassig...@gcc.gnu.org ReportedBy: sgunder...@bigfoot.com Hi, I am on gcc version 4.7.1 (Debian 4.7.1-7) and a project of mine had code that looked like this: beklager:~> cat example.cpp void func(float * __restrict prod_features, float * __restrict grad_prod_features, float alpha, unsigned num_prods) { float *pf = (float *)__builtin_assume_aligned(prod_features, 16); float *gpf = (float *)__builtin_assume_aligned(grad_prod_features, 16); for (unsigned i = 0; i < num_prods * 16; ++i) { prod_features[i] -= alpha * grad_prod_features[i]; //pf[i] -= alpha * gpf[i]; } } This would seem like a great case for autovectorization, so I tried: beklager:~> g++ -Wall -O2 -ftree-vectorize -msse4.1 -c example.cpp example.cpp: In function ‘void func(float*, float*, float, unsigned int)’: example.cpp:2:9: warning: unused variable ‘pf’ [-Wunused-variable] example.cpp:3:9: warning: unused variable ‘gpf’ [-Wunused-variable] The resulting code, however, is a train wreck: beklager:~> objdump --disassemble --demangle example.o example.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <func(float*, float*, float, unsigned int)>: 0: 55 push %rbp 1: c1 e2 04 shl $0x4,%edx 4: 85 d2 test %edx,%edx 6: 53 push %rbx 7: 0f 84 ef 00 00 00 je fc <func(float*, float*, float, unsigned int)+0xfc> d: 49 89 f8 mov %rdi,%r8 10: 41 83 e0 0f and $0xf,%r8d 14: 49 c1 e8 02 shr $0x2,%r8 18: 49 f7 d8 neg %r8 1b: 41 83 e0 03 and $0x3,%r8d 1f: 44 39 c2 cmp %r8d,%edx 22: 44 0f 42 c2 cmovb %edx,%r8d 26: 83 fa 04 cmp $0x4,%edx 29: 0f 87 d0 00 00 00 ja ff <func(float*, float*, float, unsigned int)+0xff> 2f: 41 89 d0 mov %edx,%r8d 32: 31 c0 xor %eax,%eax 34: 0f 1f 40 00 nopl 0x0(%rax) 38: f3 0f 10 14 86 movss (%rsi,%rax,4),%xmm2 3d: 8d 48 01 lea 0x1(%rax),%ecx 40: f3 0f 59 d0 mulss %xmm0,%xmm2 44: f3 0f 10 0c 87 movss (%rdi,%rax,4),%xmm1 49: f3 0f 5c ca subss %xmm2,%xmm1 4d: f3 0f 11 0c 87 movss %xmm1,(%rdi,%rax,4) 52: 48 83 c0 01 add $0x1,%rax 56: 41 39 c0 cmp %eax,%r8d 59: 77 dd ja 38 <func(float*, float*, float, unsigned int)+0x38> 5b: 44 39 c2 cmp %r8d,%edx 5e: 0f 84 98 00 00 00 je fc <func(float*, float*, float, unsigned int)+0xfc> 64: 89 d5 mov %edx,%ebp 66: 45 89 c1 mov %r8d,%r9d 69: 44 29 c5 sub %r8d,%ebp 6c: 41 89 eb mov %ebp,%r11d 6f: 41 c1 eb 02 shr $0x2,%r11d 73: 42 8d 1c 9d 00 00 00 lea 0x0(,%r11,4),%ebx 7a: 00 7b: 85 db test %ebx,%ebx 7d: 74 59 je d8 <func(float*, float*, float, unsigned int)+0xd8> 7f: 0f 28 c8 movaps %xmm0,%xmm1 82: 49 c1 e1 02 shl $0x2,%r9 86: 0f 57 db xorps %xmm3,%xmm3 89: 4e 8d 14 0f lea (%rdi,%r9,1),%r10 8d: 0f c6 c9 00 shufps $0x0,%xmm1,%xmm1 91: 49 01 f1 add %rsi,%r9 94: 31 c0 xor %eax,%eax 96: 45 31 c0 xor %r8d,%r8d 99: 0f 28 e1 movaps %xmm1,%xmm4 9c: 0f 1f 40 00 nopl 0x0(%rax) a0: 0f 28 cb movaps %xmm3,%xmm1 a3: 41 83 c0 01 add $0x1,%r8d a7: 41 0f 28 14 02 movaps (%r10,%rax,1),%xmm2 ac: 41 0f 12 0c 01 movlps (%r9,%rax,1),%xmm1 b1: 41 0f 16 4c 01 08 movhps 0x8(%r9,%rax,1),%xmm1 b7: 0f 59 cc mulps %xmm4,%xmm1 ba: 0f 5c d1 subps %xmm1,%xmm2 bd: 41 0f 29 14 02 movaps %xmm2,(%r10,%rax,1) c2: 48 83 c0 10 add $0x10,%rax c6: 45 39 d8 cmp %r11d,%r8d c9: 72 d5 jb a0 <func(float*, float*, float, unsigned int)+0xa0> cb: 01 d9 add %ebx,%ecx cd: 39 dd cmp %ebx,%ebp cf: 74 2b je fc <func(float*, float*, float, unsigned int)+0xfc> d1: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) d8: 41 89 c8 mov %ecx,%r8d db: 83 c1 01 add $0x1,%ecx de: f3 42 0f 10 14 86 movss (%rsi,%r8,4),%xmm2 e4: 4a 8d 04 87 lea (%rdi,%r8,4),%rax e8: 39 ca cmp %ecx,%edx ea: f3 0f 59 d0 mulss %xmm0,%xmm2 ee: f3 0f 10 08 movss (%rax),%xmm1 f2: f3 0f 5c ca subss %xmm2,%xmm1 f6: f3 0f 11 08 movss %xmm1,(%rax) fa: 77 dc ja d8 <func(float*, float*, float, unsigned int)+0xd8> fc: 5b pop %rbx fd: 5d pop %rbp fe: c3 retq ff: 45 85 c0 test %r8d,%r8d 102: 0f 85 2a ff ff ff jne 32 <func(float*, float*, float, unsigned int)+0x32> 108: 31 c9 xor %ecx,%ecx 10a: e9 55 ff ff ff jmpq 64 <func(float*, float*, float, unsigned int)+0x64> There are two potential issues here: 1. It knows that my two arrays are not necessarily 16-byte aligned, so it emits a huge body of code around it. (If I comment out the line in the inner loop and uncomment the one next to it, much of this code disappears.) It should simply write the loop using unaligned loads/stores (movups) instead of trying to piece together packed scalars with movlps and movhps itself. 2. For some reason, it doesn't understand that (num_prods * 16) is divisible by four, so it has extra code to handle that case. If I change num_prods to a constant (e.g. 64), and use the variables that are assumed 16-aligned, the output is the much more sane beklager:~> cat example.cpp void func(float * __restrict prod_features, float * __restrict grad_prod_features, float alpha, unsigned num_prods) { float *pf = (float *)__builtin_assume_aligned(prod_features, 16); float *gpf = (float *)__builtin_assume_aligned(grad_prod_features, 16); for (unsigned i = 0; i < 64 * 16; ++i) { //prod_features[i] -= alpha * grad_prod_features[i]; pf[i] -= alpha * gpf[i]; } } beklager:~> g++ -Wall -O2 -ftree-vectorize -msse4.1 -c example.cpp beklager:~> objdump --disassemble --demangle example.o example.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <func(float*, float*, float, unsigned int)>: 0: 0f 28 c8 movaps %xmm0,%xmm1 3: 31 c0 xor %eax,%eax 5: 0f c6 c9 00 shufps $0x0,%xmm1,%xmm1 9: 0f 28 d1 movaps %xmm1,%xmm2 c: 0f 1f 40 00 nopl 0x0(%rax) 10: 0f 28 0c 06 movaps (%rsi,%rax,1),%xmm1 14: 0f 59 ca mulps %xmm2,%xmm1 17: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0 1b: 0f 5c c1 subps %xmm1,%xmm0 1e: 0f 29 04 07 movaps %xmm0,(%rdi,%rax,1) 22: 48 83 c0 10 add $0x10,%rax 26: 48 3d 00 10 00 00 cmp $0x1000,%rax 2c: 75 e2 jne 10 <func(float*, float*, float, unsigned int)+0x10> 2e: f3 c3 repz retq although in this case, one could argue that it should have fused the movaps+subps+movaps to a single subps from memory.