https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61722
Bug ID: 61722 Summary: [ 4.9 ] gcc sometimes does not optimise movaps with movups Product: gcc Version: 4.9.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: rtl-optimization Assignee: unassigned at gcc dot gnu.org Reporter: pietrek.j at gmail dot com I have two functions that use unaligned moving of __m128 ( instruction movups ). The first one is optimized well, but in the second function gcc does not eliminate unneeded movaps in the while loop. Code: typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); void __test_fill_1( __m128 *dst, __m128 v, int count ) { while ( count-- ) { __builtin_ia32_storeups((float*)(dst++),v); __builtin_ia32_storeups((float*)(dst++),v); } } void __test_fill_2( __m128 *dst, long long _v, int count ) { __m128 v; ((long long*)&v)[0]=((long long*)&v)[1]=_v; while ( count-- ) { __builtin_ia32_storeups((float*)(dst++),v); __builtin_ia32_storeups((float*)(dst++),v); } } Compilation: $ gcc -O3 test_fill.c -o test_fill -S $ cat test_fill .file "test_fill.c" .section .text.unlikely,"ax",@progbits .LCOLDB0: .text .LHOTB0: .p2align 4,,15 .globl __test_fill_1 .type __test_fill_1, @function __test_fill_1: <------------ first function, optimisation works well here .LFB0: .cfi_startproc testl %esi, %esi je .L1 subl $1, %esi leaq 16(%rdi), %rax salq $5, %rsi leaq 48(%rdi,%rsi), %rdx .p2align 4,,10 .p2align 3 .L3: v----------------------- well-optimized while loop movups %xmm0, -16(%rax) addq $32, %rax movups %xmm0, -32(%rax) cmpq %rdx, %rax jne .L3 .L1: rep ret .cfi_endproc .LFE0: .size __test_fill_1, .-__test_fill_1 .section .text.unlikely .LCOLDE0: .text .LHOTE0: .section .text.unlikely .LCOLDB1: .text .LHOTB1: .p2align 4,,15 .globl __test_fill_2 .type __test_fill_2, @function __test_fill_2: <------------ second function, problem with optimizing while loop .LFB1: .cfi_startproc testl %edx, %edx movq %rsi, -16(%rsp) movq %rsi, -24(%rsp) je .L7 subl $1, %edx leaq 16(%rdi), %rax salq $5, %rdx leaq 48(%rdi,%rdx), %rdx .p2align 4,,10 .p2align 3 .L9: movaps -24(%rsp), %xmm0 <-------- why movaps here? addq $32, %rax movups %xmm0, -48(%rax) movaps -24(%rsp), %xmm1 <-------- why movaps here? movups %xmm1, -32(%rax) cmpq %rdx, %rax jne .L9 .L7: rep ret .cfi_endproc .LFE1: .size __test_fill_2, .-__test_fill_2 .section .text.unlikely .LCOLDE1: .text .LHOTE1: .ident "GCC: (GNU) 4.9.0 20140604 (prerelease)" .section .note.GNU-stack,"",@progbits