[Bug rtl-optimization/61722] New: [ 4.9 ] gcc sometimes does not optimise movaps with movups

pietrek.j at gmail dot com Sat, 05 Jul 2014 08:58:23 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61722


            Bug ID: 61722
           Summary: [ 4.9 ] gcc sometimes does not optimise movaps with
                    movups
           Product: gcc
           Version: 4.9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: pietrek.j at gmail dot com

I have two functions that use unaligned moving of __m128 ( instruction movups
).
The first one is optimized well, but in the second function gcc does not
eliminate unneeded movaps in the while loop.
Code:
typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));

void __test_fill_1( __m128 *dst, __m128 v, int count )
{
    while ( count-- )
       {
           __builtin_ia32_storeups((float*)(dst++),v);
           __builtin_ia32_storeups((float*)(dst++),v);
       }
}

void __test_fill_2( __m128 *dst, long long _v, int count )
{
    __m128 v;
    ((long long*)&v)[0]=((long long*)&v)[1]=_v;
    while ( count-- )
       {
           __builtin_ia32_storeups((float*)(dst++),v);
           __builtin_ia32_storeups((float*)(dst++),v);
       }
}

Compilation:
$ gcc -O3 test_fill.c -o test_fill -S            
$ cat test_fill
        .file   "test_fill.c"
        .section        .text.unlikely,"ax",@progbits
.LCOLDB0:
        .text
.LHOTB0:
        .p2align 4,,15
        .globl  __test_fill_1
        .type   __test_fill_1, @function
__test_fill_1: <------------ first function, optimisation works well here
.LFB0:
        .cfi_startproc
        testl   %esi, %esi
        je      .L1
        subl    $1, %esi
        leaq    16(%rdi), %rax
        salq    $5, %rsi
        leaq    48(%rdi,%rsi), %rdx
        .p2align 4,,10
        .p2align 3
.L3:              v----------------------- well-optimized while loop
        movups  %xmm0, -16(%rax)
        addq    $32, %rax
        movups  %xmm0, -32(%rax)
        cmpq    %rdx, %rax
        jne     .L3
.L1:
        rep ret
        .cfi_endproc
.LFE0:
        .size   __test_fill_1, .-__test_fill_1
        .section        .text.unlikely
.LCOLDE0:
        .text
.LHOTE0:
        .section        .text.unlikely
.LCOLDB1:
        .text
.LHOTB1:
        .p2align 4,,15
        .globl  __test_fill_2
        .type   __test_fill_2, @function
__test_fill_2: <------------ second function, problem with optimizing while
loop
.LFB1:
        .cfi_startproc
        testl   %edx, %edx
        movq    %rsi, -16(%rsp)
        movq    %rsi, -24(%rsp)
        je      .L7
        subl    $1, %edx
        leaq    16(%rdi), %rax
        salq    $5, %rdx
        leaq    48(%rdi,%rdx), %rdx
        .p2align 4,,10
        .p2align 3
.L9:
        movaps  -24(%rsp), %xmm0 <-------- why movaps here?
        addq    $32, %rax
        movups  %xmm0, -48(%rax)
        movaps  -24(%rsp), %xmm1 <-------- why movaps here?
        movups  %xmm1, -32(%rax)
        cmpq    %rdx, %rax
        jne     .L9
.L7:
        rep ret
        .cfi_endproc
.LFE1:
        .size   __test_fill_2, .-__test_fill_2
        .section        .text.unlikely
.LCOLDE1:
        .text
.LHOTE1:
        .ident  "GCC: (GNU) 4.9.0 20140604 (prerelease)"
        .section        .note.GNU-stack,"",@progbits

[Bug rtl-optimization/61722] New: [ 4.9 ] gcc sometimes does not optimise movaps with movups

Reply via email to