------- Comment #1 from matz at gcc dot gnu dot org  2009-02-25 13:53 -------
For reference intel fortran (11.0) produces three loops, one where it uses
predictive commoning (that is used when there are only few iterations):

..B1.7:                         # Preds ..B1.6
        movsd     8(%r8), %xmm1                                 #13.52
        movsd     (%r8), %xmm0                                  #13.52
                                # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r12 r13
r14 r15 xmm0 xmm1
..B1.8:                         # Preds ..B1.8 ..B1.7
        movaps    %xmm1, %xmm2                                  #13.33
        movsd     16(%r8,%rdi,8), %xmm3                         #13.52
        addsd     %xmm3, %xmm2                                  #13.33
        addsd     %xmm0, %xmm2                                  #13.41
        movaps    %xmm1, %xmm0                                  #14.7
        movaps    %xmm3, %xmm1                                  #14.7
        addsd     8(%rdx,%rdi,8), %xmm2                         #13.9
        movsd     %xmm2, 8(%rcx,%rdi,8)                         #13.9
        incq      %rdi                                          #14.7
        cmpq      %rax, %rdi                                    #14.7
        jl        ..B1.8        # Prob 82%                      #14.7

And two others which are vectorized (plus four/eight times unrolled), but
do _not_ use something like predictive commoning (i.e. no cross iteration
values).  Both loops are just versions of each other, one for aligned
destinations and the other for unaligned.  The aligned variant is this:

..B1.15:                        # Preds ..B1.10 ..B1.15
        movsd     8(%rdx,%rax,8), %xmm1                         #13.18
        movhpd    16(%rdx,%rax,8), %xmm1                        #13.18
        movsd     8(%r8,%rax,8), %xmm0                          #13.34
        movhpd    16(%r8,%rax,8), %xmm0                         #13.34
        movsd     24(%rdx,%rax,8), %xmm4                        #13.18
        movhpd    32(%rdx,%rax,8), %xmm4                        #13.18
        movsd     24(%r8,%rax,8), %xmm2                         #13.34
        movhpd    32(%r8,%rax,8), %xmm2                         #13.34
        movsd     40(%rdx,%rax,8), %xmm7                        #13.18
        movhpd    48(%rdx,%rax,8), %xmm7                        #13.18
        movsd     40(%r8,%rax,8), %xmm5                         #13.34
        movhpd    48(%r8,%rax,8), %xmm5                         #13.34
        movsd     56(%rdx,%rax,8), %xmm10                       #13.18
        movhpd    64(%rdx,%rax,8), %xmm10                       #13.18
        movsd     56(%r8,%rax,8), %xmm8                         #13.34
        movhpd    64(%r8,%rax,8), %xmm8                         #13.34
        addpd     %xmm0, %xmm1                                  #13.33
        addpd     (%r8,%rax,8), %xmm1                           #13.41
        addpd     %xmm2, %xmm4                                  #13.33
        addpd     %xmm5, %xmm7                                  #13.33
        addpd     %xmm8, %xmm10                                 #13.33
        movaps    16(%r8,%rax,8), %xmm3                         #13.52
        addpd     %xmm3, %xmm1                                  #13.9
        movaps    32(%r8,%rax,8), %xmm6                         #13.52
        movaps    48(%r8,%rax,8), %xmm9                         #13.52
        movaps    %xmm1, 8(%rcx,%rax,8)                         #13.9
        addpd     %xmm3, %xmm4                                  #13.41
        addpd     %xmm6, %xmm4                                  #13.9
        movaps    %xmm4, 24(%rcx,%rax,8)                        #13.9
        addpd     %xmm6, %xmm7                                  #13.41
        addpd     %xmm9, %xmm7                                  #13.9
        movaps    %xmm7, 40(%rcx,%rax,8)                        #13.9
        addpd     %xmm9, %xmm10                                 #13.41
        addpd     64(%r8,%rax,8), %xmm10                        #13.9
        movaps    %xmm10, 56(%rcx,%rax,8)                       #13.9
        addq      $8, %rax                                      #14.7
        cmpq      %r9, %rax                                     #14.7
        jl        ..B1.15       # Prob 82%                      #14.7

Not most optimal, due to not using the cross-iteration values to save 
two loads per iteration.  But still much better than what GCC uses.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39300

Reply via email to