the following two functions are equivalent, adding a scalar to a vector, using
a manual loop unrolling of 8 (2 sse vectors).
the first function serializes the operation, while the second function
interleaves the instructions for two operations:
void bench_3(float * out, float * in, float f, unsigned int n)
{
n /= 8;
__m128 scalar = _mm_set_ps1(f);
do
{
__m128 arg = _mm_load_ps(in);
__m128 result = _mm_add_ps(arg, scalar);
_mm_store_ps(out, result);
arg = _mm_load_ps(in+4);
result = _mm_add_ps(arg, scalar);
_mm_store_ps(out+4, result);
in += 8;
out += 8;
}
while (--n);
}
with the generated code:
.L13:
movaps (%rsi,%rax), %xmm0
addps %xmm1, %xmm0
movaps %xmm0, (%rdi,%rax)
movaps 16(%rsi,%rax), %xmm0
addps %xmm1, %xmm0
movaps %xmm0, 16(%rdi,%rax)
addq $32, %rax
cmpq %rdx, %rax
jne .L13
void bench_4(float * out, float * in, float f, unsigned int n)
{
n /= 8;
__m128 scalar = _mm_set_ps1(f);
do
{
__m128 arg = _mm_load_ps(in);
__m128 arg2 = _mm_load_ps(in+4);
__m128 result = _mm_add_ps(arg, scalar);
__m128 result2 = _mm_add_ps(arg2, scalar);
_mm_store_ps(out, result);
_mm_store_ps(out+4, result2);
in += 8;
out += 8;
}
while (--n);
}
generated code:
.L9:
movaps (%rsi,%rax), %xmm0
movaps 16(%rsi,%rax), %xmm1
addps %xmm2, %xmm0
addps %xmm2, %xmm1
movaps %xmm0, (%rdi,%rax)
movaps %xmm1, 16(%rdi,%rax)
addq $32, %rax
cmpq %rdx, %rax
jne .L9
the interleaved code outperforms the sequential code by about 12% on
x86_64/core2, possibly, because the instruction pairs (load/add/store) don't
have any data dependencies.
it would be nice, if gcc could do a register renaming and instruction
reordering on the first function to generate the same instructions than the
second function.
--
Summary: missed optimization: register renaming in unrolled loop
Product: gcc
Version: 4.4.0
Status: UNCONFIRMED
Severity: enhancement
Priority: P3
Component: target
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: tim at klingt dot org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38825