------- Comment #1 from pinskia at gcc dot gnu dot org 2006-11-18 00:14 ------- The problem here I think is unions.
Here is how I would have written this code (without using unions in fact): void array_sample_fun(__m128 *dst, const __m128 *src, int length) { __m128 af = _mm_set1_ps(1.20f); __m128 bf = _mm_set1_ps(2.88f); __m128 cf = _mm_set1_ps(-2.44f); __m128 df = _mm_set1_ps(4.06f); __m128 ef = _mm_set1_ps(-12.04f); __m128i mask = _mm_set1_epi32(0xff << 23); __m128i bias = _mm_set1_epi32(0x7f << 23); __m128i t; while (length-- != 0) { __m128 vec; vec = (*src++); __m128 arg = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_sub_epi32(_mm_and_si128((__m128i)vec, mask), bias), 23)); vec = (__m128)_mm_or_si128(_mm_andnot_si128(mask, (__m128i)vec), bias); *dst++ = _mm_add_ps(arg, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps( _mm_mul_ps(af, vec), bf), vec), cf), vec), df), vec), ef)); } } ---------------------- The above gives good results for 32bit: .L4: movaps (%eax), %xmm0 movdqa %xmm4, %xmm1 addl $1, %ecx addl $16, %eax movdqa %xmm0, %xmm2 pandn %xmm0, %xmm1 movaps .LC0, %xmm0 por %xmm3, %xmm1 pand %xmm4, %xmm2 psubd %xmm3, %xmm2 mulps %xmm1, %xmm0 psrad $23, %xmm2 cvtdq2ps %xmm2, %xmm2 addps .LC1, %xmm0 mulps %xmm1, %xmm0 addps %xmm7, %xmm0 mulps %xmm1, %xmm0 addps %xmm6, %xmm0 mulps %xmm1, %xmm0 addps %xmm5, %xmm0 addps %xmm0, %xmm2 movaps %xmm2, (%edx) addl $16, %edx cmpl %ebx, %ecx jne .L4 While your orginal example on 32bits has a store to the stack. So this is just a case where union cause missed optimization -- pinskia at gcc dot gnu dot org changed: What |Removed |Added ---------------------------------------------------------------------------- GCC build triplet| x86_64-unknown-linux-gnu |x86_64-unknown-linux-gnu GCC host triplet| x86_64-unknown-linux-gnu |x86_64-unknown-linux-gnu GCC target triplet| x86_64-unknown-linux-gnu |x86_64-unknown-linux-gnu Summary|inefficient/incorrect xmm |union causes inefficient |registers usage |code http://gcc.gnu.org/bugzilla/show_bug.cgi?id=29881