> The problem here I think is unions.
> Here is how I would have written this code (without using unions in fact):
> void array_sample_fun(__m128 *dst, const __m128  *src, int length) {
>         __m128 af = _mm_set1_ps(1.20f);
>         __m128 bf = _mm_set1_ps(2.88f);
>         __m128 cf = _mm_set1_ps(-2.44f);
>         __m128 df = _mm_set1_ps(4.06f);
>         __m128 ef = _mm_set1_ps(-12.04f);
>         __m128i mask = _mm_set1_epi32(0xff << 23);
>         __m128i bias = _mm_set1_epi32(0x7f << 23);
>         __m128i t;
>         while (length-- != 0) {
>                 __m128 vec;
>                 vec = (*src++);
>                 __m128 arg =
> _mm_cvtepi32_ps(_mm_srai_epi32(_mm_sub_epi32(_mm_and_si128((__m128i)vec, 
> mask),
> bias),
> 23));
>                 vec = (__m128)_mm_or_si128(_mm_andnot_si128(mask,
> (__m128i)vec), bias);
>                 *dst++ = _mm_add_ps(arg,
> _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(
>                         _mm_mul_ps(af, vec), bf), vec), cf), vec), df),
> vec), ef));
>         }
> }
> ----------------------
> The above gives good results for 32bit:
> .L4:
>         movaps  (%eax), %xmm0
>         movdqa  %xmm4, %xmm1
>         addl    $1, %ecx
>         addl    $16, %eax
>         movdqa  %xmm0, %xmm2
>         pandn   %xmm0, %xmm1
>         movaps  .LC0, %xmm0
>         por     %xmm3, %xmm1
>         pand    %xmm4, %xmm2
>         psubd   %xmm3, %xmm2
>         mulps   %xmm1, %xmm0
>         psrad   $23, %xmm2
>         cvtdq2ps        %xmm2, %xmm2
>         addps   .LC1, %xmm0
>         mulps   %xmm1, %xmm0
>         addps   %xmm7, %xmm0
>         mulps   %xmm1, %xmm0
>         addps   %xmm6, %xmm0
>         mulps   %xmm1, %xmm0
>         addps   %xmm5, %xmm0
>         addps   %xmm0, %xmm2
>         movaps  %xmm2, (%edx)
>         addl    $16, %edx
>         cmpl    %ebx, %ecx
>         jne     .L4
> While your orginal example on 32bits has a store to the stack.
> So this is just a case where union cause missed optimization

Thanks for the hint! Assembly looks ok right now - didn't expect that this kind
of casting would work (it doesn't in msvc compilers). Anyway it would be nice
if the compiler would detect such a optimization opportunity (union case) as a
valid one.



