------- Comment #2 from spd at poczta dot fm 2006-11-18 10:25 -------
(In reply to comment #1)
> The problem here I think is unions.
>
> Here is how I would have written this code (without using unions in fact):
>
> void array_sample_fun(__m128 *dst, const __m128 *src, int length) {
> __m128 af = _mm_set1_ps(1.20f);
> __m128 bf = _mm_set1_ps(2.88f);
> __m128 cf = _mm_set1_ps(-2.44f);
> __m128 df = _mm_set1_ps(4.06f);
> __m128 ef = _mm_set1_ps(-12.04f);
>
> __m128i mask = _mm_set1_epi32(0xff << 23);
> __m128i bias = _mm_set1_epi32(0x7f << 23);
> __m128i t;
>
> while (length-- != 0) {
> __m128 vec;
>
> vec = (*src++);
> __m128 arg =
> _mm_cvtepi32_ps(_mm_srai_epi32(_mm_sub_epi32(_mm_and_si128((__m128i)vec,
> mask),
> bias),
> 23));
> vec = (__m128)_mm_or_si128(_mm_andnot_si128(mask,
> (__m128i)vec), bias);
> *dst++ = _mm_add_ps(arg,
> _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(
> _mm_mul_ps(af, vec), bf), vec), cf), vec), df),
> vec), ef));
> }
> }
>
>
> ----------------------
> The above gives good results for 32bit:
> .L4:
> movaps (%eax), %xmm0
> movdqa %xmm4, %xmm1
> addl $1, %ecx
> addl $16, %eax
> movdqa %xmm0, %xmm2
> pandn %xmm0, %xmm1
> movaps .LC0, %xmm0
> por %xmm3, %xmm1
> pand %xmm4, %xmm2
> psubd %xmm3, %xmm2
> mulps %xmm1, %xmm0
> psrad $23, %xmm2
> cvtdq2ps %xmm2, %xmm2
> addps .LC1, %xmm0
> mulps %xmm1, %xmm0
> addps %xmm7, %xmm0
> mulps %xmm1, %xmm0
> addps %xmm6, %xmm0
> mulps %xmm1, %xmm0
> addps %xmm5, %xmm0
> addps %xmm0, %xmm2
> movaps %xmm2, (%edx)
> addl $16, %edx
> cmpl %ebx, %ecx
> jne .L4
>
>
> While your orginal example on 32bits has a store to the stack.
>
> So this is just a case where union cause missed optimization
>
Thanks for the hint! Assembly looks ok right now - didn't expect that this kind
of casting would work (it doesn't in msvc compilers). Anyway it would be nice
if the compiler would detect such a optimization opportunity (union case) as a
valid one.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=29881