On Mon, Feb 11, 2019 at 11:55 PM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> Emulate MMX maskmovq with SSE2 maskmovdqu in 64-bit mode by zero-extending
> source and mask operands to 128 bits.  Handle unmapped bits 64:127 at
> memory address by adjusting source and mask operands together with memory
> address.
>
>         PR target/89021
>         * config/i386/xmmintrin.h: Emulate MMX maskmovq with SSE2
>         maskmovdqu in 64-bit mode.
> ---
>  gcc/config/i386/xmmintrin.h | 61 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 61 insertions(+)
>
> diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
> index 58284378514..e797795f127 100644
> --- a/gcc/config/i386/xmmintrin.h
> +++ b/gcc/config/i386/xmmintrin.h
> @@ -1165,7 +1165,68 @@ _m_pshufw (__m64 __A, int const __N)
>  extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
>  {
> +#ifdef __x86_64__

We need __MMX_WITH_SSE__ target macro defined from the compiler here.

Uros.

> +  /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits
> +     64:127 at address __P.  */
> +  typedef long long __v2di __attribute__ ((__vector_size__ (16)));
> +  typedef char __v16qi __attribute__ ((__vector_size__ (16)));
> +  /* Zero-extend __A and __N to 128 bits.  */
> +  __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 };
> +  __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 };
> +
> +  /* Check the alignment of __P.  */
> +  __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf;
> +  if (offset)
> +    {
> +      /* If the misalignment of __P > 8, subtract __P by 8 bytes.
> +        Otherwise, subtract __P by the misalignment.  */
> +      if (offset > 8)
> +       offset = 8;
> +      __P = (char *) (((__SIZE_TYPE__) __P) - offset);
> +
> +      /* Shift __A128 and __N128 to the left by the adjustment.  */
> +      switch (offset)
> +       {
> +       case 1:
> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 8);
> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 8);
> +         break;
> +       case 2:
> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8);
> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8);
> +         break;
> +       case 3:
> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8);
> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8);
> +         break;
> +       case 4:
> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8);
> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8);
> +         break;
> +       case 5:
> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8);
> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8);
> +         break;
> +       case 6:
> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8);
> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8);
> +         break;
> +       case 7:
> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8);
> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8);
> +         break;
> +       case 8:
> +         __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8);
> +         __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8);
> +         break;
> +       default:
> +         break;
> +       }
> +    }
> +  __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P);
> +#else
>    __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
> +#endif
>  }
>
>  extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> --
> 2.20.1
>

Reply via email to