On Mon, Feb 04, 2019 at 04:45:24AM -0800, H.J. Lu wrote: > On Mon, Feb 4, 2019 at 3:19 AM Jakub Jelinek <ja...@redhat.com> wrote: > > > > On Sun, Feb 03, 2019 at 08:07:22AM -0800, H.J. Lu wrote: > > > + /* If the misalignment of __P > 8, subtract __P by 8 bytes. > > > + Otherwise, subtract __P by the misalignment. */ > > > + if (offset > 8) > > > + offset = 8; > > > + __P = (char *) (((__SIZE_TYPE__) __P) - offset); > > > + > > > + /* Zero-extend __A and __N to 128 bits and shift right by the > > > + adjustment. */ > > > + unsigned __int128 __a128 = ((__v1di) __A)[0]; > > > + unsigned __int128 __n128 = ((__v1di) __N)[0]; > > > + __a128 <<= offset * 8; > > > + __n128 <<= offset * 8; > > > + __A128 = __extension__ (__v2di) { __a128, __a128 >> 64 }; > > > + __N128 = __extension__ (__v2di) { __n128, __n128 >> 64 }; > > > > We have _mm_slli_si128/__builtin_ia32_pslldqi128, why can't you use that > > instead of doing the arithmetics in unsigned __int128 scalars? > > > > Since "PSLLDQ xmm1, imm8" takes an immediate operand, __int128 > doesn't need a switch statement. >
This updated patch uses __builtin_ia32_pslldqi128. H.J. --- Emulate MMX maskmovq with SSE2 maskmovdqu by zero-extending source and mask operands to 128 bits. Handle unmapped bits 64:127 at memory address by adjusting source and mask operands together with memory address. PR target/89021 * config/i386/i386.c (ix86_init_mmx_sse_builtins): Don't provide __builtin_ia32_maskmovq for TARGET_MMX_WITH_SSE. * config/i386/mmx.md (mmx_maskmovq): Add "&& !TARGET_MMX_WITH_SSE". (*mmx_maskmovq): Likewise. * config/i386/xmmintrin.h: Emulate MMX maskmovq with SSE2 maskmovdqu. --- gcc/config/i386/i386.c | 15 +++++---- gcc/config/i386/mmx.md | 4 +-- gcc/config/i386/xmmintrin.h | 65 +++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 8 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5f4f7e9ddde..b7cbc3f8a2d 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -31048,12 +31048,15 @@ ix86_init_mmx_sse_builtins (void) def_builtin_pure (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_stmxcsr", UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR); - /* SSE or 3DNow!A */ - def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A - /* As it uses V4HImode, we have to require -mmmx too. */ - | OPTION_MASK_ISA_MMX, 0, - "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR, - IX86_BUILTIN_MASKMOVQ); + /* SSE or 3DNow!A. NB: We can't emulate MMX maskmovq directly with + SSE2 maskmovdqu since invalid memory access may happen when bits + 64:127 at memory location are unmapped. */ + if (!TARGET_MMX_WITH_SSE) + def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A + /* As it uses V4HImode, we have to require -mmmx too. */ + | OPTION_MASK_ISA_MMX, 0, + "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR, + IX86_BUILTIN_MASKMOVQ); /* SSE2 */ def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_maskmovdqu", diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index f90574a7255..a1b732ad7be 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1748,7 +1748,7 @@ (match_operand:V8QI 2 "register_operand") (match_dup 0)] UNSPEC_MASKMOV))] - "TARGET_SSE || TARGET_3DNOW_A") + "(TARGET_SSE || TARGET_3DNOW_A) && !TARGET_MMX_WITH_SSE") (define_insn "*mmx_maskmovq" [(set (mem:V8QI (match_operand:P 0 "register_operand" "D")) @@ -1756,7 +1756,7 @@ (match_operand:V8QI 2 "register_operand" "y") (mem:V8QI (match_dup 0))] UNSPEC_MASKMOV))] - "TARGET_SSE || TARGET_3DNOW_A" + "(TARGET_SSE || TARGET_3DNOW_A) && !TARGET_MMX_WITH_SSE" ;; @@@ check ordering of operands in intel/nonintel syntax "maskmovq\t{%2, %1|%1, %2}" [(set_attr "type" "mmxcvt") diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h index 58284378514..95152f8b337 100644 --- a/gcc/config/i386/xmmintrin.h +++ b/gcc/config/i386/xmmintrin.h @@ -1165,7 +1165,72 @@ _m_pshufw (__m64 __A, int const __N) extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) { +#ifdef __x86_64__ +# ifdef __MMX__ + __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); +# else + /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits + 64:127 at address __P. */ + typedef long long __v2di __attribute__ ((__vector_size__ (16))); + typedef char __v16qi __attribute__ ((__vector_size__ (16))); + /* Zero-extend __A and __N to 128 bits. */ + __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 }; + __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 }; + + /* Check the alignment of __P. */ + __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf; + if (offset) + { + /* If the misalignment of __P > 8, subtract __P by 8 bytes. + Otherwise, subtract __P by the misalignment. */ + if (offset > 8) + offset = 8; + __P = (char *) (((__SIZE_TYPE__) __P) - offset); + + /* Shift __A128 and __N128 to the left by the adjustment. */ + switch (offset) + { + case 1: + __A128 = __builtin_ia32_pslldqi128 (__A128, 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 8); + break; + case 2: + __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8); + break; + case 3: + __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8); + break; + case 4: + __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8); + break; + case 5: + __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8); + break; + case 6: + __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8); + break; + case 7: + __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8); + break; + case 8: + __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8); + break; + default: + break; + } + } + __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P); +# endif +#else __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); +#endif } extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -- 2.20.1