On Mon, Feb 04, 2019 at 04:45:24AM -0800, H.J. Lu wrote:
> On Mon, Feb 4, 2019 at 3:19 AM Jakub Jelinek <ja...@redhat.com> wrote:
> >
> > On Sun, Feb 03, 2019 at 08:07:22AM -0800, H.J. Lu wrote:
> > > +      /* If the misalignment of __P > 8, subtract __P by 8 bytes.
> > > +      Otherwise, subtract __P by the misalignment.  */
> > > +      if (offset > 8)
> > > +     offset = 8;
> > > +      __P = (char *) (((__SIZE_TYPE__) __P) - offset);
> > > +
> > > +      /* Zero-extend __A and __N to 128 bits and shift right by the
> > > +      adjustment.  */
> > > +      unsigned __int128 __a128 = ((__v1di) __A)[0];
> > > +      unsigned __int128 __n128 = ((__v1di) __N)[0];
> > > +      __a128 <<= offset * 8;
> > > +      __n128 <<= offset * 8;
> > > +      __A128 = __extension__ (__v2di) { __a128, __a128 >> 64 };
> > > +      __N128 = __extension__ (__v2di) { __n128, __n128 >> 64 };
> >
> > We have _mm_slli_si128/__builtin_ia32_pslldqi128, why can't you use that
> > instead of doing the arithmetics in unsigned __int128 scalars?
> >
> 
> Since "PSLLDQ xmm1, imm8" takes an immediate operand,  __int128
> doesn't need a switch statement.
> 

This updated patch uses __builtin_ia32_pslldqi128.


H.J.
---
Emulate MMX maskmovq with SSE2 maskmovdqu by zero-extending source and
mask operands to 128 bits.  Handle unmapped bits 64:127 at memory address
by adjusting source and mask operands together with memory address.

        PR target/89021
        * config/i386/i386.c (ix86_init_mmx_sse_builtins): Don't
        provide __builtin_ia32_maskmovq for TARGET_MMX_WITH_SSE.
        * config/i386/mmx.md (mmx_maskmovq): Add "&& !TARGET_MMX_WITH_SSE".
        (*mmx_maskmovq): Likewise.
        * config/i386/xmmintrin.h: Emulate MMX maskmovq with SSE2
        maskmovdqu.
---
 gcc/config/i386/i386.c      | 15 +++++----
 gcc/config/i386/mmx.md      |  4 +--
 gcc/config/i386/xmmintrin.h | 65 +++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5f4f7e9ddde..b7cbc3f8a2d 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -31048,12 +31048,15 @@ ix86_init_mmx_sse_builtins (void)
   def_builtin_pure (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_stmxcsr",
                    UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
 
-  /* SSE or 3DNow!A */
-  def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
-              /* As it uses V4HImode, we have to require -mmmx too.  */
-              | OPTION_MASK_ISA_MMX, 0,
-              "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
-              IX86_BUILTIN_MASKMOVQ);
+  /* SSE or 3DNow!A.  NB: We can't emulate MMX maskmovq directly with
+     SSE2 maskmovdqu since invalid memory access may happen when bits
+     64:127 at memory location are unmapped.  */
+  if (!TARGET_MMX_WITH_SSE)
+    def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
+                /* As it uses V4HImode, we have to require -mmmx too.  */
+                | OPTION_MASK_ISA_MMX, 0,
+                "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
+                IX86_BUILTIN_MASKMOVQ);
 
   /* SSE2 */
   def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_maskmovdqu",
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index f90574a7255..a1b732ad7be 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1748,7 +1748,7 @@
                      (match_operand:V8QI 2 "register_operand")
                      (match_dup 0)]
                     UNSPEC_MASKMOV))]
-  "TARGET_SSE || TARGET_3DNOW_A")
+  "(TARGET_SSE || TARGET_3DNOW_A) && !TARGET_MMX_WITH_SSE")
 
 (define_insn "*mmx_maskmovq"
   [(set (mem:V8QI (match_operand:P 0 "register_operand" "D"))
@@ -1756,7 +1756,7 @@
                      (match_operand:V8QI 2 "register_operand" "y")
                      (mem:V8QI (match_dup 0))]
                     UNSPEC_MASKMOV))]
-  "TARGET_SSE || TARGET_3DNOW_A"
+  "(TARGET_SSE || TARGET_3DNOW_A) && !TARGET_MMX_WITH_SSE"
   ;; @@@ check ordering of operands in intel/nonintel syntax
   "maskmovq\t{%2, %1|%1, %2}"
   [(set_attr "type" "mmxcvt")
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index 58284378514..95152f8b337 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1165,7 +1165,72 @@ _m_pshufw (__m64 __A, int const __N)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
 {
+#ifdef __x86_64__
+# ifdef __MMX__
+  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
+# else
+  /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits
+     64:127 at address __P.  */
+  typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+  typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+  /* Zero-extend __A and __N to 128 bits.  */
+  __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 };
+  __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 };
+
+  /* Check the alignment of __P.  */
+  __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf;
+  if (offset)
+    {
+      /* If the misalignment of __P > 8, subtract __P by 8 bytes.
+        Otherwise, subtract __P by the misalignment.  */
+      if (offset > 8)
+       offset = 8;
+      __P = (char *) (((__SIZE_TYPE__) __P) - offset);
+
+      /* Shift __A128 and __N128 to the left by the adjustment.  */
+      switch (offset)
+       {
+       case 1:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 8);
+         break;
+       case 2:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8);
+         break;
+       case 3:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8);
+         break;
+       case 4:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8);
+         break;
+       case 5:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8);
+         break;
+       case 6:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8);
+         break;
+       case 7:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8);
+         break;
+       case 8:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8);
+         break;
+       default:
+         break;
+       }
+    }
+  __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P);
+# endif
+#else
   __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
+#endif
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
-- 
2.20.1

Reply via email to