Since the characters we are searching for (CR, LF, '\', '?') all have
distinct ASCII codes mod 16, PSHUFB can help match them all at once.

libcpp/ChangeLog:

        * lex.cc (search_line_sse42): Replace with...
        (search_line_ssse3): ... this new function.  Adjust the use...
        (init_vectorized_lexer): ... here.
---
 libcpp/lex.cc | 118 ++++++++++++++++++++------------------------------
 1 file changed, 46 insertions(+), 72 deletions(-)

diff --git a/libcpp/lex.cc b/libcpp/lex.cc
index fa9c03614c..815b8abd29 100644
--- a/libcpp/lex.cc
+++ b/libcpp/lex.cc
@@ -345,84 +345,58 @@ search_line_sse2 (const uchar *s, const uchar *end 
ATTRIBUTE_UNUSED)
 }
 
 #ifdef HAVE_AVX2
-/* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
+/* A version of the fast scanner using SSSE3 shuffle (PSHUFB) insns.  */
 
 static const uchar *
-#ifndef __SSE4_2__
-__attribute__((__target__("sse4.2")))
+#ifndef __SSSE3__
+__attribute__((__target__("ssse3")))
 #endif
-search_line_sse42 (const uchar *s, const uchar *end)
+search_line_ssse3 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 {
   typedef char v16qi __attribute__ ((__vector_size__ (16)));
-  static const v16qi search = { '\n', '\r', '?', '\\' };
-
-  uintptr_t si = (uintptr_t)s;
-  uintptr_t index;
-
-  /* Check for unaligned input.  */
-  if (si & 15)
-    {
-      v16qi sv;
-
-      if (__builtin_expect (end - s < 16, 0)
-         && __builtin_expect ((si & 0xfff) > 0xff0, 0))
-       {
-         /* There are less than 16 bytes left in the buffer, and less
-            than 16 bytes left on the page.  Reading 16 bytes at this
-            point might generate a spurious page fault.  Defer to the
-            SSE2 implementation, which already handles alignment.  */
-         return search_line_sse2 (s, end);
-       }
-
-      /* ??? The builtin doesn't understand that the PCMPESTRI read from
-        memory need not be aligned.  */
-      sv = __builtin_ia32_loaddqu ((const char *) s);
-      index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
-
-      if (__builtin_expect (index < 16, 0))
-       goto found;
-
-      /* Advance the pointer to an aligned address.  We will re-scan a
-        few bytes, but we no longer need care for reading past the
-        end of a page, since we're guaranteed a match.  */
-      s = (const uchar *)((si + 15) & -16);
-    }
-
-  /* Main loop, processing 16 bytes at a time.  */
-#ifdef __GCC_ASM_FLAG_OUTPUTS__
-  while (1)
+  typedef v16qi v16qi_u __attribute__ ((__aligned__ (1)));
+  /* Helper vector for pshufb-based matching:
+     each character C we're searching for is at position (C % 16).  */
+  v16qi lut = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' };
+  static_assert ('\n' == 10 && '\r' == 13 && '\\' == 92 && '?' == 63);
+
+  int found;
+  /* Process three 16-byte chunks per iteration.  */
+  for (; ; s += 48)
     {
-      char f;
-
-      /* By using inline assembly instead of the builtin,
-        we can use the result, as well as the flags set.  */
-      __asm ("%vpcmpestri\t$0, %2, %3"
-            : "=c"(index), "=@ccc"(f)
-            : "m"(*s), "x"(search), "a"(4), "d"(16));
-      if (f)
-       break;
-      
-      s += 16;
+      v16qi data, t;
+      /* Unaligned load.  Reading beyond the final newline is safe, since
+        files.cc:read_file_guts pads the allocation.  */
+      data = *(const v16qi_u *)s;
+      /* Prevent propagation into pshufb and pcmp as memory operand.  */
+      __asm__ ("" : "+x" (data));
+      t = __builtin_ia32_pshufb128 (lut, data);
+      if ((found = __builtin_ia32_pmovmskb128 (t == data)))
+       goto done;
+      /* Second chunk.  */
+      data = *(const v16qi_u *)(s + 16);
+      __asm__ ("" : "+x" (data));
+      t = __builtin_ia32_pshufb128 (lut, data);
+      if ((found = __builtin_ia32_pmovmskb128 (t == data)))
+       goto add_16;
+      /* Third chunk.  */
+      data = *(const v16qi_u *)(s + 32);
+      __asm__ ("" : "+x" (data));
+      t = __builtin_ia32_pshufb128 (lut, data);
+      if ((found = __builtin_ia32_pmovmskb128 (t == data)))
+       goto add_32;
     }
-#else
-  s -= 16;
-  /* By doing the whole loop in inline assembly,
-     we can make proper use of the flags set.  */
-  __asm (      ".balign 16\n"
-       "0:     add $16, %1\n"
-       "       %vpcmpestri\t$0, (%1), %2\n"
-       "       jnc 0b"
-       : "=&c"(index), "+r"(s)
-       : "x"(search), "a"(4), "d"(16));
-#endif
-
- found:
-  return s + index;
+add_32:
+  s += 16;
+add_16:
+  s += 16;
+done:
+  return s + __builtin_ctz (found);
 }
 
 #else
-/* Work around out-dated assemblers without sse4 support.  */
-#define search_line_sse42 search_line_sse2
+/* Work around out-dated assemblers without SSSE3 support.  */
+#define search_line_ssse3 search_line_sse2
 #endif
 
 /* Check the CPU capabilities.  */
@@ -440,18 +414,18 @@ init_vectorized_lexer (void)
   search_line_fast_type impl = search_line_acc_char;
   int minimum = 0;
 
-#if defined(__SSE4_2__)
+#if defined(__SSSE3__)
   minimum = 3;
 #elif defined(__SSE2__)
   minimum = 2;
 #endif
 
   if (minimum == 3)
-    impl = search_line_sse42;
+    impl = search_line_ssse3;
   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
     {
-      if (minimum == 3 || (ecx & bit_SSE4_2))
-        impl = search_line_sse42;
+      if (minimum == 3 || (ecx & bit_SSSE3))
+       impl = search_line_ssse3;
       else if (minimum == 2 || (edx & bit_SSE2))
        impl = search_line_sse2;
     }
-- 
2.44.0

Reply via email to