Use the same PSHUFB-based matching as in the SSSE3 helper, just 2x
wider.

Directly use the new helper if __AVX2__ is defined. It makes the other
helpers unused, so mark them inline to prevent warnings.

Rewrite and simplify init_vectorized_lexer.

libcpp/ChangeLog:

        * files.cc (read_file_guts): Bump padding to 32 if HAVE_AVX2.
        * lex.cc (search_line_acc_char): Mark inline, not "unused".
        (search_line_sse2): Mark inline.
        (search_line_ssse3): Ditto.
        (search_line_avx2): New function.
        (init_vectorized_lexer): Reimplement.
---
 libcpp/files.cc |  15 +++----
 libcpp/lex.cc   | 111 ++++++++++++++++++++++++++++++++++++------------
 2 files changed, 92 insertions(+), 34 deletions(-)

diff --git a/libcpp/files.cc b/libcpp/files.cc
index 78f56e30bd..3df070d035 100644
--- a/libcpp/files.cc
+++ b/libcpp/files.cc
@@ -693,7 +693,7 @@ static bool
 read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc,
                const char *input_charset)
 {
-  ssize_t size, total, count;
+  ssize_t size, pad, total, count;
   uchar *buf;
   bool regular;
 
@@ -732,11 +732,10 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, 
location_t loc,
        the majority of C source files.  */
     size = 8 * 1024;
 
-  /* The + 16 here is space for the final '\n' and 15 bytes of padding,
-     used to quiet warnings from valgrind or Address Sanitizer, when the
-     optimized lexer accesses aligned 16-byte memory chunks, including
-     the bytes after the malloced, area, and stops lexing on '\n'.  */
-  buf = XNEWVEC (uchar, size + 16);
+  pad = HAVE_AVX2 ? 32 : 16;
+  /* The '+ PAD' here is space for the final '\n' and PAD-1 bytes of padding,
+     allowing search_line_fast to use (possibly misaligned) vector loads.  */
+  buf = XNEWVEC (uchar, size + pad);
   total = 0;
   while ((count = read (file->fd, buf + total, size - total)) > 0)
     {
@@ -747,7 +746,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, 
location_t loc,
          if (regular)
            break;
          size *= 2;
-         buf = XRESIZEVEC (uchar, buf, size + 16);
+         buf = XRESIZEVEC (uchar, buf, size + pad);
        }
     }
 
@@ -765,7 +764,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, 
location_t loc,
 
   file->buffer = _cpp_convert_input (pfile,
                                     input_charset,
-                                    buf, size + 16, total,
+                                    buf, size + pad, total,
                                     &file->buffer_start,
                                     &file->st.st_size);
   file->buffer_valid = file->buffer;
diff --git a/libcpp/lex.cc b/libcpp/lex.cc
index 815b8abd29..c336281658 100644
--- a/libcpp/lex.cc
+++ b/libcpp/lex.cc
@@ -225,10 +225,7 @@ acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
    and branches without increasing the number of arithmetic operations.
    It's almost certainly going to be a win with 64-bit word size.  */
 
-static const uchar * search_line_acc_char (const uchar *, const uchar *)
-  ATTRIBUTE_UNUSED;
-
-static const uchar *
+static inline const uchar *
 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 {
   const word_type repl_nl = acc_char_replicate ('\n');
@@ -293,7 +290,7 @@ static const char repl_chars[4][16] 
__attribute__((aligned(16))) = {
 
 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 
-static const uchar *
+static inline const uchar *
 #ifndef __SSE2__
 __attribute__((__target__("sse2")))
 #endif
@@ -345,9 +342,9 @@ search_line_sse2 (const uchar *s, const uchar *end 
ATTRIBUTE_UNUSED)
 }
 
 #ifdef HAVE_AVX2
-/* A version of the fast scanner using SSSE3 shuffle (PSHUFB) insns.  */
+/* Variants of the fast scanner using SSSE3 shuffle (PSHUFB) insns.  */
 
-static const uchar *
+static inline const uchar *
 #ifndef __SSSE3__
 __attribute__((__target__("ssse3")))
 #endif
@@ -394,44 +391,106 @@ done:
   return s + __builtin_ctz (found);
 }
 
+static inline const uchar *
+#ifndef __AVX2__
+__attribute__((__target__("avx2")))
+#endif
+search_line_avx2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
+{
+  typedef char v32qi __attribute__ ((__vector_size__ (32)));
+  typedef v32qi v32qi_u __attribute__ ((__aligned__ (1)));
+  v32qi lut = {
+    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?',
+    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?'
+  };
+
+  int found;
+  /* Process three 32-byte chunks per iteration.  */
+  for (; ; s += 96)
+    {
+      v32qi data, t;
+      data = *(const v32qi_u *)s;
+      __asm__ ("" : "+x" (data));
+      t = __builtin_ia32_pshufb256 (lut, data);
+      if ((found = __builtin_ia32_pmovmskb256 (t == data)))
+       goto done;
+      /* Second chunk.  */
+      data = *(const v32qi_u *)(s + 32);
+      __asm__ ("" : "+x" (data));
+      t = __builtin_ia32_pshufb256 (lut, data);
+      if ((found = __builtin_ia32_pmovmskb256 (t == data)))
+       goto add_32;
+      /* Third chunk.  */
+      data = *(const v32qi_u *)(s + 64);
+      __asm__ ("" : "+x" (data));
+      t = __builtin_ia32_pshufb256 (lut, data);
+      if ((found = __builtin_ia32_pmovmskb256 (t == data)))
+       goto add_64;
+    }
+add_64:
+  s += 32;
+add_32:
+  s += 32;
+done:
+  return s + __builtin_ctz (found);
+}
+
 #else
-/* Work around out-dated assemblers without SSSE3 support.  */
+/* Work around out-dated assemblers without AVX2 support.  */
 #define search_line_ssse3 search_line_sse2
+#define search_line_avx2 search_line_sse2
 #endif
 
+#ifdef __AVX2__
+/* No need for CPU probing, just use the best available variant.  */
+#define search_line_fast search_line_avx2
+#else
 /* Check the CPU capabilities.  */
 
 #include "../gcc/config/i386/cpuid.h"
 
 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
-static search_line_fast_type search_line_fast;
+static search_line_fast_type search_line_fast
+#if defined(__SSE2__)
+ = search_line_sse2;
+#else
+ = search_line_acc_char;
+#endif
 
 #define HAVE_init_vectorized_lexer 1
 static inline void
 init_vectorized_lexer (void)
 {
-  unsigned dummy, ecx = 0, edx = 0;
-  search_line_fast_type impl = search_line_acc_char;
-  int minimum = 0;
-
-#if defined(__SSSE3__)
-  minimum = 3;
-#elif defined(__SSE2__)
-  minimum = 2;
-#endif
+  unsigned a1, b1, c1, d1;
+
+  if (!__get_cpuid (1, &a1, &b1, &c1, &d1))
+    return;
 
-  if (minimum == 3)
-    impl = search_line_ssse3;
-  else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
+  if (c1 & bit_OSXSAVE)
     {
-      if (minimum == 3 || (ecx & bit_SSSE3))
-       impl = search_line_ssse3;
-      else if (minimum == 2 || (edx & bit_SSE2))
-       impl = search_line_sse2;
+      /* Check leaf 7 subleaf 0 for AVX2 ISA support.  */
+      unsigned a7, b7, c7, d7;
+      if (__get_cpuid_count (7, 0, &a7, &b7, &c7, &d7)
+         && (b7 & bit_AVX2))
+       {
+         /* Check XCR0 for YMM state support in the OS.  */
+         unsigned xcr0h, xcr0l;
+         __asm__ volatile (".byte 0x0f, 0x01, 0xd0" // xgetbv
+                           : "=d" (xcr0h), "=a" (xcr0l) : "c" (0));
+         if ((xcr0l & 6) == 6)
+           {
+             search_line_fast = search_line_avx2;
+             return;
+           }
+       }
     }
 
-  search_line_fast = impl;
+  if (c1 & bit_SSSE3)
+    search_line_fast = search_line_ssse3;
+  else if (d1 & bit_SSE2)
+    search_line_fast = search_line_sse2;
 }
+#endif
 
 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 
-- 
2.44.0

Reply via email to