Use the same PSHUFB-based matching as in the SSSE3 helper, just 2x wider. Directly use the new helper if __AVX2__ is defined. It makes the other helpers unused, so mark them inline to prevent warnings.
Rewrite and simplify init_vectorized_lexer. libcpp/ChangeLog: * files.cc (read_file_guts): Bump padding to 32 if HAVE_AVX2. * lex.cc (search_line_acc_char): Mark inline, not "unused". (search_line_sse2): Mark inline. (search_line_ssse3): Ditto. (search_line_avx2): New function. (init_vectorized_lexer): Reimplement. --- libcpp/files.cc | 15 +++---- libcpp/lex.cc | 111 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 92 insertions(+), 34 deletions(-) diff --git a/libcpp/files.cc b/libcpp/files.cc index 78f56e30bd..3df070d035 100644 --- a/libcpp/files.cc +++ b/libcpp/files.cc @@ -693,7 +693,7 @@ static bool read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, const char *input_charset) { - ssize_t size, total, count; + ssize_t size, pad, total, count; uchar *buf; bool regular; @@ -732,11 +732,10 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, the majority of C source files. */ size = 8 * 1024; - /* The + 16 here is space for the final '\n' and 15 bytes of padding, - used to quiet warnings from valgrind or Address Sanitizer, when the - optimized lexer accesses aligned 16-byte memory chunks, including - the bytes after the malloced, area, and stops lexing on '\n'. */ - buf = XNEWVEC (uchar, size + 16); + pad = HAVE_AVX2 ? 32 : 16; + /* The '+ PAD' here is space for the final '\n' and PAD-1 bytes of padding, + allowing search_line_fast to use (possibly misaligned) vector loads. */ + buf = XNEWVEC (uchar, size + pad); total = 0; while ((count = read (file->fd, buf + total, size - total)) > 0) { @@ -747,7 +746,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, if (regular) break; size *= 2; - buf = XRESIZEVEC (uchar, buf, size + 16); + buf = XRESIZEVEC (uchar, buf, size + pad); } } @@ -765,7 +764,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, file->buffer = _cpp_convert_input (pfile, input_charset, - buf, size + 16, total, + buf, size + pad, total, &file->buffer_start, &file->st.st_size); file->buffer_valid = file->buffer; diff --git a/libcpp/lex.cc b/libcpp/lex.cc index 815b8abd29..c336281658 100644 --- a/libcpp/lex.cc +++ b/libcpp/lex.cc @@ -225,10 +225,7 @@ acc_char_index (word_type cmp ATTRIBUTE_UNUSED, and branches without increasing the number of arithmetic operations. It's almost certainly going to be a win with 64-bit word size. */ -static const uchar * search_line_acc_char (const uchar *, const uchar *) - ATTRIBUTE_UNUSED; - -static const uchar * +static inline const uchar * search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) { const word_type repl_nl = acc_char_replicate ('\n'); @@ -293,7 +290,7 @@ static const char repl_chars[4][16] __attribute__((aligned(16))) = { /* A version of the fast scanner using SSE2 vectorized byte compare insns. */ -static const uchar * +static inline const uchar * #ifndef __SSE2__ __attribute__((__target__("sse2"))) #endif @@ -345,9 +342,9 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) } #ifdef HAVE_AVX2 -/* A version of the fast scanner using SSSE3 shuffle (PSHUFB) insns. */ +/* Variants of the fast scanner using SSSE3 shuffle (PSHUFB) insns. */ -static const uchar * +static inline const uchar * #ifndef __SSSE3__ __attribute__((__target__("ssse3"))) #endif @@ -394,44 +391,106 @@ done: return s + __builtin_ctz (found); } +static inline const uchar * +#ifndef __AVX2__ +__attribute__((__target__("avx2"))) +#endif +search_line_avx2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) +{ + typedef char v32qi __attribute__ ((__vector_size__ (32))); + typedef v32qi v32qi_u __attribute__ ((__aligned__ (1))); + v32qi lut = { + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?', + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' + }; + + int found; + /* Process three 32-byte chunks per iteration. */ + for (; ; s += 96) + { + v32qi data, t; + data = *(const v32qi_u *)s; + __asm__ ("" : "+x" (data)); + t = __builtin_ia32_pshufb256 (lut, data); + if ((found = __builtin_ia32_pmovmskb256 (t == data))) + goto done; + /* Second chunk. */ + data = *(const v32qi_u *)(s + 32); + __asm__ ("" : "+x" (data)); + t = __builtin_ia32_pshufb256 (lut, data); + if ((found = __builtin_ia32_pmovmskb256 (t == data))) + goto add_32; + /* Third chunk. */ + data = *(const v32qi_u *)(s + 64); + __asm__ ("" : "+x" (data)); + t = __builtin_ia32_pshufb256 (lut, data); + if ((found = __builtin_ia32_pmovmskb256 (t == data))) + goto add_64; + } +add_64: + s += 32; +add_32: + s += 32; +done: + return s + __builtin_ctz (found); +} + #else -/* Work around out-dated assemblers without SSSE3 support. */ +/* Work around out-dated assemblers without AVX2 support. */ #define search_line_ssse3 search_line_sse2 +#define search_line_avx2 search_line_sse2 #endif +#ifdef __AVX2__ +/* No need for CPU probing, just use the best available variant. */ +#define search_line_fast search_line_avx2 +#else /* Check the CPU capabilities. */ #include "../gcc/config/i386/cpuid.h" typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *); -static search_line_fast_type search_line_fast; +static search_line_fast_type search_line_fast +#if defined(__SSE2__) + = search_line_sse2; +#else + = search_line_acc_char; +#endif #define HAVE_init_vectorized_lexer 1 static inline void init_vectorized_lexer (void) { - unsigned dummy, ecx = 0, edx = 0; - search_line_fast_type impl = search_line_acc_char; - int minimum = 0; - -#if defined(__SSSE3__) - minimum = 3; -#elif defined(__SSE2__) - minimum = 2; -#endif + unsigned a1, b1, c1, d1; + + if (!__get_cpuid (1, &a1, &b1, &c1, &d1)) + return; - if (minimum == 3) - impl = search_line_ssse3; - else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2) + if (c1 & bit_OSXSAVE) { - if (minimum == 3 || (ecx & bit_SSSE3)) - impl = search_line_ssse3; - else if (minimum == 2 || (edx & bit_SSE2)) - impl = search_line_sse2; + /* Check leaf 7 subleaf 0 for AVX2 ISA support. */ + unsigned a7, b7, c7, d7; + if (__get_cpuid_count (7, 0, &a7, &b7, &c7, &d7) + && (b7 & bit_AVX2)) + { + /* Check XCR0 for YMM state support in the OS. */ + unsigned xcr0h, xcr0l; + __asm__ volatile (".byte 0x0f, 0x01, 0xd0" // xgetbv + : "=d" (xcr0h), "=a" (xcr0l) : "c" (0)); + if ((xcr0l & 6) == 6) + { + search_line_fast = search_line_avx2; + return; + } + } } - search_line_fast = impl; + if (c1 & bit_SSSE3) + search_line_fast = search_line_ssse3; + else if (d1 & bit_SSE2) + search_line_fast = search_line_sse2; } +#endif #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__) -- 2.44.0