From: Andi Kleen <[email protected]>
AVX2 is widely available on x86 and it allows to do the scanner line
check with 32 bytes at a time. The code is similar to the SSE2 code
path, just using AVX and 32 bytes at a time instead of SSE2 16 bytes.
Also adjust the code to allow inlining when the compiler
is built for an AVX2 host, following what other architectures
do.
I see about a ~0.6% compile time improvement for compiling i386
insn-recog.i with -O0.
libcpp/ChangeLog:
* config.in (HAVE_AVX2): Add.
* configure: Regenerate.
* configure.ac: Add HAVE_AVX2 check.
* lex.cc (repl_chars): Extend to 32 bytes.
(search_line_avx2): New function to scan line using AVX2.
(init_vectorized_lexer): Check for AVX2 in CPUID.
---
libcpp/config.in | 3 ++
libcpp/configure | 17 +++++++++
libcpp/configure.ac | 3 ++
libcpp/lex.cc | 91 +++++++++++++++++++++++++++++++++++++++++++--
4 files changed, 110 insertions(+), 4 deletions(-)
diff --git a/libcpp/config.in b/libcpp/config.in
index 253ef03a3dea..8fad6bd4b4f5 100644
--- a/libcpp/config.in
+++ b/libcpp/config.in
@@ -213,6 +213,9 @@
/* Define to 1 if you can assemble SSE4 insns. */
#undef HAVE_SSE4
+/* Define to 1 if you can assemble AVX2 insns. */
+#undef HAVE_AVX2
+
/* Define to 1 if you have the <stddef.h> header file. */
#undef HAVE_STDDEF_H
diff --git a/libcpp/configure b/libcpp/configure
index 32d6aaa30699..6d9286ac9601 100755
--- a/libcpp/configure
+++ b/libcpp/configure
@@ -9149,6 +9149,23 @@ if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define HAVE_SSE4 1" >>confdefs.h
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+int
+main ()
+{
+asm ("vpcmpeqb %%ymm0, %%ymm4, %%ymm5" : : "i"(0))
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX2 1" >>confdefs.h
+
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
esac
diff --git a/libcpp/configure.ac b/libcpp/configure.ac
index b883fec776fe..c06609827924 100644
--- a/libcpp/configure.ac
+++ b/libcpp/configure.ac
@@ -200,6 +200,9 @@ case $target in
AC_TRY_COMPILE([], [asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0))],
[AC_DEFINE([HAVE_SSE4], [1],
[Define to 1 if you can assemble SSE4 insns.])])
+ AC_TRY_COMPILE([], [asm ("vpcmpeqb %%ymm0, %%ymm4, %%ymm5" : : "i"(0))],
+ [AC_DEFINE([HAVE_AVX2], [1],
+ [Define to 1 if you can assemble AVX2 insns.])])
esac
# Enable --enable-host-shared.
diff --git a/libcpp/lex.cc b/libcpp/lex.cc
index 1591dcdf151a..72f3402aac99 100644
--- a/libcpp/lex.cc
+++ b/libcpp/lex.cc
@@ -278,19 +278,31 @@ search_line_acc_char (const uchar *s, const uchar *end
ATTRIBUTE_UNUSED)
/* Replicated character data to be shared between implementations.
Recall that outside of a context with vector support we can't
define compatible vector types, therefore these are all defined
- in terms of raw characters. */
-static const char repl_chars[4][16] __attribute__((aligned(16))) = {
+ in terms of raw characters.
+ gcc constant propagates this and usually turns it into a
+ vector broadcast, so it actually disappears. */
+
+static const char repl_chars[4][32] __attribute__((aligned(32))) = {
{ '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
+ '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
+ '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
'\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
{ '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
+ '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
+ '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
'\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
{ '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
+ '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
+ '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
'\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
{ '?', '?', '?', '?', '?', '?', '?', '?',
+ '?', '?', '?', '?', '?', '?', '?', '?',
+ '?', '?', '?', '?', '?', '?', '?', '?',
'?', '?', '?', '?', '?', '?', '?', '?' },
};
+#ifndef __AVX2__
/* A version of the fast scanner using SSE2 vectorized byte compare insns. */
static const uchar *
@@ -343,8 +355,9 @@ search_line_sse2 (const uchar *s, const uchar *end
ATTRIBUTE_UNUSED)
found = __builtin_ctz(found);
return (const uchar *)p + found;
}
+#endif
-#ifdef HAVE_SSE4
+#if defined(HAVE_SSE4) && !defined(__AVX2__)
/* A version of the fast scanner using SSE 4.2 vectorized string insns. */
static const uchar *
@@ -425,6 +438,71 @@ search_line_sse42 (const uchar *s, const uchar *end)
#define search_line_sse42 search_line_sse2
#endif
+#ifdef HAVE_AVX2
+
+/* A version of the fast scanner using AVX2 vectorized byte compare insns. */
+
+static const uchar *
+#ifndef __AVX2__
+__attribute__((__target__("avx2")))
+#endif
+search_line_avx2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
+{
+ typedef char v32qi __attribute__ ((__vector_size__ (32)));
+
+ const v32qi repl_nl = *(const v32qi *)repl_chars[0];
+ const v32qi repl_cr = *(const v32qi *)repl_chars[1];
+ const v32qi repl_bs = *(const v32qi *)repl_chars[2];
+ const v32qi repl_qm = *(const v32qi *)repl_chars[3];
+
+ unsigned int misalign, found, mask;
+ const v32qi *p;
+ v32qi data, t;
+
+ /* Align the source pointer. */
+ misalign = (uintptr_t)s & 31;
+ p = (const v32qi *)((uintptr_t)s & -32);
+ data = *p;
+
+ /* Create a mask for the bytes that are valid within the first
+ 32-byte block. The Idea here is that the AND with the mask
+ within the loop is "free", since we need some AND or TEST
+ insn in order to set the flags for the branch anyway. */
+ mask = -1u << misalign;
+
+ /* Main loop processing 32 bytes at a time. */
+ goto start;
+ do
+ {
+ data = *++p;
+ mask = -1;
+
+ start:
+ t = data == repl_nl;
+ t |= data == repl_cr;
+ t |= data == repl_bs;
+ t |= data == repl_qm;
+ found = __builtin_ia32_pmovmskb256 (t);
+ found &= mask;
+ }
+ while (!found);
+
+ /* FOUND contains 1 in bits for which we matched a relevant
+ character. Conversion to the byte index is trivial. */
+ found = __builtin_ctz (found);
+ return (const uchar *)p + found;
+}
+
+#else
+#define search_line_avx2 search_line_sse2
+#endif
+
+#ifdef __AVX2__
+/* Avoid indirect calls to encourage inlining if the compiler is built
+ using AVX. */
+#define search_line_fast search_line_avx2
+#else
+
/* Check the CPU capabilities. */
#include "../gcc/config/i386/cpuid.h"
@@ -436,7 +514,7 @@ static search_line_fast_type search_line_fast;
static inline void
init_vectorized_lexer (void)
{
- unsigned dummy, ecx = 0, edx = 0;
+ unsigned dummy, ecx = 0, edx = 0, ebx = 0;
search_line_fast_type impl = search_line_acc_char;
int minimum = 0;
@@ -448,6 +526,10 @@ init_vectorized_lexer (void)
if (minimum == 3)
impl = search_line_sse42;
+ else if (__get_cpuid_max (0, &dummy) >= 7
+ && __get_cpuid_count (7, 0, &dummy, &ebx, &dummy, &dummy)
+ && (ebx & bit_AVX2))
+ impl = search_line_avx2;
else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
{
if (minimum == 3 || (ecx & bit_SSE4_2))
@@ -458,6 +540,7 @@ init_vectorized_lexer (void)
search_line_fast = impl;
}
+#endif /* !__AVX2__ */
#elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
--
2.45.2