Here is the proposed patch for both simplifying and consistently speeding up the avx version of wc -l by 10% in up to 1 billion rows scenarios on 7800X3D (probably should be tested on different data samples and CPUs).
--- src/wc_avx2.c | 56 ++++++++++++--------------------------------------- 1 file changed, 13 insertions(+), 43 deletions(-) diff --git a/src/wc_avx2.c b/src/wc_avx2.c index cc0454a46..d2fd43064 100644 --- a/src/wc_avx2.c +++ b/src/wc_avx2.c @@ -22,10 +22,9 @@ #include <x86intrin.h> -/* This must be below 16 KB (16384) or else the accumulators can - theoretically overflow, producing wrong result. This is 2*32 bytes below, - so there is no single bytes in the optimal case. */ -#define BUFSIZE (16320) +/* 256 KB buffer delivers 10-15% speedup over the old 16 KB one on 7800X3D. + The speedup beyond this buffer size was negligible. */ +#define BUFSIZE (256 * 1024) /* Read FD and return a summary. */ extern struct wc_lines @@ -34,20 +33,11 @@ wc_lines_avx2 (int fd) intmax_t lines = 0; intmax_t bytes = 0; - __m256i - zeroes = _mm256_setzero_si256 (), - endlines = _mm256_set1_epi8 ('\n'); + __m256i endlines = _mm256_set1_epi8 ('\n'); while (true) { - /* Using two parallel accumulators gave a good performance increase. - Adding a third gave no additional benefit, at least on an - Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector - execution engines it would be a win. */ - __m256i - accumulator = _mm256_setzero_si256 (), - accumulator2 = _mm256_setzero_si256 (), - avx_buf[BUFSIZE / sizeof (__m256i)]; + __m256i avx_buf[BUFSIZE / sizeof (__m256i)]; ssize_t bytes_read = read (fd, avx_buf, sizeof avx_buf); if (bytes_read <= 0) @@ -56,36 +46,16 @@ wc_lines_avx2 (int fd) bytes += bytes_read; __m256i *datap = avx_buf; - while (bytes_read >= 64) + while (bytes_read >= 32) { - __m256i - to_match = _mm256_load_si256 (datap), - to_match2 = _mm256_load_si256 (datap + 1), - matches = _mm256_cmpeq_epi8 (to_match, endlines), - matches2 = _mm256_cmpeq_epi8 (to_match2, endlines); - - /* Compare will set each 8 bit integer in the register to 0xFF - on match. When we subtract it the 8 bit accumulators - will underflow, so this is equal to adding 1. */ - accumulator = _mm256_sub_epi8 (accumulator, matches); - accumulator2 = _mm256_sub_epi8 (accumulator2, matches2); - - datap += 2; - bytes_read -= 64; - } + __m256i to_match = _mm256_load_si256 (datap); + __m256i matches = _mm256_cmpeq_epi8 (to_match, endlines); + int mask = _mm256_movemask_epi8 (matches); - /* Horizontally add all 8 bit integers in the register. */ - accumulator = _mm256_sad_epu8 (accumulator, zeroes); - lines += _mm256_extract_epi16 (accumulator, 0) - + _mm256_extract_epi16 (accumulator, 4) - + _mm256_extract_epi16 (accumulator, 8) - + _mm256_extract_epi16 (accumulator, 12); - - accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes); - lines += _mm256_extract_epi16 (accumulator2, 0) - + _mm256_extract_epi16 (accumulator2, 4) - + _mm256_extract_epi16 (accumulator2, 8) - + _mm256_extract_epi16 (accumulator2, 12); + lines += __builtin_popcount (mask); + datap += 1; + bytes_read -= 32; + } /* Finish up any left over bytes */ char *end = (char *) datap + bytes_read; -- 2.44.0