[PATCH] wc: speed-up by simplifying avx code

Evgeny Nizhibitsky Sat, 30 Mar 2024 17:19:34 -0700

Here is the proposed patch for both simplifying and consistently speeding
up the avx version of wc -l by 10% in up to 1 billion rows scenarios on
7800X3D (probably should be tested on different data samples and CPUs).


---
src/wc_avx2.c | 56 ++++++++++++---------------------------------------
1 file changed, 13 insertions(+), 43 deletions(-)

diff --git a/src/wc_avx2.c b/src/wc_avx2.c
index cc0454a46..d2fd43064 100644
--- a/src/wc_avx2.c
+++ b/src/wc_avx2.c
@@ -22,10 +22,9 @@
#include <x86intrin.h>
-/* This must be below 16 KB (16384) or else the accumulators can
- theoretically overflow, producing wrong result. This is 2*32 bytes below,
- so there is no single bytes in the optimal case. */
-#define BUFSIZE (16320)
+/* 256 KB buffer delivers 10-15% speedup over the old 16 KB one on 7800X3D.
+ The speedup beyond this buffer size was negligible. */
+#define BUFSIZE (256 * 1024)
/* Read FD and return a summary. */
extern struct wc_lines
@@ -34,20 +33,11 @@ wc_lines_avx2 (int fd)
intmax_t lines = 0;
intmax_t bytes = 0;
- __m256i
- zeroes = _mm256_setzero_si256 (),
- endlines = _mm256_set1_epi8 ('\n');
+ __m256i endlines = _mm256_set1_epi8 ('\n');
while (true)
{
- /* Using two parallel accumulators gave a good performance increase.
- Adding a third gave no additional benefit, at least on an
- Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector
- execution engines it would be a win. */
- __m256i
- accumulator = _mm256_setzero_si256 (),
- accumulator2 = _mm256_setzero_si256 (),
- avx_buf[BUFSIZE / sizeof (__m256i)];
+ __m256i avx_buf[BUFSIZE / sizeof (__m256i)];
ssize_t bytes_read = read (fd, avx_buf, sizeof avx_buf);
if (bytes_read <= 0)
@@ -56,36 +46,16 @@ wc_lines_avx2 (int fd)
bytes += bytes_read;
__m256i *datap = avx_buf;
- while (bytes_read >= 64)
+ while (bytes_read >= 32)
{
- __m256i
- to_match = _mm256_load_si256 (datap),
- to_match2 = _mm256_load_si256 (datap + 1),
- matches = _mm256_cmpeq_epi8 (to_match, endlines),
- matches2 = _mm256_cmpeq_epi8 (to_match2, endlines);
-
- /* Compare will set each 8 bit integer in the register to 0xFF
- on match. When we subtract it the 8 bit accumulators
- will underflow, so this is equal to adding 1. */
- accumulator = _mm256_sub_epi8 (accumulator, matches);
- accumulator2 = _mm256_sub_epi8 (accumulator2, matches2);
-
- datap += 2;
- bytes_read -= 64;
- }
+ __m256i to_match = _mm256_load_si256 (datap);
+ __m256i matches = _mm256_cmpeq_epi8 (to_match, endlines);
+ int mask = _mm256_movemask_epi8 (matches);
- /* Horizontally add all 8 bit integers in the register. */
- accumulator = _mm256_sad_epu8 (accumulator, zeroes);
- lines += _mm256_extract_epi16 (accumulator, 0)
- + _mm256_extract_epi16 (accumulator, 4)
- + _mm256_extract_epi16 (accumulator, 8)
- + _mm256_extract_epi16 (accumulator, 12);
-
- accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes);
- lines += _mm256_extract_epi16 (accumulator2, 0)
- + _mm256_extract_epi16 (accumulator2, 4)
- + _mm256_extract_epi16 (accumulator2, 8)
- + _mm256_extract_epi16 (accumulator2, 12);
+ lines += __builtin_popcount (mask);
+ datap += 1;
+ bytes_read -= 32;
+ }
/* Finish up any left over bytes */
char *end = (char *) datap + bytes_read;
--
2.44.0

[PATCH] wc: speed-up by simplifying avx code

Reply via email to