On 31/03/2024 00:18, Evgeny Nizhibitsky wrote:
Here is the proposed patch for both simplifying and consistently speeding up
the avx version of wc -l by 10% in up to 1 billion rows scenarios on 7800X3D
(probably should be tested on different data samples and CPUs).
The patch was mangled, but I manually applied it.
Probably best to attach rather than pasting any further patches.
Attaching here in case others want to try.
This is good as it simplifies the code,
and should have the same portability, to machines and compilers.
I'll adjust the configure.ac check to be more aligned.
As for performance, I tested on my laptop with no change:
# on an i7-5600U with 1 billion short lines
$ yes | head -n1000000000 > /dev/shm/yes
$ time src/wc-old -l /dev/shm/yes
1000000000 /dev/shm/yes
real 0m0.351s
user 0m0.060s
sys 0m0.288s
$ time src/wc-new -l /dev/shm/yes
1000000000 /dev/shm/yes
real 0m0.356s
user 0m0.098s
sys 0m0.255s
Since you change the I/O size from 16 to 256 KiB,
it's more aligned with the recent I/O size adjustment in:
https://github.com/coreutils/coreutils/commit/fcfba90d0
In fact perhaps much of the speedup is just from that change.
Can you test on your system with the buffer reduced back to 16KiB
to see how much that impacts the performance?
thanks,
Pádraig
From ffce0cb65c6eea926b0efbdb6b0a1a665ef4d317 Mon Sep 17 00:00:00 2001
From: Evgeny Nizhibitsky <nizhibit...@gmail.com>
Date: Sun, 31 Mar 2024 12:23:32 +0100
Subject: [PATCH] wc: speed-up by simplifying AVX code
* src/wc_avx2.c (_mm256_sad_epu8): Change from
_mm256_sub_epi8() + _mm256_sad_epu8() to
_mm256_movemask_epi8() + __builtin_popcount().
Also adjust the I/O size from 16 to 256 KiB.
---
src/wc_avx2.c | 56 +++++++++++----------------------------------------
1 file changed, 12 insertions(+), 44 deletions(-)
diff --git a/src/wc_avx2.c b/src/wc_avx2.c
index cc0454a46..4afc28a8e 100644
--- a/src/wc_avx2.c
+++ b/src/wc_avx2.c
@@ -22,10 +22,9 @@
#include <x86intrin.h>
-/* This must be below 16 KB (16384) or else the accumulators can
- theoretically overflow, producing wrong result. This is 2*32 bytes below,
- so there is no single bytes in the optimal case. */
-#define BUFSIZE (16320)
+/* 256 KB buffer delivers 10-15% speedup over the old 16 KB one on 7800X3D.
+ The speedup beyond this buffer size was negligible. */
+#define BUFSIZE (256 * 1024)
/* Read FD and return a summary. */
extern struct wc_lines
@@ -34,21 +33,11 @@ wc_lines_avx2 (int fd)
intmax_t lines = 0;
intmax_t bytes = 0;
- __m256i
- zeroes = _mm256_setzero_si256 (),
- endlines = _mm256_set1_epi8 ('\n');
+ __m256i endlines = _mm256_set1_epi8 ('\n');
while (true)
{
- /* Using two parallel accumulators gave a good performance increase.
- Adding a third gave no additional benefit, at least on an
- Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector
- execution engines it would be a win. */
- __m256i
- accumulator = _mm256_setzero_si256 (),
- accumulator2 = _mm256_setzero_si256 (),
- avx_buf[BUFSIZE / sizeof (__m256i)];
-
+ __m256i avx_buf[BUFSIZE / sizeof (__m256i)];
ssize_t bytes_read = read (fd, avx_buf, sizeof avx_buf);
if (bytes_read <= 0)
return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };
@@ -56,37 +45,16 @@ wc_lines_avx2 (int fd)
bytes += bytes_read;
__m256i *datap = avx_buf;
- while (bytes_read >= 64)
+ while (bytes_read >= 32)
{
- __m256i
- to_match = _mm256_load_si256 (datap),
- to_match2 = _mm256_load_si256 (datap + 1),
- matches = _mm256_cmpeq_epi8 (to_match, endlines),
- matches2 = _mm256_cmpeq_epi8 (to_match2, endlines);
-
- /* Compare will set each 8 bit integer in the register to 0xFF
- on match. When we subtract it the 8 bit accumulators
- will underflow, so this is equal to adding 1. */
- accumulator = _mm256_sub_epi8 (accumulator, matches);
- accumulator2 = _mm256_sub_epi8 (accumulator2, matches2);
-
- datap += 2;
- bytes_read -= 64;
+ __m256i to_match = _mm256_load_si256 (datap);
+ __m256i matches = _mm256_cmpeq_epi8 (to_match, endlines);
+ int mask = _mm256_movemask_epi8 (matches);
+ lines += __builtin_popcount (mask);
+ datap += 1;
+ bytes_read -= 32;
}
- /* Horizontally add all 8 bit integers in the register. */
- accumulator = _mm256_sad_epu8 (accumulator, zeroes);
- lines += _mm256_extract_epi16 (accumulator, 0)
- + _mm256_extract_epi16 (accumulator, 4)
- + _mm256_extract_epi16 (accumulator, 8)
- + _mm256_extract_epi16 (accumulator, 12);
-
- accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes);
- lines += _mm256_extract_epi16 (accumulator2, 0)
- + _mm256_extract_epi16 (accumulator2, 4)
- + _mm256_extract_epi16 (accumulator2, 8)
- + _mm256_extract_epi16 (accumulator2, 12);
-
/* Finish up any left over bytes */
char *end = (char *) datap + bytes_read;
for (char *p = (char *) datap; p < end; p++)
--
2.44.0