Pádraig Brady <[email protected]> writes: > On 18/02/2026 08:16, Collin Funk wrote: >> I will have to add support for Neon in the 'cpu-supports' Gnulib >> module and update tests/wc/wc-cpu.sh before pushing this. >> However, that should be fairly trivial. So I thought it was best to >> send this for others to review anyways. > > Very nice. 3x is a nice win.
I pushed the attached patch with the Gnulib update, plus the missing documentation and test fix. I chose to use the name "neon" in the messages, as my impression is that name is much more commonly used than "asimd" (Advanced SIMD). For the environment variable though, I think we should stick to the HWCAP_* macros, which unfortunately use ASIMD. Collin
>From 6a3dde5dd2d77fa1aa5fac65d8f71bdad81471a1 Mon Sep 17 00:00:00 2001 Message-ID: <6a3dde5dd2d77fa1aa5fac65d8f71bdad81471a1.1771474640.git.collin.fu...@gmail.com> From: Collin Funk <[email protected]> Date: Wed, 18 Feb 2026 00:03:05 -0800 Subject: [PATCH] wc: add aarch64 Neon optimization for wc -l Here is an example of the performance improvement: $ yes abcdefghijklmnopqrstuvwxyz | head -n 100000000 > input $ time ./src/wc-prev -l < input 100000000 real 0m0.793s user 0m0.630s sys 0m0.162s $ time ./src/wc -l < input 100000000 real 0m0.230s user 0m0.065s sys 0m0.164s * NEWS: Mention the performance improvement. * gnulib: Update to the latest commit. * configure.ac: Check the the necessary intrinsics and functions. * src/local.mk (noinst_LIBRARIES) [USE_NEON_WC_LINECOUNT]: Add src/libwc_neon.a. (src_libwc_neon_a_SOURCES, wc_neon_ldadd, src_libwc_neon_a_CFLAGS) [USE_NEON_WC_LINECOUNT]: New variables. (src_wc_LDADD) [USE_NEON_WC_LINECOUNT]: Add $(wc_neon_ldadd). * src/wc.c [USE_NEON_WC_LINECOUNT]: Include sys/auxv.h and asm/hwcap.h. (neon_supported) [USE_NEON_WC_LINECOUNT]: New function. (wc_lines) [USE_NEON_WC_LINECOUNT]: Use neon_supported and wc_lines_neon. * src/wc.h (wc_lines_neon): Add declaration. * src/wc_neon.c: New file. * doc/coreutils.texi (Hardware Acceleration): Document the "-ASIMD" hwcap and the variable used in ./configure to override detection of Neon instructions. * tests/wc/wc-cpu.sh: Also add "-ASIMD" to disable the use of Neon instructions. --- NEWS | 3 ++ configure.ac | 37 +++++++++++++++ doc/coreutils.texi | 3 +- gnulib | 2 +- src/local.mk | 7 +++ src/wc.c | 27 +++++++++++ src/wc.h | 1 + src/wc_neon.c | 109 +++++++++++++++++++++++++++++++++++++++++++++ tests/wc/wc-cpu.sh | 4 +- 9 files changed, 189 insertions(+), 4 deletions(-) create mode 100644 src/wc_neon.c diff --git a/NEWS b/NEWS index 179deca90..c509cbc1e 100644 --- a/NEWS +++ b/NEWS @@ -20,6 +20,9 @@ GNU coreutils NEWS -*- outline -*- 'nl' now supports multi-byte --section-delimiter characters. + 'wc -l' now operates up to three times faster on hosts that support Neon + instructions. + ** Build-related ./configure --enable-single-binary=hardlinks is now supported on systems diff --git a/configure.ac b/configure.ac index 2da648987..fdf8d067f 100644 --- a/configure.ac +++ b/configure.ac @@ -807,6 +807,43 @@ fi AM_CONDITIONAL([USE_AVX512_WC_LINECOUNT], [test $utils_cv_avx512_intrinsic_exists = yes]) +CFLAGS=$ac_save_CFLAGS + +CFLAGS="-march=armv8-a+simd $CFLAGS" +AC_MSG_CHECKING([for neon intrinsics]) +AC_CACHE_VAL([utils_cv_neon_intrinsic_exists],[ +AC_LINK_IFELSE( + [AC_LANG_SOURCE([[ + #include <sys/auxv.h> + #include <asm/hwcap.h> + #include <arm_neon.h> + + int + main (void) + { + char buffer[128] = {0}; + uint8x16_t v = vld1q_u8 (buffer); + uint8x16_t m = vceqq_u8 (v, v); + uint8x16_t s = vandq_u8 (m, m); + uint16x8_t a = vpaddlq_u8 (s); + uint32x4_t b = vpaddlq_u16 (a); + uint64x2_t c = vpaddlq_u32 (b); + int value = vgetq_lane_u64 (c, 0) + vgetq_lane_u64 (c, 1); + return value && 0 < (getauxval (AT_HWCAP) & HWCAP_ASIMD); + } + ]]) + ],[ + utils_cv_neon_intrinsic_exists=yes + ],[ + utils_cv_neon_intrinsic_exists=no + ])]) +AC_MSG_RESULT([$utils_cv_neon_intrinsic_exists]) +if test $utils_cv_neon_intrinsic_exists = yes; then + AC_DEFINE([USE_NEON_WC_LINECOUNT], [1], [Counting lines with Neon enabled]) +fi +AM_CONDITIONAL([USE_NEON_WC_LINECOUNT], + [test $utils_cv_neon_intrinsic_exists = yes]) + CFLAGS=$ac_save_CFLAGS ############################################################################ diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 455930746..def1a8820 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -1599,6 +1599,7 @@ @node Hardware Acceleration @item utils_cv_avx2_intrinsic_exists @item utils_cv_avx2_pclmul_intrinsic_exists @item utils_cv_avx512_pclmul_intrinsic_exists +@item utils_cv_neon_intrinsic_exists @item utils_cv_pclmul_intrinsic_exists @item utils_cv_vmull_intrinsic_exists @end table @@ -1623,7 +1624,7 @@ @node Hardware Acceleration @example export OPENSSL_ia32cap='0x0' export OPENSSL_armcap='0x0' -export GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512F,-AVX2,-AVX,-PMULL' +export GLIBC_TUNABLES='glibc.cpu.hwcaps=-ASIMD,-AVX512F,-AVX2,-AVX,-PMULL' @end example The @option{--debug} option is available on all utilities supporting diff --git a/gnulib b/gnulib index a5a25302c..08e222ff7 160000 --- a/gnulib +++ b/gnulib @@ -1 +1 @@ -Subproject commit a5a25302cc8d20832f08c828f1d2de0961607e0f +Subproject commit 08e222ff788896d872ff6526d755d869786bbee7 diff --git a/src/local.mk b/src/local.mk index 29f07f254..bf88f7d0e 100644 --- a/src/local.mk +++ b/src/local.mk @@ -509,6 +509,13 @@ wc_avx2_ldadd = src/libwc_avx2.a src_wc_LDADD += $(wc_avx2_ldadd) src_libwc_avx2_a_CFLAGS = -mavx2 $(AM_CFLAGS) endif +if USE_NEON_WC_LINECOUNT +noinst_LIBRARIES += src/libwc_neon.a +src_libwc_neon_a_SOURCES = src/wc_neon.c +wc_neon_ldadd = src/libwc_neon.a +src_wc_LDADD += $(wc_neon_ldadd) +src_libwc_neon_a_CFLAGS = -march=armv8-a+simd $(AM_CFLAGS) +endif # Ensure we don't link against libcoreutils.a as that lib is # not compiled with -fPIC which causes issues on 64 bit at least diff --git a/src/wc.c b/src/wc.c index 76f8d21a4..eb6c3d9e4 100644 --- a/src/wc.c +++ b/src/wc.c @@ -31,6 +31,11 @@ #include <stat-size.h> #include <xbinary-io.h> +#ifdef USE_NEON_WC_LINECOUNT +# include <sys/auxv.h> +# include <asm/hwcap.h> +#endif + #include "system.h" #include "cpu-supports.h" #include "ioblksize.h" @@ -159,6 +164,21 @@ avx512_supported (void) } #endif +#ifdef USE_NEON_WC_LINECOUNT +static bool +neon_supported (void) +{ + bool neon_enabled = (cpu_may_support ("asimd") + && 0 < (getauxval (AT_HWCAP) & HWCAP_ASIMD)); + if (debug) + error (0, 0, (neon_enabled + ? _("using neon hardware support") + : _("neon support not detected"))); + + return neon_enabled; +} +#endif + void usage (int status) { @@ -298,6 +318,13 @@ wc_lines (int fd) if (0 < use_avx2) return wc_lines_avx2 (fd); #endif +#ifdef USE_NEON_WC_LINECOUNT + static signed char use_neon; + if (!use_neon) + use_neon = neon_supported () ? 1 : -1; + if (0 < use_neon) + return wc_lines_neon (fd); +#endif intmax_t lines = 0, bytes = 0; bool long_lines = false; diff --git a/src/wc.h b/src/wc.h index f151e92f2..fa0aef990 100644 --- a/src/wc.h +++ b/src/wc.h @@ -2,3 +2,4 @@ struct wc_lines { int err; intmax_t lines; intmax_t bytes; }; struct wc_lines wc_lines_avx2 (int); struct wc_lines wc_lines_avx512 (int); +struct wc_lines wc_lines_neon (int fd); diff --git a/src/wc_neon.c b/src/wc_neon.c new file mode 100644 index 000000000..53f82b8b4 --- /dev/null +++ b/src/wc_neon.c @@ -0,0 +1,109 @@ +/* wc_neon - Count the number of newlines with neon instructions. + Copyright (C) 2026 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +/* Written by Collin Funk <[email protected]>, 2026. */ + +#include <config.h> + +#include "wc.h" +#include "system.h" +#include "ioblksize.h" + +#include <arm_neon.h> + +/* Read FD and return a summary. */ +extern struct wc_lines +wc_lines_neon (int fd) +{ + intmax_t lines = 0; + intmax_t bytes = 0; + + uint8x16_t endlines = vdupq_n_u8 ('\n'); + uint8x16_t ones = vdupq_n_u8 (1); + + while (true) + { + unsigned char neon_buf[IO_BUFSIZE]; + ssize_t bytes_read = read (fd, neon_buf, sizeof neon_buf); + if (bytes_read <= 0) + return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes }; + + bytes += bytes_read; + unsigned char *datap = neon_buf; + + while (64 <= bytes_read) + { + /* Load 64 bytes from NEON_BUF. */ + uint8x16_t v0 = vld1q_u8 (datap); + uint8x16_t v1 = vld1q_u8 (datap + 16); + uint8x16_t v2 = vld1q_u8 (datap + 32); + uint8x16_t v3 = vld1q_u8 (datap + 48); + + /* Bitwise equal with ENDLINES. */ + uint8x16_t m0 = vceqq_u8 (v0, endlines); + uint8x16_t m1 = vceqq_u8 (v1, endlines); + uint8x16_t m2 = vceqq_u8 (v2, endlines); + uint8x16_t m3 = vceqq_u8 (v3, endlines); + + /* Bitwise and with ONES. */ + uint8x16_t s0 = vandq_u8 (m0, ones); + uint8x16_t s1 = vandq_u8 (m1, ones); + uint8x16_t s2 = vandq_u8 (m2, ones); + uint8x16_t s3 = vandq_u8 (m3, ones); + + /* Sum the vectors. */ + uint16x8_t a0 = vpaddlq_u8 (s0); + uint16x8_t a1 = vpaddlq_u8 (s1); + uint16x8_t a2 = vpaddlq_u8 (s2); + uint16x8_t a3 = vpaddlq_u8 (s3); + uint32x4_t b0 = vpaddlq_u16 (a0); + uint32x4_t b1 = vpaddlq_u16 (a1); + uint32x4_t b2 = vpaddlq_u16 (a2); + uint32x4_t b3 = vpaddlq_u16 (a3); + uint64x2_t c0 = vpaddlq_u32 (b0); + uint64x2_t c1 = vpaddlq_u32 (b1); + uint64x2_t c2 = vpaddlq_u32 (b2); + uint64x2_t c3 = vpaddlq_u32 (b3); + + /* Extract the vectors. */ + lines += (vgetq_lane_u64 (c0, 0) + vgetq_lane_u64 (c0, 1) + + vgetq_lane_u64 (c1, 0) + vgetq_lane_u64 (c1, 1) + + vgetq_lane_u64 (c2, 0) + vgetq_lane_u64 (c2, 1) + + vgetq_lane_u64 (c3, 0) + vgetq_lane_u64 (c3, 1)); + + datap += 64; + bytes_read -= 64; + } + + while (16 <= bytes_read) + { + uint8x16_t v = vld1q_u8 (datap); + uint8x16_t m = vceqq_u8 (v, endlines); + uint8x16_t s = vandq_u8 (m, ones); + uint16x8_t a = vpaddlq_u8 (s); + uint32x4_t b = vpaddlq_u16 (a); + uint64x2_t c = vpaddlq_u32 (b); + lines += vgetq_lane_u64 (c, 0) + vgetq_lane_u64 (c, 1); + datap += 16; + bytes_read -= 16; + } + + /* Finish up any left over bytes. */ + unsigned char *end = (unsigned char *) datap + bytes_read; + for (unsigned char *p = (unsigned char *) datap; p < end; p++) + lines += *p == '\n'; + } +} diff --git a/tests/wc/wc-cpu.sh b/tests/wc/wc-cpu.sh index 13366f59c..8aed6f808 100755 --- a/tests/wc/wc-cpu.sh +++ b/tests/wc/wc-cpu.sh @@ -19,7 +19,7 @@ . "${srcdir=.}/tests/init.sh"; path_prepend_ ./src print_ver_ wc -GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2,-AVX512F' \ +GLIBC_TUNABLES='glibc.cpu.hwcaps=-ASIMD,-AVX2,-AVX512F' \ wc -l --debug /dev/null 2>debug || fail=1 grep 'using.*hardware support' debug && fail=1 @@ -32,7 +32,7 @@ wc_accelerated_no_avx512=$( wc -l < lines ) || fail=1 wc_base=$( - GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2,-AVX512F' \ + GLIBC_TUNABLES='glibc.cpu.hwcaps=-ASIMD,-AVX2,-AVX512F' \ wc -l < lines ) || fail=1 -- 2.53.0
