I will have to add support for Neon in the 'cpu-supports' Gnulib
module and update tests/wc/wc-cpu.sh before pushing this.
However, that should be fairly trivial. So I thought it was best to
send this for others to review anyways.
-- 8< --
Here is an example of the performance improvement:
$ yes abcdefghijklmnopqrstuvwxyz | head -n 100000000 > input
$ time ./src/wc-prev -l < input
100000000
real 0m0.793s
user 0m0.630s
sys 0m0.162s
$ time ./src/wc -l < input
100000000
real 0m0.230s
user 0m0.065s
sys 0m0.164s
* NEWS: Mention the performance improvement.
* configure.ac: Check the the necessary intrinsics.
* src/local.mk (noinst_LIBRARIES) [USE_NEON_WC_LINECOUNT]: Add
src/libwc_neon.a.
(src_libwc_neon_a_SOURCES, wc_neon_ldadd, src_libwc_neon_a_CFLAGS)
[USE_NEON_WC_LINECOUNT]: New variables.
(src_wc_LDADD) [USE_NEON_WC_LINECOUNT]: Add $(wc_neon_ldadd).
* src/wc.c (neon_supported) [USE_NEON_WC_LINECOUNT]: New function.
(wc_lines) [USE_NEON_WC_LINECOUNT]: Use neon_supported and
wc_lines_neon.
* src/wc.h (wc_lines_neon): Add declaration.
* src/wc_neon.c: New file.
---
NEWS | 3 ++
configure.ac | 35 ++++++++++++++++
src/local.mk | 7 ++++
src/wc.c | 23 +++++++++++
src/wc.h | 1 +
src/wc_neon.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 178 insertions(+)
create mode 100644 src/wc_neon.c
diff --git a/NEWS b/NEWS
index 179deca90..c509cbc1e 100644
--- a/NEWS
+++ b/NEWS
@@ -20,6 +20,9 @@ GNU coreutils NEWS -*-
outline -*-
'nl' now supports multi-byte --section-delimiter characters.
+ 'wc -l' now operates up to three times faster on hosts that support Neon
+ instructions.
+
** Build-related
./configure --enable-single-binary=hardlinks is now supported on systems
diff --git a/configure.ac b/configure.ac
index 2da648987..5e41563a0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -807,6 +807,41 @@ fi
AM_CONDITIONAL([USE_AVX512_WC_LINECOUNT],
[test $utils_cv_avx512_intrinsic_exists = yes])
+CFLAGS=$ac_save_CFLAGS
+
+CFLAGS="-march=armv8-a+simd $CFLAGS"
+AC_MSG_CHECKING([for neon intrinsics])
+AC_CACHE_VAL([utils_cv_neon_intrinsic_exists],[
+AC_LINK_IFELSE(
+ [AC_LANG_SOURCE([[
+ #include <arm_neon.h>
+
+ int
+ main (void)
+ {
+ char buffer[128] = {0};
+ uint8x16_t v = vld1q_u8 (buffer);
+ uint8x16_t m = vceqq_u8 (v, v);
+ uint8x16_t s = vandq_u8 (m, m);
+ uint16x8_t a = vpaddlq_u8 (s);
+ uint32x4_t b = vpaddlq_u16 (a);
+ uint64x2_t c = vpaddlq_u32 (b);
+ int value = vgetq_lane_u64 (c, 0) + vgetq_lane_u64 (c, 1);
+ return value;
+ }
+ ]])
+ ],[
+ utils_cv_neon_intrinsic_exists=yes
+ ],[
+ utils_cv_neon_intrinsic_exists=no
+ ])])
+AC_MSG_RESULT([$utils_cv_neon_intrinsic_exists])
+if test $utils_cv_neon_intrinsic_exists = yes; then
+ AC_DEFINE([USE_NEON_WC_LINECOUNT], [1], [Counting lines with Neon enabled])
+fi
+AM_CONDITIONAL([USE_NEON_WC_LINECOUNT],
+ [test $utils_cv_neon_intrinsic_exists = yes])
+
CFLAGS=$ac_save_CFLAGS
############################################################################
diff --git a/src/local.mk b/src/local.mk
index 29f07f254..bf88f7d0e 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -509,6 +509,13 @@ wc_avx2_ldadd = src/libwc_avx2.a
src_wc_LDADD += $(wc_avx2_ldadd)
src_libwc_avx2_a_CFLAGS = -mavx2 $(AM_CFLAGS)
endif
+if USE_NEON_WC_LINECOUNT
+noinst_LIBRARIES += src/libwc_neon.a
+src_libwc_neon_a_SOURCES = src/wc_neon.c
+wc_neon_ldadd = src/libwc_neon.a
+src_wc_LDADD += $(wc_neon_ldadd)
+src_libwc_neon_a_CFLAGS = -march=armv8-a+simd $(AM_CFLAGS)
+endif
# Ensure we don't link against libcoreutils.a as that lib is
# not compiled with -fPIC which causes issues on 64 bit at least
diff --git a/src/wc.c b/src/wc.c
index 76f8d21a4..11f1ef6ab 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -159,6 +159,22 @@ avx512_supported (void)
}
#endif
+#ifdef USE_NEON_WC_LINECOUNT
+static bool
+neon_supported (void)
+{
+ /* FIXME. */
+ /* bool neon_enabled = cpu_supports ("neon"); */
+ bool neon_enabled = true;
+ if (debug)
+ error (0, 0, (neon_enabled
+ ? _("using neon hardware support")
+ : _("neon support not detected")));
+
+ return neon_enabled;
+}
+#endif
+
void
usage (int status)
{
@@ -298,6 +314,13 @@ wc_lines (int fd)
if (0 < use_avx2)
return wc_lines_avx2 (fd);
#endif
+#ifdef USE_NEON_WC_LINECOUNT
+ static signed char use_neon;
+ if (!use_neon)
+ use_neon = neon_supported () ? 1 : -1;
+ if (0 < use_neon)
+ return wc_lines_neon (fd);
+#endif
intmax_t lines = 0, bytes = 0;
bool long_lines = false;
diff --git a/src/wc.h b/src/wc.h
index f151e92f2..fa0aef990 100644
--- a/src/wc.h
+++ b/src/wc.h
@@ -2,3 +2,4 @@
struct wc_lines { int err; intmax_t lines; intmax_t bytes; };
struct wc_lines wc_lines_avx2 (int);
struct wc_lines wc_lines_avx512 (int);
+struct wc_lines wc_lines_neon (int fd);
diff --git a/src/wc_neon.c b/src/wc_neon.c
new file mode 100644
index 000000000..53f82b8b4
--- /dev/null
+++ b/src/wc_neon.c
@@ -0,0 +1,109 @@
+/* wc_neon - Count the number of newlines with neon instructions.
+ Copyright (C) 2026 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+/* Written by Collin Funk <[email protected]>, 2026. */
+
+#include <config.h>
+
+#include "wc.h"
+#include "system.h"
+#include "ioblksize.h"
+
+#include <arm_neon.h>
+
+/* Read FD and return a summary. */
+extern struct wc_lines
+wc_lines_neon (int fd)
+{
+ intmax_t lines = 0;
+ intmax_t bytes = 0;
+
+ uint8x16_t endlines = vdupq_n_u8 ('\n');
+ uint8x16_t ones = vdupq_n_u8 (1);
+
+ while (true)
+ {
+ unsigned char neon_buf[IO_BUFSIZE];
+ ssize_t bytes_read = read (fd, neon_buf, sizeof neon_buf);
+ if (bytes_read <= 0)
+ return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };
+
+ bytes += bytes_read;
+ unsigned char *datap = neon_buf;
+
+ while (64 <= bytes_read)
+ {
+ /* Load 64 bytes from NEON_BUF. */
+ uint8x16_t v0 = vld1q_u8 (datap);
+ uint8x16_t v1 = vld1q_u8 (datap + 16);
+ uint8x16_t v2 = vld1q_u8 (datap + 32);
+ uint8x16_t v3 = vld1q_u8 (datap + 48);
+
+ /* Bitwise equal with ENDLINES. */
+ uint8x16_t m0 = vceqq_u8 (v0, endlines);
+ uint8x16_t m1 = vceqq_u8 (v1, endlines);
+ uint8x16_t m2 = vceqq_u8 (v2, endlines);
+ uint8x16_t m3 = vceqq_u8 (v3, endlines);
+
+ /* Bitwise and with ONES. */
+ uint8x16_t s0 = vandq_u8 (m0, ones);
+ uint8x16_t s1 = vandq_u8 (m1, ones);
+ uint8x16_t s2 = vandq_u8 (m2, ones);
+ uint8x16_t s3 = vandq_u8 (m3, ones);
+
+ /* Sum the vectors. */
+ uint16x8_t a0 = vpaddlq_u8 (s0);
+ uint16x8_t a1 = vpaddlq_u8 (s1);
+ uint16x8_t a2 = vpaddlq_u8 (s2);
+ uint16x8_t a3 = vpaddlq_u8 (s3);
+ uint32x4_t b0 = vpaddlq_u16 (a0);
+ uint32x4_t b1 = vpaddlq_u16 (a1);
+ uint32x4_t b2 = vpaddlq_u16 (a2);
+ uint32x4_t b3 = vpaddlq_u16 (a3);
+ uint64x2_t c0 = vpaddlq_u32 (b0);
+ uint64x2_t c1 = vpaddlq_u32 (b1);
+ uint64x2_t c2 = vpaddlq_u32 (b2);
+ uint64x2_t c3 = vpaddlq_u32 (b3);
+
+ /* Extract the vectors. */
+ lines += (vgetq_lane_u64 (c0, 0) + vgetq_lane_u64 (c0, 1)
+ + vgetq_lane_u64 (c1, 0) + vgetq_lane_u64 (c1, 1)
+ + vgetq_lane_u64 (c2, 0) + vgetq_lane_u64 (c2, 1)
+ + vgetq_lane_u64 (c3, 0) + vgetq_lane_u64 (c3, 1));
+
+ datap += 64;
+ bytes_read -= 64;
+ }
+
+ while (16 <= bytes_read)
+ {
+ uint8x16_t v = vld1q_u8 (datap);
+ uint8x16_t m = vceqq_u8 (v, endlines);
+ uint8x16_t s = vandq_u8 (m, ones);
+ uint16x8_t a = vpaddlq_u8 (s);
+ uint32x4_t b = vpaddlq_u16 (a);
+ uint64x2_t c = vpaddlq_u32 (b);
+ lines += vgetq_lane_u64 (c, 0) + vgetq_lane_u64 (c, 1);
+ datap += 16;
+ bytes_read -= 16;
+ }
+
+ /* Finish up any left over bytes. */
+ unsigned char *end = (unsigned char *) datap + bytes_read;
+ for (unsigned char *p = (unsigned char *) datap; p < end; p++)
+ lines += *p == '\n';
+ }
+}
--
2.53.0