git: a74c77cc7bed - main - grep(1): optimize -w/--word-regexp word boundary check

Baptiste Daroussin Sun, 14 Jun 2026 11:14:46 -0700

The branch main has been updated by bapt:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=a74c77cc7bed8dba50e976a7be2aa0094ee27b61


commit a74c77cc7bed8dba50e976a7be2aa0094ee27b61
Author:     Baptiste Daroussin <[email protected]>
AuthorDate: 2026-06-10 14:41:39 +0000
Commit:     Baptiste Daroussin <[email protected]>
CommitDate: 2026-06-14 18:14:31 +0000

    grep(1): optimize -w/--word-regexp word boundary check
    
    The -w option checks word boundaries before and after each potential
    match by decoding the adjacent character.  This was done via the
    heavyweight sscanf(3) with "%lc", which goes through the full scanf
    parser and locale-aware mbrtowc(3) machinery even for simple ASCII.
    
    Replace with a three-tier fast path:
    
    1. ASCII bytes (< 0x80): simple isalnum(3) / '_' comparison
    2. UTF-8 continuation bytes (0x80-0xBF): interior bytes of a multi-byte
       character are always word characters -> no further decoding needed
    3. Multi-byte start bytes (>= 0xC0): decode with mbrtowc(3) directly
       instead of sscanf(3)/%lc, avoiding scanf parser overhead
    
    Benchmark with ministat(1) (10 runs each):
    
    Worst-case ASCII (100k lines of 100 'a' chars, -w 'a'):
        Difference at 95.0% confidence: -15.3% +/- 3.1%
    
    Worst-case Unicode (50k lines of 100 accented 'e', -w 'e'):
        Difference at 95.0% confidence: -11.2% +/- 4.7%
    
    Normal -w (500k lines, -w 'the'):
        Difference at 95.0% confidence: -18.1% +/- 3.6%
    
    French text (100k lines, -w accented 'ete'):
        Difference at 95.0% confidence: -18.0% +/- 4.1%
    
    Non -w case shows no regression.
    
    Reviewed by:    kevans
    Differential Revision:  https://reviews.freebsd.org/D57587
---
 usr.bin/grep/util.c | 44 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/usr.bin/grep/util.c b/usr.bin/grep/util.c
index dbb21dcfd78e..bbb174370bd5 100644
--- a/usr.bin/grep/util.c
+++ b/usr.bin/grep/util.c
@@ -490,6 +490,35 @@ litexec(const struct pat *pat, const char *string, size_t 
nmatch,
 
 #define iswword(x)     (iswalnum((x)) || (x) == L'_')
 
+/*
+ * Check if the byte at the given offset in the line is a word character
+ * (alphanumeric or _).  Handles ASCII fast path, UTF-8 continuation bytes,
+ * and multi-byte decoding via mbrtowc(3).
+ */
+static bool
+iswordchar(const char *dat, size_t len, size_t offset)
+{
+       unsigned char ch;
+       mbstate_t mbstate;
+       wchar_t wc;
+       size_t n;
+
+       if (offset >= len)
+               return (false);
+
+       ch = (unsigned char)dat[offset];
+       if (ch < 0x80)
+               return (isalnum(ch) || ch == '_');
+       if ((ch & 0xC0) == 0x80)
+               /* Continuation byte: part of a word */
+               return (true);
+
+       /* Multi-byte start byte: decode with mbrtowc */
+       memset(&mbstate, 0, sizeof(mbstate));
+       n = mbrtowc(&wc, &dat[offset], MB_CUR_MAX, &mbstate);
+       return (n == (size_t)-1 || n == (size_t)-2 || iswword(wc));
+}
+
 /*
  * Processes a line comparing it with the specified patterns.  Each pattern
  * is looped to be compared along with the full string, saving each and every
@@ -501,7 +530,6 @@ static bool
 procline(struct parsec *pc)
 {
        regmatch_t pmatch, lastmatch, chkmatch;
-       wchar_t wbegin, wend;
        size_t st, nst;
        unsigned int i;
        int r = 0, leflags = eflags;
@@ -567,18 +595,14 @@ procline(struct parsec *pc)
                                continue;
                        /* Check for whole word match */
                        if (wflag) {
-                               wbegin = wend = L' ';
                                if (pmatch.rm_so != 0 &&
-                                   sscanf(&pc->ln.dat[pmatch.rm_so - 1],
-                                   "%lc", &wbegin) != 1)
+                                   iswordchar(pc->ln.dat, pc->ln.len,
+                                   pmatch.rm_so - 1))
                                        r = REG_NOMATCH;
-                               else if ((size_t)pmatch.rm_eo !=
+                               if (r == 0 && (size_t)pmatch.rm_eo !=
                                    pc->ln.len &&
-                                   sscanf(&pc->ln.dat[pmatch.rm_eo],
-                                   "%lc", &wend) != 1)
-                                       r = REG_NOMATCH;
-                               else if (iswword(wbegin) ||
-                                   iswword(wend))
+                                   iswordchar(pc->ln.dat, pc->ln.len,
+                                   pmatch.rm_eo))
                                        r = REG_NOMATCH;
                                /*
                                 * If we're doing whole word matching and we

git: a74c77cc7bed - main - grep(1): optimize -w/--word-regexp word boundary check

Reply via email to