bug#62267: grep-3.9 bug: \d matches multibyte digits

Paul Eggert Sun, 19 Mar 2023 01:30:05 -0700

On 2023-03-18 23:33, Jim Meyering wrote:

By the way, have you ever used \D? I think I have not.

No, I'm not much of a Perl user these days (last seriously used it inthe 1990s...).

-  char *new_keys = xnmalloc (len / 2 + 1, 5);
+  char *new_keys = xnmalloc (len / 2 + 1, 6);


This could be xnmalloc (len + 1, 3).

Or if you want to show the work, you can replace it with something like:

   int origlen = sizeof "\\D" - 1;
   int repllen = sizeof "[^0-9]" - 1;
   int expansion = repllen / origlen + (repllen % origlen != 0);
   char *new_keys = xnmalloc (len + 1, expansion);

(Isn't memory allocation fun? :-)

Doesn't Perl have the same issue?


Oh, you're right. Not being a Perl expert, all I did was run this:

  echo '٠١٢٣٤٥٦٧٨٩' | perl -ne 'print if /\d/'

and I observed no output. However, I now see that I need to use perl's-C option too, to get the kind of regular-expression behavior that plaingrep has.

Looking at the source code again, how about if we move the PCRE-specificchanges from src/grep.c to src/pcresearch.c which is where it reallybelongs, and more importantly use the bleeding-edgePCRE2_EXTRA_ASCII_BSD macro if available?

Something like the attached patch, say. This patch doesn't take your \Dfixes (or the above suggestions) into account.

From ed7fa801963aaf526f7725741d095c80ad944731 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 19 Mar 2023 01:23:51 -0700
Subject: [PATCH] grep: forward port to PCRE2 10.43

* doc/grep.texi: Document this, and version hassles.
* src/grep.c: Move recent changes into pcresearch.c.
(P_MATCHER_INDEX): Remove.
(pcre_pattern_expand_backslash_d): Move from here ...
* src/pcresearch.c: ... to here.
(PCRE2_EXTRA_ASCII_BSD): Default to 0.
(Pcompile): Use PCRE2_EXTRA_ASCII_BSD if available,
and expand \d to [0-9] otherwise.
---
 doc/grep.texi    | 24 ++++++++++----
 src/grep.c       | 82 +---------------------------------------------
 src/pcresearch.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 101 insertions(+), 89 deletions(-)

diff --git a/doc/grep.texi b/doc/grep.texi
index eaad6e1..8c8baa9 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -1144,18 +1144,28 @@ combined with the @option{-z} (@option{--null-data}) option, and note that
 For documentation, refer to @url{https://www.pcre.org/}, with these caveats:
 @itemize
 @item
-@samp{\d} always matches only the ten ASCII digits, regardless of locale or
-in-regexp directives like @samp{(?aD)}.
-Use @samp{\p@{Nd@}} if you require to match non-ASCII digits.
-Once pcre2 support for @samp{(?aD)} is widespread enough,
-we expect to make that the default, so it will be overridable.
-@c Using pcre2 git commit pcre2-10.40-112-g6277357, this demonstrates how
-@c we'll prefix with (?aD) to make \d's ASCII-only behavior the default:
+@samp{\d} matches only the ten ASCII digits, regardless of locale.
+Use @samp{\p@{Nd@}} to also match non-ASCII digits.
+
+When @command{grep} is built with PCRE2 10.42 and earlier, @samp{\d}
+ignores in-regexp directives like @samp{(?aD)} and matches only ASCII
+digits regardless of these directives.  However, later versions of
+PCRE2 likely will fix this, and the plan is for @command{grep} to
+respect those directives if possible.
+@c Using PCRE2 git commit pcre2-10.40-112-g6277357, this demonstrates
+@c the equivalent of how grep could use PCRE2_EXTRA_ASCII_BSD to make \d's
+@c ASCII-only behavior the default:
 @c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '(?aD)^\d+' <<< '٠١٢٣٤٥٦٧٨٩'
 @c [Exit 1]
 @c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '^\d+' <<< '٠١٢٣٤٥٦٧٨٩'
 @c ٠١٢٣٤٥٦٧٨٩
 
+@item
+Although PCRE2 tracks the syntax and semantics of Perl's regular
+expressions, the match is not always exact, partly because Perl
+evolves and a Perl installation may predate or postdate the PCRE2
+installation on the same host.
+
 @item
 By default, @command{grep} applies each regexp to a line at a time,
 so the @samp{(?s)} directive (making @samp{.} match line breaks)
diff --git a/src/grep.c b/src/grep.c
index 6ba881e..7547b64 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -2089,8 +2089,7 @@ static struct
 #endif
 };
 /* Keep these in sync with the 'matchers' table.  */
-enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0,
-       P_MATCHER_INDEX = 6 };
+enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0 };
 
 /* Return the index of the matcher corresponding to M if available.
    MATCHER is the index of the previous matcher, or -1 if none.
@@ -2379,80 +2378,6 @@ fgrep_to_grep_pattern (char **keys_p, idx_t *len_p)
   *len_p = p - new_keys;
 }
 
-/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII
-   digits.  Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise
-   match non-ASCII digits in some locales.  Use \p{Nd} if you require to match
-   those.  */
-static void
-pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
-{
-  idx_t len = *len_p;
-  char *keys = *keys_p;
-  mbstate_t mb_state = { 0 };
-  char *new_keys = xnmalloc (len / 2 + 1, 5);
-  char *p = new_keys;
-  bool prev_backslash = false;
-
-  for (ptrdiff_t n; len; keys += n, len -= n)
-    {
-      n = mb_clen (keys, len, &mb_state);
-      switch (n)
-        {
-        case -2:
-          n = len;
-          FALLTHROUGH;
-        default:
-          if (prev_backslash)
-            {
-              prev_backslash = false;
-              *p++ = '\\';
-            }
-          p = mempcpy (p, keys, n);
-          break;
-
-        case -1:
-          if (prev_backslash)
-            {
-              prev_backslash = false;
-              *p++ = '\\';
-            }
-          memset (&mb_state, 0, sizeof mb_state);
-          n = 1;
-          FALLTHROUGH;
-        case 1:
-          if (prev_backslash)
-            {
-              prev_backslash = false;
-              switch (*keys)
-                {
-                case 'd':
-                  p = mempcpy (p, "[0-9]", 5);
-                  break;
-                default:
-                  *p++ = '\\';
-                  *p++ = *keys;
-                  break;
-                }
-            }
-          else
-            {
-              if (*keys == '\\')
-                prev_backslash = true;
-              else
-                *p++ = *keys;
-            }
-          break;
-        }
-    }
-
-  if (prev_backslash)
-    *p++ = '\\';
-  *p = '\n';
-  free (*keys_p);
-  *keys_p = new_keys;
-  *len_p = p - new_keys;
-}
-
 /* If it is easy, convert the MATCHER-style patterns KEYS (of size
    *LEN_P) to -F style, update *LEN_P to a possibly-smaller value, and
    return F_MATCHER_INDEX.  If not, leave KEYS and *LEN_P alone and
@@ -3045,11 +2970,6 @@ main (int argc, char **argv)
         matcher = try_fgrep_pattern (matcher, keys, &keycc);
     }
 
-  /* If -P, replace each \d with [0-9].
-     Those who want to match non-ASCII digits must use \p{Nd}.  */
-  if (matcher == P_MATCHER_INDEX)
-    pcre_pattern_expand_backslash_d (&keys, &keycc);
-
   execute = matchers[matcher].execute;
   compiled_pattern =
     matchers[matcher].compile (keys, keycc, matchers[matcher].syntax,
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 5b111be..3a0fa60 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -35,6 +35,9 @@
 # define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT
 # define pcre2_set_depth_limit pcre2_set_recursion_limit
 #endif
+#ifndef PCRE2_EXTRA_ASCII_BSD
+# define PCRE2_EXTRA_ASCII_BSD 0
+#endif
 
 struct pcre_comp
 {
@@ -130,12 +133,89 @@ bad_utf8_from_pcre2 (int e)
 #endif
 }
 
+/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII
+   digits.  Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise
+   match non-ASCII digits in some locales.  Use \p{Nd} if you require to match
+   those.  */
+static void
+pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
+{
+  idx_t len = *len_p;
+  char *keys = *keys_p;
+  mbstate_t mb_state = { 0 };
+  char *new_keys = xnmalloc (len / 2 + 1, 5);
+  char *p = new_keys;
+  bool prev_backslash = false;
+
+  for (ptrdiff_t n; len; keys += n, len -= n)
+    {
+      n = mb_clen (keys, len, &mb_state);
+      switch (n)
+        {
+        case -2:
+          n = len;
+          FALLTHROUGH;
+        default:
+          if (prev_backslash)
+            {
+              prev_backslash = false;
+              *p++ = '\\';
+            }
+          p = mempcpy (p, keys, n);
+          break;
+
+        case -1:
+          if (prev_backslash)
+            {
+              prev_backslash = false;
+              *p++ = '\\';
+            }
+          memset (&mb_state, 0, sizeof mb_state);
+          n = 1;
+          FALLTHROUGH;
+        case 1:
+          if (prev_backslash)
+            {
+              prev_backslash = false;
+              switch (*keys)
+                {
+                case 'd':
+                  p = mempcpy (p, "[0-9]", 5);
+                  break;
+                default:
+                  *p++ = '\\';
+                  *p++ = *keys;
+                  break;
+                }
+            }
+          else
+            {
+              if (*keys == '\\')
+                prev_backslash = true;
+              else
+                *p++ = *keys;
+            }
+          break;
+        }
+    }
+
+  if (prev_backslash)
+    *p++ = '\\';
+  *p = '\n';
+  free (*keys_p);
+  *keys_p = new_keys;
+  *len_p = p - new_keys;
+}
+
 /* Compile the -P style PATTERN, containing SIZE bytes that are
    followed by '\n'.  Return a description of the compiled pattern.  */
 
 void *
 Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
 {
+  if (! PCRE2_EXTRA_ASCII_BSD)
+    pcre_pattern_expand_backslash_d (&pattern, &size);
+
   PCRE2_SIZE e;
   int ec;
   int flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0);
@@ -172,7 +252,9 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
   if (match_lines)
     {
 #ifdef PCRE2_EXTRA_MATCH_LINE
-      pcre2_set_compile_extra_options (ccontext, PCRE2_EXTRA_MATCH_LINE);
+      pcre2_set_compile_extra_options (ccontext,
+                                       (PCRE2_EXTRA_MATCH_LINE
+                                        | PCRE2_EXTRA_ASCII_BSD));
 #else
       static char const /* These sizes omit trailing NUL.  */
         xprefix[4] = "^(?:", xsuffix[2] = ")$";
-- 
2.39.2

bug#62267: grep-3.9 bug: \d matches multibyte digits

Reply via email to