From 1b091165d1ed2d1ed9e575bfab4f1b1808a85f04 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@fb.com>
Date: Sat, 18 Mar 2023 23:25:03 -0700
Subject: [PATCH] grep: -P (--perl-regexp) \D once again works lie [^0-9]

* NEWS: Mention \D, too.
* doc/grep.texi: Likewise
* src/grep.c (pcre_pattern_expand_backslash_d): Handle \D.
* tests/pcre-ascii-digits: Test \D, too. Add comments.
Tighten one test by using returns_ 1.
Reported by Paul Eggert in https://bugs.gnu.org/62267#8
---
 NEWS                    | 2 +-
 doc/grep.texi           | 1 +
 src/grep.c              | 7 +++++--
 tests/pcre-ascii-digits | 9 ++++++++-
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/NEWS b/NEWS
index a24cebd..6f77d16 100644
--- a/NEWS
+++ b/NEWS
@@ -9,7 +9,7 @@ GNU grep NEWS                                    -*- outline -*-
   properly had the undesirable side effect of making \d also match
   e.g., the Arabic digits: ٠١٢٣٤٥٦٧٨٩.  With grep-3.9, -P '\d+'
   would match that ten-digit (20-byte) string. Now, to match such
-  a digit, you would use \p{Nd}.
+  a digit, you would use \p{Nd}. Similarly, \D is now mapped to [^0-9].
   [bug introduced in grep 3.9]


diff --git a/doc/grep.texi b/doc/grep.texi
index eaad6e1..ad034f1 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -1149,6 +1149,7 @@ in-regexp directives like @samp{(?aD)}.
 Use @samp{\p@{Nd@}} if you require to match non-ASCII digits.
 Once pcre2 support for @samp{(?aD)} is widespread enough,
 we expect to make that the default, so it will be overridable.
+Similarly, @samp{\D} matches anything but those ten ASCII digits.
 @c Using pcre2 git commit pcre2-10.40-112-g6277357, this demonstrates how
 @c we'll prefix with (?aD) to make \d's ASCII-only behavior the default:
 @c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '(?aD)^\d+' <<< '٠١٢٣٤٥٦٧٨٩'
diff --git a/src/grep.c b/src/grep.c
index 6ba881e..79459f3 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -2382,14 +2382,14 @@ fgrep_to_grep_pattern (char **keys_p, idx_t *len_p)
 /* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII
    digits.  Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise
    match non-ASCII digits in some locales.  Use \p{Nd} if you require to match
-   those.  */
+   those.  Similarly, replace each \D with [^0-9].  */
 static void
 pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
 {
   idx_t len = *len_p;
   char *keys = *keys_p;
   mbstate_t mb_state = { 0 };
-  char *new_keys = xnmalloc (len / 2 + 1, 5);
+  char *new_keys = xnmalloc (len / 2 + 1, 6);
   char *p = new_keys;
   bool prev_backslash = false;

@@ -2428,6 +2428,9 @@ pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
                 case 'd':
                   p = mempcpy (p, "[0-9]", 5);
                   break;
+                case 'D':
+                  p = mempcpy (p, "[^0-9]", 6);
+                  break;
                 default:
                   *p++ = '\\';
                   *p++ = *keys;
diff --git a/tests/pcre-ascii-digits b/tests/pcre-ascii-digits
index ae713f7..0159286 100755
--- a/tests/pcre-ascii-digits
+++ b/tests/pcre-ascii-digits
@@ -1,6 +1,7 @@
 #!/bin/sh
 # Ensure that grep -P's \d matches only the 10 ASCII digits.
 # With, grep-3.9, \d would match e.g., the multibyte Arabic digits.
+# The same applied to \D.
 #
 # Copyright (C) 2023 Free Software Foundation, Inc.
 #
@@ -24,8 +25,14 @@ fail=0
 # \331\245\331\246\331\247\331\250\331\251
 printf '\331\240\331\241\331\242\331\243\331\244' > in || framework_failure_
 printf '\331\245\331\246\331\247\331\250\331\251' >> in || framework_failure_
+printf '\n' >> in || framework_failure_

-grep -P '\d+' in > out && fail=1
+# Ensure that \d matches no character.
+returns_ 1 grep -P '\d' in > out || fail=1
 compare /dev/null out || fail=1

+# Ensure that ^\D+$ matches the entire line.
+grep -P '^\D+$' in > out || fail=1
+compare in out || fail=1
+
 Exit $fail
-- 
2.40.0.rc2

