On 2023-03-18 23:33, Jim Meyering wrote:
By the way, have you ever used \D? I think I have not.
No, I'm not much of a Perl user these days (last seriously used it in
the 1990s...).
- char *new_keys = xnmalloc (len / 2 + 1, 5);
+ char *new_keys = xnmalloc (len / 2 + 1, 6);
This could be xnmalloc (len + 1, 3).
Or if you want to show the work, you can replace it with something like:
int origlen = sizeof "\\D" - 1;
int repllen = sizeof "[^0-9]" - 1;
int expansion = repllen / origlen + (repllen % origlen != 0);
char *new_keys = xnmalloc (len + 1, expansion);
(Isn't memory allocation fun? :-)
Doesn't Perl have the same issue?
Oh, you're right. Not being a Perl expert, all I did was run this:
echo '٠١٢٣٤٥٦٧٨٩' | perl -ne 'print if /\d/'
and I observed no output. However, I now see that I need to use perl's
-C option too, to get the kind of regular-expression behavior that plain
grep has.
Looking at the source code again, how about if we move the PCRE-specific
changes from src/grep.c to src/pcresearch.c which is where it really
belongs, and more importantly use the bleeding-edge
PCRE2_EXTRA_ASCII_BSD macro if available?
Something like the attached patch, say. This patch doesn't take your \D
fixes (or the above suggestions) into account.
From ed7fa801963aaf526f7725741d095c80ad944731 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 19 Mar 2023 01:23:51 -0700
Subject: [PATCH] grep: forward port to PCRE2 10.43
* doc/grep.texi: Document this, and version hassles.
* src/grep.c: Move recent changes into pcresearch.c.
(P_MATCHER_INDEX): Remove.
(pcre_pattern_expand_backslash_d): Move from here ...
* src/pcresearch.c: ... to here.
(PCRE2_EXTRA_ASCII_BSD): Default to 0.
(Pcompile): Use PCRE2_EXTRA_ASCII_BSD if available,
and expand \d to [0-9] otherwise.
---
doc/grep.texi | 24 ++++++++++----
src/grep.c | 82 +---------------------------------------------
src/pcresearch.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 101 insertions(+), 89 deletions(-)
diff --git a/doc/grep.texi b/doc/grep.texi
index eaad6e1..8c8baa9 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -1144,18 +1144,28 @@ combined with the @option{-z} (@option{--null-data}) option, and note that
For documentation, refer to @url{https://www.pcre.org/}, with these caveats:
@itemize
@item
-@samp{\d} always matches only the ten ASCII digits, regardless of locale or
-in-regexp directives like @samp{(?aD)}.
-Use @samp{\p@{Nd@}} if you require to match non-ASCII digits.
-Once pcre2 support for @samp{(?aD)} is widespread enough,
-we expect to make that the default, so it will be overridable.
-@c Using pcre2 git commit pcre2-10.40-112-g6277357, this demonstrates how
-@c we'll prefix with (?aD) to make \d's ASCII-only behavior the default:
+@samp{\d} matches only the ten ASCII digits, regardless of locale.
+Use @samp{\p@{Nd@}} to also match non-ASCII digits.
+
+When @command{grep} is built with PCRE2 10.42 and earlier, @samp{\d}
+ignores in-regexp directives like @samp{(?aD)} and matches only ASCII
+digits regardless of these directives. However, later versions of
+PCRE2 likely will fix this, and the plan is for @command{grep} to
+respect those directives if possible.
+@c Using PCRE2 git commit pcre2-10.40-112-g6277357, this demonstrates
+@c the equivalent of how grep could use PCRE2_EXTRA_ASCII_BSD to make \d's
+@c ASCII-only behavior the default:
@c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '(?aD)^\d+' <<< '٠١٢٣٤٥٦٧٨٩'
@c [Exit 1]
@c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '^\d+' <<< '٠١٢٣٤٥٦٧٨٩'
@c ٠١٢٣٤٥٦٧٨٩
+@item
+Although PCRE2 tracks the syntax and semantics of Perl's regular
+expressions, the match is not always exact, partly because Perl
+evolves and a Perl installation may predate or postdate the PCRE2
+installation on the same host.
+
@item
By default, @command{grep} applies each regexp to a line at a time,
so the @samp{(?s)} directive (making @samp{.} match line breaks)
diff --git a/src/grep.c b/src/grep.c
index 6ba881e..7547b64 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -2089,8 +2089,7 @@ static struct
#endif
};
/* Keep these in sync with the 'matchers' table. */
-enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0,
- P_MATCHER_INDEX = 6 };
+enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0 };
/* Return the index of the matcher corresponding to M if available.
MATCHER is the index of the previous matcher, or -1 if none.
@@ -2379,80 +2378,6 @@ fgrep_to_grep_pattern (char **keys_p, idx_t *len_p)
*len_p = p - new_keys;
}
-/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII
- digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise
- match non-ASCII digits in some locales. Use \p{Nd} if you require to match
- those. */
-static void
-pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
-{
- idx_t len = *len_p;
- char *keys = *keys_p;
- mbstate_t mb_state = { 0 };
- char *new_keys = xnmalloc (len / 2 + 1, 5);
- char *p = new_keys;
- bool prev_backslash = false;
-
- for (ptrdiff_t n; len; keys += n, len -= n)
- {
- n = mb_clen (keys, len, &mb_state);
- switch (n)
- {
- case -2:
- n = len;
- FALLTHROUGH;
- default:
- if (prev_backslash)
- {
- prev_backslash = false;
- *p++ = '\\';
- }
- p = mempcpy (p, keys, n);
- break;
-
- case -1:
- if (prev_backslash)
- {
- prev_backslash = false;
- *p++ = '\\';
- }
- memset (&mb_state, 0, sizeof mb_state);
- n = 1;
- FALLTHROUGH;
- case 1:
- if (prev_backslash)
- {
- prev_backslash = false;
- switch (*keys)
- {
- case 'd':
- p = mempcpy (p, "[0-9]", 5);
- break;
- default:
- *p++ = '\\';
- *p++ = *keys;
- break;
- }
- }
- else
- {
- if (*keys == '\\')
- prev_backslash = true;
- else
- *p++ = *keys;
- }
- break;
- }
- }
-
- if (prev_backslash)
- *p++ = '\\';
- *p = '\n';
- free (*keys_p);
- *keys_p = new_keys;
- *len_p = p - new_keys;
-}
-
/* If it is easy, convert the MATCHER-style patterns KEYS (of size
*LEN_P) to -F style, update *LEN_P to a possibly-smaller value, and
return F_MATCHER_INDEX. If not, leave KEYS and *LEN_P alone and
@@ -3045,11 +2970,6 @@ main (int argc, char **argv)
matcher = try_fgrep_pattern (matcher, keys, &keycc);
}
- /* If -P, replace each \d with [0-9].
- Those who want to match non-ASCII digits must use \p{Nd}. */
- if (matcher == P_MATCHER_INDEX)
- pcre_pattern_expand_backslash_d (&keys, &keycc);
-
execute = matchers[matcher].execute;
compiled_pattern =
matchers[matcher].compile (keys, keycc, matchers[matcher].syntax,
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 5b111be..3a0fa60 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -35,6 +35,9 @@
# define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT
# define pcre2_set_depth_limit pcre2_set_recursion_limit
#endif
+#ifndef PCRE2_EXTRA_ASCII_BSD
+# define PCRE2_EXTRA_ASCII_BSD 0
+#endif
struct pcre_comp
{
@@ -130,12 +133,89 @@ bad_utf8_from_pcre2 (int e)
#endif
}
+/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII
+ digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise
+ match non-ASCII digits in some locales. Use \p{Nd} if you require to match
+ those. */
+static void
+pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
+{
+ idx_t len = *len_p;
+ char *keys = *keys_p;
+ mbstate_t mb_state = { 0 };
+ char *new_keys = xnmalloc (len / 2 + 1, 5);
+ char *p = new_keys;
+ bool prev_backslash = false;
+
+ for (ptrdiff_t n; len; keys += n, len -= n)
+ {
+ n = mb_clen (keys, len, &mb_state);
+ switch (n)
+ {
+ case -2:
+ n = len;
+ FALLTHROUGH;
+ default:
+ if (prev_backslash)
+ {
+ prev_backslash = false;
+ *p++ = '\\';
+ }
+ p = mempcpy (p, keys, n);
+ break;
+
+ case -1:
+ if (prev_backslash)
+ {
+ prev_backslash = false;
+ *p++ = '\\';
+ }
+ memset (&mb_state, 0, sizeof mb_state);
+ n = 1;
+ FALLTHROUGH;
+ case 1:
+ if (prev_backslash)
+ {
+ prev_backslash = false;
+ switch (*keys)
+ {
+ case 'd':
+ p = mempcpy (p, "[0-9]", 5);
+ break;
+ default:
+ *p++ = '\\';
+ *p++ = *keys;
+ break;
+ }
+ }
+ else
+ {
+ if (*keys == '\\')
+ prev_backslash = true;
+ else
+ *p++ = *keys;
+ }
+ break;
+ }
+ }
+
+ if (prev_backslash)
+ *p++ = '\\';
+ *p = '\n';
+ free (*keys_p);
+ *keys_p = new_keys;
+ *len_p = p - new_keys;
+}
+
/* Compile the -P style PATTERN, containing SIZE bytes that are
followed by '\n'. Return a description of the compiled pattern. */
void *
Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
{
+ if (! PCRE2_EXTRA_ASCII_BSD)
+ pcre_pattern_expand_backslash_d (&pattern, &size);
+
PCRE2_SIZE e;
int ec;
int flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0);
@@ -172,7 +252,9 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
if (match_lines)
{
#ifdef PCRE2_EXTRA_MATCH_LINE
- pcre2_set_compile_extra_options (ccontext, PCRE2_EXTRA_MATCH_LINE);
+ pcre2_set_compile_extra_options (ccontext,
+ (PCRE2_EXTRA_MATCH_LINE
+ | PCRE2_EXTRA_ASCII_BSD));
#else
static char const /* These sizes omit trailing NUL. */
xprefix[4] = "^(?:", xsuffix[2] = ")$";
--
2.39.2