Reported to PCRE[1] with mention of GNU grep being also affected. [1] https://github.com/PCRE2Project/pcre2/issues/185
From c2d4a43b5b15df7c8853d591bf6ae872c602ed14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= <care...@gmail.com> Date: Fri, 6 Jan 2023 19:34:56 -0800 Subject: [PATCH] pcre: use UCP in UTF mode
* src/pcresearch.c: set PCRE2_UCP together with PCRE2_UTF * tests/pcre-utf8-w: add test --- src/pcresearch.c | 2 +- tests/Makefile.am | 1 + tests/pcre-utf8-w | 28 ++++++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) create mode 100755 tests/pcre-utf8-w diff --git a/src/pcresearch.c b/src/pcresearch.c index a107f4d..45b67ee 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -149,7 +149,7 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) { if (! localeinfo.using_utf8) die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales")); - flags |= PCRE2_UTF; + flags |= (PCRE2_UTF | PCRE2_UCP); #if 0 /* Do not match individual code units but only UTF-8. */ flags |= PCRE2_NEVER_BACKSLASH_C; diff --git a/tests/Makefile.am b/tests/Makefile.am index e0b0503..a47cf5c 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -147,6 +147,7 @@ TESTS = \ pcre-jitstack \ pcre-o \ pcre-utf8 \ + pcre-utf8-w \ pcre-w \ pcre-wx-backref \ pcre-z \ diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w new file mode 100755 index 0000000..431685c --- /dev/null +++ b/tests/pcre-utf8-w @@ -0,0 +1,28 @@ +#!/bin/sh +# UTF-8 characters are correctly identified as part of a word +# +# Copyright (C) 2023-2023 Free Software Foundation, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. + +. "${srcdir=.}/init.sh"; path_prepend_ ../src +require_en_utf8_locale_ +LC_ALL=en_US.UTF-8 +export LC_ALL +require_pcre_ + +fail=0 + +echo 'Perú'> in || framework_failure_ + +echo 'ú' > exp || framework_failure_ +grep -Po '.\b' in > out || fail=1 +compare out exp || fail=1 + +echo 'rú' > exp || framework_failure_ +grep -Po 'r\w' in > out && fail=1 +compare out exp || fail=1 + +Exit $fail -- 2.30.2