From f66dafc2181bf997f8e7192ad49d3d6ec9dc2b87 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <noritnk@kcn.ne.jp>
Date: Sat, 25 Oct 2014 01:46:01 +0900
Subject: [PATCH] dfa: make \w and \W work in multibyte locales

Reported by Jaroslav Skarvada in: http://bugs.gnu.org/18817
Now, \w and \W are supported in not only single byte locale but multibyte
locale.

* src/dfa.c (PUSH_LEX_STATE, POP_LEX_STATE): Move definitions "up",
so they are not within the function.
(lex): Make \w and \W work in a multibyte locale, the same way
we made \s and \S work.
* tests/word-multibyte: New test for this change.
* tests/Makefile.am: Add a rule to build new test.
* NEWS (Bug fixes): Mention it.
---
 NEWS                 |  3 +++
 src/dfa.c            | 61 ++++++++++++++++++++++++++++++++++------------------
 tests/Makefile.am    |  1 +
 tests/word-multibyte | 23 ++++++++++++++++++++
 4 files changed, 67 insertions(+), 21 deletions(-)
 create mode 100644 tests/word-multibyte

diff --git a/NEWS b/NEWS
index 94eeeeb..183b7f0 100644
--- a/NEWS
+++ b/NEWS
@@ -21,6 +21,9 @@ GNU grep NEWS                                    -*- outline -*-

 ** Bug fixes

+  grep no longer mishandles patterns that contain \w or \W in multibyte
+  locales.
+
   grep would fail to count newlines internally when operating in non-UTF8
   multibyte locales, leading it to print potentially many lines that did
   not match.  E.g., the command, "seq 10 | env LC_ALL=zh_CN src/grep -n .."
diff --git a/src/dfa.c b/src/dfa.c
index 5b9d154..e0fc120 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1249,6 +1249,20 @@ parse_bracket_exp (void)
   return CSET + charclass_index (ccl);
 }

+#define PUSH_LEX_STATE(s)			\
+  do						\
+    {						\
+      char const *lexptr_saved = lexptr;	\
+      size_t lexleft_saved = lexleft;		\
+      lexptr = (s);				\
+      lexleft = strlen (lexptr)
+
+#define POP_LEX_STATE()				\
+      lexptr = lexptr_saved;			\
+      lexleft = lexleft_saved;			\
+    }						\
+  while (0)
+
 static token
 lex (void)
 {
@@ -1496,20 +1510,6 @@ lex (void)
               return lasttok = CSET + charclass_index (ccl);
             }

-#define PUSH_LEX_STATE(s)			\
-  do						\
-    {						\
-      char const *lexptr_saved = lexptr;	\
-      size_t lexleft_saved = lexleft;		\
-      lexptr = (s);				\
-      lexleft = strlen (lexptr)
-
-#define POP_LEX_STATE()				\
-      lexptr = lexptr_saved;			\
-      lexleft = lexleft_saved;			\
-    }						\
-  while (0)
-
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
              add_utf8_anychar, makes sense.  */

@@ -1529,14 +1529,33 @@ lex (void)
         case 'W':
           if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
             goto normal_char;
-          zeroset (ccl);
-          for (c2 = 0; c2 < NOTCHAR; ++c2)
-            if (IS_WORD_CONSTITUENT (c2))
-              setbit (c2, ccl);
-          if (c == 'W')
-            notset (ccl);
+
+          if (!dfa->multibyte)
+            {
+              zeroset (ccl);
+              for (c2 = 0; c2 < NOTCHAR; ++c2)
+                if (IS_WORD_CONSTITUENT (c2))
+                  setbit (c2, ccl);
+              if (c == 'W')
+                notset (ccl);
+              laststart = false;
+              return lasttok = CSET + charclass_index (ccl);
+            }
+
+          /* FIXME: see if optimizing this, as is done with ANYCHAR and
+             add_utf8_anychar, makes sense.  */
+
+          /* \w and \W are documented to be equivalent to [_[:alnum:]] and
+             [^_[:alnum:]] respectively, so tell the lexer to process those
+             strings, each minus its "already processed" '['.  */
+          PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]");
+
+          lasttok = parse_bracket_exp ();
+
+          POP_LEX_STATE ();
+
           laststart = false;
-          return lasttok = CSET + charclass_index (ccl);
+          return lasttok;

         case '[':
           if (backslash)
diff --git a/tests/Makefile.am b/tests/Makefile.am
index f6f051c..c006e58 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -114,6 +114,7 @@ TESTS =						\
   warn-char-classes				\
   word-delim-multibyte				\
   word-multi-file				\
+  word-multibyte				\
   yesno

 EXTRA_DIST =					\
diff --git a/tests/word-multibyte b/tests/word-multibyte
new file mode 100644
index 0000000..e067a37
--- /dev/null
+++ b/tests/word-multibyte
@@ -0,0 +1,23 @@
+#!/bin/sh
+# This would fail for grep-2.20
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+require_en_utf8_locale_
+
+printf '\xc3\xa1\n' > in || framework_failure_
+LC_ALL=en_US.UTF-8
+export LC_ALL
+
+fail=0
+
+for LOC in en_US.UTF-8 zh_CN $LOCALE_FR_UTF8; do
+  out=out1-$LOC
+  LC_ALL=$LOC grep '\w' in >$out || fail=1
+  compare in $out || fail=1
+
+  out=out2-$LOC
+  LC_ALL=$LOC grep '\W' in >$out && fail=1
+  compare /dev/null $out || fail=1
+done
+
+Exit $fail
-- 
2.0.0.421.g786a89d