John P. Linderman wrote:
Using what is to me the more obvious [0-9] pattern takes almost 50 times as
long as using the [[:digit:]] pattern. Seems very strange.

Thanks for reporting that. In general, patterns like [a-z] can be much slower than [[:lower:]] due to poorly-thought-out POSIX interfaces. However, [0-9] is a special case: we can optimize such patterns safely if both ends are ASCII digits. I installed the attached patch to Gnulib to do that; it fixes the performance glitch you noticed, at least for me.
>From 6afba02d7869d39ed7f61981045ddbdcb2814101 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 21 Mar 2017 19:05:17 -0700
Subject: [PATCH] dfa: make [0-9] faster in non-C locales

Problem reported by John P. Linderman (Bug#26193).
* lib/dfa.c (parse_bracket_exp): Remove redundant assignment.
If both ends of the range are ASCII digits, do not worry about
multi-character collating sequences and the like.  Be consistent
about using isalpha as a precondition for setbit_case_fold_c.
---
 ChangeLog |  9 +++++++++
 lib/dfa.c | 35 ++++++++++++-----------------------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 625f007..33bd6a1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2017-03-21  Paul Eggert  <egg...@cs.ucla.edu>
+
+	dfa: make [0-9] faster in non-C locales
+	Problem reported by John P. Linderman (Bug#26193).
+	* lib/dfa.c (parse_bracket_exp): Remove redundant assignment.
+	If both ends of the range are ASCII digits, do not worry about
+	multi-character collating sequences and the like.  Be consistent
+	about using isalpha as a precondition for setbit_case_fold_c.
+
 2017-03-19  Bruno Haible  <br...@clisp.org>
 
 	lock: Fix compilation error with HP-UX IA64 cc.
diff --git a/lib/dfa.c b/lib/dfa.c
index 5bac288..e97dae1c 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -551,8 +551,9 @@ struct dfa
                     bool, size_t *, bool *);
 
   /* The locale is simple, like the C locale.  These locales can be
-     processed more efficiently, e.g., the relationship between lower-
-     and upper-case letters is 1-1.  */
+     processed more efficiently, as they are single-byte, their native
+     character set is in collating-sequence order, and they do not
+     have multi-character collating elements.  */
   bool simple_locale;
 
   /* Other cached information derived from the locale.  */
@@ -1012,7 +1013,6 @@ parse_bracket_exp (struct dfa *dfa)
   if (invert)
     {
       c = bracket_fetch_wc (dfa);
-      invert = true;
       known_bracket_exp = dfa->simple_locale;
     }
   wint_t wc = dfa->lex.wctok;
@@ -1143,24 +1143,14 @@ parse_bracket_exp (struct dfa *dfa)
               /* Treat [x-y] as a range if x != y.  */
               if (wc != wc2 || wc == WEOF)
                 {
-                  if (dfa->localeinfo.multibyte)
-                    known_bracket_exp = false;
-                  else if (dfa->simple_locale)
+                  if (dfa->simple_locale
+                      || (isasciidigit (c) & isasciidigit (c2)))
                     {
-                      int ci;
-                      for (ci = c; ci <= c2; ci++)
-                        setbit (ci, &ccl);
-                      if (dfa->syntax.case_fold)
-                        {
-                          int uc = toupper (c);
-                          int uc2 = toupper (c2);
-                          for (ci = 0; ci < NOTCHAR; ci++)
-                            {
-                              int uci = toupper (ci);
-                              if (uc <= uci && uci <= uc2)
-                                setbit (ci, &ccl);
-                            }
-                        }
+                      for (int ci = c; ci <= c2; ci++)
+                        if (dfa->syntax.case_fold && isalpha (ci))
+                          setbit_case_fold_c (ci, &ccl);
+                        else
+                          setbit (ci, &ccl);
                     }
                   else
                     known_bracket_exp = false;
@@ -1174,7 +1164,7 @@ parse_bracket_exp (struct dfa *dfa)
 
       if (!dfa->localeinfo.multibyte)
         {
-          if (dfa->syntax.case_fold)
+          if (dfa->syntax.case_fold && isalpha (c))
             setbit_case_fold_c (c, &ccl);
           else
             setbit (c, &ccl);
@@ -1209,7 +1199,7 @@ parse_bracket_exp (struct dfa *dfa)
   if (! known_bracket_exp)
     return BACKREF;
 
-  if (dfa->localeinfo.multibyte)
+  if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
     {
       dfa->lex.brack.invert = invert;
       dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
@@ -1218,7 +1208,6 @@ parse_bracket_exp (struct dfa *dfa)
 
   if (invert)
     {
-      assert (!dfa->localeinfo.multibyte);
       notset (&ccl);
       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
         clrbit ('\n', &ccl);
-- 
2.7.4

Reply via email to