Thanks, this patch was installed into grep in June so I'm marking the bug report as done.

Also, I installed the attached two patches as followup. The first is because I've run into too many compilers that complain about 'static const int x;' on the grounds that x should have an initializer. Rewriting the code to avoid these zero constants makes the code a tiny bit smaller on x86-64, so it's likely a tiny win anyway.

The 2nd patch changes this code to use 64-bit words on x86-64 GNU/Linux, much as other parts of grep already do. This makes the code a tiny bit smaller too.

The runtime performance change is likely so small I haven't bothered to try to measure it. Perhaps some day compilers will be smart enough to put those 256-bit arrays into registers and regex nerds can tune it some more....
From d0345459737e9f4ee436fa3a19a076ac681db2a5 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 16 Aug 2016 03:04:26 -0700
Subject: [PATCH 1/2] dfa: avoid uninitialized constants

Some compilers warn about 'static int const x;' on the grounds
that X should have an initializer.  Instead of worrying about
this, rewrite to avoid this sort of thing.
* src/dfa.c (emptyset): New function.
(parse_bracket_exp): Use it instead of 'equal' and a zero constant.
* src/dfasearch.c (struct patterns): Remove tag 'patterns'.
(patterns0): Remove zero constant.
(GEAcompile): Use memset instead of the zero constant.
---
 src/dfa.c       | 13 +++++++++++--
 src/dfasearch.c |  8 +++-----
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index b64a176..f970766 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -608,6 +608,16 @@ equal (charclass const s1, charclass const s2)
   return memcmp (s1, s2, sizeof (charclass)) == 0;
 }
 
+static bool
+emptyset (charclass const s)
+{
+  charclass_word w = 0;
+  int i;
+  for (i = 0; i < CHARCLASS_WORDS; i++)
+    w |= s[i];
+  return w == 0;
+}
+
 /* Ensure that the array addressed by PTR holds at least NITEMS +
    (PTR || !NITEMS) items.  Either return PTR, or reallocate the array
    and return its new address.  Although PTR may be null, the returned
@@ -1184,9 +1194,8 @@ parse_bracket_exp (void)
 
   if (dfa->multibyte)
     {
-      static charclass const zeroclass;
       work_mbc->invert = invert;
-      work_mbc->cset = equal (ccl, zeroclass) ? -1 : charclass_index (ccl);
+      work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (ccl);
       return MBCSET;
     }
 
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 9a523c8..222232c 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -38,15 +38,13 @@ static kwset_t kwset;
 static struct dfa *dfa;
 
 /* The Regex compiled patterns.  */
-static const struct patterns
+static struct
 {
   /* Regex compiled regexp. */
   struct re_pattern_buffer regexbuf;
   struct re_registers regs; /* This is here on account of a BRAIN-DEAD
                                Q@#%!# library interface in regex.c.  */
-} patterns0;
-
-static struct patterns *patterns;
+} *patterns;
 static size_t pcount;
 
 /* Number of compiled fixed strings known to exactly match the regexp.
@@ -153,7 +151,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
         }
 
       patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns);
-      patterns[pcount] = patterns0;
+      memset (&patterns[pcount], 0, sizeof patterns[pcount]);
 
       char const *err = re_compile_pattern (p, len,
                                             &(patterns[pcount].regexbuf));
-- 
2.7.4

From 7e704da5aa5a09a8c45fc843805020c1ff025407 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Wed, 17 Aug 2016 18:41:16 -0700
Subject: [PATCH 2/2] dfa: use 64-bit when ulong is at least that wide

* src/dfa.c (charclass_word): Now unsigned long instead of unsigned.
(CHARCLASS_WORD_BITS): Now 64 on 64-bit platforms.
(CHARCLASS_PAIR, CHARCLASS_INIT): New macros.
(CHARCLASS_WORD_MASK): Now a static const, since it no longer
needs to be a macro.
(equal): Open-code rather than calling memcmp.
(add_utf8_anychar): Use CHARCLASS_INIT.
---
 src/dfa.c | 42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index f970766..f7d2a37 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -63,15 +63,30 @@ enum { NOTCHAR = 1 << CHAR_BIT };
 
 /* This represents part of a character class.  It must be unsigned and
    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
-typedef unsigned int charclass_word;
+typedef unsigned long int charclass_word;
 
-/* The number of bits used in a charclass word.  utf8_classes assumes
-   this is exactly 32.  */
+/* CHARCLASS_WORD_BITS is the number of bits used in a charclass word.
+   CHARCLASS_PAIR (LO, HI) is part of a charclass initializer, and
+   represents 64 bits' worth of a charclass, where LO and HI are the
+   low and high-order 32 bits of the 64-bit quantity.  */
+#if ULONG_MAX >> 31 >> 31 < 3
 enum { CHARCLASS_WORD_BITS = 32 };
+# define CHARCLASS_PAIR(lo, hi) lo, hi
+#else
+enum { CHARCLASS_WORD_BITS = 64 };
+# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
+#endif
+
+/* An initializer for a charclass whose 32-bit words are A through H.  */
+#define CHARCLASS_INIT(a, b, c, d, e, f, g, h)		\
+    {							\
+      CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d),	\
+      CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h)	\
+    }
 
 /* The maximum useful value of a charclass_word; all used bits are 1.  */
-#define CHARCLASS_WORD_MASK \
-  (((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1)
+static charclass_word const CHARCLASS_WORD_MASK
+  = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
 
 /* Number of words required to hold a bit for every character.  */
 enum
@@ -605,7 +620,11 @@ notset (charclass s)
 static bool
 equal (charclass const s1, charclass const s2)
 {
-  return memcmp (s1, s2, sizeof (charclass)) == 0;
+  charclass_word w = 0;
+  int i;
+  for (i = 0; i < CHARCLASS_WORDS; i++)
+    w |= s1[i] ^ s2[i];
+  return w == 0;
 }
 
 static bool
@@ -1675,20 +1694,19 @@ add_utf8_anychar (void)
 {
   static charclass const utf8_classes[5] = {
     /* 80-bf: non-leading bytes.  */
-    {0, 0, 0, 0, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, 0, 0},
+    CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
 
     /* 00-7f: 1-byte sequence.  */
-    {CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK,
-     CHARCLASS_WORD_MASK, 0, 0, 0, 0},
+    CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
 
     /* c2-df: 2-byte sequence.  */
-    {0, 0, 0, 0, 0, 0, ~3 & CHARCLASS_WORD_MASK, 0},
+    CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
 
     /* e0-ef: 3-byte sequence.  */
-    {0, 0, 0, 0, 0, 0, 0, 0xffff},
+    CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xffff),
 
     /* f0-f7: 4-byte sequence.  */
-    {0, 0, 0, 0, 0, 0, 0, 0xff0000}
+    CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xff0000)
   };
   const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
   unsigned int i;
-- 
2.7.4

Reply via email to