Thanks, this patch was installed into grep in June so I'm marking the bug report
as done.
Also, I installed the attached two patches as followup. The first is because
I've run into too many compilers that complain about 'static const int x;' on
the grounds that x should have an initializer. Rewriting the code to avoid these
zero constants makes the code a tiny bit smaller on x86-64, so it's likely a
tiny win anyway.
The 2nd patch changes this code to use 64-bit words on x86-64 GNU/Linux, much as
other parts of grep already do. This makes the code a tiny bit smaller too.
The runtime performance change is likely so small I haven't bothered to try to
measure it. Perhaps some day compilers will be smart enough to put those 256-bit
arrays into registers and regex nerds can tune it some more....
From d0345459737e9f4ee436fa3a19a076ac681db2a5 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 16 Aug 2016 03:04:26 -0700
Subject: [PATCH 1/2] dfa: avoid uninitialized constants
Some compilers warn about 'static int const x;' on the grounds
that X should have an initializer. Instead of worrying about
this, rewrite to avoid this sort of thing.
* src/dfa.c (emptyset): New function.
(parse_bracket_exp): Use it instead of 'equal' and a zero constant.
* src/dfasearch.c (struct patterns): Remove tag 'patterns'.
(patterns0): Remove zero constant.
(GEAcompile): Use memset instead of the zero constant.
---
src/dfa.c | 13 +++++++++++--
src/dfasearch.c | 8 +++-----
2 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index b64a176..f970766 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -608,6 +608,16 @@ equal (charclass const s1, charclass const s2)
return memcmp (s1, s2, sizeof (charclass)) == 0;
}
+static bool
+emptyset (charclass const s)
+{
+ charclass_word w = 0;
+ int i;
+ for (i = 0; i < CHARCLASS_WORDS; i++)
+ w |= s[i];
+ return w == 0;
+}
+
/* Ensure that the array addressed by PTR holds at least NITEMS +
(PTR || !NITEMS) items. Either return PTR, or reallocate the array
and return its new address. Although PTR may be null, the returned
@@ -1184,9 +1194,8 @@ parse_bracket_exp (void)
if (dfa->multibyte)
{
- static charclass const zeroclass;
work_mbc->invert = invert;
- work_mbc->cset = equal (ccl, zeroclass) ? -1 : charclass_index (ccl);
+ work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (ccl);
return MBCSET;
}
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 9a523c8..222232c 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -38,15 +38,13 @@ static kwset_t kwset;
static struct dfa *dfa;
/* The Regex compiled patterns. */
-static const struct patterns
+static struct
{
/* Regex compiled regexp. */
struct re_pattern_buffer regexbuf;
struct re_registers regs; /* This is here on account of a BRAIN-DEAD
Q@#%!# library interface in regex.c. */
-} patterns0;
-
-static struct patterns *patterns;
+} *patterns;
static size_t pcount;
/* Number of compiled fixed strings known to exactly match the regexp.
@@ -153,7 +151,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
}
patterns = xnrealloc (patterns, pcount + 1, sizeof *patterns);
- patterns[pcount] = patterns0;
+ memset (&patterns[pcount], 0, sizeof patterns[pcount]);
char const *err = re_compile_pattern (p, len,
&(patterns[pcount].regexbuf));
--
2.7.4
From 7e704da5aa5a09a8c45fc843805020c1ff025407 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Wed, 17 Aug 2016 18:41:16 -0700
Subject: [PATCH 2/2] dfa: use 64-bit when ulong is at least that wide
* src/dfa.c (charclass_word): Now unsigned long instead of unsigned.
(CHARCLASS_WORD_BITS): Now 64 on 64-bit platforms.
(CHARCLASS_PAIR, CHARCLASS_INIT): New macros.
(CHARCLASS_WORD_MASK): Now a static const, since it no longer
needs to be a macro.
(equal): Open-code rather than calling memcmp.
(add_utf8_anychar): Use CHARCLASS_INIT.
---
src/dfa.c | 42 ++++++++++++++++++++++++++++++------------
1 file changed, 30 insertions(+), 12 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index f970766..f7d2a37 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -63,15 +63,30 @@ enum { NOTCHAR = 1 << CHAR_BIT };
/* This represents part of a character class. It must be unsigned and
at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
-typedef unsigned int charclass_word;
+typedef unsigned long int charclass_word;
-/* The number of bits used in a charclass word. utf8_classes assumes
- this is exactly 32. */
+/* CHARCLASS_WORD_BITS is the number of bits used in a charclass word.
+ CHARCLASS_PAIR (LO, HI) is part of a charclass initializer, and
+ represents 64 bits' worth of a charclass, where LO and HI are the
+ low and high-order 32 bits of the 64-bit quantity. */
+#if ULONG_MAX >> 31 >> 31 < 3
enum { CHARCLASS_WORD_BITS = 32 };
+# define CHARCLASS_PAIR(lo, hi) lo, hi
+#else
+enum { CHARCLASS_WORD_BITS = 64 };
+# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
+#endif
+
+/* An initializer for a charclass whose 32-bit words are A through H. */
+#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
+ { \
+ CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
+ CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
+ }
/* The maximum useful value of a charclass_word; all used bits are 1. */
-#define CHARCLASS_WORD_MASK \
- (((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1)
+static charclass_word const CHARCLASS_WORD_MASK
+ = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
/* Number of words required to hold a bit for every character. */
enum
@@ -605,7 +620,11 @@ notset (charclass s)
static bool
equal (charclass const s1, charclass const s2)
{
- return memcmp (s1, s2, sizeof (charclass)) == 0;
+ charclass_word w = 0;
+ int i;
+ for (i = 0; i < CHARCLASS_WORDS; i++)
+ w |= s1[i] ^ s2[i];
+ return w == 0;
}
static bool
@@ -1675,20 +1694,19 @@ add_utf8_anychar (void)
{
static charclass const utf8_classes[5] = {
/* 80-bf: non-leading bytes. */
- {0, 0, 0, 0, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, 0, 0},
+ CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
/* 00-7f: 1-byte sequence. */
- {CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK,
- CHARCLASS_WORD_MASK, 0, 0, 0, 0},
+ CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
/* c2-df: 2-byte sequence. */
- {0, 0, 0, 0, 0, 0, ~3 & CHARCLASS_WORD_MASK, 0},
+ CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
/* e0-ef: 3-byte sequence. */
- {0, 0, 0, 0, 0, 0, 0, 0xffff},
+ CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xffff),
/* f0-f7: 4-byte sequence. */
- {0, 0, 0, 0, 0, 0, 0, 0xff0000}
+ CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xff0000)
};
const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
unsigned int i;
--
2.7.4