bug#43040: grep 3.4: memory leak

Paul Eggert Mon, 07 Sep 2020 20:00:20 -0700

I don't see the extra memory consumption as necessarily being a bug, if grep istrading space (which is relatively cheap) for time (which is less so). Perhapssomeone with some time to spare could look into that in more detail.

The excess CPU consumption is clearly problematic, though. I installed theattached patches, which on your example causes 'grep' to be 3x faster than grep3.3 was. I hope this addresses any practical performance problems uncovered bythat artificial test case. The first patch is a bit of a hack but does the realwork; the rest are merely cleanups or very minor performance improvements.

>From 33e4602c96e639ec7d56b92ffe3614aa700d3d76 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 7 Sep 2020 17:20:11 -0700
Subject: [PATCH 1/4] Omit duplicate regexps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit


Do not pass two copies of the same regexp to the
regular-expression engine.  Although the engines should
perform nearly as well even with the copies, in practice they do not.
Problem reported by Luca Borzacchiello (Bug#43040).
* bootstrap.conf (gnulib_modules): Add hash.
* src/grep.c: Include stdint.h, for SIZE_WIDTH.
Include hash.h.
(struct patloc, patloc, patlocs_allocated, patlocs_used):
Rename from struct FL_pair, fl_pair, n_fl_pair_slots, n_pattern_files,
respectively, since the data type is no longer a pair.
All uses changed.
(struct patloc): New member FILELINE.  The lineno member is now
ptrdiff_t since nowadays we prefer signed types.
(pattern_array, patterns_table): New static vars.
(count_nl_bytes, fl_add): Remove; no longer used.
(hash_pattern, compare_patterns, update_patterns): New functions.
update_patterns does what fl_add used to do, plus remove dups.
(pattern_file_name): Adjust to change from fl_pair to patloc.
(main): Move some variables to inner blocks for clarity.
Maintain the pattern_table hash of all patterns.
Update pattern_array to match keys, and use update_patterns
instead of fl_add to remove duplicate keys.
* tests/filename-lineno.pl (invalid-re-2-files)
(invalid-re-2-files2, invalid-re-2e): Ensure regexps are unique in
tests so that dups aren’t removed in diagnostics.
(invalid-re-line-numbers): New test.
---
 NEWS                     |   4 +
 bootstrap.conf           |   1 +
 src/grep.c               | 277 +++++++++++++++++++++++++--------------
 tests/filename-lineno.pl |  19 ++-
 4 files changed, 198 insertions(+), 103 deletions(-)

diff --git a/NEWS b/NEWS
index 5f4c0b4..f661adf 100644
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,10 @@ GNU grep NEWS                                    -*- outline -*-
   line is selected, not when a file is listed.  The behavior in grep
   3.2 through 3.4 was causing compatibility problems.
 
+** Bug fixes
+
+  A performance regression with many duplicate patterns has been fixed.
+  [Bug#43040 introduced in grep 3.4]
 
 * Noteworthy changes in release 3.4 (2020-01-02) [stable]
 
diff --git a/bootstrap.conf b/bootstrap.conf
index ef142fc..fceb318 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -49,6 +49,7 @@ git-version-gen
 gitlog-to-changelog
 gnu-web-doc-update
 gnupload
+hash
 ignore-value
 intprops
 inttypes
diff --git a/src/grep.c b/src/grep.c
index 5764b2a..c359ea9 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -24,6 +24,7 @@
 #include <wchar.h>
 #include <inttypes.h>
 #include <stdarg.h>
+#include <stdint.h>
 #include <stdio.h>
 #include "system.h"
 
@@ -41,6 +42,7 @@
 #include "getopt.h"
 #include "getprogname.h"
 #include "grep.h"
+#include "hash.h"
 #include "intprops.h"
 #include "propername.h"
 #include "quote.h"
@@ -82,72 +84,136 @@ static bool align_tabs;
 /* Print width of line numbers and byte offsets.  Nonzero if ALIGN_TABS.  */
 static int offset_width;
 
-/* See below */
-struct FL_pair
+/* An entry in the PATLOC array saying where patterns came from.  */
+struct patloc
   {
+    /* Line number of the pattern in PATTERN_ARRAY.  Line numbers
+       start at 0, and each pattern is terminated by '\n'.  */
+    ptrdiff_t lineno;
+
+    /* Input location of the pattern.  The FILENAME "-" represents
+       standard input, and "" represents the command line.  FILELINE is
+       origin-1 for files and is irrelevant for the command line.  */
     char const *filename;
-    size_t lineno;
+    ptrdiff_t fileline;
   };
 
-/* A list of lineno,filename pairs corresponding to -f FILENAME
-   arguments. Since we store the concatenation of all patterns in
-   a single array, KEYS, be they from the command line via "-e PAT"
-   or read from one or more -f-specified FILENAMES.  Given this
-   invocation, grep -f <(seq 5) -f <(seq 2) -f <(seq 3) FILE, there
-   will be three entries in LF_PAIR: {1, x} {6, y} {8, z}, where
-   x, y and z are just place-holders for shell-generated names.  */
-static struct FL_pair *fl_pair;
-static size_t n_fl_pair_slots;
-/* Count not only -f-specified files, but also individual -e operands
-   and any command-line argument that serves as a regular expression.  */
-static size_t n_pattern_files;
-
-/* The number of patterns seen so far.
-   It is advanced by fl_add and, when needed, used in pattern_file_name
-   to derive a file-relative line number.  */
+/* The array of pattern locations.  The concatenation of all patterns
+   is stored in a single array, KEYS.  Given the invocation
+   'grep -f <(seq 5) -f <(seq 6) -f <(seq 3)', there will initially be
+   28 bytes in KEYS.  After duplicate patterns are removed, KEYS
+   will have 12 bytes and PATLOC will be {0,x,1}, {10,y,1}
+   where x, y and z are just place-holders for shell-generated names
+   since and z is omitted as it contains only duplicates.  Sometimes
+   removing duplicates will grow PATLOC, since each run of
+   removed patterns not at a file start or end requires another
+   PATLOC entry for the first non-removed pattern.  */
+static struct patloc *patloc;
+static size_t patlocs_allocated, patlocs_used;
+
+/* Pointer to the array of patterns, each terminated by newline.  */
+static char *pattern_array;
+
+/* The number of unique patterns seen so far.  */
 static size_t n_patterns;
 
-/* Return the number of newline bytes in BUF with size SIZE.  */
+/* Hash table of patterns seen so far.  */
+static Hash_table *pattern_table;
+
+/* Hash and compare newline-terminated patterns for textual equality.
+   Patterns are represented by origin-1 offsets into PATTERN_ARRAY,
+   cast to void *.  The origin-1 is so that the first pattern offset
+   does not appear to be a null pointer when cast to void *.  */
 static size_t _GL_ATTRIBUTE_PURE
-count_nl_bytes (char const *buf, size_t size)
-{
-  char const *p = buf;
-  char const *end_p = buf + size;
-  size_t n = 0;
-  while ((p = memchr (p, '\n', end_p - p)))
-    p++, n++;
-  return n;
+hash_pattern (void const *pat, size_t n_buckets)
+{
+  size_t h = 0;
+  intptr_t pat_offset = (intptr_t) pat - 1;
+  for (char const *s = pattern_array + pat_offset; *s != '\n'; s++)
+    h = *s + ((h << 9) | (h >> (SIZE_WIDTH - 9)));
+  return h % n_buckets;
+}
+static bool _GL_ATTRIBUTE_PURE
+compare_patterns (void const *a, void const *b)
+{
+  intptr_t a_offset = (intptr_t) a - 1;
+  intptr_t b_offset = (intptr_t) b - 1;
+  char const *p = pattern_array + a_offset;
+  char const *q = pattern_array + b_offset;
+  for (; *p == *q; p++, q++)
+    if (*p == '\n')
+      return true;
+  return false;
 }
 
-/* Append a FILENAME,line-number pair to FL_PAIR, and update
-   pattern-related counts from the contents of BUF with SIZE bytes.  */
-static void
-fl_add (char const *buf, size_t size, char const *filename)
+/* Update KEYS to remove duplicate patterns, and return the number of
+   bytes in the resulting KEYS.  KEYS contains a sequence of patterns
+   each terminated by '\n'.  The first DUPFREE_SIZE bytes are a
+   sequence of patterns with no duplicates; SIZE is the total number
+   of bytes in KEYS.  If some patterns past the first DUPFREE_SIZE
+   bytes are not duplicates, update PATLOCS accordingly.  */
+static ptrdiff_t
+update_patterns (char *keys, ptrdiff_t dupfree_size, ptrdiff_t size,
+                 char const *filename)
 {
-  if (n_fl_pair_slots <= n_pattern_files)
-    fl_pair = x2nrealloc (fl_pair, &n_fl_pair_slots, sizeof *fl_pair);
+  char *dst = keys + dupfree_size;
+  ptrdiff_t fileline = 1;
+  int prev_inserted = 0;
+
+  char const *srclim = keys + size;
+  ptrdiff_t patsize;
+  for (char const *src = keys + dupfree_size; src < srclim; src += patsize)
+    {
+      char const *patend = memchr (src, '\n', srclim - src);
+      patsize = patend + 1 - src;
+      memmove (dst, src, patsize);
+
+      intptr_t dst_offset_1 = dst - keys + 1;
+      int inserted = hash_insert_if_absent (pattern_table,
+                                            (void *) dst_offset_1, NULL);
+      if (inserted)
+        {
+          if (inserted < 0)
+            xalloc_die ();
+          dst += patsize;
 
-  fl_pair[n_pattern_files].lineno = n_patterns + 1;
-  fl_pair[n_pattern_files].filename = filename;
-  n_pattern_files++;
-  n_patterns += count_nl_bytes (buf, size);
+          /* Add a PATLOCS entry unless this input line is simply the
+             next one in the same file.  */
+          if (!prev_inserted)
+            {
+              if (patlocs_used == patlocs_allocated)
+                patloc = x2nrealloc (patloc, &patlocs_allocated,
+                                     sizeof *patloc);
+              patloc[patlocs_used++]
+                = (struct patloc) { .lineno = n_patterns,
+                                    .filename = filename,
+                                    .fileline = fileline };
+            }
+          n_patterns++;
+        }
+
+      prev_inserted = inserted;
+      fileline++;
+    }
+
+  return dst - keys;
 }
 
-/* Map the line number, LINENO, of one of the input patterns to the
-   name of the file from which it came.  If it was read from stdin
-   or if it was specified on the command line, return "-".  */
+/* Map LINENO, the origin-1 line number of one of the input patterns,
+   to the name of the file from which it came.  Return "-" if it was
+   read from stdin, "" if it was specified on the command line.
+   Set *NEW_LINENO to the origin-1 line number of PATTERN in the file,
+   or to an unspecified value if PATTERN came from the command line.  */
 char const * _GL_ATTRIBUTE_PURE
 pattern_file_name (size_t lineno, size_t *new_lineno)
 {
-  size_t i;
-  for (i = 1; i < n_pattern_files; i++)
-    {
-      if (lineno < fl_pair[i].lineno)
-        break;
-    }
-
-  *new_lineno = lineno - fl_pair[i - 1].lineno + 1;
-  return fl_pair[i - 1].filename;
+  lineno--;
+  ptrdiff_t i;
+  for (i = 1; i < patlocs_used; i++)
+    if (lineno < patloc[i].lineno)
+      break;
+  *new_lineno = lineno - patloc[i - 1].lineno + patloc[i - 1].fileline;
+  return patloc[i - 1].filename;
 }
 
 #if HAVE_ASAN
@@ -2330,6 +2396,7 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
         }
     }
 
+  *p = '\n';
   free (*keys_p);
   *keys_p = new_keys;
   *len_p = p - new_keys;
@@ -2424,9 +2491,8 @@ int
 main (int argc, char **argv)
 {
   char *keys = NULL;
-  size_t keycc = 0, oldcc, keyalloc = 0;
+  size_t keycc = 0, keyalloc = 0;
   int matcher = -1;
-  size_t cc;
   int opt, prepended;
   int prev_optind, last_recursive;
   int fread_errno;
@@ -2467,6 +2533,10 @@ main (int argc, char **argv)
 
   last_recursive = 0;
 
+  pattern_table = hash_initialize (0, 0, hash_pattern, compare_patterns, 0);
+  if (!pattern_table)
+    xalloc_die ();
+
   prepended = prepend_default_options (getenv ("GREP_OPTIONS"), &argc, &argv);
   if (prepended)
     error (0, 0, _("warning: GREP_OPTIONS is deprecated;"
@@ -2565,50 +2635,52 @@ main (int argc, char **argv)
         break;
 
       case 'e':
-        cc = strlen (optarg);
-        if (keyalloc < keycc + cc + 1)
-          {
-            keyalloc = keycc + cc + 1;
-            keys = x2realloc (keys, &keyalloc);
-          }
-        oldcc = keycc;
-        memcpy (keys + oldcc, optarg, cc);
-        keycc += cc;
-        keys[keycc++] = '\n';
-        fl_add (keys + oldcc, cc + 1, "");
+        {
+          ptrdiff_t cc = strlen (optarg);
+          if (keyalloc < keycc + cc + 1)
+            {
+              keyalloc = keycc + cc + 1;
+              pattern_array = keys = x2realloc (keys, &keyalloc);
+            }
+          char *keyend = mempcpy (keys + keycc, optarg, cc);
+          *keyend = '\n';
+          keycc = update_patterns (keys, keycc, keycc + cc + 1, "");
+        }
         break;
 
       case 'f':
-        if (STREQ (optarg, "-"))
-          {
-            if (binary)
-              xset_binary_mode (STDIN_FILENO, O_BINARY);
-            fp = stdin;
-          }
-        else
-          {
-            fp = fopen (optarg, binary ? "rb" : "r");
-            if (!fp)
-              die (EXIT_TROUBLE, errno, "%s", optarg);
-          }
-        oldcc = keycc;
-        for (;; keycc += cc)
-          {
-            if (keyalloc <= keycc + 1)
-              keys = x2realloc (keys, &keyalloc);
-            cc = fread (keys + keycc, 1, keyalloc - (keycc + 1), fp);
-            if (cc == 0)
-              break;
-          }
-        fread_errno = errno;
-        if (ferror (fp))
-          die (EXIT_TROUBLE, fread_errno, "%s", optarg);
-        if (fp != stdin)
-          fclose (fp);
-        /* Append final newline if file ended in non-newline. */
-        if (oldcc != keycc && keys[keycc - 1] != '\n')
-          keys[keycc++] = '\n';
-        fl_add (keys + oldcc, keycc - oldcc, optarg);
+        {
+          if (STREQ (optarg, "-"))
+            {
+              if (binary)
+                xset_binary_mode (STDIN_FILENO, O_BINARY);
+              fp = stdin;
+            }
+          else
+            {
+              fp = fopen (optarg, binary ? "rb" : "r");
+              if (!fp)
+                die (EXIT_TROUBLE, errno, "%s", optarg);
+            }
+          ptrdiff_t newkeycc = keycc, cc;
+          for (;; newkeycc += cc)
+            {
+              if (keyalloc <= newkeycc + 1)
+                pattern_array = keys = x2realloc (keys, &keyalloc);
+              cc = fread (keys + newkeycc, 1, keyalloc - (newkeycc + 1), fp);
+              if (cc == 0)
+                break;
+            }
+          fread_errno = errno;
+          if (ferror (fp))
+            die (EXIT_TROUBLE, fread_errno, "%s", optarg);
+          if (fp != stdin)
+            fclose (fp);
+          /* Append final newline if file ended in non-newline. */
+          if (newkeycc != keycc && keys[newkeycc - 1] != '\n')
+            keys[newkeycc++] = '\n';
+          keycc = update_patterns (keys, keycc, newkeycc, optarg);
+        }
         break;
 
       case 'h':
@@ -2800,22 +2872,26 @@ main (int argc, char **argv)
           /* No keys were specified (e.g. -f /dev/null).  Match nothing.  */
           out_invert ^= true;
           match_lines = match_words = false;
+          keys[keycc++] = '\n';
         }
-      else
-        /* Strip trailing newline. */
-        --keycc;
     }
   else if (optind < argc)
     {
       /* Make a copy so that it can be reallocated or freed later.  */
-      keycc = strlen (argv[optind]);
-      keys = xmemdup (argv[optind++], keycc + 1);
-      fl_add (keys, keycc, "");
+      pattern_array = keys = xstrdup (argv[optind++]);
+      ptrdiff_t patlen = strlen (keys);
+      keys[patlen] = '\n';
+      keycc = update_patterns (keys, 0, patlen + 1, "");
       n_patterns++;
     }
   else
     usage (EXIT_TROUBLE);
 
+  /* Strip trailing newline from keys.  */
+  keycc--;
+
+  hash_free (pattern_table);
+
   bool possibly_tty = false;
   struct stat tmp_stat;
   if (! exit_on_match && fstat (STDOUT_FILENO, &tmp_stat) == 0)
@@ -2887,7 +2963,8 @@ main (int argc, char **argv)
           : (contains_encoding_error (keys, keycc)
              || (match_icase && !fgrep_icase_available (keys, keycc)))))
     {
-      fgrep_to_grep_pattern (&keys, &keycc);
+      fgrep_to_grep_pattern (&pattern_array, &keycc);
+      keys = pattern_array;
       matcher = G_MATCHER_INDEX;
     }
   /* With two or more patterns, if -F works then switch from either -E
diff --git a/tests/filename-lineno.pl b/tests/filename-lineno.pl
index 998af73..8940b1c 100755
--- a/tests/filename-lineno.pl
+++ b/tests/filename-lineno.pl
@@ -48,7 +48,7 @@ my @Tests =
    # Show that with two or more errors, grep now prints all diagnostics:
    ['invalid-re-2-files', '-f g -f h', {EXIT=>2},
     {AUX=>{g=>"1\n2[[\n3\n4[[\n"}},
-    {AUX=>{h=>"\n\n[[\n"}},
+    {AUX=>{h=>"5\n6\n7[[\n"}},
     $err_subst,
     {ERR => "$prog: g:2: Unmatched [...\n"
          . "$prog: g:4: Unmatched [...\n"
@@ -59,7 +59,7 @@ my @Tests =
    # Like the above, but on the other lines.
    ['invalid-re-2-files2', '-f g -f h', {EXIT=>2},
     {AUX=>{g=>"1[[\n2\n3[[\n4\n"}},
-    {AUX=>{h=>"[[\n[[\n\n"}},
+    {AUX=>{h=>"5[[\n6[[\n7\n"}},
     $err_subst,
     {ERR => "$prog: g:1: Unmatched [...\n"
          . "$prog: g:3: Unmatched [...\n"
@@ -68,9 +68,22 @@ my @Tests =
     },
    ],
 
+   # Make sure the line numbers are right when some regexps are duplicates.
+   ['invalid-re-line-numbers', '-f g -f h', {EXIT=>2},
+    {AUX=>{g=>"1[[\n\n3[[\n\n5[[\n"}},
+    {AUX=>{h=>"1[[\n\n\n4[[\n\n6[[\n"}},
+    $err_subst,
+    {ERR => "$prog: g:1: Unmatched [...\n"
+         . "$prog: g:3: Unmatched [...\n"
+         . "$prog: g:5: Unmatched [...\n"
+         . "$prog: h:4: Unmatched [...\n"
+         . "$prog: h:6: Unmatched [...\n"
+    },
+   ],
+
    # Show that with two '-e'-specified erroneous regexps,
    # there is no file name or line number.
-   ['invalid-re-2e', '-e "[[" -e "[["', {EXIT=>2},
+   ['invalid-re-2e', '-e "1[[" -e "2[["', {EXIT=>2},
     $err_subst,
     {ERR => "$prog: Unmatched [...\n" x 2},
    ],
-- 
2.17.1

>From 71b5c685d0dd3e9b0298e1a9c37b32fbedece340 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 7 Sep 2020 17:20:11 -0700
Subject: [PATCH 2/4] Simplify regex_compile
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* src/dfasearch.c (regex_compile): "" suffices; we don’t need "\0".
No need to initialize pat_lineno.
---
 src/dfasearch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dfasearch.c b/src/dfasearch.c
index 337345f..2ddab33 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -162,9 +162,9 @@ regex_compile (struct dfa_comp *dc, char const *p, ptrdiff_t len,
     return true;
 
   /* Emit a filename:lineno: prefix for patterns taken from files.  */
-  size_t pat_lineno = lineno;
+  size_t pat_lineno;
   char const *pat_filename
-    = lineno < 0 ? "\0" : pattern_file_name (lineno + 1, &pat_lineno);
+    = lineno < 0 ? "" : pattern_file_name (lineno + 1, &pat_lineno);
 
   if (*pat_filename == '\0')
     error (0, 0, "%s", err);
-- 
2.17.1

>From 0ede35a6cd21093560de8bd9843263ba199abf1f Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 7 Sep 2020 17:20:11 -0700
Subject: [PATCH 3/4] Simplify pattern_file_name

* src/grep.c (pattern_file_name): Make first argument
origin-0, not origin-1, as this simplifies both caller and
callee.  All uses changed.
---
 src/dfasearch.c | 2 +-
 src/grep.c      | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/dfasearch.c b/src/dfasearch.c
index 2ddab33..256cd39 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -164,7 +164,7 @@ regex_compile (struct dfa_comp *dc, char const *p, ptrdiff_t len,
   /* Emit a filename:lineno: prefix for patterns taken from files.  */
   size_t pat_lineno;
   char const *pat_filename
-    = lineno < 0 ? "" : pattern_file_name (lineno + 1, &pat_lineno);
+    = lineno < 0 ? "" : pattern_file_name (lineno, &pat_lineno);
 
   if (*pat_filename == '\0')
     error (0, 0, "%s", err);
diff --git a/src/grep.c b/src/grep.c
index c359ea9..ce2f291 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -199,7 +199,7 @@ update_patterns (char *keys, ptrdiff_t dupfree_size, ptrdiff_t size,
   return dst - keys;
 }
 
-/* Map LINENO, the origin-1 line number of one of the input patterns,
+/* Map LINENO, the origin-0 line number of one of the input patterns,
    to the name of the file from which it came.  Return "-" if it was
    read from stdin, "" if it was specified on the command line.
    Set *NEW_LINENO to the origin-1 line number of PATTERN in the file,
@@ -207,7 +207,6 @@ update_patterns (char *keys, ptrdiff_t dupfree_size, ptrdiff_t size,
 char const * _GL_ATTRIBUTE_PURE
 pattern_file_name (size_t lineno, size_t *new_lineno)
 {
-  lineno--;
   ptrdiff_t i;
   for (i = 1; i < patlocs_used; i++)
     if (lineno < patloc[i].lineno)
-- 
2.17.1

>From 9393b977015bf7944cec1d71ad3972c101bdb4b8 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 7 Sep 2020 19:44:21 -0700
Subject: [PATCH 4/4] =?UTF-8?q?Prefer=20rawmemchr=20to=20memchr=20when=20i?=
 =?UTF-8?q?t=E2=80=99s=20easy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* bootstrap.conf (gnulib_modules): Add rawmemchr.
* src/dfasearch.c (GEAcompile, EGexecute):
* src/grep.c (update_patterns, prpending, prtext):
* src/kwsearch.c (Fcompile, Fexecute):
* src/pcresearch.c (Pcompile, Pexecute):
Simplify (and presumably speed up a little) by using rawmemchr
with a sentinel, instead of using memchr.
---
 bootstrap.conf   |  1 +
 src/dfasearch.c  | 50 +++++++++++++++++++++++++-----------------------
 src/grep.c       | 11 +++++------
 src/kwsearch.c   | 45 +++++++++++++++++++++----------------------
 src/pcresearch.c | 18 ++++++++++-------
 5 files changed, 65 insertions(+), 60 deletions(-)

diff --git a/bootstrap.conf b/bootstrap.conf
index fceb318..4268623 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -73,6 +73,7 @@ openat-safer
 perl
 propername
 quote
+rawmemchr
 readme-release
 realloc-gnu
 regex
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 256cd39..4d3f4b2 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -174,6 +174,10 @@ regex_compile (struct dfa_comp *dc, char const *p, ptrdiff_t len,
   return false;
 }
 
+/* Compile PATTERN, containing SIZE bytes that are followed by '\n'.
+   SYNTAX_BITS specifies whether PATTERN uses style -G, -E, or -A.
+   Return a description of the compiled pattern.  */
+
 void *
 GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
 {
@@ -213,15 +217,8 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
 
   do
     {
-      size_t len;
-      char const *sep = memchr (p, '\n', patlim - p);
-      if (sep)
-        {
-          len = sep - p;
-          sep++;
-        }
-      else
-        len = patlim - p;
+      char const *sep = rawmemchr (p, '\n');
+      ptrdiff_t len = sep - p;
 
       bool backref = possible_backrefs_in_pattern (p, len, bs_safe);
 
@@ -247,7 +244,7 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
       if (!regex_compile (dc, p, len, dc->pcount, lineno, !backref))
         compilation_failed = true;
 
-      p = sep;
+      p = sep + 1;
       lineno++;
 
       if (backref)
@@ -256,12 +253,12 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
           prev = p;
         }
     }
-  while (p);
+  while (p <= patlim);
 
   if (compilation_failed)
     exit (EXIT_TROUBLE);
 
-  if (prev != NULL)
+  if (prev <= patlim)
     {
       if (pattern < prev)
         {
@@ -383,14 +380,19 @@ EGexecute (void *vdc, char const *buf, size_t size, size_t *match_size,
                  greater of the latter two values; this temporarily prefers
                  the DFA to KWset.  */
               exact_kwset_match = kwsm.index < dc->kwset_exact_matches;
-              end = ((exact_kwset_match || !dfafast
-                      || MAX (16, match - beg) < (match - prev_beg) >> 2)
-                     ? match
-                     : MAX (16, match - beg) < (buflim - prev_beg) >> 2
-                     ? prev_beg + 4 * MAX (16, match - beg)
-                     : buflim);
-              end = memchr (end, eol, buflim - end);
-              end = end ? end + 1 : buflim;
+              if (exact_kwset_match || !dfafast
+                  || MAX (16, match - beg) < (match - prev_beg) >> 2)
+                {
+                  end = rawmemchr (match, eol);
+                  end++;
+                }
+              else if (MAX (16, match - beg) < (buflim - prev_beg) >> 2)
+                {
+                  end = rawmemchr (prev_beg + 4 * MAX (16, match - beg), eol);
+                  end++;
+                }
+              else
+                end = buflim;
 
               if (exact_kwset_match)
                 {
@@ -425,8 +427,8 @@ EGexecute (void *vdc, char const *buf, size_t size, size_t *match_size,
                   beg++;
                   dfa_beg = beg;
                 }
-              end = memchr (next_beg, eol, buflim - next_beg);
-              end = end ? end + 1 : buflim;
+              end = rawmemchr (next_beg, eol);
+              end++;
 
               count = 0;
             }
@@ -446,8 +448,8 @@ EGexecute (void *vdc, char const *buf, size_t size, size_t *match_size,
               beg = memrchr (buf, eol, next_beg - buf);
               beg++;
             }
-          end = memchr (next_beg, eol, buflim - next_beg);
-          end = end ? end + 1 : buflim;
+          end = rawmemchr (next_beg, eol);
+          end++;
 
           /* Successful, no back-references encountered! */
           if (!backref)
diff --git a/src/grep.c b/src/grep.c
index ce2f291..d058a76 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -164,7 +164,7 @@ update_patterns (char *keys, ptrdiff_t dupfree_size, ptrdiff_t size,
   ptrdiff_t patsize;
   for (char const *src = keys + dupfree_size; src < srclim; src += patsize)
     {
-      char const *patend = memchr (src, '\n', srclim - src);
+      char const *patend = rawmemchr (src, '\n');
       patsize = patend + 1 - src;
       memmove (dst, src, patsize);
 
@@ -1104,8 +1104,7 @@ static void
 nlscan (char const *lim)
 {
   size_t newlines = 0;
-  char const *beg;
-  for (beg = lastnl; beg < lim; beg++)
+  for (char const *beg = lastnl; beg < lim; beg++)
     {
       beg = memchr (beg, eolbyte, lim - beg);
       if (!beg)
@@ -1353,7 +1352,7 @@ prpending (char const *lim)
     lastout = bufbeg;
   for (; 0 < pending && lastout < lim; pending--)
     {
-      char *nl = memchr (lastout, eolbyte, lim - lastout);
+      char *nl = rawmemchr (lastout, eolbyte);
       prline (lastout, nl + 1, SEP_CHAR_REJECTED);
     }
 }
@@ -1394,7 +1393,7 @@ prtext (char *beg, char *lim)
 
       while (p < beg)
         {
-          char *nl = memchr (p, eol, beg - p);
+          char *nl = rawmemchr (p, eol);
           nl++;
           prline (p, nl, SEP_CHAR_REJECTED);
           p = nl;
@@ -1407,7 +1406,7 @@ prtext (char *beg, char *lim)
       /* One or more lines are output.  */
       for (n = 0; p < lim && n < outleft; n++)
         {
-          char *nl = memchr (p, eol, lim - p);
+          char *nl = rawmemchr (p, eol);
           nl++;
           if (!out_quiet)
             prline (p, nl, SEP_CHAR_SELECTED);
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 6f6d4d0..7081060 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -43,14 +43,13 @@ struct kwsearch
   void *re;
 };
 
-/* Compile the -F style PATTERN, containing SIZE bytes.  Return a
-   description of the compiled pattern.  */
+/* Compile the -F style PATTERN, containing SIZE bytes that are
+   followed by '\n'.  Return a description of the compiled pattern.  */
 
 void *
 Fcompile (char *pattern, size_t size, reg_syntax_t ignored)
 {
   kwset_t kwset;
-  ptrdiff_t total = size;
   char *buf = NULL;
   size_t bufalloc = 0;
 
@@ -59,23 +58,12 @@ Fcompile (char *pattern, size_t size, reg_syntax_t ignored)
   char const *p = pattern;
   do
     {
-      ptrdiff_t len;
-      char const *sep = memchr (p, '\n', total);
-      if (sep)
-        {
-          len = sep - p;
-          sep++;
-          total -= (len + 1);
-        }
-      else
-        {
-          len = total;
-          total = 0;
-        }
+      char const *sep = rawmemchr (p, '\n');
+      ptrdiff_t len = sep - p;
 
       if (match_lines)
         {
-          if (eolbyte == '\n' && pattern < p && sep)
+          if (eolbyte == '\n' && pattern < p)
             p--;
           else
             {
@@ -94,9 +82,9 @@ Fcompile (char *pattern, size_t size, reg_syntax_t ignored)
         }
       kwsincr (kwset, p, len);
 
-      p = sep;
+      p = sep + 1;
     }
-  while (p);
+  while (p <= pattern + size);
 
   free (buf);
   ptrdiff_t words = kwswords (kwset);
@@ -259,8 +247,14 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
                                                kwsearch->size,
                                                RE_SYNTAX_GREP);
                   }
-                end = memchr (beg + len, eol, (buf + size) - (beg + len));
-                end = end ? end + 1 : buf + size;
+                if (beg + len < buf + size)
+                  {
+                    end = rawmemchr (beg + len, eol);
+                    end++;
+                  }
+                else
+                  end = buf + size;
+
                 if (EGexecute (kwsearch->re, beg, end - beg, match_size, NULL)
                     != (size_t) -1)
                   goto success_match_words;
@@ -285,8 +279,13 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
   return -1;
 
  success:
-  end = memchr (beg + len, eol, (buf + size) - (beg + len));
-  end = end ? end + 1 : buf + size;
+  if (beg + len < buf + size)
+    {
+      end = rawmemchr (beg + len, eol);
+      end++;
+    }
+  else
+    end = buf + size;
  success_match_words:
   beg = memrchr (buf, eol, beg - buf);
   beg = beg ? beg + 1 : buf;
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 15a6a59..2fcbf8e 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -107,6 +107,9 @@ jit_exec (struct pcre_comp *pc, char const *subject, int search_bytes,
     }
 }
 
+/* Compile the -P style PATTERN, containing SIZE bytes that are
+   followed by '\n'.  Return a description of the compiled pattern.  */
+
 void *
 Pcompile (char *pattern, size_t size, reg_syntax_t ignored)
 {
@@ -120,7 +123,7 @@ Pcompile (char *pattern, size_t size, reg_syntax_t ignored)
                          sizeof xprefix - 1 + sizeof xsuffix - 1);
   char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
   int flags = PCRE_DOLLAR_ENDONLY | (match_icase ? PCRE_CASELESS : 0);
-  char const *patlim = pattern + size;
+  char *patlim = pattern + size;
   char *n = re;
   char const *p;
   char const *pnul;
@@ -134,7 +137,7 @@ Pcompile (char *pattern, size_t size, reg_syntax_t ignored)
     }
 
   /* FIXME: Remove this restriction.  */
-  if (memchr (pattern, '\n', size))
+  if (rawmemchr (pattern, '\n') != patlim)
     die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
 
   *n = '\0';
@@ -148,7 +151,8 @@ Pcompile (char *pattern, size_t size, reg_syntax_t ignored)
      replace each NUL byte in the pattern with the four characters
      "\000", removing a preceding backslash if there are an odd
      number of backslashes before the NUL.  */
-  for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
+  *patlim = '\0';
+  for (p = pattern; (pnul = p + strlen (p)) < patlim; p = pnul + 1)
     {
       memcpy (n, p, pnul - p);
       n += pnul - p;
@@ -158,10 +162,10 @@ Pcompile (char *pattern, size_t size, reg_syntax_t ignored)
       strcpy (n, "\\000");
       n += 4;
     }
-
-  memcpy (n, p, patlim - p);
+  memcpy (n, p, patlim - p + 1);
   n += patlim - p;
-  *n = '\0';
+  *patlim = '\n';
+
   if (match_words)
     strcpy (n, wsuffix);
   if (match_lines)
@@ -219,7 +223,7 @@ Pexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
          PCRE_MULTILINE for performance, the performance wasn't always
          better and the correctness issues were too puzzling.  See
          Bug#22655.  */
-      line_end = memchr (p, eolbyte, buf + size - p);
+      line_end = rawmemchr (p, eolbyte);
       if (INT_MAX < line_end - p)
         die (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
 
-- 
2.17.1

bug#43040: grep 3.4: memory leak

Reply via email to