On 01/23/2013 11:58 PM, Lei Zhang wrote:
> Hi All,
> 
> In the `split' program of coreutils 8.20, we found a bug which is similar
> to the bug reported in bug#13530:
> 
> Invoking `split -C T' or `split -C P' or `split -C E' will cause memory
> exhaustion.
> 
> However, smaller units (e.g., K, M, G) work fine; bigger units (e.g., Z, Y)
> fail properly (by outputing "invalid number of bytes").
> 
> A bit dig reveals that the bug is introduced at line 1412 of split.c
> (coreutils 8.20):
> 
> 1400: switch (split_type)
> 1401: {
>       ...
> 1411:   case type_byteslines:
> 1412:     line_bytes_split (n_units); /* this function calls
> xmalloc(n_bytes), which fails. */
> 1413:     break;
>       ...
> 1434: }
> 
> `n_units' before that statement can be very large. In the case of `split -C
> T', it is 1099511627776.
> 
> We think this is a bug similar to the one reported in bug#13530, so we
> bring this issue to your attention. Thanks!

I intend to push the attached soon to fix this.

thanks,
Pádraig.

>From f3548e6fd20755629c33e7e0d87b1c455677aedd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Sat, 20 Apr 2013 08:46:43 +0100
Subject: [PATCH] split: change -C to only allocate memory when required

* src/split.c (line_bytes_split): Rewrite to only buffer
when necessary.  I.E. only increase the buffer when we've
already lines output in a split and we encounter a line
larger than the input buffer size, in which case a hold
buffer will be increased in increments of the input buffer size.
* tests/split/line-bytes.sh: Add a new test for this
function which previously had no test coverage.
* tests/local.mk: Reference the new test.
* NEWS: Mention the improvement.
Fixes http://bugs.gnu.org/13537
---
 NEWS                      |    3 +
 src/split.c               |  137 +++++++++++++++++++++++++++++++--------------
 tests/local.mk            |    1 +
 tests/split/line-bytes.sh |   85 ++++++++++++++++++++++++++++
 4 files changed, 184 insertions(+), 42 deletions(-)
 create mode 100755 tests/split/line-bytes.sh

diff --git a/NEWS b/NEWS
index fc05b8a..e9a8efa 100644
--- a/NEWS
+++ b/NEWS
@@ -41,6 +41,9 @@ GNU coreutils NEWS                                    -*- outline -*-
   Reservoir sampling is used to limit memory usage based on the number of
   outputs, rather than the number of inputs.
 
+  split --line-bytes=SIZE, now only allocates memory as needed rather
+  than allocating SIZE bytes at program start.
+
 ** Build-related
 
   factor now builds on aarch64 based systems [bug introduced in coreutils-8.20]
diff --git a/src/split.c b/src/split.c
index e5e75f0..e6a7ce7 100644
--- a/src/split.c
+++ b/src/split.c
@@ -616,62 +616,115 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize)
     }
   while (n_read == bufsize);
 }
-
+
 /* Split into pieces that are as large as possible while still not more
    than N_BYTES bytes, and are split on line boundaries except
-   where lines longer than N_BYTES bytes occur.
-   FIXME: Allow N_BYTES to be any uintmax_t value, and don't require a
-   buffer of size N_BYTES, in case N_BYTES is very large.  */
+   where lines longer than N_BYTES bytes occur. */
 
 static void
-line_bytes_split (size_t n_bytes)
+line_bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize)
 {
-  char *bp;
-  bool eof = false;
-  size_t n_buffered = 0;
-  char *buf = xmalloc (n_bytes);
+  size_t n_read;
+  uintmax_t n_out = 0;      /* for each split.  */
+  size_t n_hold = 0;
+  char *hold = NULL;        /* for lines > bufsize.  */
+  size_t hold_size = 0;
+  bool split_line = false;  /* Whether a \n was output in a split.  */
 
   do
     {
-      /* Fill up the full buffer size from the input file.  */
-
-      size_t to_read = n_bytes - n_buffered;
-      size_t n_read = full_read (STDIN_FILENO, buf + n_buffered, to_read);
-      if (n_read < to_read && errno)
+      n_read = full_read (STDIN_FILENO, buf, bufsize);
+      if (n_read < bufsize && errno)
         error (EXIT_FAILURE, errno, "%s", infile);
-
-      n_buffered += n_read;
-      if (n_buffered != n_bytes)
+      size_t n_left = n_read;
+      char *sob = buf;
+      while (n_left)
         {
-          if (n_buffered == 0)
-            break;
-          eof = true;
-        }
+          size_t split_rest = 0;
+          char *eoc = NULL;
+          char *eol;
 
-      /* Find where to end this chunk.  */
-      bp = buf + n_buffered;
-      if (n_buffered == n_bytes)
-        {
-          while (bp > buf && bp[-1] != '\n')
-            bp--;
-        }
+          /* Determine End Of Chunk and/or End of Line,
+             which are used below to select what to write or buffer.  */
+          if (n_bytes - n_out - n_hold <= n_left)
+            {
+              /* Have enough for split.  */
+              split_rest = n_bytes - n_out - n_hold;
+              eoc = sob + split_rest - 1;
+              eol = memrchr (sob, '\n', split_rest);
+            }
+          else
+            eol = memrchr (sob, '\n', n_left);
+
+          /* Output hold space if possible.  */
+          if (n_hold && !(!eol && n_out))
+            {
+              cwrite (n_out == 0, hold, n_hold);
+              n_out += n_hold;
+              if (n_hold > bufsize)
+                hold = xrealloc (hold, bufsize);
+              n_hold = 0;
+              hold_size = bufsize;
+            }
+
+          /* Output to eol if present.  */
+          if (eol)
+            {
+              split_line = true;
+              size_t n_write = eol - sob + 1;
+              cwrite (n_out == 0, sob, n_write);
+              n_out += n_write;
+              n_left -= n_write;
+              sob += n_write;
+              if (eoc)
+                split_rest -= n_write;
+            }
 
-      /* If chunk has no newlines, use all the chunk.  */
-      if (bp == buf)
-        bp = buf + n_buffered;
+          /* Output to eoc or eob if possible.  */
+          if (n_left && !split_line)
+            {
+              size_t n_write = eoc ? split_rest : n_left;
+              cwrite (n_out == 0, sob, n_write);
+              n_out += n_write;
+              n_left -= n_write;
+              sob += n_write;
+              if (eoc)
+                split_rest -= n_write;
+            }
 
-      /* Output the chars as one output file.  */
-      cwrite (true, buf, bp - buf);
+          /* Update hold if needed.  */
+          if ((eoc && split_rest) || (!eoc && n_left))
+            {
+              size_t n_buf = eoc ? split_rest : n_left;
+              if (hold_size - n_hold < n_buf)
+                {
+                  if (hold_size <= SIZE_MAX - bufsize)
+                    hold_size += bufsize;
+                  else
+                    error (EXIT_FAILURE, 0, "%s", _("memory exhausted"));
+                  hold = xrealloc (hold, hold_size);
+                }
+              memcpy (hold + n_hold, sob, n_buf);
+              n_hold += n_buf;
+              n_left -= n_buf;
+              sob += n_buf;
+            }
 
-      /* Discard the chars we just output; move rest of chunk
-         down to be the start of the next chunk.  Source and
-         destination probably overlap.  */
-      n_buffered -= bp - buf;
-      if (n_buffered > 0)
-        memmove (buf, bp, n_buffered);
+          /* Reset for new split.  */
+          if (eoc)
+            {
+              n_out = 0;
+              split_line = false;
+            }
+        }
     }
-  while (!eof);
-  free (buf);
+  while (n_read == bufsize);
+
+  /* Handle no eol at end of file.  */
+  if (n_hold)
+    cwrite (n_out == 0, hold, n_hold);
+
+  free (hold);
 }
 
 /* -n l/[K/]N: Write lines to files of approximately file size / N.
@@ -1408,7 +1461,7 @@ main (int argc, char **argv)
       break;
 
     case type_byteslines:
-      line_bytes_split (n_units);
+      line_bytes_split (n_units, buf, in_blk_size);
       break;
 
     case type_chunk_bytes:
diff --git a/tests/local.mk b/tests/local.mk
index f47da8d..b255464 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -345,6 +345,7 @@ all_tests =					\
   tests/split/b-chunk.sh			\
   tests/split/fail.sh				\
   tests/split/lines.sh				\
+  tests/split/line-bytes.sh			\
   tests/split/l-chunk.sh			\
   tests/split/r-chunk.sh			\
   tests/split/numeric.sh			\
diff --git a/tests/split/line-bytes.sh b/tests/split/line-bytes.sh
new file mode 100755
index 0000000..ed2baac
--- /dev/null
+++ b/tests/split/line-bytes.sh
@@ -0,0 +1,85 @@
+#!/bin/sh
+# test -C, --lines-bytes
+
+# Copyright (C) 2013 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ split
+
+
+# Ensure memory is not allocated up front
+(ulimit -v 20000; split -C 'E' /dev/null || fail=1)
+
+
+# Ensure correct operation with various split and buffer size combinations
+
+lines=\
+1~2222~3~4
+
+printf '%s' "$lines" | tr '~' '\n' > in || framework_failure_
+
+cat <<\EOF > splits_exp
+1 1 1 1 1 1 1 1 1 1
+2 2 2 1 2 1
+2 3 2 2 1
+2 4 3 1
+2 5 3
+2 5 3
+7 3
+7 3
+9 1
+9 1
+10
+EOF
+
+seq 0 9 | tr -d '\n' > no_eol_in
+
+cat <<\EOF > no_eol_splits_exp
+1 1 1 1 1 1 1 1 1 1
+2 2 2 2 2
+3 3 3 1
+4 4 2
+5 5
+6 4
+7 3
+8 2
+9 1
+10
+10
+EOF
+
+for b in $(seq 10); do
+  : > splits
+  : > no_eol_splits
+  for s in $(seq 11); do
+    rm x??
+    split ---io=$b -C$s in || fail=1
+    cat x* > out || framework_failure_
+    compare in out || fail=1
+    stat -c %s x* | paste -s -d ' ' >> splits
+
+    rm x??
+    split ---io=$b -C$s no_eol_in || fail=1
+    cat x* > out || framework_failure_
+    cat xaa
+    compare no_eol_in out || fail=1
+    stat -c %s x* | paste -s -d ' ' >> no_eol_splits
+  done
+  compare splits_exp splits || fail=1
+  compare no_eol_splits_exp no_eol_splits || fail=1
+done
+
+Exit $fail
-- 
1.7.7.6

Reply via email to