On 04/26/2013 05:07 PM, Pádraig Brady wrote:
> I've rebased this to master and attached.
> The rebase wasn't trivial so I might have messed up.
> The cut.pl test is now failing on master. Could you have a look.
> Also could you add a test (or just a bit of shell) to demonstrate
> which options the memory is not allocated for example.
> Ideally some pathological option combo that no longer
> allocates huge amounts of RAM.
I refactored a little more (see next_item()).
Also split to two patches and added some benchmarks.
Will push the attached in a few hours.
thanks,
Pádraig.
>From d28d239bec5371f88d0779836af2107abd624258 Mon Sep 17 00:00:00 2001
From: Cojocaru Alexandru
Date: Sun, 9 Dec 2012 10:43:10 +0100
Subject: [PATCH 1/2] cut: make memory allocation independent of range width
The current implementation of cut, uses a bit array,
an array of `struct range_pair's, and (when --output-delimiter
is specified) a hash_table. The new implementation will use
only an array of `struct range_pair's.
The old implementation is memory inefficient because:
1. When -b with a big num is specified, it allocates a lot of
memory for `printable_field'.
2. When --output-delimiter is specified, it will allocate 31 buckets.
Even if a few ranges are specified.
Note CPU overhead is increased to determine if an item is to be printed,
as shown by:
$ yes abcdfeg | head -n1MB > big-file
$ for c in with-bitarray without-bitarray; do
src/cut-$c 2>/dev/null
echo -ne "\n== $c =="
time src/cut-$c -b1,3 big-file > /dev/null
done
== with-bitarray ==
real0m0.084s
user0m0.078s
sys 0m0.006s
== without-bitarray ==
real0m0.111s
user0m0.108s
sys 0m0.002s
Further patches will reduce this overhead without using a bit array.
* src/cut.c (set_fields): Set and initialize RP
instead of printable_field.
* src/cut.c (is_range_start_index): Use CURRENT_RP rather than a hash.
* tests/misc/cut.pl: Check if `eol_range_start' is set correctly.
* tests/misc/cut-huge-range.sh: Rename from cut-huge-to-eol-range.sh,
and add a test to verify large amounts of mem aren't allocated.
Fixes http://bugs.gnu.org/13127
---
src/cut.c | 294 +++-
tests/local.mk |2 +-
...{cut-huge-to-eol-range.sh => cut-huge-range.sh} |4 +
tests/misc/cut.pl |4 +
4 files changed, 111 insertions(+), 193 deletions(-)
rename tests/misc/{cut-huge-to-eol-range.sh => cut-huge-range.sh} (84%)
diff --git a/src/cut.c b/src/cut.c
index 494aad7..8738c46 100644
--- a/src/cut.c
+++ b/src/cut.c
@@ -53,8 +53,31 @@
} \
while (0)
+
+struct range_pair
+ {
+size_t lo;
+size_t hi;
+ };
+
+/* Array of `struct range_pair' holding all the finite ranges. */
+static struct range_pair *rp;
+
+/* Pointer inside RP. When checking if a byte or field is selected
+ by a finite range, we check if it is between CURRENT_RP.LO
+ and CURRENT_RP.HI. If the byte or field index is greater than
+ CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
+static struct range_pair *current_rp;
+
+/* Number of finite ranges specified by the user. */
+static size_t n_rp;
+
+/* Number of `struct range_pair's allocated. */
+static size_t n_rp_allocated;
+
+
/* Append LOW, HIGH to the list RP of range pairs, allocating additional
- space if necessary. Update local variable N_RP. When allocating,
+ space if necessary. Update global variable N_RP. When allocating,
update global variable N_RP_ALLOCATED. */
#define ADD_RANGE_PAIR(rp, low, high) \
@@ -72,11 +95,6 @@
} \
while (0)
-struct range_pair
- {
-size_t lo;
-size_t hi;
- };
/* This buffer is used to support the semantics of the -s option
(or lack of same) when the specified field list includes (does
@@ -90,26 +108,11 @@ static char *field_1_buffer;
/* The number of bytes allocated for FIELD_1_BUFFER. */
static size_t field_1_bufsize;
-/* The largest field or byte index used as an endpoint of a closed
- or degenerate range specification; this doesn't include the starting
- index of right-open-ended ranges. For example, with either range spec
- '2-5,9-', '2-3,5,9-' this variable would be set to 5. */
-static size_t max_range_endpoint;
/* If nonzero, this is the index of the first field in a range that goes
to end of line. */
static size_t eol_range_start;
-/* This is a bit vector.
- In byte mode, which bytes to output.
- In field mode, which DELIM-separated fields to output.
- Both bytes and fields are numbered starting with 1,
- so the zeroth bit of this array is unused.
- A field or byte K has been selected if
- (K <= MAX_RANGE_ENDPOINT and is_printable_field(K))
-|| (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
-static unsigned char *printable_field;
-
enum operating_mode
{
undefined_mode,
@@ -148,15 +151,6 @@ static char *output