Hello,
Attach is a patch that gives 'csplit' the ability to split files by content of
a field.
A typical usage is:
## the "@1" pattern means "start a new file when field 1 changes"
$ printf "A\nA\nB\nB\nB\nC\n" | csplit - @1 {*}
$ wc -l xx*
2 xx00
3 xx01
1 xx02
6 total
$ head xx*
==> xx00 <==
A
A
==> xx01 <==
B
B
B
==> xx02 <==
C
This is just a proof of concept, and the pattern specification can be changed
(I think "@N" doesn't conflict with any existing pattern).
The same can probably be achieved using other programs (awk comes to mind), but
it won't be as simple and clean (with all of csplit's output features).
Let me know if you're willing to consider such addition.
Thanks,
-gordon
>From 074614c0764c278e8abd9d41af4ce626fefd6cfc Mon Sep 17 00:00:00 2001
From: Assaf Gordon <[email protected]>
Date: Wed, 6 Feb 2013 16:40:00 -0500
Subject: [PATCH] csplit: split files by field-change
src/csplit.c: create a new output file whenever field content changes.
---
src/csplit.c | 237 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 230 insertions(+), 7 deletions(-)
diff --git a/src/csplit.c b/src/csplit.c
index 22f3ad4..ec725d2 100644
--- a/src/csplit.c
+++ b/src/csplit.c
@@ -44,6 +44,13 @@
/* The default prefix for output file names. */
#define DEFAULT_PREFIX "xx"
+enum csplit_type
+ {
+ CSPLIT_LINE,
+ CSPLIT_REGEXPR,
+ CSPLIT_FIELD_CHANGE
+ };
+
/* A compiled pattern arg. */
struct control
{
@@ -53,8 +60,9 @@ struct control
int argnum; /* ARGV index. */
bool repeat_forever; /* True if '*' used as a repeat count. */
bool ignore; /* If true, produce no output (for regexp). */
- bool regexpr; /* True if regular expression was used. */
+ enum csplit_type type; /* Split type: line/regex/field */
struct re_pattern_buffer re_compiled; /* Compiled regular expression. */
+ uintmax_t field; /* Field to monitor for change */
};
/* Initial size of data area in buffers. */
@@ -176,6 +184,16 @@ static size_t control_used;
/* The set of signals that are caught. */
static sigset_t caught_signals;
+/* If delimiter has this value, blanks separate fields. */
+enum { DELIMITER_DEFAULT = CHAR_MAX + 1 };
+
+/* The delimiter to use for field extraction */
+static int delimiter = DELIMITER_DEFAULT;
+
+/* The content of the field from the last line, to be compared with the
+ * current line */
+static struct cstring last_field;
+
static struct option const longopts[] =
{
{"digits", required_argument, NULL, 'n'},
@@ -185,6 +203,7 @@ static struct option const longopts[] =
{"elide-empty-files", no_argument, NULL, 'z'},
{"prefix", required_argument, NULL, 'f'},
{"suffix-format", required_argument, NULL, 'b'},
+ {"delimiter", required_argument, NULL, 'd'},
{GETOPT_HELP_OPTION_DECL},
{GETOPT_VERSION_OPTION_DECL},
{NULL, 0, NULL, 0}
@@ -867,6 +886,169 @@ process_regexp (struct control *p, uintmax_t repetition)
current_line = break_line;
}
+/* Skip the requested number of fields in the input string.
+ Returns a pointer to the *delimiter* of the requested field,
+ or a pointer to NUL (if reached the end of the string).
+
+ NOTE: buf is *not* expected to be NULL-terminated string.
+ The end of the string is determined by 'len' */
+static inline char *
+__attribute ((pure))
+skip_fields (char *buf, int len, int fields)
+{
+ static char null_str[] = "";
+
+ char *ptr = buf;
+ if (delimiter != DELIMITER_DEFAULT)
+ {
+ if (*ptr == delimiter)
+ fields--;
+ while (len && fields--)
+ {
+ while (len && *ptr == delimiter)
+ {
+ ++ptr;
+ --len;
+ }
+ while (len && *ptr != delimiter)
+ {
+ ++ptr;
+ --len;
+ }
+ }
+ }
+ else
+ while (len && fields--)
+ {
+ while (len && isblank (*ptr))
+ {
+ --len;
+ ++ptr;
+ }
+ while (len && !isblank (*ptr))
+ {
+ ++ptr;
+ --len;
+ }
+ }
+
+ if (len==0)
+ return null_str;
+
+ return ptr;
+}
+
+static void
+set_last_field (const char* str, size_t len)
+{
+ last_field.len = len ;
+ last_field.str = xrealloc (last_field.str, len);
+ memcpy (last_field.str, str, len);
+}
+
+static void
+reset_last_field (void)
+{
+ last_field.len = 0 ;
+}
+
+static void
+free_last_field (void)
+{
+ last_field.len = 0;
+ free (last_field.str);
+ last_field.str=NULL;
+}
+
+/* Prints the input line until a fields change its value */
+static void
+process_field_change (struct control *p)
+{
+ struct cstring *line; /* From input file. */
+ char *field_start = NULL;
+ char *field_end = NULL ;
+ size_t field_len;
+ size_t line_len;
+ size_t eol_len; /* length from field_start to EOL */
+
+ create_output_file ();
+
+ reset_last_field ();
+
+ while (true)
+ {
+ line = find_line (++current_line);
+ if (line == NULL)
+ {
+ /* No more input lines */
+ if (p->repeat_forever)
+ {
+ dump_rest_of_file ();
+ close_output_file ();
+ exit (EXIT_SUCCESS);
+ }
+ else
+ {
+ error (0, 0, _("not enough input lines for pattern %d "
+ "field @%zu"), p->argnum, p->field);
+ cleanup_fatal ();
+ }
+ }
+
+
+ line_len = line->len;
+ if (line->str[line_len-1] == '\n')
+ --line_len;
+
+ /* Find the beginning of the field */
+ if (p->field>1)
+ {
+ field_start = skip_fields (line->str, line_len, p->field-1);
+ if (*field_start == '\0')
+ {
+ error (0, 0, _("not enough input fields on line %zu"
+ "(Looking for field @%zu)"), p->field,current_line);
+ cleanup_fatal ();
+ }
+ ++field_start; /* skip delimiter */
+ }
+ else
+ field_start = line->str;
+
+ /* Find the end of the field */
+ eol_len = line_len - (field_start-line->str);
+ field_end = skip_fields (field_start, eol_len, 1);
+ if (*field_end=='\0')
+ field_len = eol_len ;
+ else
+ field_len = field_end - field_start;
+
+
+ /* new field content, or same value as previous line? */
+ if ( last_field.len==0
+ || (last_field.len == field_len
+ && memcmp (last_field.str, field_start, field_len)==0 ) )
+ {
+ /* First line encountered with this field - set it */
+ if (last_field.len==0)
+ set_last_field (field_start, field_len);
+
+ line = remove_line ();
+ save_line_to_file (line);
+ }
+ else
+ {
+ /* Field changed, get out (but use the same line next time) */
+ --current_line;
+ break;
+ }
+ }
+
+ close_output_file ();
+}
+
+
+
/* Split the input file according to the control records we have built. */
static void
@@ -877,17 +1059,25 @@ split_file (void)
for (i = 0; i < control_used; i++)
{
uintmax_t j;
- if (controls[i].regexpr)
+ switch (controls[i].type)
{
+ case CSPLIT_REGEXPR:
for (j = 0; (controls[i].repeat_forever
|| j <= controls[i].repeat); j++)
process_regexp (&controls[i], j);
- }
- else
- {
+ break;
+
+ case CSPLIT_LINE:
for (j = 0; (controls[i].repeat_forever
|| j <= controls[i].repeat); j++)
process_line_count (&controls[i], j);
+ break;
+
+ case CSPLIT_FIELD_CHANGE:
+ for (j = 0; (controls[i].repeat_forever
+ || j <= controls[i].repeat); j++)
+ process_field_change (&controls[i]);
+ break;
}
}
@@ -1039,7 +1229,7 @@ new_control_record (void)
if (control_used == control_allocated)
controls = X2NREALLOC (controls, &control_allocated);
p = &controls[control_used++];
- p->regexpr = false;
+ p->type = CSPLIT_LINE;
p->repeat = 0;
p->repeat_forever = false;
p->lines_required = 0;
@@ -1116,7 +1306,7 @@ extract_regexp (int argnum, bool ignore, char const *str)
p->argnum = argnum;
p->ignore = ignore;
- p->regexpr = true;
+ p->type = CSPLIT_REGEXPR;
p->re_compiled.buffer = NULL;
p->re_compiled.allocated = 0;
p->re_compiled.fastmap = xmalloc (UCHAR_MAX + 1);
@@ -1136,6 +1326,30 @@ extract_regexp (int argnum, bool ignore, char const *str)
return p;
}
+/* Extract the field specification from STR and check for a numeric offset.
+ Return a new control record for the field pattern.
+ ARGNUM is the ARGV index of STR. */
+static struct control *
+extract_field_pattern (int argnum, char const *str)
+{
+ struct control *p;
+ uintmax_t val = 0;
+
+ p = new_control_record ();
+ p->argnum = argnum;
+ p->type = CSPLIT_FIELD_CHANGE;
+
+ if (xstrtoumax (str+1, NULL, 10, &val, "") != LONGINT_OK)
+ error (EXIT_FAILURE, 0, _("%s: invalid field number"), str+1);
+ if (val == 0)
+ error (EXIT_FAILURE, 0,
+ _("%s: field number must be greater than zero"),
+ str+1);
+ p->field = val;
+
+ return p;
+}
+
/* Extract the break patterns from args START through ARGC - 1 of ARGV.
After each pattern, check if the next argument is a repeat count. */
@@ -1154,6 +1368,11 @@ parse_patterns (int argc, int start, char **argv)
p = extract_regexp (i, *argv[i] == '%', argv[i]);
}
else
+ if (*argv[i] == '@')
+ {
+ p = extract_field_pattern (i, argv[i]);
+ }
+ else
{
p = new_control_record ();
p->argnum = i;
@@ -1432,6 +1651,8 @@ main (int argc, char **argv)
split_file ();
+ free_last_field ();
+
if (close (STDIN_FILENO) != 0)
{
error (0, errno, _("read error"));
@@ -1461,6 +1682,7 @@ and output byte counts of each piece to standard output.\n\
fputs (_("\
-b, --suffix-format=FORMAT use sprintf FORMAT instead of %02d\n\
+ -d, --delimiter=X use X instead of whitespace for field delimiter\n\
-f, --prefix=PREFIX use PREFIX instead of 'xx'\n\
-k, --keep-files do not remove output files on errors\n\
"), stdout);
@@ -1480,6 +1702,7 @@ Read standard input if FILE is -. Each PATTERN may be:\n\
INTEGER copy up to but not including specified line number\n\
/REGEXP/[OFFSET] copy up to but not including a matching line\n\
%REGEXP%[OFFSET] skip to, but not including a matching line\n\
+ @N Start new field every time field N changes\n\
{INTEGER} repeat the previous pattern specified number of times\n\
{*} repeat the previous pattern as many times as possible\n\
\n\
--
1.7.7.4