Hi all, > Pádraig Brady writes:
> > > For reference bash and ksh support ansi c quoting like $'\0' > > so you could specify -t $'\0'. Since this is only bash and ksh, I'm guessing it'd be best to write our own parsing, to support something like -t\n or -t\0 for shells that don't support ansi C quoting? At any rate, I figure it's about time I shared what I have so far, in case you guys want to look at it or play around with it. This is everything, minus parsing issues (i.e. -t is implemented, but depends on shell parsing) and minus handling of non-seekable files. From 8296bb34e51b93a339c51cdf16160cf3e9753d2b Mon Sep 17 00:00:00 2001 From: Chen Guo <cheng...@yahoo.com> Date: Wed, 16 Dec 2009 15:07:38 +0100 Subject: [PATCH] Split: extend --bytes and --lines to divide file into N roughly equal pieces, or to extract Kth of N said pieces Add --number option (equivalent to --bytes=/N and --bytes=K/N Add -t optiont to specify delineation character --- src/split.c | 285 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 files changed, 275 insertions(+), 10 deletions(-) diff --git a/src/split.c b/src/split.c index d1a0e0d..98a4e0d 100644 --- a/src/split.c +++ b/src/split.c @@ -72,6 +72,9 @@ static int output_desc; output file is opened. */ static bool verbose; +/* End of line character */ +static char eol; + /* For long options that have no equivalent short option, use a non-character as a pseudo short option, starting with CHAR_MAX + 1. */ enum @@ -81,9 +84,11 @@ enum static struct option const longopts[] = { + {"break", required_argument, NULL, 't'}, {"bytes", required_argument, NULL, 'b'}, {"lines", required_argument, NULL, 'l'}, {"line-bytes", required_argument, NULL, 'C'}, + {"number", required_argument, NULL, 'n'}, {"suffix-length", required_argument, NULL, 'a'}, {"numeric-suffixes", no_argument, NULL, 'd'}, {"verbose", no_argument, NULL, VERBOSE_OPTION}, @@ -116,9 +121,18 @@ Mandatory arguments to long options are mandatory for short options too.\n\ fprintf (stdout, _("\ -a, --suffix-length=N use suffixes of length N (default %d)\n\ -b, --bytes=SIZE put SIZE bytes per output file\n\ + -b, --bytes=/N generate N output files\n\ + -b, --bytes=K/N print Kth of N chunks of file\n\ -C, --line-bytes=SIZE put at most SIZE bytes of lines per output file\n\ -d, --numeric-suffixes use numeric suffixes instead of alphabetic\n\ -l, --lines=NUMBER put NUMBER lines per output file\n\ + -l, --lines=/N generate N newline delineated output files\n\ + -l, --lines=K/N print Kth of N newline delineated chunks\n\ + -n, --number=N same as --bytes=/N\n\ + -n, --number=K/N same as --bytes=K/N\n\ + -t specify delineation character. This will also\n\ + convert options such as -b to their delineated\n\ + equivalent (-l or -C, depending on context)\n\ "), DEFAULT_SUFFIX_LENGTH); fputs (_("\ --verbose print a diagnostic just before each\n\ @@ -218,13 +232,14 @@ cwrite (bool new_file_flag, const char *bp, size_t bytes) Use buffer BUF, whose size is BUFSIZE. */ static void -bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize) +bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, uintmax_t max_files) { size_t n_read; bool new_file_flag = true; size_t to_read; uintmax_t to_write = n_bytes; char *bp_out; + uintmax_t opened = 1; do { @@ -251,7 +266,8 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize) cwrite (new_file_flag, bp_out, w); bp_out += w; to_read -= w; - new_file_flag = true; + new_file_flag = (opened++ < max_files || !max_files)? + true : false; to_write = n_bytes; } } @@ -277,10 +293,10 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize) error (EXIT_FAILURE, errno, "%s", infile); bp = bp_out = buf; eob = bp + n_read; - *eob = '\n'; + *eob = eol; for (;;) { - bp = memchr (bp, '\n', eob - bp + 1); + bp = memchr (bp, eol, eob - bp + 1); if (bp == eob) { if (eob != bp_out) /* do not write 0 bytes! */ @@ -323,7 +339,6 @@ line_bytes_split (size_t n_bytes) do { /* Fill up the full buffer size from the input file. */ - n_read = full_read (STDIN_FILENO, buf + n_buffered, n_bytes - n_buffered); if (n_read == SAFE_READ_ERROR) error (EXIT_FAILURE, errno, "%s", infile); @@ -340,7 +355,7 @@ line_bytes_split (size_t n_bytes) bp = buf + n_buffered; if (n_buffered == n_bytes) { - while (bp > buf && bp[-1] != '\n') + while (bp > buf && bp[-1] != eol) bp--; } @@ -362,6 +377,164 @@ line_bytes_split (size_t n_bytes) free (buf); } +/* Split into NUMBER newline delineated chunks. */ + +static void +line_chunk_split (uintmax_t number, char *buf, size_t bufsize, size_t file_size) +{ + size_t n_read; + uintmax_t chunk_no = 1; + off_t chunk_end = file_size / number - 1; + off_t offset = 0; + bool new_file_flag = true; + char *bp, *bp_out, *eob; + + while (offset < file_size) + { + bp = bp_out = buf; + n_read = full_read (STDIN_FILENO, buf, bufsize); + if (n_read == SAFE_READ_ERROR) + error (EXIT_FAILURE, errno, "%s", infile); + eob = buf + n_read; + + while (1) + { + char *seek = (offset < chunk_end)? bp + chunk_end - offset : bp; + if (seek > eob) + seek = eob; + bp_out = memchr (seek, eol, eob - seek); + if (!bp_out) + { + cwrite (new_file_flag, bp, eob - bp); + new_file_flag = false; + offset += eob - bp; + break; + } + else + bp_out++; + + cwrite (new_file_flag, bp, bp_out - bp); + chunk_end = (++chunk_no < number)? + chunk_end + file_size / number : file_size; + new_file_flag = true; + offset += bp_out - bp; + bp = bp_out; + /* A line could have been so long that it skipped + entire chunks. */ + while (chunk_end < offset) + { + chunk_end += file_size / number; + chunk_no++; + /* Create blank file: this ensures NUMBER files are + created. */ + cwrite (true, bp, 0); + } + } + } +} + +/* If file is seekable, extract Nth of TOTAL chunks. + FIXME: Handle non-ssekable files. */ + +static void +byte_chunk_extract (uintmax_t n, uintmax_t total, char *buf, size_t bufsize, + size_t file_size) +{ + off_t start = (n == 0)? 0 : (n - 1) * (file_size / total); + off_t end = (n == total)? file_size : n * (file_size / total); + ssize_t n_read; + size_t n_write; + + while (1) + { + n_read = pread (STDIN_FILENO, buf, bufsize, start); + if (n_read < 0) + error (EXIT_FAILURE, errno, "%s", infile); + n_write = (start + n_read <= end)? n_read : end - start; + if (write (STDOUT_FILENO, buf, n_write) != n_write) + error (EXIT_FAILURE, errno, "output error"); + start += n_read; + if (end <= start) + return; + } +} + +/* If file is seekable, extract lines whose first byte begins in the Nth of + TOTAL chunks. + FIXME: Handle non-seekable files. */ + +static void +line_chunk_extract (uintmax_t n, uintmax_t total, char* buf, size_t bufsize, + size_t file_size) +{ + ssize_t n_read; + bool end_of_chunk = false; + bool skip = true; + char *bp = buf, *bp_out = buf, *eob; + off_t start; + off_t end; + + /* For n != 1, start reading 1 byte before nth chunk of file. This is to + detect if the first byte of chunk is the first byte of a line. */ + if (n == 1) + { + start = 0; + skip = false; + } + else + start = (n - 1) * (file_size / total) - 1; + end = (n == total)? file_size - 1 : n * (file_size / total) - 1; + + while (1) + { + n_read = pread (STDIN_FILENO, buf, bufsize, start); + if (n_read < 0) + error (EXIT_FAILURE, errno, "%s", infile); + bp = buf; + bp_out = buf + n_read; + eob = bp_out; + + /* Unless n == 1, skip past the first eol character + encountered. */ + if (skip) + { + bp = memchr (buf, eol, n_read); + if (bp) + { + if (bp - buf >= end - start) + return; + bp++; + skip = false; + } + else + if (start + n_read < end) + { + start += n_read; + continue; + } + else + return; + } + if (start + n_read >= end && end == file_size -1) + end_of_chunk = true; + else if (start + n_read >= end) + { + char *base = (buf + end - start < buf)? buf : buf + end - start; + bp_out = memchr (base, eol, eob - base); + if (bp_out) + { + bp_out++; + end_of_chunk = true; + } + } + start += n_read; + if (write (STDOUT_FILENO, bp, bp_out - bp) != bp_out - bp) + error (EXIT_FAILURE, errno, "output error"); + if (end_of_chunk) + return; + } +} + #define FAIL_ONLY_ONE_WAY() \ do \ { \ @@ -370,21 +543,47 @@ line_bytes_split (size_t n_bytes) } \ while (0) +/* Parse K/N syntax of chunk options. */ + +static inline void +chunk_parse (uintmax_t *m_units, uintmax_t *n_units, char *slash) +{ + *slash = '\0'; + if (slash != optarg + && xstrtoumax (optarg, NULL, 10, m_units, "") != LONGINT_OK + || SIZE_MAX < *m_units) + { + error (0, 0, _("%s: invalid chunk number"), optarg); + usage (EXIT_FAILURE); + } + if (xstrtoumax (++slash, NULL, 10, n_units, "") != LONGINT_OK + || *n_units == 0 || *n_units < *m_units || SIZE_MAX < *n_units) + { + error (0, 0, _("%s: invalid number of total chunks"), slash); + usage (EXIT_FAILURE); + } +} + int main (int argc, char **argv) { struct stat stat_buf; enum { - type_undef, type_bytes, type_byteslines, type_lines, type_digits + type_undef, type_bytes, type_byteslines, type_lines, type_digits, + type_chunk_bytes, type_chunk_eol } split_type = type_undef; size_t in_blk_size; /* optimal block size of input file device */ char *buf; /* file i/o buffer */ size_t page_size = getpagesize (); + uintmax_t m_units = 0; uintmax_t n_units; static char const multipliers[] = "bEGKkMmPTYZ0"; int c; int digits_optind = 0; + size_t file_size; + char *slash; + bool eol_char = false; initialize_main (&argc, &argv); set_program_name (argv[0]); @@ -404,7 +603,7 @@ main (int argc, char **argv) /* This is the argv-index of the option we will read next. */ int this_optind = optind ? optind : 1; - c = getopt_long (argc, argv, "0123456789C:a:b:dl:", longopts, NULL); + c = getopt_long (argc, argv, "0123456789C:a:b:c:dl:n:t:", longopts, NULL); if (c == -1) break; @@ -426,6 +625,13 @@ main (int argc, char **argv) case 'b': if (split_type != type_undef) FAIL_ONLY_ONE_WAY (); + slash = strchr (optarg, '/'); + if (slash) + { + split_type = type_chunk_bytes; + chunk_parse (&m_units, &n_units, slash); + break; + } split_type = type_bytes; if (xstrtoumax (optarg, NULL, 10, &n_units, multipliers) != LONGINT_OK || n_units == 0) @@ -438,6 +644,13 @@ main (int argc, char **argv) case 'l': if (split_type != type_undef) FAIL_ONLY_ONE_WAY (); + slash = strchr (optarg, '/'); + if (slash) + { + split_type = type_chunk_eol; + chunk_parse (&m_units, &n_units, slash); + break; + } split_type = type_lines; if (xstrtoumax (optarg, NULL, 10, &n_units, "") != LONGINT_OK || n_units == 0) @@ -447,6 +660,24 @@ main (int argc, char **argv) } break; + case 'n': + if (split_type != type_undef) + FAIL_ONLY_ONE_WAY (); + split_type = type_chunk_bytes; + slash = strchr (optarg, '/'); + if (slash) + { + chunk_parse (&m_units, &n_units, slash); + break; + } + if (xstrtoumax (optarg, NULL, 10, &n_units, "") != LONGINT_OK + || n_units == 0 || SIZE_MAX < n_units) + { + error (0, 0, _("%s: invalid number of total chunks"), optarg); + usage (EXIT_FAILURE); + } + break; + case 'C': if (split_type != type_undef) FAIL_ONLY_ONE_WAY (); @@ -492,6 +723,11 @@ main (int argc, char **argv) suffix_alphabet = "0123456789"; break; + case 't': + eol = *optarg; + eol_char = true; + break; + case VERBOSE_OPTION: verbose = true; break; @@ -505,6 +741,17 @@ main (int argc, char **argv) } } + /* Default eol to \n if none specified. */ + if (!eol_char) + eol = '\n'; + else + { + if (split_type == type_chunk_bytes) + split_type = type_chunk_eol; + if (split_type == type_bytes) + split_type = type_byteslines; + } + /* Handle default case. */ if (split_type == type_undef) { @@ -546,10 +793,14 @@ main (int argc, char **argv) output_desc = -1; /* Get the optimal block size of input device and make a buffer. */ - if (fstat (STDIN_FILENO, &stat_buf) != 0) error (EXIT_FAILURE, errno, "%s", infile); in_blk_size = io_blksize (stat_buf); + file_size = stat_buf.st_size; + + if (split_type == type_chunk_bytes || split_type == type_chunk_eol) + if (file_size < n_units) + error (EXIT_FAILURE, errno, "number of chunks exceed file size"); buf = ptr_align (xmalloc (in_blk_size + 1 + page_size - 1), page_size); @@ -561,13 +812,27 @@ main (int argc, char **argv) break; case type_bytes: - bytes_split (n_units, buf, in_blk_size); + bytes_split (n_units, buf, in_blk_size, 0); break; case type_byteslines: line_bytes_split (n_units); break; + case type_chunk_bytes: + if (m_units == 0) + bytes_split (file_size / n_units, buf, in_blk_size, n_units); + else + byte_chunk_extract (m_units, n_units, buf, in_blk_size, file_size); + break; + + case type_chunk_eol: + if (m_units == 0) + line_chunk_split (n_units, buf, in_blk_size, file_size); + else + line_chunk_extract (m_units, n_units, buf, in_blk_size, file_size); + break; + default: abort (); } -- 1.6.3.3