split

Jan Schaumann Mon, 30 Jan 2023 07:22:11 -0800

Module Name:    src
Committed By:   jschauma
Date:           Mon Jan 30 15:22:03 UTC 2023


Modified Files:
        src/usr.bin/split: split.1 split.c

Log Message:
auto-extend suffix length if required

If the input cannot be split into the number of files resulting from the
default suffix length, automatically extend the suffix length rather than
bailing out with 'too many files'.

Suffixes are extended such that the resulting files continue to sort
lexically and "cat *" would reproduce the input.  For example, splitting
a 1M lines file into (default) 1000 lines per file would yield files
named 'xaa', 'xab', ..., 'xyy', 'xyz', 'xzaaa', 'xzaab', ..., 'xzanl'.

If '-a' is specified, the suffix length is not auto-extended.

This behavior matches GNU sort(1) since around version 8.16.


To generate a diff of this commit:
cvs rdiff -u -r1.15 -r1.16 src/usr.bin/split/split.1
cvs rdiff -u -r1.28 -r1.29 src/usr.bin/split/split.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/usr.bin/split/split.1
diff -u src/usr.bin/split/split.1:1.15 src/usr.bin/split/split.1:1.16
--- src/usr.bin/split/split.1:1.15	Thu May 31 01:35:35 2007
+++ src/usr.bin/split/split.1	Mon Jan 30 15:22:02 2023
@@ -1,4 +1,4 @@
-.\"	$NetBSD: split.1,v 1.15 2007/05/31 01:35:35 jschauma Exp $
+.\"	$NetBSD: split.1,v 1.16 2023/01/30 15:22:02 jschauma Exp $
 .\"
 .\" Copyright (c) 1990, 1991, 1993, 1994
 .\"	The Regents of the University of California.  All rights reserved.
@@ -29,7 +29,7 @@
 .\"
 .\"	@(#)split.1	8.3 (Berkeley) 4/16/94
 .\"
-.Dd May 28, 2007
+.Dd January 28, 2023
 .Dt SPLIT 1
 .Os
 .Sh NAME
@@ -99,7 +99,12 @@ characters in the range
 .Dq Li a-z .
 If
 .Fl a
-is not specified, two letters are used as the suffix.
+is not specified, two letters are used as the initial
+suffix.
+If the output does not fit into the resulting number
+of files, then the suffix length is automatically
+extended as needed such that all output files continue
+to sort in lexical order.
 .Pp
 If the
 .Ar name

Index: src/usr.bin/split/split.c
diff -u src/usr.bin/split/split.c:1.28 src/usr.bin/split/split.c:1.29
--- src/usr.bin/split/split.c:1.28	Fri Jan 27 19:39:04 2023
+++ src/usr.bin/split/split.c	Mon Jan 30 15:22:02 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: split.c,v 1.28 2023/01/27 19:39:04 jschauma Exp $	*/
+/*	$NetBSD: split.c,v 1.29 2023/01/30 15:22:02 jschauma Exp $	*/
 
 /*
  * Copyright (c) 1987, 1993, 1994
@@ -39,7 +39,7 @@ __COPYRIGHT("@(#) Copyright (c) 1987, 19
 #if 0
 static char sccsid[] = "@(#)split.c	8.3 (Berkeley) 4/25/94";
 #endif
-__RCSID("$NetBSD: split.c,v 1.28 2023/01/27 19:39:04 jschauma Exp $");
+__RCSID("$NetBSD: split.c,v 1.29 2023/01/30 15:22:02 jschauma Exp $");
 #endif /* not lint */
 
 #include <sys/param.h>
@@ -60,6 +60,7 @@ static int file_open;		/* If a file is o
 static int ifd = STDIN_FILENO, ofd = -1; /* Input/output file descriptors. */
 static char *fname;		/* File name prefix. */
 static size_t sfxlen = 2;	/* Suffix length. */
+static int autosfx = 1;		/* Whether to auto-extend the suffix length. */
 
 static void newfile(void);
 static void split1(off_t, int) __dead;
@@ -120,6 +121,7 @@ main(int argc, char *argv[])
 			    (sfxlen = (size_t)strtoul(optarg, &ep, 10)) == 0 ||
 			    *ep != '\0')
 				errx(1, "%s: illegal suffix length.", optarg);
+			autosfx = 0;
 			break;
 		case 'n':		/* Chunks. */
 			if (!isdigit((unsigned char)optarg[0]) ||
@@ -323,6 +325,38 @@ newfile(void)
 		err(1, "%s", fname);
 
 	quot = fnum;
+
+	/* If '-a' is not specified, then we automatically expand the
+	 * suffix length to accomodate splitting all input.  We do this
+	 * by moving the suffix pointer (fpnt) forward and incrementing
+	 * sfxlen by one, thereby yielding an additional two characters
+	 * and allowing all output files to sort such that 'cat *' yields
+	 * the input in order.  I.e., the order is '... xyy xyz xzaaa
+	 * xzaab ... xzyzy, xzyzz, xzzaaaa, xzzaaab' and so on. */
+	if (autosfx && (fpnt[0] == 'y') && (strspn(fpnt+1, "z") == strlen(fpnt+1))) {
+		if ((fname = realloc(fname, strlen(fname) + sfxlen + 2 + 1)) == NULL)
+			err(EXIT_FAILURE, NULL);
+			/* NOTREACHED */
+
+		fpnt = fname + strlen(fname) - sfxlen;
+		fpnt[sfxlen + 2] = '\0';
+
+		fpnt[0] = 'z';
+		fpnt[1] = 'a';
+
+		/*  Basename | Suffix
+		 *  before:
+		 *  x        | yz
+		 *  after:
+		 *  xz       | a.. */
+		fpnt++;
+		sfxlen++;
+
+		/* Reset so we start back at all 'a's in our extended suffix. */
+		quot = 0;
+		fnum = 0;
+	}
+
 	for (i = sfxlen - 1; i >= 0; i--) {
 		fpnt[i] = quot % 26 + 'a';
 		quot = quot / 26;

CVS commit: src/usr.bin/split

Reply via email to