Module Name: src Committed By: gutteridge Date: Tue Mar 4 03:54:19 UTC 2025
Modified Files: src/usr.bin/cut: cut.1 cut.c Log Message: cut(1): implement the -n option (for use with -b) This command had long advertised the existence of -n (in its usage message and man page) but had never implemented it. Here we borrow the implementation written by Tim J. Robbins for FreeBSD, which provides most code changes and almost all documentation changes applied here. We also borrow some options handling simplifications from OpenBSD, with some minor tweaks to code and documentation by me. Addresses PR bin/59029. This is a pretty obscure feature, it seems, so it's unlikely it will be pulled up to stable branches. To generate a diff of this commit: cvs rdiff -u -r1.18 -r1.19 src/usr.bin/cut/cut.1 cvs rdiff -u -r1.30 -r1.31 src/usr.bin/cut/cut.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/usr.bin/cut/cut.1 diff -u src/usr.bin/cut/cut.1:1.18 src/usr.bin/cut/cut.1:1.19 --- src/usr.bin/cut/cut.1:1.18 Wed Jun 20 17:53:19 2012 +++ src/usr.bin/cut/cut.1 Tue Mar 4 03:54:19 2025 @@ -1,4 +1,4 @@ -.\" $NetBSD: cut.1,v 1.18 2012/06/20 17:53:19 wiz Exp $ +.\" $NetBSD: cut.1,v 1.19 2025/03/04 03:54:19 gutteridge Exp $ .\" .\" Copyright (c) 1989, 1990, 1993 .\" The Regents of the University of California. All rights reserved. @@ -32,7 +32,7 @@ .\" .\" @(#)cut.1 8.1 (Berkeley) 6/6/93 .\" -.Dd June 12, 2012 +.Dd March 4, 2025 .Dt CUT 1 .Os .Sh NAME @@ -110,11 +110,29 @@ specifies fields, separated by the field The selected fields are output, separated by the field delimiter character. .It Fl n -Do not split multi-byte characters. +Do not split multi-byte characters when the +.Fl b +option is used. +Characters will only be output if at least one byte is selected, and, +after a prefix of zero or more unselected bytes, the rest of the bytes +that form the character are selected. .It Fl s Suppress lines with no field delimiter characters. Unless specified, lines with no delimiters are passed through unmodified. .El +.Sh ENVIRONMENT +The +.Ev LANG , +.Ev LC_ALL , +and +.Ev LC_CTYPE +environment variables affect the execution of +.Nm +if the +.Fl n +option is specified. +Their effect is described in +.Xr environ 7 . .Sh EXIT STATUS .Ex -std .Sh SEE ALSO Index: src/usr.bin/cut/cut.c diff -u src/usr.bin/cut/cut.c:1.30 src/usr.bin/cut/cut.c:1.31 --- src/usr.bin/cut/cut.c:1.30 Wed Feb 19 17:34:14 2025 +++ src/usr.bin/cut/cut.c Tue Mar 4 03:54:19 2025 @@ -1,4 +1,4 @@ -/* $NetBSD: cut.c,v 1.30 2025/02/19 17:34:14 gutteridge Exp $ */ +/* $NetBSD: cut.c,v 1.31 2025/03/04 03:54:19 gutteridge Exp $ */ /* * Copyright (c) 1989, 1993 @@ -42,7 +42,7 @@ __COPYRIGHT("@(#) Copyright (c) 1989, 19 #if 0 static char sccsid[] = "@(#)cut.c 8.3 (Berkeley) 5/4/95"; #endif -__RCSID("$NetBSD: cut.c,v 1.30 2025/02/19 17:34:14 gutteridge Exp $"); +__RCSID("$NetBSD: cut.c,v 1.31 2025/03/04 03:54:19 gutteridge Exp $"); #endif /* not lint */ #include <ctype.h> @@ -63,9 +63,11 @@ static int cflag; static char dchar; static int dflag; static int fflag; +static int nflag; static int sflag; static void b_cut(FILE *, const char *); +static void b_n_cut(FILE *, const char *); static void c_cut(FILE *, const char *); static void f_cut(FILE *, const char *); static void get_list(char *); @@ -83,8 +85,6 @@ main(int argc, char *argv[]) dchar = '\t'; /* default delimiter is \t */ - /* Since we don't support multi-byte characters, the -c and -b - options are equivalent, and the -n option is meaningless. */ while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1) switch(ch) { case 'b': @@ -110,6 +110,7 @@ main(int argc, char *argv[]) sflag = 1; break; case 'n': + nflag = 1; break; case '?': default: @@ -118,14 +119,14 @@ main(int argc, char *argv[]) argc -= optind; argv += optind; - if (fflag) { - if (cflag || bflag) - usage(); - } else if ((!cflag && !bflag) || dflag || sflag) - usage(); - else if (bflag && cflag) + if (bflag + cflag + fflag != 1 || + (nflag && !bflag) || + ((dflag || sflag) && !fflag)) usage(); + if (nflag) + fcn = b_n_cut; + rval = 0; if (*argv) for (; *argv; ++argv) { @@ -219,6 +220,72 @@ get_list(char *list) (void)memset(positions + 1, '1', autostart); } +/* + * Cut based on byte positions, taking care not to split multibyte characters. + * Although this function also handles the case where -n is not specified, + * b_cut() ought to be much faster. + */ +static void +b_n_cut(FILE *fp, const char *fname) +{ + size_t col, i, lbuflen; + char *lbuf; + int canwrite, clen, warned; + mbstate_t mbs; + + memset(&mbs, 0, sizeof(mbs)); + warned = 0; + while ((lbuf = fgetln(fp, &lbuflen)) != NULL) { + for (col = 0; lbuflen > 0; col += clen) { + if ((clen = mbrlen(lbuf, lbuflen, &mbs)) < 0) { + if (!warned) { + warn("%s", fname); + warned = 1; + } + memset(&mbs, 0, sizeof(mbs)); + clen = 1; + } + if (clen == 0 || *lbuf == '\n') + break; + if (col < maxval && !positions[1 + col]) { + /* + * Print the character if (1) after an initial + * segment of un-selected bytes, the rest of + * it is selected, and (2) the last byte is + * selected. + */ + i = col; + while (i < col + clen && i < maxval && + !positions[1 + i]) + i++; + canwrite = i < col + clen; + for (; i < col + clen && i < maxval; i++) + canwrite &= positions[1 + i]; + if (canwrite) + fwrite(lbuf, 1, clen, stdout); + } else { + /* + * Print the character if all of it has + * been selected. + */ + canwrite = 1; + for (i = col; i < col + clen; i++) + if ((i >= maxval && !autostop) || + (i < maxval && !positions[1 + i])) { + canwrite = 0; + break; + } + if (canwrite) + fwrite(lbuf, 1, clen, stdout); + } + lbuf += clen; + lbuflen -= clen; + } + if (lbuflen > 0) + putchar('\n'); + } +} + static void /*ARGSUSED*/ f_cut(FILE *fp, const char *fname __unused)