On 07/08/2021 14:07, Michael Debertol wrote:
Hi,

after https://lists.gnu.org/archive/html/coreutils/2021-02/msg00003.html
(unreleased), the behavior of cat -E was changed so that it prints "^M$"
for "\r\n" line endings.

Whenever it sees a \r "cat -E" checks if the byte after is a \n, however
that \n might be the sentinel value that is inserted at the end of a buffer.

This is a problem in two cases:

- When a \r is at the end of the input. `printf "\r" | cat -E` will
print "^M", even though there is no "\n" after the "\r". FWIW,
tests/misc/cat-E.sh expects a "^M" for a trailing "\r", but I think
that's wrong.

This was intentional (as per the test) as I was thinking
we can provide more info here in the edge case that \r is the last char of a 
file.
However it's incorrect as you suggest, as cat can't treat files independently.

- When the file is too big to fit into one buffer. If you try to "cat
-E" a big file (mutliple megabytes) that consists of only "\r", cat will
print a few "^M" whenever it hits the end of a buffer in the middle of
the file and at the end.

That indeed is a bug.

So we need to track handling of \r across buffer and file boundaries.
The attached does that, and I'll apply later.

marking this as done,

thanks!
Pádraig
>From 2952343144654eaac158ef65292d885a61759e75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <p...@draigbrady.com>
Date: Sat, 7 Aug 2021 18:47:57 +0100
Subject: [PATCH] cat: with -E fix handling of \r\n spanning buffers

We must delay handling when \r is the last character
of the buffer being processed, as the next character
may or may not be \n.

* src/cat.c (pending_cr): A new global to record whether
the last character processed (in -E mode) is '\r'.
(cat): Honor pending_cr when processing the start of the buffer.
(main): Honor pending_cr if no more files to process.
* tests/misc/cat-E.sh: Add test cases.
Fixes https://bugs.gnu.org/49925
---
 src/cat.c           | 39 +++++++++++++++++++++++++++++++++------
 tests/misc/cat-E.sh | 16 ++++++++++++++--
 2 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/src/cat.c b/src/cat.c
index 28f6e8e4e..17bc4fab9 100644
--- a/src/cat.c
+++ b/src/cat.c
@@ -78,6 +78,9 @@ static char *line_num_end = line_buf + LINE_COUNTER_BUF_LEN - 3;
 /* Preserves the 'cat' function's local 'newlines' between invocations.  */
 static int newlines2 = 0;
 
+/* Whether there is a pending CR to process.  */
+static bool pending_cr = false;
+
 void
 usage (int status)
 {
@@ -397,9 +400,16 @@ cat (
                 }
 
               /* Output a currency symbol if requested (-e).  */
-
               if (show_ends)
-                *bpout++ = '$';
+                {
+                  if (pending_cr)
+                    {
+                      *bpout++ = '^';
+                      *bpout++ = 'M';
+                      pending_cr = false;
+                    }
+                  *bpout++ = '$';
+                }
 
               /* Output the newline.  */
 
@@ -409,6 +419,14 @@ cat (
         }
       while (ch == '\n');
 
+      /* Here CH cannot contain a newline character.  */
+
+      if (pending_cr)
+        {
+          *bpout++ = '\r';
+          pending_cr = false;
+        }
+
       /* Are we at the beginning of a line, and line numbers are requested?  */
 
       if (newlines >= 0 && number)
@@ -417,8 +435,6 @@ cat (
           bpout = stpcpy (bpout, line_num_print);
         }
 
-      /* Here CH cannot contain a newline character.  */
-
       /* The loops below continue until a newline character is found,
          which means that the buffer is empty or that a proper newline
          has been found.  */
@@ -489,8 +505,13 @@ cat (
                 {
                   if (ch == '\r' && *bpin == '\n' && show_ends)
                     {
-                      *bpout++ = '^';
-                      *bpout++ = 'M';
+                      if (bpin == eob)
+                        pending_cr = true;
+                      else
+                        {
+                          *bpout++ = '^';
+                          *bpout++ = 'M';
+                        }
                     }
                   else
                     *bpout++ = ch;
@@ -768,6 +789,12 @@ main (int argc, char **argv)
     }
   while (++argind < argc);
 
+  if (pending_cr)
+    {
+      if (full_write (STDOUT_FILENO, "\r", 1) != 1)
+        die (EXIT_FAILURE, errno, _("write error"));
+    }
+
   if (have_read_stdin && close (STDIN_FILENO) < 0)
     die (EXIT_FAILURE, errno, _("closing standard input"));
 
diff --git a/tests/misc/cat-E.sh b/tests/misc/cat-E.sh
index 401b6d591..1131eb3a5 100755
--- a/tests/misc/cat-E.sh
+++ b/tests/misc/cat-E.sh
@@ -21,10 +21,22 @@ print_ver_ cat
 # Up to and including 8.32 the $ would have displayed at the start of the line
 # overwriting the first character
 printf 'a\rb\r\nc\n\r\nd\r' > 'in' || framework_failure_
-printf 'a\rb^M$\nc$\n^M$\nd^M' > 'exp' || framework_failure_
-
+printf 'a\rb^M$\nc$\n^M$\nd\r' > 'exp' || framework_failure_
 cat -E 'in' > out || fail=1
+compare exp out || fail=1
+
+# Ensure \r\n spanning files (or buffers) is handled
+printf '1\r' > in2 || framework_failure_
+printf '\n2\r\n' > in2b || framework_failure_
+printf '1^M$\n2^M$\n' > 'exp' || framework_failure_
+cat -E 'in2' 'in2b' > out || fail=1
+compare exp out || fail=1
 
+# Ensure \r at end of buffer is handled
+printf '1\r' > in2 || framework_failure_
+printf '2\r\n' > in2b || framework_failure_
+printf '1\r2^M$\n' > 'exp' || framework_failure_
+cat -E 'in2' 'in2b' > out || fail=1
 compare exp out || fail=1
 
 Exit $fail
-- 
2.26.2

Reply via email to