Force lookahead in COPY FROM parsing

Heikki Linnakangas Thu, 04 Mar 2021 01:13:58 -0800

I posted this earlier athttps://www.postgresql.org/message-id/9ec25819-0a8a-d51a-17dc-4150bb3cc...@iki.fi,and that led to removing FE/BE protocol version 2 support. That's beencommitted now, so here's COPY FROM patch again, rebased. To recap:

Previously COPY FROM could not look ahead in the COPY stream, because inthe v2 protocol, it had to detect the end-of-copy marker correctly. Withv2 protocol gone, that's no longer an issue, and we can simplify theparsing slightly. Simpler should also mean faster, but I haven't triedthat measuring that.


- Heikki

>From 82680ff836fd5b43d1f83ed01e490d831ffa2b09 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Thu, 4 Mar 2021 11:10:39 +0200
Subject: [PATCH v2 1/1] Simplify COPY FROM parsing by forcing lookahead.

Now that we don't support the old-style COPY protocol anymore, we can
force four bytes of lookahead like was suggested in an existing comment,
and simplify the loop in CopyReadLineText().

Discussion: https://www.postgresql.org/message-id/9ec25819-0a8a-d51a-17dc-4150bb3cca3b%40iki.fi
---
 src/backend/commands/copyfromparse.c     | 119 ++++++++---------------
 src/include/commands/copyfrom_internal.h |   3 +-
 2 files changed, 40 insertions(+), 82 deletions(-)

diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index ce24a1528bd..c2efe7e0782 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -46,21 +46,6 @@
  * empty statements.  See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
  */
 
-/*
- * This keeps the character read at the top of the loop in the buffer
- * even if there is more than one read-ahead.
- */
-#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
-if (1) \
-{ \
-	if (raw_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
-	{ \
-		raw_buf_ptr = prev_raw_ptr; /* undo fetch */ \
-		need_data = true; \
-		continue; \
-	} \
-} else ((void) 0)
-
 /* This consumes the remainder of the buffer and breaks */
 #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
 if (1) \
@@ -118,7 +103,7 @@ static int	CopyGetData(CopyFromState cstate, void *databuf,
 						int minread, int maxread);
 static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
 static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
-static bool CopyLoadRawBuf(CopyFromState cstate);
+static bool CopyLoadRawBuf(CopyFromState cstate, int minread);
 static int	CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
 
 void
@@ -209,7 +194,7 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not read from COPY file: %m")));
-			if (bytesread == 0)
+			if (bytesread < maxread)
 				cstate->reached_eof = true;
 			break;
 		case COPY_FRONTEND:
@@ -278,6 +263,8 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
 			break;
 		case COPY_CALLBACK:
 			bytesread = cstate->data_source_cb(databuf, minread, maxread);
+			if (bytesread < minread)
+				cstate->reached_eof = true;
 			break;
 	}
 
@@ -329,14 +316,13 @@ CopyGetInt16(CopyFromState cstate, int16 *val)
 /*
  * CopyLoadRawBuf loads some more data into raw_buf
  *
- * Returns true if able to obtain at least one more byte, else false.
+ * Returns true if able to obtain at least 'minread' bytes, else false.
  *
  * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
- * of the buffer and then we load more data after that.  This case occurs only
- * when a multibyte character crosses a bufferload boundary.
+ * of the buffer and then we load more data after that.
  */
 static bool
-CopyLoadRawBuf(CopyFromState cstate)
+CopyLoadRawBuf(CopyFromState cstate, int minread)
 {
 	int			nbytes = RAW_BUF_BYTES(cstate);
 	int			inbytes;
@@ -347,14 +333,15 @@ CopyLoadRawBuf(CopyFromState cstate)
 				nbytes);
 
 	inbytes = CopyGetData(cstate, cstate->raw_buf + nbytes,
-						  1, RAW_BUF_SIZE - nbytes);
+						  minread, RAW_BUF_SIZE - nbytes);
 	nbytes += inbytes;
 	cstate->raw_buf[nbytes] = '\0';
 	cstate->raw_buf_index = 0;
 	cstate->raw_buf_len = nbytes;
 	cstate->bytes_processed += inbytes;
 	pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
-	return (inbytes > 0);
+
+	return (inbytes >= minread);
 }
 
 /*
@@ -389,7 +376,7 @@ CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
 			/* Load more data if buffer is empty. */
 			if (RAW_BUF_BYTES(cstate) == 0)
 			{
-				if (!CopyLoadRawBuf(cstate))
+				if (!CopyLoadRawBuf(cstate, 1))
 					break;		/* EOF */
 			}
 
@@ -678,7 +665,7 @@ CopyReadLine(CopyFromState cstate)
 			do
 			{
 				cstate->raw_buf_index = cstate->raw_buf_len;
-			} while (CopyLoadRawBuf(cstate));
+			} while (CopyLoadRawBuf(cstate, 1));
 		}
 	}
 	else
@@ -747,7 +734,6 @@ CopyReadLineText(CopyFromState cstate)
 	char	   *copy_raw_buf;
 	int			raw_buf_ptr;
 	int			copy_buf_len;
-	bool		need_data = false;
 	bool		hit_eof = false;
 	bool		result = false;
 	char		mblen_str[2];
@@ -794,6 +780,7 @@ CopyReadLineText(CopyFromState cstate)
 	copy_raw_buf = cstate->raw_buf;
 	raw_buf_ptr = cstate->raw_buf_index;
 	copy_buf_len = cstate->raw_buf_len;
+	hit_eof = cstate->reached_eof;
 
 	for (;;)
 	{
@@ -801,38 +788,40 @@ CopyReadLineText(CopyFromState cstate)
 		char		c;
 
 		/*
-		 * Load more data if needed.  Ideally we would just force four bytes
-		 * of read-ahead and avoid the many calls to
-		 * IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(), but the COPY_OLD_FE protocol
-		 * does not allow us to read too far ahead or we might read into the
-		 * next data, so we read-ahead only as far we know we can.  One
-		 * optimization would be to read-ahead four byte here if
-		 * cstate->copy_src != COPY_OLD_FE, but it hardly seems worth it,
-		 * considering the size of the buffer.
+		 * Load more data if needed.
+		 *
+		 * We need to look ahead max three bytes in one iteration of the loop
+		 * (for the sequence \.<CR><NL>), so make sure we have at least four
+		 * bytes in the buffer.  Note that we always guarantee that there is
+		 * one \0 in the buffer, after last valid byte; the lookaheads below
+		 * rely on that.
 		 */
-		if (raw_buf_ptr >= copy_buf_len || need_data)
+#define COPY_READ_LINE_LOOKAHEAD	4
+		if (raw_buf_ptr + COPY_READ_LINE_LOOKAHEAD >= copy_buf_len)
 		{
-			REFILL_LINEBUF;
+			if (!hit_eof)
+			{
+				REFILL_LINEBUF;
 
-			/*
-			 * Try to read some more data.  This will certainly reset
-			 * raw_buf_index to zero, and raw_buf_ptr must go with it.
-			 */
-			if (!CopyLoadRawBuf(cstate))
-				hit_eof = true;
-			raw_buf_ptr = 0;
-			copy_buf_len = cstate->raw_buf_len;
+				/*
+				 * Try to read some more data.  This will certainly reset
+				 * raw_buf_index to zero, and raw_buf_ptr must go with it.
+				 */
+				if (!CopyLoadRawBuf(cstate, COPY_READ_LINE_LOOKAHEAD))
+					hit_eof = true;
+				raw_buf_ptr = 0;
+				copy_buf_len = cstate->raw_buf_len;
+			}
 
 			/*
 			 * If we are completely out of data, break out of the loop,
 			 * reporting EOF.
 			 */
-			if (copy_buf_len <= 0)
+			if (copy_buf_len - raw_buf_ptr <= 0)
 			{
 				result = true;
 				break;
 			}
-			need_data = false;
 		}
 
 		/* OK to fetch a character */
@@ -841,20 +830,6 @@ CopyReadLineText(CopyFromState cstate)
 
 		if (cstate->opts.csv_mode)
 		{
-			/*
-			 * If character is '\\' or '\r', we may need to look ahead below.
-			 * Force fetch of the next character if we don't already have it.
-			 * We need to do this before changing CSV state, in case one of
-			 * these characters is also the quote or escape character.
-			 *
-			 * Note: old-protocol does not like forced prefetch, but it's OK
-			 * here since we cannot validly be at EOF.
-			 */
-			if (c == '\\' || c == '\r')
-			{
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-			}
-
 			/*
 			 * Dealing with quotes and escapes here is mildly tricky. If the
 			 * quote char is also the escape char, there's no problem - we
@@ -888,14 +863,9 @@ CopyReadLineText(CopyFromState cstate)
 				cstate->eol_type == EOL_CRNL)
 			{
 				/*
-				 * If need more data, go back to loop top to load it.
-				 *
-				 * Note that if we are at EOF, c will wind up as '\0' because
-				 * of the guaranteed pad of raw_buf.
+				 * Look at the next character.  If we're at EOF, c2 will wind
+				 * up as '\0' because of the guaranteed pad of raw_buf.
 				 */
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-
-				/* get next char */
 				c = copy_raw_buf[raw_buf_ptr];
 
 				if (c == '\n')
@@ -961,7 +931,6 @@ CopyReadLineText(CopyFromState cstate)
 		{
 			char		c2;
 
-			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 			IF_NEED_REFILL_AND_EOF_BREAK(0);
 
 			/* -----
@@ -976,16 +945,9 @@ CopyReadLineText(CopyFromState cstate)
 			{
 				raw_buf_ptr++;	/* consume the '.' */
 
-				/*
-				 * Note: if we loop back for more data here, it does not
-				 * matter that the CSV state change checks are re-executed; we
-				 * will come back here with no important state changed.
-				 */
 				if (cstate->eol_type == EOL_CRNL)
 				{
-					/* Get the next character */
-					IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-					/* if hit_eof, c2 will become '\0' */
+					/* Get next character.  If hit_eof, c2 will become '\0' */
 					c2 = copy_raw_buf[raw_buf_ptr++];
 
 					if (c2 == '\n')
@@ -1008,9 +970,7 @@ CopyReadLineText(CopyFromState cstate)
 					}
 				}
 
-				/* Get the next character */
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-				/* if hit_eof, c2 will become '\0' */
+				/* Get next character.  If hit_eof, c2 will become '\0' */
 				c2 = copy_raw_buf[raw_buf_ptr++];
 
 				if (c2 != '\r' && c2 != '\n')
@@ -1095,7 +1055,6 @@ not_end_of_copy:
 			mblen_str[0] = c;
 			mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str);
 
-			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
 			IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
 			raw_buf_ptr += mblen - 1;
 		}
diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index 705f5b615be..c088c4facdb 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -70,8 +70,7 @@ typedef struct CopyFromStateData
 	CopySource	copy_src;		/* type of copy source */
 	FILE	   *copy_file;		/* used if copy_src == COPY_FILE */
 	StringInfo	fe_msgbuf;		/* used if copy_src == COPY_NEW_FE */
-	bool		reached_eof;	/* true if we read to end of copy data (not
-								 * all copy_src types maintain this) */
+	bool		reached_eof;	/* true if we read to end of copy data */
 
 	EolType		eol_type;		/* EOL type of input */
 	int			file_encoding;	/* file or remote side's character encoding */
-- 
2.30.1

Force lookahead in COPY FROM parsing

Reply via email to