Hi,
9e2d870 enabled the COPY command to skip soft error, and I think we can
add another option which specifies the maximum tolerable number of soft
errors.
I remember this was discussed in [1], and feel it would be useful when
loading 'dirty' data but there is a limit to how dirty it can be.
Attached a patch for this.
What do you think?
[1]
https://www.postgresql.org/message-id/752672.1699474336%40sss.pgh.pa.us
--
Regards,
--
Atsushi Torikoshi
NTT DATA Group Corporation
From 7f111e98e21654c4ca338c93d7cbb4ec9acaabcb Mon Sep 17 00:00:00 2001
From: Atsushi Torikoshi <torikos...@oss.nttdata.com>
Date: Fri, 26 Jan 2024 18:32:40 +0900
Subject: [PATCH v1] Add new COPY option REJECT_LIMIT
REJECT_LIMIT specifies the maximum tolerable number of malformed rows.
If input data has more malformed errors than this value, entire COPY fails.
This option must be used with ON_ERROR to be set to other than stop.
---
doc/src/sgml/ref/copy.sgml | 13 +++++++++++++
src/backend/commands/copy.c | 16 ++++++++++++++++
src/backend/commands/copyfrom.c | 6 ++++++
src/include/commands/copy.h | 1 +
src/test/regress/expected/copy2.out | 10 ++++++++++
src/test/regress/sql/copy2.sql | 21 +++++++++++++++++++++
6 files changed, 67 insertions(+)
diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml
index 21a5c4a052..8982e8464a 100644
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -393,6 +393,19 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><literal>REJECT_LIMIT</literal></term>
+ <listitem>
+ <para>
+ Specifies the maximum tolerable number of malformed rows.
+ If input data has caused more malformed errors than this value, entire
+ <command>COPY</command> fails.
+ This option must be used with <literal>ON_ERROR</literal> to be set to
+ other than <literal>stop</literal>.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><literal>ENCODING</literal></term>
<listitem>
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index cc0786c6f4..ca5263d588 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -615,6 +615,22 @@ ProcessCopyOptions(ParseState *pstate,
on_error_specified = true;
opts_out->on_error = defGetCopyOnErrorChoice(defel, pstate, is_from);
}
+ else if (strcmp(defel->defname, "reject_limit") == 0)
+ {
+ int64 reject_limit = defGetInt64(defel);
+
+ if (!opts_out->on_error)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("REJECT_LIMIT requires ON_ERROR to be set to other than stop")));
+ if (opts_out->reject_limit > 0)
+ errorConflictingDefElem(defel, pstate);
+ if (reject_limit <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("REJECT_LIMIT must be greater than zero")));
+ opts_out->reject_limit = reject_limit;
+ }
else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 1fe70b9133..15066887ea 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -1017,6 +1017,12 @@ CopyFrom(CopyFromState cstate)
pgstat_progress_update_param(PROGRESS_COPY_TUPLES_SKIPPED,
++skipped);
+ if (cstate->opts.reject_limit > 0 && skipped > cstate->opts.reject_limit)
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("exceeded the number specified by REJECT LIMIT \"%d\"",
+ cstate->opts.reject_limit)));
+
continue;
}
diff --git a/src/include/commands/copy.h b/src/include/commands/copy.h
index b3da3cb0be..8f8dab9524 100644
--- a/src/include/commands/copy.h
+++ b/src/include/commands/copy.h
@@ -73,6 +73,7 @@ typedef struct CopyFormatOptions
bool *force_null_flags; /* per-column CSV FN flags */
bool convert_selectively; /* do selective binary conversion? */
CopyOnErrorChoice on_error; /* what to do when error happened */
+ int reject_limit; /* tolerable number of malformed rows */
List *convert_select; /* list of column names (can be NIL) */
} CopyFormatOptions;
diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out
index 25c401ce34..28de7a2685 100644
--- a/src/test/regress/expected/copy2.out
+++ b/src/test/regress/expected/copy2.out
@@ -108,6 +108,10 @@ COPY x to stdin (format BINARY, on_error unsupported);
ERROR: COPY ON_ERROR cannot be used with COPY TO
LINE 1: COPY x to stdin (format BINARY, on_error unsupported);
^
+COPY x from stdin with (reject_limit 3);
+ERROR: REJECT_LIMIT requires ON_ERROR to be set to other than stop
+COPY x from stdin with (on_error ignore, reject_limit 0);
+ERROR: REJECT_LIMIT must be greater than zero
-- too many columns in column list: should fail
COPY x (a, b, c, d, e, d, c) from stdin;
ERROR: column "d" specified more than once
@@ -751,6 +755,12 @@ CONTEXT: COPY check_ign_err, line 1: "1 {1}"
COPY check_ign_err FROM STDIN WITH (on_error ignore);
ERROR: extra data after last expected column
CONTEXT: COPY check_ign_err, line 1: "1 {1} 3 abc"
+-- tests for reject_limit option
+COPY check_ign_err FROM STDIN WITH (on_error ignore, reject_limit 3);
+ERROR: exceeded the number specified by REJECT LIMIT "3"
+CONTEXT: COPY check_ign_err, line 5, column n: ""
+COPY check_ign_err FROM STDIN WITH (on_error ignore, reject_limit 4);
+NOTICE: 4 rows were skipped due to data type incompatibility
-- clean up
DROP TABLE forcetest;
DROP TABLE vistest;
diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql
index b5e549e856..e5299b1308 100644
--- a/src/test/regress/sql/copy2.sql
+++ b/src/test/regress/sql/copy2.sql
@@ -80,6 +80,8 @@ COPY x to stdin (format CSV, force_not_null(a));
COPY x to stdout (format TEXT, force_null(a));
COPY x to stdin (format CSV, force_null(a));
COPY x to stdin (format BINARY, on_error unsupported);
+COPY x from stdin with (reject_limit 3);
+COPY x from stdin with (on_error ignore, reject_limit 0);
-- too many columns in column list: should fail
COPY x (a, b, c, d, e, d, c) from stdin;
@@ -534,6 +536,25 @@ COPY check_ign_err FROM STDIN WITH (on_error ignore);
1 {1} 3 abc
\.
+-- tests for reject_limit option
+COPY check_ign_err FROM STDIN WITH (on_error ignore, reject_limit 3);
+6 {6} 6
+a {7} 7
+8 {8} 8888888888
+9 {a, 9} 9
+
+10 {10} 10
+\.
+
+COPY check_ign_err FROM STDIN WITH (on_error ignore, reject_limit 4);
+6 {6} 6
+a {7} 7
+8 {8} 8888888888
+9 {a, 9} 9
+
+10 {10} 10
+\.
+
-- clean up
DROP TABLE forcetest;
DROP TABLE vistest;
base-commit: f2743a7d70e7b2891277632121bb51e739743a47
--
2.39.2