Dear Kuroda-san, hackers,
On 2025/06/04 21:57, Hayato Kuroda (Fujitsu) wrote:
Dear Ikeda-san,
Thanks for starting the new thread! I have never known the issue before I heard
at
PGConf.dev.
Few comments:
1.
This parameter seems a type of benchmark option. So should we set
benchmarking_option_set as well?
2.
Not sure, but exit-on-abort seems a similar option. What if both are specified?
Is it allowed?
3.
Can you consider a test case for the new parameter?
Best regards,
Hayato Kuroda
FUJITSU LIMITED
Thank you for your valuable comment!
1. I should've also set benchmarking_option_set. I've modified it accordingly.
2. The exit-on-abort option and continue-on-error option are mutually exclusive.
Therefore, I've updated the patch to throw a FATAL error when two options are
set simultaneously. Corresponding explanation was also added.
(I'm wondering the name of parameter should be continue-on-abort so that users
understand the two option are mutually exclusive.)
3. I've added the test.
Additionally, I modified the patch so that st->state does not transition to
CSTATE_RETRY when a transaction fails and continue-on-error option is enabled.
In the previous patch, we retry the failed transaction up to max-try times,
which is unnecessary for our purpose: clients does not exit when its
transactions fail.
I've attached the updated patch.
v3-0001-Add-continue-on-error-option-to-pgbench.patch is identical to
v4-0001-Add-continue-on-error-option-to-pgbench.patch. The v4-0002 patch is the
diff from the previous patch.
Best regards,
Rintaro Ikeda
From 6a539c2d467b671728a564a0368eb474d398310c Mon Sep 17 00:00:00 2001
From: "Rintaro.Ikeda" <ikedarinta...@oss.nttdata.com>
Date: Mon, 12 May 2025 21:57:39 +0900
Subject: [PATCH 1/2] Add continue-on-error option to pgbench
When the option is set, client rolls back the failed transaction and starts a
new one when its transaction fails due to the reason other than the deadlock and
serialization failure.
---
doc/src/sgml/ref/pgbench.sgml | 12 +++++++++++
src/bin/pgbench/pgbench.c | 39 +++++++++++++++++++++++++++++++----
2 files changed, 47 insertions(+), 4 deletions(-)
diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml
index ab252d9fc74..dcb8c1c487c 100644
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -914,6 +914,18 @@ pgbench <optional> <replaceable>options</replaceable>
</optional> <replaceable>d
</listitem>
</varlistentry>
+ <varlistentry id="pgbench-option-continue-on-error">
+ <term><option>--continue-on-error</option></term>
+ <listitem>
+ <para>
+ Client rolls back the failed transaction and starts a new one when its
+ transaction fails due to the reason other than the deadlock and
+ serialization failure. This allows all clients specified with -c option
+ to continuously apply load to the server, even if some transactions
fail.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</para>
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 497a936c141..5db222f2c1e 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -440,6 +440,10 @@ typedef struct StatsData
int64 deadlock_failures; /* number of transactions that
were not
*
successfully retried after a deadlock
*
error */
+ int64 other_sql_failures; /* number of failed
transactions for reasons
+ *
other than serialization/deadlock failure
+ * ,
which is enabled if --continue-on-error
+ * is
used */
SimpleStats latency;
SimpleStats lag;
} StatsData;
@@ -770,6 +774,7 @@ static int64 total_weight = 0;
static bool verbose_errors = false; /* print verbose messages of all errors */
static bool exit_on_abort = false; /* exit when any client is aborted */
+static bool continue_on_error = false; /* continue after errors */
/* Builtin test scripts */
typedef struct BuiltinScript
@@ -954,6 +959,8 @@ usage(void)
" --log-prefix=PREFIX prefix for transaction time log
file\n"
" (default: \"pgbench_log\")\n"
" --max-tries=NUM max number of tries to run
transaction (default: 1)\n"
+ " --continue-on-error\n"
+ " Continue and retry transactions
that failed due to errors other than serialization or deadlocks.\n"
" --progress-timestamp use Unix epoch timestamps for
progress\n"
" --random-seed=SEED set random seed (\"time\",
\"rand\", integer)\n"
" --sampling-rate=NUM fraction of transactions to log
(e.g., 0.01 for 1%%)\n"
@@ -1467,6 +1474,7 @@ initStats(StatsData *sd, pg_time_usec_t start)
sd->retried = 0;
sd->serialization_failures = 0;
sd->deadlock_failures = 0;
+ sd->other_sql_failures = 0;
initSimpleStats(&sd->latency);
initSimpleStats(&sd->lag);
}
@@ -1516,6 +1524,9 @@ accumStats(StatsData *stats, bool skipped, double lat,
double lag,
case ESTATUS_DEADLOCK_ERROR:
stats->deadlock_failures++;
break;
+ case ESTATUS_OTHER_SQL_ERROR:
+ stats->other_sql_failures++;
+ break;
default:
/* internal error which should never occur */
pg_fatal("unexpected error status: %d", estatus);
@@ -3247,7 +3258,8 @@ static bool
canRetryError(EStatus estatus)
{
return (estatus == ESTATUS_SERIALIZATION_ERROR ||
- estatus == ESTATUS_DEADLOCK_ERROR);
+ estatus == ESTATUS_DEADLOCK_ERROR ||
+ (continue_on_error && estatus ==
ESTATUS_OTHER_SQL_ERROR));
}
/*
@@ -4528,7 +4540,8 @@ static int64
getFailures(const StatsData *stats)
{
return (stats->serialization_failures +
- stats->deadlock_failures);
+ stats->deadlock_failures +
+ stats->other_sql_failures);
}
/*
@@ -4548,6 +4561,8 @@ getResultString(bool skipped, EStatus estatus)
return "serialization";
case ESTATUS_DEADLOCK_ERROR:
return "deadlock";
+ case ESTATUS_OTHER_SQL_ERROR:
+ return "error (except serialization/deadlock)";
default:
/* internal error which should never occur */
pg_fatal("unexpected error status: %d",
estatus);
@@ -4603,6 +4618,7 @@ doLog(TState *thread, CState *st,
int64 skipped = 0;
int64 serialization_failures = 0;
int64 deadlock_failures = 0;
+ int64 other_sql_failures = 0;
int64 retried = 0;
int64 retries = 0;
@@ -4643,10 +4659,12 @@ doLog(TState *thread, CState *st,
{
serialization_failures =
agg->serialization_failures;
deadlock_failures = agg->deadlock_failures;
+ other_sql_failures = agg->other_sql_failures;
}
- fprintf(logfile, " " INT64_FORMAT " " INT64_FORMAT,
+ fprintf(logfile, " " INT64_FORMAT " " INT64_FORMAT " "
INT64_FORMAT,
serialization_failures,
- deadlock_failures);
+ deadlock_failures,
+ other_sql_failures);
fputc('\n', logfile);
@@ -6285,6 +6303,7 @@ printProgressReport(TState *threads, int64 test_start,
pg_time_usec_t now,
cur.serialization_failures +=
threads[i].stats.serialization_failures;
cur.deadlock_failures += threads[i].stats.deadlock_failures;
+ cur.other_sql_failures += threads[i].stats.other_sql_failures;
}
/* we count only actually executed transactions */
@@ -6443,6 +6462,9 @@ printResults(StatsData *total,
printf("number of deadlock failures: " INT64_FORMAT "
(%.3f%%)\n",
total->deadlock_failures,
100.0 * total->deadlock_failures / total_cnt);
+ printf("number of other failures: " INT64_FORMAT " (%.3f%%)\n",
+ total->other_sql_failures,
+ 100.0 * total->other_sql_failures / total_cnt);
}
/* it can be non-zero only if max_tries is not equal to one */
@@ -6546,6 +6568,10 @@ printResults(StatsData *total,
sstats->deadlock_failures,
(100.0 *
sstats->deadlock_failures /
script_total_cnt));
+ printf(" - number of other
failures: " INT64_FORMAT " (%.3f%%)\n",
+
sstats->other_sql_failures,
+ (100.0 *
sstats->other_sql_failures /
+
script_total_cnt));
}
/*
@@ -6705,6 +6731,7 @@ main(int argc, char **argv)
{"verbose-errors", no_argument, NULL, 15},
{"exit-on-abort", no_argument, NULL, 16},
{"debug", no_argument, NULL, 17},
+ {"continue-on-error", no_argument, NULL, 18},
{NULL, 0, NULL, 0}
};
@@ -7058,6 +7085,9 @@ main(int argc, char **argv)
case 17: /* debug */
pg_logging_increase_verbosity();
break;
+ case 18: /* continue-on-error */
+ continue_on_error = true;
+ break;
default:
/* getopt_long already emitted a complaint */
pg_log_error_hint("Try \"%s --help\" for more
information.", progname);
@@ -7413,6 +7443,7 @@ main(int argc, char **argv)
stats.retried += thread->stats.retried;
stats.serialization_failures +=
thread->stats.serialization_failures;
stats.deadlock_failures += thread->stats.deadlock_failures;
+ stats.other_sql_failures += thread->stats.other_sql_failures;
latency_late += thread->latency_late;
conn_total_duration += thread->conn_duration;
--
2.39.5 (Apple Git-154)
From d9d363c7c298f44063a2aa33530622548ee45cbf Mon Sep 17 00:00:00 2001
From: Rintaro Ikeda <ikedarintaro@ikedarintarous-MacBook-Air.local>
Date: Sun, 8 Jun 2025 23:40:32 +0900
Subject: [PATCH 2/2] 1. Do not retry failed transaction due to
other_sql_failures. 2. modify documentation and comments. 3. add test.
---
doc/src/sgml/ref/pgbench.sgml | 3 +++
src/bin/pgbench/pgbench.c | 26 +++++++++++++++-----
src/bin/pgbench/t/001_pgbench_with_server.pl | 22 +++++++++++++++++
3 files changed, 45 insertions(+), 6 deletions(-)
diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml
index dcb8c1c487c..2086dd59cb3 100644
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -923,6 +923,9 @@ pgbench <optional> <replaceable>options</replaceable>
</optional> <replaceable>d
serialization failure. This allows all clients specified with -c option
to continuously apply load to the server, even if some transactions
fail.
</para>
+ <para>
+ Note that this option can not be used together with
+ <option>--exit-on-abort</option>.
</listitem>
</varlistentry>
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 5db222f2c1e..2333110c29f 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -287,6 +287,9 @@ static int main_pid; /* main process
id used in log filename */
/*
* We cannot retry a transaction after the serialization/deadlock error if its
* number of tries reaches this maximum; if its value is zero, it is not used.
+ * We can ignore errors including serialization/deadlock errors and other
errors
+ * if --continue-on-error is set, but in this case the failed transaction is
not
+ * retried.
*/
static uint32 max_tries = 1;
@@ -402,7 +405,8 @@ typedef struct StatsData
* directly successful transactions (they were successfully completed
on
* the first try).
*
- * A failed transaction is defined as unsuccessfully retried
transactions.
+ * A failed transaction is defined as unsuccessfully retried
transactions
+ * unless continue-on-error option is specified.
* It can be one of two types:
*
* failed (the number of failed transactions) =
@@ -411,6 +415,11 @@ typedef struct StatsData
* 'deadlock_failures' (they got a deadlock error and were not
* successfully retried).
*
+ * When continue-on-error option is specified,
+ * failed (the number of failed transactions) =
+ * 'other_sql_failures' (they got a error when continue-on-error
option
+ * was specified).
+ *
* If the transaction was retried after a serialization or a deadlock
* error this does not guarantee that this retry was successful. Thus
*
@@ -960,7 +969,7 @@ usage(void)
" (default: \"pgbench_log\")\n"
" --max-tries=NUM max number of tries to run
transaction (default: 1)\n"
" --continue-on-error\n"
- " Continue and retry transactions
that failed due to errors other than serialization or deadlocks.\n"
+ " continue to process transactions
after a trasaction fails due to errors other than serialization or deadlocks.\n"
" --progress-timestamp use Unix epoch timestamps for
progress\n"
" --random-seed=SEED set random seed (\"time\",
\"rand\", integer)\n"
" --sampling-rate=NUM fraction of transactions to log
(e.g., 0.01 for 1%%)\n"
@@ -3258,8 +3267,7 @@ static bool
canRetryError(EStatus estatus)
{
return (estatus == ESTATUS_SERIALIZATION_ERROR ||
- estatus == ESTATUS_DEADLOCK_ERROR ||
- (continue_on_error && estatus ==
ESTATUS_OTHER_SQL_ERROR));
+ estatus == ESTATUS_DEADLOCK_ERROR);
}
/*
@@ -4019,7 +4027,7 @@ advanceConnectionState(TState *thread, CState *st,
StatsData *agg)
if (PQpipelineStatus(st->con) !=
PQ_PIPELINE_ON)
st->state = CSTATE_END_COMMAND;
}
- else if (canRetryError(st->estatus))
+ else if (canRetryError(st->estatus) |
continue_on_error)
st->state = CSTATE_ERROR;
else
st->state = CSTATE_ABORTED;
@@ -4111,6 +4119,7 @@ advanceConnectionState(TState *thread, CState *st,
StatsData *agg)
* can retry the error.
*/
st->state = timer_exceeded ?
CSTATE_FINISHED :
+ continue_on_error ?
CSTATE_FAILURE :
doRetry(st, &now) ?
CSTATE_RETRY : CSTATE_FAILURE;
}
else
@@ -6446,7 +6455,8 @@ printResults(StatsData *total,
/*
* Remaining stats are nonsensical if we failed to execute any xacts due
- * to others than serialization or deadlock errors
+ * to others than serialization or deadlock errors and
--continue-on-error
+ * is not set.
*/
if (total_cnt <= 0)
return;
@@ -7086,6 +7096,7 @@ main(int argc, char **argv)
pg_logging_increase_verbosity();
break;
case 18: /* continue-on-error */
+ benchmarking_option_set = true;
continue_on_error = true;
break;
default:
@@ -7242,6 +7253,9 @@ main(int argc, char **argv)
pg_fatal("an unlimited number of transaction tries can
only be used with --latency-limit or a duration (-T)");
}
+ if (exit_on_abort && continue_on_error)
+ pg_fatal("--exit-on-abort and --continue-on-error are mutually
exclusive options");
+
/*
* save main process id in the global variable because process id will
be
* changed after fork.
diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl
b/src/bin/pgbench/t/001_pgbench_with_server.pl
index 7dd78940300..afb49b554d0 100644
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -1813,6 +1813,28 @@ update counter set i = i+1 returning i \gset
# Clean up
$node->safe_psql('postgres', 'DROP TABLE counter;');
+# Test --continue-on-error
+$node->safe_psql('postgres',
+ 'CREATE TABLE unique_table(i int unique); ' . 'INSERT INTO unique_table
VALUES (0);');
+
+$node->pgbench(
+ '-t 10 --continue-on-error --failures-detailed',
+ 0,
+ [
+ qr{processed: 0/10\b},
+ qr{other failures: 10\b}
+ ],
+ [],
+ 'test --continue-on-error',
+ {
+ '002_continue_on_error' => q{
+ insert into unique_table values 0;
+ }
+ });
+
+# Clean up
+$node->safe_psql('postgres', 'DROP TABLE unique_table;');
+
# done
$node->safe_psql('postgres', 'DROP TABLESPACE regress_pgbench_tap_1_ts');
$node->stop;
--
2.39.5 (Apple Git-154)